[wip] "Python compiled autograd II"

Today, compiled autograd runs in two phases: - a make_fx-like phase that uses FakeTensors + fx.Proxy to create an fx.Graph from the current autograd graph - a second phase that applies torch.compile to the result of the previous phase. This PR changes it so that compiled autograd no longer uses FakeTensors in its first phase. At a high level: - [Here's an example of the new graph](https://gist.github.com/zou3519/20272a3e31124621843f53ae66671ed7) compiled autograd's first phase produces. - In order to acquire this graph, we get compiled autograd to effectively torch.fx.symbolic_trace over a new `python_autograd` function that runs the autograd graph. - The graph contains calls to `apply_with_saved`, which is a way to apply a given node with some inputs and some specific saved values. This is different from the existing `Node::apply_with_saved` because that one accepts the saved values for the *entire graph*. - There are also calls to `validate_outputs`, which also needs some saved values because it need to swizzle out input metadata state. - We support graph breaks on unsupported C++ custom ops via emitting a special `apply_with_saved_dynamo_disabled` function. The state of C++ torch::autograd::Function is completely iterable by us, since we ask users to only save values via `ctx->save_for_backward` and `ctx->saved[...]`. There's a long tail of things that don't work yet: - we don't support all types of hooks yet - we don't inline user-defined autograd.Function into this graph yet - we don't inline the backward of torch.compile'd regions - we need to somehow free the autograd graph when we're done with it - many more TODOs inline. ghstack-source-id: 23a98023d271db220a29db66631e9087fb8e2325 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138101
[BE] Delete empty files (#137376 )
2025-10-26 00:24:53 +08:00 · 2024-10-17 19:09:43 -04:00 · 2024-10-06 18:59:04 +00:00 · 2024-10-06 05:23:24 +00:00 · 2024-10-06 04:28:18 +00:00 · 2024-10-06 04:25:36 +00:00
646 changed files with 19343 additions and 16039 deletions
--- a/.buckconfig.oss
+++ b/.buckconfig.oss
@ -21,6 +21,3 @@
  cxx = /usr/bin/clang++
  cxxpp = /usr/bin/clang++
  ld = /usr/bin/clang++
-
-[project]
-  default_flavors_mode=all
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -355,6 +355,12 @@ case "$image" in
    CONDA_CMAKE=yes
    VISION=yes
    ;;
+  pytorch-linux-jammy-py3-clang18-asan)
+    ANACONDA_PYTHON_VERSION=3.10
+    CLANG_VERSION=18
+    CONDA_CMAKE=yes
+    VISION=yes
+    ;;
  pytorch-linux-jammy-py3.9-gcc11)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
@ -381,6 +387,13 @@ case "$image" in
    HALIDE=yes
    TRITON=yes
    ;;
+  pytorch-linux-jammy-py3.12-triton-cpu)
+    CUDA_VERSION=12.4
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    CONDA_CMAKE=yes
+    TRITON_CPU=yes
+    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -510,6 +523,7 @@ docker build \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
+       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
--- a/.ci/docker/ci_commit_pins/triton-cpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-cpu.txt
@ -0,0 +1 @@
+6a333f1b05671f6fada4ba7bbfae4a02a9d96f4f
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-5fe38ffd73c2ac6ed6323b554205186696631c6f
+cf34004b8a67d290a962da166f5aa2fc66751326
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -13,11 +13,17 @@ if [ -n "$CLANG_VERSION" ]; then
  elif [[ $UBUNTU_VERSION == 22.04 ]]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
+    if [[ $CLANG_VERSION == 18 ]]; then
+      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
+    fi
  fi

  sudo apt-get update
-  apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
-  apt-get install -y --no-install-recommends llvm-"$CLANG_VERSION"
+  apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" llvm-"$CLANG_VERSION"
+  if [[ $CLANG_VERSION == 18 ]]; then
+    apt-get install -y --no-install-recommends libomp-18-dev
+  fi

  # Install dev version of LLVM.
  if [ -n "$LLVMDEV" ]; then
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -105,7 +105,7 @@ function install_121 {
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -5,19 +5,19 @@ set -ex

 NCCL_VERSION=v2.21.5-1

-function install_cusparselt_052 {
+function install_cusparselt_062 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
-    tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
-    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
@ -44,7 +44,7 @@ function install_124 {
  cd ..
  rm -rf nccl

-  install_cusparselt_052
+  install_cusparselt_062

  ldconfig
 }
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -15,8 +15,11 @@ conda_reinstall() {
 if [ -n "${XPU_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
+elif [ -n "${TRITON_CPU}" ]; then
+  TRITON_REPO="https://github.com/triton-lang/triton-cpu"
+  TRITON_TEXT_FILE="triton-cpu"
 else
-  TRITON_REPO="https://github.com/openai/triton"
+  TRITON_REPO="https://github.com/triton-lang/triton"
  TRITON_TEXT_FILE="triton"
 fi

@ -44,9 +47,10 @@ chown -R jenkins /var/lib/jenkins/triton
 chgrp -R jenkins /var/lib/jenkins/triton
 pushd /var/lib/jenkins/

-as_jenkins git clone ${TRITON_REPO} triton
+as_jenkins git clone --recursive ${TRITON_REPO} triton
 cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
+as_jenkins git submodule update --init --recursive
 cd python

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -147,6 +147,13 @@ COPY ci_commit_pins/triton.txt triton.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt

+ARG TRITON_CPU
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
+RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton-cpu.txt
+
 ARG EXECUTORCH
 # Build and install executorch
 COPY ./common/install_executorch.sh install_executorch.sh
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -191,9 +191,22 @@ function install_torchrec_and_fbgemm() {
  pip_uninstall torchrec-nightly
  pip_uninstall fbgemm-gpu-nightly
  pip_install setuptools-git-versioning scikit-build pyre-extensions
+
+  # TODO (huydhn): I still have no clue on why sccache doesn't work with only fbgemm_gpu here, but it
+  # seems to be an sccache-related issue
+  if [[ "$IS_A100_RUNNER" == "1" ]]; then
+    unset CMAKE_CUDA_COMPILER_LAUNCHER
+    sudo mv /opt/cache/bin /opt/cache/bin-backup
+  fi
+
  # See https://github.com/pytorch/pytorch/issues/106971
  CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+
+  if [[ "$IS_A100_RUNNER" == "1" ]]; then
+    export CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache
+    sudo mv /opt/cache/bin-backup /opt/cache/bin
+  fi
 }

 function clone_pytorch_xla() {
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -376,7 +376,7 @@ test_inductor_cpp_wrapper_abi_compatible() {

  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
-  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro
+  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro inductor/test_extension_backend

  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
@ -403,7 +403,7 @@ pr_time_benchmarks() {
  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
  echo "benchmark results on current PR: "
  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
-
+  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks python benchmarks/dynamo/pr_time_benchmarks/check_results.py "benchmarks/dynamo/pr_time_benchmarks/expected_results.csv" "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
 }

 if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
@ -606,6 +606,11 @@ test_inductor_halide() {
  assert_git_not_dirty
 }

+test_inductor_triton_cpu() {
+  python test/run_test.py --include inductor/test_triton_cpu_backend.py --verbose
+  assert_git_not_dirty
+}
+
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -660,15 +665,6 @@ test_inductor_torchbench_smoketest_perf() {
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

-  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
-    --export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
-  # The threshold value needs to be actively maintained to make this check useful
-  # The perf number of nanogpt seems not very stable, e.g.
-  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
-  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
-  # we switch to use some other model.
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9
-
  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
@ -712,6 +708,10 @@ test_inductor_set_cpu_affinity(){
    export KMP_BLOCKTIME=1
  fi
  cores=$(test_inductor_get_core_number)
+  # Set number of cores to 16 on Aarch64 for performance runs.
+  if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
+    cores=16
+  fi
  export OMP_NUM_THREADS=$cores
  end_core=$((cores-1))
  export TASKSET="taskset -c 0-$end_core"
@ -1435,6 +1435,8 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
+elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
+  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
@ -1458,7 +1460,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
-    checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
+    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
--- a/.ci/pytorch/win-build.sh
+++ b/.ci/pytorch/win-build.sh
@ -26,7 +26,7 @@ fi
 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers

 set +ex
-grep -E -R 'PyLong_(From|As)(Unsigned|)Long\(' --exclude=python_numbers.h --exclude=eval_frame.c torch/
+grep -E -R 'PyLong_(From|As)(Unsigned|)Long\(' --exclude=python_numbers.h  --exclude=pythoncapi_compat.h --exclude=eval_frame.c torch/
 PYLONG_API_CHECK=$?
 if [[ $PYLONG_API_CHECK == 0 ]]; then
  echo "Usage of PyLong_{From,As}{Unsigned}Long API may lead to overflow errors on Windows"
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -27,12 +27,11 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  source activate testenv >/dev/null
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
  python_path="/opt/python/cp\$python_nodot-cp\${python_nodot}"
-  # Prior to Python 3.8 paths were suffixed with an 'm'
-  if [[ -d  "\${python_path}/bin" ]]; then
-    export PATH="\${python_path}/bin:\$PATH"
-  elif [[ -d "\${python_path}m/bin" ]]; then
-    export PATH="\${python_path}m/bin:\$PATH"
+  if [[ "\$python_nodot" = *t ]]; then
+    python_digits="\$(echo $DESIRED_PYTHON | tr -cd [:digit:])"
+    python_path="/opt/python/cp\$python_digits-cp\${python_digits}t"
  fi
+  export PATH="\${python_path}/bin:\$PATH"
 fi

 EXTRA_CONDA_FLAGS=""
--- a/.clang-format
+++ b/.clang-format
@ -44,7 +44,9 @@ ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DerivePointerAlignment: false
 DisableFormat:   false
-ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+ForEachMacros:
+  - FOR_EACH_RANGE
+  - FOR_EACH
 IncludeCategories:
  - Regex:           '^<.*\.h(pp)?>'
    Priority:        1
@ -58,6 +60,24 @@ IndentWrappedFunctionNames: false
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
+Macros:
+  - >-
+    PyObject_HEAD_INIT(type)={
+        /* this is not exactly match with PyObject_HEAD_INIT in Python source code
+         * but it is enough for clang-format */
+        { 0xFFFFFFFF },
+        (type)
+    },
+  - >-
+    PyVarObject_HEAD_INIT(type, size)={
+        {
+            /* manually expand PyObject_HEAD_INIT(type) above
+             * because clang-format do not support recursive expansion */
+            { 0xFFFFFFFF },
+            (type)
+        },
+        (size)
+    },
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 PenaltyBreakBeforeFirstCallParameter: 1
@ -79,7 +99,11 @@ SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
-Standard:        Cpp11
+Standard:        c++17
+StatementMacros:
+  - PyObject_HEAD
+  - PyObject_VAR_HEAD
+  - PyException_HEAD
 TabWidth:        8
 UseTab:          Never
 ---
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -1,38 +0,0 @@
-If you have a question or would like help and support, please ask at our
-[forums](https://discuss.pytorch.org/).
-
-If you are submitting a feature request, please preface the title with [feature request].
-If you are submitting a bug report, please fill in the following details.
-
-## Issue description
-
-Provide a short description.
-
-## Code example
-
-Please try to provide a minimal example to repro the bug.
-Error messages and stack traces are also helpful.
-
-## System Info
-Please copy and paste the output from our
-[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py)
-(or fill out the checklist below manually).
-
-You can get the script and run it with:
-```
-wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
-# For security purposes, please check the contents of collect_env.py before running it.
-python collect_env.py
-```
-
- PyTorch or Caffe2:
- How you installed PyTorch (conda, pip, source):
- Build command you used (if compiling from source):
- OS:
- PyTorch version:
- Python version:
- CUDA/cuDNN version:
- GPU models and configuration:
- GCC version (if compiling from source):
- CMake version:
- Versions of any other relevant libraries:
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -5,7 +5,8 @@ about: Tracking incidents for PyTorch's CI infra.

 > NOTE: Remember to label this issue with "`ci: sev`"

-**MERGE BLOCKING** <!-- remove this line if you don't want this SEV to block merges -->
+ <!-- uncomment the below line if you don't want this SEV to block merges -->
+ <!--  **MERGE BLOCKING** -->

 ## Current Status
 *Status could be: preemptive, ongoing, mitigated, closed. Also tell people if they need to take action to fix it (i.e. rebase)*.
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -18,8 +18,14 @@ inputs:
 runs:
  using: composite
  steps:
+    - name: Check if in a container runner
+      shell: bash
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
+
    - name: Clean workspace
      shell: bash
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      env:
        NO_SUDO: ${{ inputs.no-sudo }}
      run: |
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -85,15 +85,25 @@ runs:
      with:
        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-    - name: Check if in a ARC runner
+    - name: Check if in a container runner
      shell: bash
-      id: check_arc_runner
-      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

    - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
      id: install-nvidia-driver
      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+
+    - name: Setup GPU_FLAG for docker run
+      id: setup-gpu-flag
+      run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+    - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+      id: setup-sscache-port-flag
+      run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}

    - name: Lock NVIDIA A100 40GB Frequency
      shell: bash
@ -101,7 +111,7 @@ runs:
        sudo nvidia-smi -pm 1
        sudo nvidia-smi -ac 1215,1410
        nvidia-smi
-      if: contains(matrix.runner, 'a100')
+      if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

    - name: Start monitoring script
      id: monitor-script
@ -172,6 +182,7 @@ runs:
        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_REGION: us-east-1
        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
        SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
        DOCKER_IMAGE: ${{ inputs.docker-image }}
@ -181,6 +192,9 @@ runs:
        PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
+        SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+        IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
+
      shell: bash
      run: |
        set -x
@ -199,6 +213,7 @@ runs:
        # shellcheck disable=SC2086,SC2090
        container_name=$(docker run \
          ${GPU_FLAG:-} \
+          ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
          -e BUILD_ENVIRONMENT \
          -e PR_NUMBER \
          -e GITHUB_ACTIONS \
@ -227,6 +242,7 @@ runs:
          -e PR_LABELS \
          -e MAX_JOBS="$(nproc --ignore=2)" \
          -e SCCACHE_BUCKET \
+          -e SCCACHE_REGION \
          -e SCCACHE_S3_KEY_PREFIX \
          -e XLA_CUDA \
          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
@ -234,7 +250,9 @@ runs:
          -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
          -e SKIP_SCCACHE_INITIALIZATION=1 \
          -e HUGGING_FACE_HUB_TOKEN \
+          -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
          -e DASHBOARD_TAG \
+          -e IS_A100_RUNNER \
          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
          --security-opt seccomp=unconfined \
          --cap-add=SYS_PTRACE \
@ -305,7 +323,7 @@ runs:

    - name: Teardown Linux
      uses: pytorch/test-infra/.github/actions/teardown-linux@main
-      if: always()
+      if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

    # NB: We are currently having an intermittent GPU-related issue on G5 runners with
    # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -28,14 +28,14 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"

-    - name: Check if in a ARC runner
+    - name: Check if in a container runner
      shell: bash
-      id: check_arc_runner
-      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)"  >> $GITHUB_OUTPUT
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

    - name: Start docker if docker deamon is not running
      shell: bash
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      run: |
        if systemctl is-active --quiet docker; then
            echo "Docker daemon is running...";
@ -73,7 +73,7 @@ runs:
        env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"

    - name: Kill any existing containers, clean up images
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      shell: bash
      run: |
        # ignore expansion of "docker ps -q" since it could be empty
@ -116,7 +116,7 @@ runs:
    - name: Check that the docker daemon is running
      shell: bash
      continue-on-error: true
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
      run: |
        set +x

--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ba696ea3dfec4cbe693bf06a84c75dc196077f5b
+3f0569939c4369bec943fc27d1c9d8dfbc828c26
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -16,6 +16,7 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/periodic
 - ciflow/rocm
+- ciflow/s390
 - ciflow/slow
 - ciflow/trunk
 - ciflow/unstable
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -333,7 +333,7 @@ def generate_wheels_matrix(
        package_type = "manywheel"

    if python_versions is None:
-        python_versions = FULL_PYTHON_VERSIONS + ["3.13"]
+        python_versions = FULL_PYTHON_VERSIONS + ["3.13", "3.13t"]

    if arches is None:
        # Define default compute archivectures
@ -369,7 +369,13 @@ def generate_wheels_matrix(
            # TODO: Enable python 3.13 on rocm, aarch64, windows
            if (
                gpu_arch_type == "rocm" or (os != "linux" and os != "linux-s390x")
-            ) and python_version == "3.13":
+            ) and (python_version == "3.13" or python_version == "3.13t"):
+                continue
+
+            # TODO: Enable python 3.13t on xpu and cpu-s390x
+            if (
+                gpu_arch_type == "xpu" or gpu_arch_type == "cpu-s390x"
+            ) and python_version == "3.13t":
                continue

            if use_split_build and (
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -1,5 +1,9 @@
 # flake8: noqa: G004

+# Note: Copies of this script in runner_determinator.py and _runner-determinator.yml
+#       must be kept in sync. You can do it easily by running the following command:
+#           python .github/scripts/update_runner_determinator.py
+
 """
 This runner determinator is used to determine which set of runners to run a
 GitHub job on. It uses the first comment of a GitHub issue (by default
@ -79,6 +83,9 @@ class Experiment(NamedTuple):
    rollout_perc: float = (
        0  # Percentage of workflows to experiment on when user is not opted-in.
    )
+    all_branches: bool = (
+        False  # If True, the experiment is also enabled on the exception branches
+    )

    # Add more fields as needed

@ -212,7 +219,7 @@ def get_potential_pr_author(

 def is_exception_branch(branch: str) -> bool:
    """
-    Branches that get opted out of all experiments and should always use Meta runners
+    Branches that get opted out of experiments by default, until they're explicitly enabled.
    """
    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}

@ -338,7 +345,10 @@ def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -


 def get_runner_prefix(
-    rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
+    rollout_state: str,
+    workflow_requestors: Iterable[str],
+    branch: str,
+    is_canary: bool = False,
 ) -> str:
    settings = parse_settings(rollout_state)
    user_optins = parse_users(rollout_state)
@ -348,6 +358,12 @@ def get_runner_prefix(
    for experiment_name, experiment_settings in settings.experiments.items():
        enabled = False

+        if not experiment_settings.all_branches and is_exception_branch(branch):
+            log.info(
+                f"Branch {branch} is an exception branch. Not enabling experiment {experiment_name}."
+            )
+            continue
+
        # Is any workflow_requestor opted in to this experiment?
        opted_in_users = [
            requestor
@ -407,35 +423,34 @@ def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -
 def main() -> None:
    args = parse_args()

-    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-        log.info(
-            f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+    runner_label_prefix = DEFAULT_LABEL_PREFIX
+
+    try:
+        rollout_state = get_rollout_state_from_issue(
+            args.github_token, args.github_issue_repo, args.github_issue
        )
-        runner_label_prefix = DEFAULT_LABEL_PREFIX
-    else:
-        try:
-            rollout_state = get_rollout_state_from_issue(
-                args.github_token, args.github_issue_repo, args.github_issue
-            )

-            username = get_potential_pr_author(
-                args.github_token,
-                args.github_repo,
-                args.github_actor,
-                args.github_ref_type,
-                args.github_branch,
-            )
+        username = get_potential_pr_author(
+            args.github_token,
+            args.github_repo,
+            args.github_actor,
+            args.github_ref_type,
+            args.github_branch,
+        )

-            is_canary = args.github_repo == "pytorch/pytorch-canary"
+        is_canary = args.github_repo == "pytorch/pytorch-canary"

-            runner_label_prefix = get_runner_prefix(
-                rollout_state, (args.github_issue_owner, username), is_canary
-            )
+        runner_label_prefix = get_runner_prefix(
+            rollout_state,
+            (args.github_issue_owner, username),
+            args.github_branch,
+            is_canary,
+        )

-        except Exception as e:
-            log.error(
-                f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
-            )
+    except Exception as e:
+        log.error(
+            f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
+        )

    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)

--- a/.github/scripts/test_runner_determinator.py
+++ b/.github/scripts/test_runner_determinator.py
@ -4,6 +4,10 @@ from unittest.mock import Mock, patch
 import runner_determinator as rd


+USER_BRANCH = "somebranch"
+EXCEPTION_BRANCH = "main"
+
+
 class TestRunnerDeterminatorIssueParser(TestCase):
    def test_parse_settings(self) -> None:
        settings_text = """
@ -66,6 +70,40 @@ class TestRunnerDeterminatorIssueParser(TestCase):
            "otherExp settings not parsed correctly",
        )

+    def test_parse_all_branches_setting(self) -> None:
+        settings_text = """
+        ```
+        experiments:
+            lf:
+                rollout_perc: 25
+                all_branches: true
+            otherExp:
+                all_branches: True
+                rollout_perc: 0
+        ```
+
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25, all_branches=True),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTrue(settings.experiments["otherExp"].all_branches)
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0, all_branches=True),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
    def test_parse_users(self) -> None:
        settings_text = """
        experiments:
@ -119,7 +157,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        @User2,lf,otherExp

        """
-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
        self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")

    def test_opted_in_user_two_experiments(self) -> None:
@ -136,7 +174,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        @User2,lf,otherExp

        """
-        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        prefix = rd.get_runner_prefix(settings_text, ["User2"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")

    @patch("random.uniform", return_value=50)
@ -154,7 +192,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        @User2,lf,otherExp

        """
-        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        prefix = rd.get_runner_prefix(settings_text, ["User3"], USER_BRANCH)
        self.assertEqual("", prefix, "Runner prefix not correct for user")

    @patch("random.uniform", return_value=10)
@ -174,7 +212,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        """

        # User3 is opted out, but is pulled into both experiments by the 10% rollout
-        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        prefix = rd.get_runner_prefix(settings_text, ["User3"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")

    def test_lf_prefix_always_comes_first(self) -> None:
@ -192,7 +230,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):

        """

-        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        prefix = rd.get_runner_prefix(settings_text, ["User2"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")

    def test_ignores_commented_users(self) -> None:
@ -210,7 +248,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):

        """

-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
        self.assertEqual("", prefix, "Runner prefix not correct for user")

    def test_ignores_extra_experiments(self) -> None:
@ -229,9 +267,44 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):

        """

-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")

+    def test_disables_experiment_on_exception_branches_when_not_explicitly_opted_in(
+        self,
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 100
+        ---
+
+        Users:
+        @User,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], EXCEPTION_BRANCH)
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    def test_allows_experiment_on_exception_branches_when_explicitly_opted_in(
+        self,
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 100
+                all_branches: true
+        ---
+
+        Users:
+        @User,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], EXCEPTION_BRANCH)
+        self.assertEqual("lf.", prefix, "Runner prefix not correct for user")
+

 if __name__ == "__main__":
    main()
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -12,7 +12,7 @@ import json
 import os
 import warnings
 from hashlib import sha256
-from typing import Any, Dict, List, Optional
+from typing import Any, List, Optional
 from unittest import main, mock, skip, TestCase
 from urllib.error import HTTPError

@ -24,7 +24,6 @@ from trymerge import (
    find_matching_merge_rule,
    get_classifications,
    get_drci_classifications,
-    get_rockset_results,
    gh_get_team_members,
    GitHubPR,
    JobCheckState,
@ -42,7 +41,6 @@ if "GIT_REMOTE_URL" not in os.environ:
    os.environ["GIT_REMOTE_URL"] = "https://github.com/pytorch/pytorch"

 GQL_MOCKS = "gql_mocks.json.gz"
-ROCKSET_MOCKS = "rockset_mocks.json.gz"
 DRCI_MOCKS = "drci_mocks.json.gz"


@ -77,16 +75,11 @@ def mock_query(
        if err.code == 401 or err.code == 403:
            err_msg = f"If you are seeing this message during workflow run, please make sure to update {file_name}"
            err_msg += f" locally, by deleting it and running {os.path.basename(__file__)} with"
-            err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN,"
-            err_msg += " the rockset api key passed via ROCKSET_API_KEY,"
+            err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN"
            err_msg += " and drci api key passed via DRCI_BOT_KEY environment variables"
-            if (
-                os.getenv("GITHUB_TOKEN") is None
-                or os.getenv("ROCKSET_API_KEY") is None
-                or os.getenv("DRCI_BOT_KEY") is None
-            ):
+            if os.getenv("GITHUB_TOKEN") is None or os.getenv("DRCI_BOT_KEY") is None:
                err_msg = (
-                    "Failed to update cached queries as GITHUB_TOKEN or ROCKSET_API_KEY or DRCI_BOT_KEY "
+                    "Failed to update cached queries as GITHUB_TOKEN or DRCI_BOT_KEY "
                    + "is not defined. "
                    + err_msg
                )
@ -110,16 +103,6 @@ def mocked_gh_graphql(query: str, **kwargs: Any) -> Any:
    return mock_query(gh_graphql_wrapper, GQL_MOCKS, key_function, query, kwargs)


-def mocked_rockset_results(head_sha: str, merge_base: str, num_retries: int = 3) -> Any:
-    return mock_query(
-        get_rockset_results,
-        ROCKSET_MOCKS,
-        lambda x, y: f"{x} {y}",
-        head_sha,
-        merge_base,
-    )
-
-
 def mocked_drci_classifications(pr_num: int, project: str, num_retries: int = 3) -> Any:
    return mock_query(
        get_drci_classifications,
@ -273,10 +256,6 @@ def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
    ]


-def empty_rockset_results(head_sha: str, merge_base: str) -> List[Dict[str, Any]]:
-    return []
-
-
 class DummyGitRepo(GitRepo):
    def __init__(self) -> None:
        super().__init__(get_git_repo_dir(), get_git_remote_name())
@ -288,7 +267,6 @@ class DummyGitRepo(GitRepo):
        return "super awsome commit message"


-@mock.patch("trymerge.get_rockset_results", side_effect=empty_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch(
    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
@ -604,7 +582,6 @@ class TestTryMerge(TestCase):
            mocked_gh_fetch_merge_base.assert_called_once()


-@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch(
@ -843,7 +820,7 @@ class TestBypassFailures(TestCase):
        checks = pr.get_checkrun_conclusions()

        # Known flaky failure takes precedence over ignore current (need to set the
-        # merge base here to get the results from Rockset, and that categorize the
+        # merge base here to get the results from Dr. CI, and that categorize the
        # broken trunk failure too
        checks = get_classifications(
            pr.pr_num,
@ -929,7 +906,6 @@ class TestBypassFailures(TestCase):
        )


-@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch("trymerge.get_drci_classifications", return_value={})
@ -1008,7 +984,6 @@ class TestBypassFailuresOnSandCastle(TestCase):
        self.assertTrue(len(failed) == 2)


-@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch(
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -452,8 +452,6 @@ RE_DIFF_REV = re.compile(r"^Differential Revision:.+?(D[0-9]+)", re.MULTILINE)
 CIFLOW_LABEL = re.compile(r"^ciflow/.+")
 CIFLOW_TRUNK_LABEL = re.compile(r"^ciflow/trunk")
 MERGE_RULE_PATH = Path(".github") / "merge_rules.yaml"
-ROCKSET_MERGES_COLLECTION = "merges"
-ROCKSET_MERGES_WORKSPACE = "commons"
 REMOTE_MAIN_BRANCH = "origin/main"
 DRCI_CHECKRUN_NAME = "Dr.CI"
 INTERNAL_CHANGES_CHECKRUN_NAME = "Meta Internal-Only Changes Check"
@ -1180,7 +1178,7 @@ class GitHubPR:
        merge_commit_sha = repo.rev_parse(name=self.default_branch())

        if comment_id and self.pr_num:
-            # Finally, upload the record to Rockset. The list of pending and failed
+            # Finally, upload the record to s3. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
                comment_id=comment_id,
@ -1202,7 +1200,7 @@ class GitHubPR:
                ignore_current=bool(ignore_current_checks),
            )
        else:
-            print("Missing comment ID or PR number, couldn't upload to Rockset")
+            print("Missing comment ID or PR number, couldn't upload to s3")

        # Usually Github will see that the commit has "resolves <pr_num>" in the
        # commit message and close the PR, but sometimes it doesn't, leading to
@ -1481,7 +1479,7 @@ def find_matching_merge_rule(

        # Categorize all checks when skip_mandatory_checks (force merge) is set. Do it here
        # where the list of checks is readily available. These records will be saved into
-        # Rockset merge records
+        # s3 merge records
        (
            pending_mandatory_checks,
            failed_mandatory_checks,
@ -1568,7 +1566,7 @@ def save_merge_record(
    This saves the merge records as a json, which can later be uploaded to s3
    """

-    # Prepare the record to be written into Rockset
+    # Prepare the record to be written into s3
    data = [
        {
            "comment_id": comment_id,
@ -1590,7 +1588,8 @@ def save_merge_record(
            "ignore_current": ignore_current,
            "error": error,
            # This is a unique identifier for the record for deduping purposes
-            # in rockset.  Any unique string would work
+            # in Rockset.  Any unique string would work.  This will not be used
+            # after we migrate off Rockset
            "_id": f"{project}-{pr_num}-{comment_id}-{os.environ.get('GITHUB_RUN_ID')}",
        }
    ]
@ -1600,36 +1599,6 @@ def save_merge_record(
        json.dump(data, f)


-@retries_decorator(rc=[])
-def get_rockset_results(head_sha: str, merge_base: str) -> List[Dict[str, Any]]:
-    query = f"""
-SELECT
-    w.name as workflow_name,
-    j.id,
-    j.name,
-    j.conclusion,
-    j.completed_at,
-    j.html_url,
-    j.head_sha,
-    j.torchci_classification.captures as failure_captures,
-    LENGTH(j.steps) as steps,
-FROM
-    commons.workflow_job j join commons.workflow_run w on w.id = j.run_id
-where
-    j.head_sha in ('{head_sha}','{merge_base}')
-"""
-    try:
-        import rockset  # type: ignore[import]
-
-        res = rockset.RocksetClient(
-            host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
-        ).sql(query)
-        return cast(List[Dict[str, Any]], res.results)
-    except ModuleNotFoundError:
-        print("Could not use RockSet as rocket dependency is missing")
-        return []
-
-
@retries_decorator()
 def get_drci_classifications(pr_num: int, project: str = "pytorch") -> Any:
    """
@ -2067,7 +2036,7 @@ def categorize_checks(
    pending_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
    failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []

-    # failed_checks_categorization is used to keep track of all ignorable failures when saving the merge record on Rockset
+    # failed_checks_categorization is used to keep track of all ignorable failures when saving the merge record on s3
    failed_checks_categorization: Dict[str, List[Any]] = defaultdict(list)

    # If required_checks is not set or empty, consider all names are relevant
@ -2126,7 +2095,7 @@ def categorize_checks(
    ):
        failed_checks = failed_checks + flaky_or_broken_trunk

-    # The list of failed_checks_categorization is returned so that it can be saved into the Rockset merge record
+    # The list of failed_checks_categorization is returned so that it can be saved into the s3 merge record
    return (pending_checks, failed_checks, failed_checks_categorization)


@ -2410,7 +2379,7 @@ def main() -> None:
        handle_exception(e)

        if args.comment_id and args.pr_num:
-            # Finally, upload the record to Rockset, we don't have access to the
+            # Finally, upload the record to s3, we don't have access to the
            # list of pending and failed checks here, but they are not really
            # needed at the moment
            save_merge_record(
@ -2433,7 +2402,7 @@ def main() -> None:
                error=str(e),
            )
        else:
-            print("Missing comment ID or PR number, couldn't upload to Rockset")
+            print("Missing comment ID or PR number, couldn't upload to s3")
    finally:
        if not args.check_mergeability:
            gh_remove_label(
--- a/.github/scripts/update_runner_determinator.py
+++ b/.github/scripts/update_runner_determinator.py
@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import re
+
+
+# Read the contents of runner_determinator.py
+with open(".github/scripts/runner_determinator.py") as script_file:
+    script_content = script_file.read()
+
+# Indent the script content by 10 spaces to match destination indentation
+indented_script_content = "\n".join(
+    [" " * 10 + line if line else line for line in script_content.splitlines()]
+)
+
+# Read the contents of _runner-determinator.yml
+with open(".github/workflows/_runner-determinator.yml") as yml_file:
+    yml_content = yml_file.read()
+
+# Replace the content between the markers
+new_yml_content = re.sub(
+    r"(cat <<EOF > runner_determinator.py\n)(.*?)(\n\s+EOF)",
+    lambda match: match.group(1) + indented_script_content + match.group(3),
+    yml_content,
+    flags=re.DOTALL,
+)
+
+# Save the modified content back to _runner-determinator.yml
+with open(".github/workflows/_runner-determinator.yml", "w") as yml_file:
+    yml_file.write(new_yml_content)
+
+print("Updated _runner-determinator.yml with the contents of runner_determinator.py")
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -91,14 +91,14 @@ jobs:
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-      - name: Check if in a ARC runner
+      - name: Check if in a container runner
        shell: bash
-        id: check_arc_runner
-        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ inputs.cuda-version != 'cpu' && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+        if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Output disk space left
        run: |
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -114,22 +114,32 @@ jobs:
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-      - name: Check if in a ARC runner
+      - name: Check if in a container runner
        shell: bash
-        id: check_arc_runner
-        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+
+      - name: Setup GPU_FLAG for docker run
+        id: setup-gpu-flag
+        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+        id: setup-sscache-port-flag
+        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}

      - name: Lock NVIDIA A100 40GB Frequency
        run: |
          sudo nvidia-smi -pm 1
          sudo nvidia-smi -ac 1215,1410
          nvidia-smi
-        if: contains(matrix.runner, 'a100')
+        if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Start monitoring script
        id: monitor-script
@ -208,6 +218,7 @@ jobs:
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_REGION: us-east-1
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
@ -218,6 +229,7 @@ jobs:
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}

        run: |
          set -x
@ -236,6 +248,7 @@ jobs:
          # shellcheck disable=SC2086,SC2090
          container_name=$(docker run \
            ${GPU_FLAG:-} \
+            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
            -e BUILD_ENVIRONMENT \
            -e PR_NUMBER \
            -e GITHUB_ACTIONS \
@ -265,6 +278,7 @@ jobs:
            -e PR_LABELS \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
+            -e SCCACHE_REGION \
            -e SCCACHE_S3_KEY_PREFIX \
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
@ -274,6 +288,7 @@ jobs:
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e DASHBOARD_TAG \
+            -e IS_A100_RUNNER \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
@ -343,7 +358,7 @@ jobs:

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
+        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

      # NB: We are currently having an intermittent GPU-related issue on G5 runners with
      # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -59,6 +59,10 @@ jobs:
          cat <<EOF > runner_determinator.py
          # flake8: noqa: G004

+          # Note: Copies of this script in runner_determinator.py and _runner-determinator.yml
+          #       must be kept in sync. You can do it easily by running the following command:
+          #           python .github/scripts/update_runner_determinator.py
+
          """
          This runner determinator is used to determine which set of runners to run a
          GitHub job on. It uses the first comment of a GitHub issue (by default
@ -138,6 +142,9 @@ jobs:
              rollout_perc: float = (
                  0  # Percentage of workflows to experiment on when user is not opted-in.
              )
+              all_branches: bool = (
+                  False  # If True, the experiment is also enabled on the exception branches
+              )

              # Add more fields as needed

@ -271,7 +278,7 @@ jobs:

          def is_exception_branch(branch: str) -> bool:
              """
-              Branches that get opted out of all experiments and should always use Meta runners
+              Branches that get opted out of experiments by default, until they're explicitly enabled.
              """
              return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}

@ -397,7 +404,10 @@ jobs:


          def get_runner_prefix(
-              rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
+              rollout_state: str,
+              workflow_requestors: Iterable[str],
+              branch: str,
+              is_canary: bool = False,
          ) -> str:
              settings = parse_settings(rollout_state)
              user_optins = parse_users(rollout_state)
@ -407,6 +417,12 @@ jobs:
              for experiment_name, experiment_settings in settings.experiments.items():
                  enabled = False

+                  if not experiment_settings.all_branches and is_exception_branch(branch):
+                      log.info(
+                          f"Branch {branch} is an exception branch. Not enabling experiment {experiment_name}."
+                      )
+                      continue
+
                  # Is any workflow_requestor opted in to this experiment?
                  opted_in_users = [
                      requestor
@ -466,35 +482,34 @@ jobs:
          def main() -> None:
              args = parse_args()

-              if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-                  log.info(
-                      f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+              runner_label_prefix = DEFAULT_LABEL_PREFIX
+
+              try:
+                  rollout_state = get_rollout_state_from_issue(
+                      args.github_token, args.github_issue_repo, args.github_issue
                  )
-                  runner_label_prefix = DEFAULT_LABEL_PREFIX
-              else:
-                  try:
-                      rollout_state = get_rollout_state_from_issue(
-                          args.github_token, args.github_issue_repo, args.github_issue
-                      )

-                      username = get_potential_pr_author(
-                          args.github_token,
-                          args.github_repo,
-                          args.github_actor,
-                          args.github_ref_type,
-                          args.github_branch,
-                      )
+                  username = get_potential_pr_author(
+                      args.github_token,
+                      args.github_repo,
+                      args.github_actor,
+                      args.github_ref_type,
+                      args.github_branch,
+                  )

-                      is_canary = args.github_repo == "pytorch/pytorch-canary"
+                  is_canary = args.github_repo == "pytorch/pytorch-canary"

-                      runner_label_prefix = get_runner_prefix(
-                          rollout_state, (args.github_issue_owner, username), is_canary
-                      )
+                  runner_label_prefix = get_runner_prefix(
+                      rollout_state,
+                      (args.github_issue_owner, username),
+                      args.github_branch,
+                      is_canary,
+                  )

-                  except Exception as e:
-                      log.error(
-                          f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
-                      )
+              except Exception as e:
+                  log.error(
+                      f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
+                  )

              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)

--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -43,7 +43,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12" ]
        device: ["cuda", "rocm", "xpu"]
        include:
          - device: "rocm"
@ -91,9 +91,6 @@ jobs:

          # Determine python executable for given version
          case $PY_VERS in
-          3.8)
-            PYTHON_EXECUTABLE=/opt/python/cp38-cp38/bin/python
-            ;;
          3.9)
            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
            ;;
@ -214,7 +211,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12" ]
    timeout-minutes: 40
    env:
      DOCKER_IMAGE: pytorch/conda-builder:cpu
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -67,6 +67,7 @@ jobs:
          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-2024.0-py3,
          pytorch-linux-jammy-py3-clang15-asan,
+          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-focal-py3-clang10-onnx,
          pytorch-linux-focal-linter,
          pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter,
@ -78,7 +79,9 @@ jobs:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
+    # Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358
+    # runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
+    runs-on: "${{ matrix.runner }}"
    env:
      DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
    steps:
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -3324,3 +3324,353 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-cxx11-abi-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-cxx11-abi-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-cxx11-abi-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda11_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda11_8-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_1-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_1-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_4-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
@ -1514,3 +1514,283 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda11_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda11_8-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_1-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_1-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_4-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel-split
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -120,6 +120,28 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.12-triton-cpu
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+        ]}
+
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
+
  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
    name: cuda12.4-py3.10-gcc9-sm86
--- a/.github/workflows/lint-autoformat.yml
+++ b/.github/workflows/lint-autoformat.yml
@ -11,7 +11,6 @@ jobs:
      contents: read
      pull-requests: write
    runs-on: lf.linux.2xlarge
-    continue-on-error: true
    if: ${{ github.repository_owner == 'pytorch' }}
    steps:
      - name: Checkout pytorch
@ -31,10 +30,12 @@ jobs:
          bash .github/scripts/lintrunner.sh
      - name: Check for changes
        id: git-check
+        continue-on-error: true
        run: |
          git diff --exit-code || echo "changes=true" >> "$GITHUB_OUTPUT"
      - name: Suggest changes
        if: steps.git-check.outputs.changes == 'true'
+        continue-on-error: true
        uses: parkerbxyz/suggest-changes@v1
        with:
          comment: "Please commit the suggested changes from pytorch's linter."
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -215,14 +215,15 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.8
+      - name: Setup Python 3.9
        uses: actions/setup-python@v4
        with:
-          python-version: '3.8'
+          python-version: '3.9'
          architecture: x64
          cache: pip
      - name: Install dependencies
        run: |
+          python3 -m pip install --upgrade pip
          pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.2.* fbscribelogger==0.1.* numpy==1.24.*
          pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
      - name: Run run_test.py (nonretryable)
@ -278,4 +279,4 @@ jobs:

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
+  cancel-in-progress: true
--- a/.github/workflows/s390.yml
+++ b/.github/workflows/s390.yml
@ -0,0 +1,24 @@
+name: s390
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - ciflow/s390/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  linux-manylinux-2_28-py3-cpu-s390x-build:
+    name: linux-manylinux-2_28-py3-cpu-s390x
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-s390x-binary-manywheel
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      runner: linux.s390x
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -316,11 +316,3 @@ jobs:
      build-environment: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
      docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
-
-  linux-manylinux-2_28-py3-cpu-s390x-build:
-    name: linux-manylinux-2_28-py3-cpu-s390x
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-s390x-binary-manywheel
-      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
-      runner: linux.s390x
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@ -28,7 +28,7 @@ jobs:
          check-latest: false
          cache: pip
          architecture: x64
-      - run: pip install pyyaml==6.0 rockset==1.0.3
+      - run: pip install pyyaml==6.0

      - name: Setup committer id
        run: |
@ -43,7 +43,6 @@ jobs:
          COMMENT_ID: ${{ github.event.client_payload.comment_id }}
          REBASE: ${{ github.event.client_payload.rebase }}
          IGNORE_CURRENT: ${{ github.event.client_payload.ignore_current }}
-          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          DRCI_BOT_KEY: ${{ secrets.DRCI_BOT_KEY }}
          GITHUB_RUN_ID: ${{ github.run_id }}
        run: |
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -216,6 +216,10 @@ include_patterns = [
    'torch/csrc/*.cpp',
    'torch/csrc/**/*.h',
    'torch/csrc/**/*.cpp',
+    'torch/csrc/distributed/autograd/**/*.cpp',
+    'torch/csrc/distributed/autograd/**/*.h',
+    'torch/csrc/distributed/rpc/**/*.cpp',
+    'torch/csrc/distributed/rpc/**/*.h',
    'torch/csrc/jit/serialization/*.h',
    'torch/csrc/jit/serialization/*.cpp',
 ]
@ -246,7 +250,6 @@ exclude_patterns = [
    'torch/csrc/inductor/aoti_torch/c/shim.h',
    'torch/csrc/jit/**/*',
    'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
-    'torch/csrc/lazy/**/*',
 ]
 init_command = [
    'python3',
@ -1255,7 +1258,6 @@ exclude_patterns = [
    'torch/fx/experimental/refinement_types.py',
    'torch/fx/experimental/rewriter.py',
    'torch/fx/experimental/schema_type_annotation.py',
-    'torch/fx/experimental/symbolic_shapes.py',
    'torch/fx/experimental/unification/__init__.py',
    'torch/fx/experimental/unification/core.py',
    'torch/fx/experimental/unification/dispatch.py',
@ -1271,7 +1273,6 @@ exclude_patterns = [
    'torch/fx/experimental/unification/utils.py',
    'torch/fx/experimental/unification/variable.py',
    'torch/fx/experimental/unify_refinements.py',
-    'torch/fx/experimental/validator.py',
    'torch/fx/graph.py',
    'torch/fx/graph_module.py',
    'torch/fx/interpreter.py',
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1083,8 +1083,16 @@ if(NOT MSVC)
  append_cxx_flag_if_supported("-Wno-unused-but-set-variable" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-fstandalone-debug" CMAKE_CXX_FLAGS_DEBUG)
-  string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
-  string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+    if(CMAKE_BUILD_TYPE MATCHES Debug)
+      message(Warning "Applying -Og optimization for aarch64 GCC debug build to workaround ICE")
+    endif()
+    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -Og")
+    string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -Og")
+  else()
+    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
+    string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
+  endif()
  append_cxx_flag_if_supported("-fno-math-errno" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-fno-trapping-math" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Werror=format" CMAKE_CXX_FLAGS)
--- a/2
+++ b/2
@ -121,7 +121,7 @@ torch/profiler/ @aaronenyeshi @sraikund16
 test/functorch/test_aotdispatch.py @ezyang @Chillee

 # Dataloader
-torch/utils/data/ @andrewkho @gokulavasan
+torch/utils/data/ @andrewkho @divyanshk

 # hipify
 torch/utils/hipify/ @jeffdaily @jithunnair-amd
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -467,6 +467,9 @@ if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
 endif()

 if(USE_CUDA AND NOT USE_ROCM)
+  add_definitions(-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1)
+  add_definitions(-DCUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES=1)
+  add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
  if($ENV{ATEN_STATIC_CUDA})
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -106,8 +106,7 @@ class TORCH_API Context {
            opt_device_type.value())) { // passed device not an accelerator
      return false;
    }
-    return getAcceleratorHooksInterface(opt_device_type.value())
-        .isPinnedPtr(data);
+    return getAcceleratorHooksInterface(opt_device_type).isPinnedPtr(data);
  }
  Allocator* getPinnedMemoryAllocator(
      std::optional<c10::DeviceType> device_type = std::nullopt) {
@ -513,7 +512,7 @@ inline size_t getNumGPUs() {
        "to be CUDA (e.g., when you say CUDA, on a HIP build of ATen, this actually "
        "means HIP.  Rebuild PyTorch with one or the other disabled.");
  } else if (hasCUDA()) {
-    return detail::getCUDAHooks().getNumGPUs();
+    return detail::getCUDAHooks().deviceCount();
  } else if (hasHIP()) {
    return detail::getHIPHooks().getNumGPUs();
  } else {
@ -550,7 +549,7 @@ inline void manual_seed(uint64_t seed) {
  }
  // NB: Sometimes we build with CUDA, but we don't have any GPUs
  // available. In that case, we must not seed CUDA; it will fail!
-  const auto cuda_num_gpus = detail::getCUDAHooks().getNumGPUs();
+  const auto cuda_num_gpus = detail::getCUDAHooks().deviceCount();
  if (hasCUDA() && cuda_num_gpus > 0) {
    for (const auto i : c10::irange(cuda_num_gpus)) {
      auto cuda_gen = globalContext().defaultGenerator(
@ -563,7 +562,7 @@ inline void manual_seed(uint64_t seed) {
    }
  }

-  const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs();
+  const auto xpu_num_gpus = detail::getXPUHooks().deviceCount();
  if (hasXPU() && xpu_num_gpus) {
    for (const auto i : c10::irange(xpu_num_gpus)) {
      auto xpu_gen = globalContext().defaultGenerator(
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@ -18,6 +18,8 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
    // To properly support this, see https://github.com/pytorch/pytorch/issues/14560
    if (at::globalContext().hasCUDA()) {
      return at::detail::getCUDAHooks().getPinnedMemoryAllocator();
+    } else if (at::globalContext().hasMTIA()) {
+      return at::detail::getMTIAHooks().getPinnedMemoryAllocator();
    } else if (at::globalContext().hasXPU()) {
      return at::detail::getXPUHooks().getPinnedMemoryAllocator();
    } else if(at::isPrivateUse1HooksRegistered()) {
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -420,15 +420,15 @@ inline c10::MaybeOwned<Tensor> expand_size(
 inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
  // expands a list of Tensors; ignores undefined (null) tensors
  bool first = true;
-  DimVector sizes;
+  SymDimVector sizes;
  for (const auto i : c10::irange(to_expand.size())) {
    if (!to_expand[i].defined()) {
      continue;
    } else if (first) {
-      sizes = to_expand[i].sizes();
+      sizes = to_expand[i].sym_sizes();
      first = false;
    } else {
-      sizes = infer_size_dimvector(sizes, to_expand[i].sizes());
+      sizes = infer_size_symdimvector(sizes, to_expand[i].sym_sizes());
    }
  }

@ -436,10 +436,10 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
  for (const auto i : c10::irange(to_expand.size())) {
    if (!to_expand[i].defined()) {
      continue;
-    } else if (to_expand[i].sizes().equals(sizes)) {
+    } else if (to_expand[i].sym_sizes().equals(sizes)) {
      result[i] = to_expand[i];
    } else {
-      result[i] = to_expand[i].expand(sizes);
+      result[i] = to_expand[i].expand_symint(sizes);
    }
  }
  return result;
--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@ -61,9 +61,8 @@ void set_num_threads(int nthreads) {
 #endif
 #ifdef USE_PTHREADPOOL
  // because PyTorch uses caffe2::pthreadpool() in QNNPACK
-  caffe2::PThreadPool* const pool = caffe2::pthreadpool();
+  caffe2::PThreadPool* const pool = caffe2::pthreadpool(nthreads);
  TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!");
-  pool->set_thread_count(nthreads);
 #endif
 #if AT_MKLDNN_ENABLED()
  at::native::mkldnn::clear_computation_cache();
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -111,17 +111,6 @@ template <
    typename E,
    typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
-  CachingHostAllocatorImpl() {
-    // Launch the background thread and process events in a loop.
-    if (pinned_use_background_threads()) {
-      getBackgroundThreadPool()->run([&]() {
-        while (true) {
-          process_events();
-          std::this_thread::sleep_for(std::chrono::microseconds(100));
-        }
-      });
-    }
-  }
  virtual ~CachingHostAllocatorImpl() = default;

 public:
@ -155,6 +144,17 @@ struct CachingHostAllocatorImpl {
      if (block) {
        return {block->ptr_, reinterpret_cast<void*>(block)};
      }
+
+      // Launch the background thread and process events in a loop.
+      static c10::once_flag background_thread_flag;
+      c10::call_once(background_thread_flag, [this] {
+        getBackgroundThreadPool()->run([&]() {
+          while (true) {
+            process_events();
+            std::this_thread::sleep_for(std::chrono::microseconds(100));
+          }
+        });
+      });
    }

    // Slow path: if we can't allocate from the cached free list, we need
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -17,8 +17,22 @@ TORCH_SDT_DEFINE_SEMAPHORE(operator_end)
 #endif

 bool show_dispatch_trace() {
-    static char const* temp = getenv("TORCH_SHOW_DISPATCH_TRACE");
-    return temp != nullptr;
+  static auto envar = std::getenv("TORCH_SHOW_DISPATCH_TRACE");
+
+  if (envar) {
+    if (strcmp(envar, "0") == 0) {
+      return false;
+    }
+    if (strcmp(envar, "1") == 0) {
+      return true;
+    }
+    TORCH_WARN(
+        "ignoring invalid value for TORCH_SHOW_DISPATCH_TRACE: ",
+        envar,
+        " valid values are 0 or 1.");
+  }
+
+  return false;
 }

 static thread_local int64_t dispatch_trace_nesting_value_;
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@ -306,11 +306,10 @@ struct VecConvert<float, 1, BFloat16, 1> {
      const VectorizedN<BFloat16, 1>& src) {
    VectorizedN<float, 1> result;
    uint16x8_t u16_8 = vld1q_u16(reinterpret_cast<const uint16_t*>(&src[0]));
-    int32x4_t shift = vdupq_n_s32(16);
    auto u16_low1 = vget_low_u16(u16_8);
    auto u16_high1 = vget_high_u16(u16_8);
-    float32x4_t f32x4_0 = vreinterpretq_f32_u32(vshlq_u32(vmovl_u16(u16_low1), shift));
-    float32x4_t f32x4_1 = vreinterpretq_f32_u32(vshlq_u32(vmovl_u16(u16_high1), shift));
+    float32x4_t f32x4_0 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_low1), 16));
+    float32x4_t f32x4_1 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_high1), 16));
    result[0] = {f32x4_0, f32x4_1};
    return result;
  }
--- a/aten/src/ATen/cuda/cub.cuh
+++ b/aten/src/ATen/cuda/cub.cuh
@ -234,7 +234,7 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
      scan_op,
      num_items,
      at::cuda::getCurrentCUDAStream());
-  C10_HIP_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 #else
  // non synchronizing cub call
  // even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
@ -302,7 +302,7 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
      init_value,
      num_items,
      at::cuda::getCurrentCUDAStream());
-  C10_HIP_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 #else
  // non synchronizing cub call
  // even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -14,6 +14,7 @@
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/cuda/CuFFTPlanCache.h>
 #include <c10/util/Exception.h>
+#include <c10/util/env.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/irange.h>
@ -79,19 +80,6 @@ struct _Initializer {
 } initializer;
 } // anonymous namespace

-// Sets the CUDA_MODULE_LOADING environment variable
-// if it's not set by the user.
-void maybe_set_cuda_module_loading(const std::string &def_value) {
-  auto value = std::getenv("CUDA_MODULE_LOADING");
-  if (!value) {
-#ifdef _WIN32
-    auto env_var = "CUDA_MODULE_LOADING=" + def_value;
-    _putenv(env_var.c_str());
-#else
-    setenv("CUDA_MODULE_LOADING", def_value.c_str(), 1);
-#endif
-  }
-}

 // NB: deleter is dynamic, because we need it to live in a separate
 // compilation unit (alt is to have another method in hooks, but
@ -102,7 +90,9 @@ void CUDAHooks::initCUDA() const {
  // have a chance to enable vitals.
  at::vitals::VitalsAPI.setVital("CUDA", "used", "true", /* force = */ true);

-  maybe_set_cuda_module_loading("LAZY");
+  // Sets the CUDA_MODULE_LOADING environment variable
+  // if it's not set by the user.
+  c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false);
  const auto num_devices = c10::cuda::device_count_ensure_non_zero();
  c10::cuda::CUDACachingAllocator::init(num_devices);
  at::cuda::detail::init_p2p_access_cache(num_devices);
@ -241,6 +231,9 @@ DeviceIndex current_device() {
  return -1;
 }

+/**
+ * DEPRECATED: use getCurrentDevice() instead
+ */
 DeviceIndex CUDAHooks::current_device() const {
  return at::cuda::detail::current_device();
 }
@ -436,10 +429,21 @@ void CUDAHooks::cuFFTClearPlanCache(DeviceIndex device_index) const {
  at::native::detail::cufft_clear_plan_cache_impl(device_index);
 }

+/**
+ * DEPRECATED: use deviceCount() instead
+ */
 int CUDAHooks::getNumGPUs() const {
  return at::cuda::device_count();
 }

+DeviceIndex CUDAHooks::deviceCount() const {
+  return at::cuda::device_count();
+}
+
+DeviceIndex CUDAHooks::getCurrentDevice() const {
+  return at::cuda::detail::current_device();
+}
+
 #ifdef USE_ROCM
 bool CUDAHooks::isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const {
  hipDeviceProp_t* prop = at::cuda::getDeviceProperties(device_index);
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -49,6 +49,9 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  int64_t cuFFTGetPlanCacheSize(DeviceIndex device_index) const override;
  void cuFFTClearPlanCache(DeviceIndex device_index) const override;
  int getNumGPUs() const override;
+  DeviceIndex deviceCount() const override;
+  DeviceIndex getCurrentDevice() const override;
+
 #ifdef USE_ROCM
  bool isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const override;
 #endif
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@ -458,6 +458,16 @@ inline int64_t get_bdim_size2(
  TORCH_INTERNAL_ASSERT(false);
 }

+inline c10::SymInt get_bdim_size2_symint(
+    const Tensor& a_value, std::optional<int64_t> a_bdim,
+    const Tensor& b_value, std::optional<int64_t> b_bdim) {
+  if (a_bdim)
+    return a_value.sym_size(*a_bdim);
+  if (b_bdim)
+    return b_value.sym_size(*b_bdim);
+  TORCH_INTERNAL_ASSERT(false);
+}
+
 // [start, start + 1, ..., stop - 1]
 inline VmapDimVector range(int64_t start, int64_t stop) {
  TORCH_INTERNAL_ASSERT(stop >= start);
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@ -58,7 +58,7 @@ static int64_t get_max_index_logical_dim(
 static std::vector<std::optional<Tensor>> batchIndices(
  ArrayRef<std::optional<Tensor>> indices,
  ArrayRef<std::optional<int64_t>> indices_bdims,
-  int64_t batch_size,
+  const c10::SymInt& batch_size,
  std::optional<int64_t> self_bdim,
  std::optional<int64_t> values_bdim = std::nullopt) {
  // There are 3 main cases:
@ -89,7 +89,7 @@ static std::vector<std::optional<Tensor>> batchIndices(

  for (size_t i = 0; i < indices.size(); i++) {
    auto index = indices[i];
-    if (index.has_value() && index->numel() != 0) {
+    if (index.has_value() && index->sym_numel() != 0) {
      const auto idx_bdim = indices_bdims[i];
      indices_.emplace_back(maybePadToLogicalRank(moveBatchDimToFront(index.value(), idx_bdim), idx_bdim, maxLogicalRank));
      if (index.value().dtype() == kBool && indices_bdims[i].has_value()) {
@ -346,10 +346,10 @@ namespace {
  // Code is mostly duplicated from
  // https://github.com/pytorch/pytorch/blob/fb0e27d38a8fdab4e1c14d6378c9e41cb30fd6a3
  // /aten/src/ATen/native/TensorAdvancedIndexing.cpp#L294-L312
-  VmapDimVector compute_indexed_shape(const Tensor &src, TensorList indices_list)
+  VmapSymDimVector compute_indexed_shape(const Tensor &src, TensorList indices_list)
  {
    int64_t dims_before = 0, dims_indexed = 0;
-    IntArrayRef replacement_shape;
+    SymIntArrayRef replacement_shape;
    for (const auto dim : c10::irange(indices_list.size())) {
      if (!indices_list[dim].defined()) {
        if (dims_indexed == 0) {
@ -357,7 +357,7 @@ namespace {
        }
      } else {
        dims_indexed++;
-        replacement_shape = indices_list[dim].sizes();
+        replacement_shape = indices_list[dim].sym_sizes();
      }
    }

@ -365,7 +365,7 @@ namespace {
    // The offset in these dimensions is computed by the kernel using the index tensor's
    // values and the stride of src. The new shape is not meaningful. It's used to make
    // the shape compatible with the result tensor.
-    auto shape = VmapDimVector(src.sizes());
+    auto shape = VmapSymDimVector(src.sym_sizes());
    int64_t end = dims_before + dims_indexed;
    shape.erase(shape.begin() + dims_before, shape.begin() + end);
    shape.insert(shape.begin() + dims_before, replacement_shape.begin(), replacement_shape.end());
@ -375,7 +375,7 @@ namespace {
  // Code is mostly duplicated from
  // https://github.com/pytorch/pytorch/blob/fb0e27d38a8fdab4e1c14d6378c9e41cb30fd6a3
  // /aten/src/ATen/native/TensorAdvancedIndexing.cpp#L379-L405
-  VmapDimVector get_indexed_shape(Tensor self, const torch::List<std::optional<at::Tensor>> &orig)
+  VmapSymDimVector get_indexed_shape(Tensor self, const torch::List<std::optional<at::Tensor>> &orig)
  {
    at::native::checkIndexTensorTypes(orig, /*allow_int*/ true);
    // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
@ -406,13 +406,13 @@ namespace {
                              ArrayRef<std::optional<int64_t>> indices_bdims,
                              const Tensor &values,
                              std::optional<int64_t> values_bdim,
-                              std::optional<int64_t> opt_batch_size = {}) {
+                              std::optional<c10::SymInt> opt_batch_size = {}) {

    Tensor self_ = moveBatchDimToFront(self, self_bdim);
    Tensor values_ = moveBatchDimToFront(values, values_bdim);
    // for inplace variants `index_put_` and `_index_put_impl_` we find the batch_size
    // here while for `index_put` does it outside of this function.
-    const auto batch_size = opt_batch_size ? opt_batch_size.value() : self_.size(0);
+    const auto batch_size = opt_batch_size ? opt_batch_size.value() : self_.sym_size(0);
    self_ = ensure_has_bdim(self_, self_bdim.has_value(), batch_size);
    values_ = ensure_has_bdim(values_, values_bdim.has_value(), batch_size);
    TORCH_INTERNAL_ASSERT(indices.size() == indices_bdims.size());
@ -431,7 +431,7 @@ namespace {

      // number of unit dims (for broadcasting value to indexed_shape)
      auto n_unit_dims = indexed_shape.size() - values_sizes.size();
-      VmapDimVector new_values_shape(values_sizes.size() + n_unit_dims);
+      VmapSymDimVector new_values_shape(values_sizes.size() + n_unit_dims);

      // add the batch-dim
      new_values_shape[0] = batch_size;
@ -445,7 +445,7 @@ namespace {
        // since batch and unit dims are already be filled.
        new_values_shape[idx + n_unit_dims] = values_sizes[idx];
      }
-      values_ = values_.view(new_values_shape);
+      values_ = values_.view_symint(new_values_shape);
    }

    return std::make_tuple(self_, indices_, values_);
@ -613,14 +613,14 @@ std::tuple<Tensor, std::optional<int64_t>> index_put_batch_rule(
  TORCH_INTERNAL_ASSERT(indices.size() == indices_bdims.size());

  // find the batch_size
-  int64_t batch_size = 0;
+  c10::SymInt batch_size = 0;
  if (self_bdim || values_bdim) {
-    batch_size = get_bdim_size2(self, self_bdim, values, values_bdim);
+    batch_size = get_bdim_size2_symint(self, self_bdim, values, values_bdim);
  } else {
    // one or more of the indices is batched.
    for (size_t i = 0; i < indices.size(); i++) {
      if (indices_bdims[i] && indices[i].has_value()) {
-        batch_size = indices[i].value().size(*indices_bdims[i]);
+        batch_size = indices[i].value().sym_size(*indices_bdims[i]);
        break;
      }
    }
--- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h
@ -1,31 +0,0 @@
-#pragma once
-
-#include <c10/core/Allocator.h>
-#include <c10/core/DeviceType.h>
-
-// Use of c10::hip namespace here makes hipification easier, because
-// I don't have to also fix namespaces.  Sorry!
-namespace c10 { namespace hip {
-
-// Takes a valid HIPAllocator (of any sort) and turns it into
-// an allocator pretending to be a CUDA allocator.  See
-// Note [Masquerading as CUDA]
-class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
-  Allocator* allocator_;
-public:
-  explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
-    : allocator_(allocator) {}
-  DataPtr allocate(size_t size) override {
-    DataPtr r = allocator_->allocate(size);
-    r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index()));
-    return r;
-  }
-  DeleterFnPtr raw_deleter() const override {
-    return allocator_->raw_deleter();
-  }
-  void copy_data(void* dest, const void* src, std::size_t count) const final {
-    allocator_->copy_data(dest, src, count);
-  }
-};
-
-}} // namespace c10::hip
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.cpp
@ -1,17 +0,0 @@
-#include <c10/core/Allocator.h>
-#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
-
-namespace c10 { namespace hip {
-namespace HIPCachingAllocatorMasqueradingAsCUDA {
-
-Allocator* get() {
-  static HIPAllocatorMasqueradingAsCUDA allocator(HIPCachingAllocator::get());
-  return &allocator;
-}
-
-void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream) {
-  HIPCachingAllocator::recordStream(ptr, stream.hip_stream());
-}
-
-} // namespace HIPCachingAllocatorMasqueradingAsCUDA
-}} // namespace c10::hip
--- a/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h
@ -1,18 +0,0 @@
-#pragma once
-
-#include <c10/hip/HIPCachingAllocator.h>
-#include <ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h>
-#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
-
-namespace c10 {
-// forward declaration
-class DataPtr;
-namespace hip {
-namespace HIPCachingAllocatorMasqueradingAsCUDA {
-
-C10_HIP_API Allocator* get();
-C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream);
-
-} // namespace HIPCachingAllocatorMasqueradingAsCUDA
-} // namespace hip
-} // namespace c10
--- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.cpp
+++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.cpp
@ -1,14 +0,0 @@
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
-
-// THIS IS A MASSIVE HACK.  This will BREAK you Caffe2 CUDA code if you
-// load ATen_hip, even if you don't ever actually use ATen_hip at runtime.
-//
-// If you ever link ATen_hip statically into the full library along
-// with ATen_cuda (libomnibus), the loading order of this versus the regular
-// ATen_cuda will be nondeterministic, and you'll nondeterministically get
-// one or the other.  (This will be obvious because all of your code
-// will fail.)
-//
-// This hack can be removed once PyTorch is out-of-place HIPified, and
-// doesn't pretend CUDA is HIP.
-C10_REGISTER_GUARD_IMPL(CUDA, at::cuda::HIPGuardImplMasqueradingAsCUDA);
--- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@ -1,374 +0,0 @@
-#pragma once
-
-#include <ATen/hip/HIPConfig.h>
-
-// The includes of HIPGuard.h
-#include <c10/hip/impl/HIPGuardImpl.h>
-#include <c10/hip/HIPMacros.h>
-#include <c10/core/DeviceType.h>
-#include <c10/core/impl/InlineDeviceGuard.h>
-#include <c10/core/impl/InlineStreamGuard.h>
-#include <c10/util/Exception.h>
-
-#include <c10/hip/impl/HIPGuardImpl.h>
-
-#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
-#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
-
-// Use of c10::hip namespace here makes hipification easier, because
-// I don't have to also fix namespaces.  Sorry!
-namespace c10 { namespace hip {
-
-// Note [Masquerading as CUDA]
-// ~~~~~~~~~~~~~~~~~~~~~~~~~~~
-// c10_hip is very easy to understand: it is HIPified from c10_cuda,
-// and anywhere you said CUDA, the source code now says HIP.  HIPified
-// PyTorch is much harder to understand: it is HIPified from regular
-// PyTorch, yes, but NO source-to-source translation from CUDA to
-// HIP occurs; instead, anywhere we see "CUDA", it actually means "HIP".
-// For example, when you use HIPified PyTorch, you say x.cuda() to
-// move a tensor onto ROCm device.  We call this situation "HIP
-// masquerading as CUDA".
-//
-// This leads to a very awkward situation when we want to call c10_hip
-// code from PyTorch, since c10_hip is expecting things to be called
-// HIP, but PyTorch is calling them CUDA (masquerading as HIP).  To
-// fix this impedance mismatch, we have MasqueradingAsCUDA variants
-// for all c10_hip classes.  These translate between the "HIP" and "CUDA
-// masquerading as HIP" worlds.  For example,
-// HIPGuardImplMasqueradingAsCUDA (this file) provides something like a
-// HIPGuardImpl, but it reports its DeviceType as CUDA (e.g., type()
-// returns CUDA, getDevice() reports the current HIP device as a CUDA
-// device.)
-//
-// We should be able to delete all of these classes entirely once
-// we switch PyTorch to calling a HIP a HIP.
-//
-// When you add a new MasqueradingAsCUDA class/function, you need to
-// also update the rewrite rules in torch/utils/hipify/cuda_to_hip_mappings.py
-//
-//
-//
-// By the way, note that the cpp file associated with this also
-// *overwrites* the entry in the DeviceGuardImpl registry for CUDA with
-// this HIP implementation.
-
-struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplInterface {
-  static constexpr c10::DeviceType static_type = c10::DeviceType::CUDA;
-  HIPGuardImplMasqueradingAsCUDA() {}
-  HIPGuardImplMasqueradingAsCUDA(c10::DeviceType t) {
-    TORCH_INTERNAL_ASSERT(t == c10::DeviceType::CUDA);
-  }
-  c10::DeviceType type() const override {
-    return c10::DeviceType::CUDA;
-  }
-  Device exchangeDevice(Device d) const override {
-    TORCH_INTERNAL_ASSERT(d.is_cuda());
-    Device old_device = getDevice();
-    if (old_device.index() != d.index()) {
-      C10_HIP_CHECK(hipSetDevice(d.index()));
-    }
-    return old_device;
-  }
-  Device getDevice() const override {
-    int device;
-    C10_HIP_CHECK(hipGetDevice(&device));
-    return Device(c10::DeviceType::CUDA, device);
-  }
-  void setDevice(Device d) const override {
-    TORCH_INTERNAL_ASSERT(d.is_cuda());
-    C10_HIP_CHECK(hipSetDevice(d.index()));
-  }
-  void uncheckedSetDevice(Device d) const noexcept override {
-    C10_HIP_CHECK_WARN(hipSetDevice(d.index()));
-  }
-  Stream getStream(Device d) const noexcept override {
-    return getCurrentHIPStreamMasqueradingAsCUDA(d.index()).unwrap();
-  }
-  Stream getDefaultStream(Device d) const override {
-    return getDefaultHIPStreamMasqueradingAsCUDA(d.index());
-  }
-  Stream getNewStream(Device d, int priority = 0) const override {
-    return getStreamFromPoolMasqueradingAsCUDA(priority, d.index());
-  }
-  Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) const override {
-    return getStreamFromPoolMasqueradingAsCUDA(isHighPriority, d.index());
-  }
-  Stream exchangeStream(Stream s) const noexcept override {
-    HIPStreamMasqueradingAsCUDA cs(s);
-    auto old_stream = getCurrentHIPStreamMasqueradingAsCUDA(s.device().index());
-    setCurrentHIPStreamMasqueradingAsCUDA(cs);
-    return old_stream.unwrap();
-  }
-  DeviceIndex deviceCount() const noexcept override {
-    int deviceCnt;
-    hipError_t _err;
-    _err = hipGetDeviceCount(&deviceCnt);
-    if(_err != hipErrorNoDevice && _err != hipSuccess)
-        C10_HIP_CHECK(_err);
-    return deviceCnt;
-  }
-
-  // Event-related functions
-  // Note: hipEventCreateWithFlags should be called on the same device as
-  //  the recording stream's device.
-  void createEvent(
-    hipEvent_t* hip_event,
-    const EventFlag flag) const {
-    // Maps PyTorch's Event::Flag to HIP flag
-    auto hip_flag = hipEventDefault;
-    switch (flag) {
-      case EventFlag::PYTORCH_DEFAULT:
-        hip_flag = hipEventDisableTiming;
-        break;
-      case EventFlag::BACKEND_DEFAULT:
-        hip_flag = hipEventDefault;
-        break;
-      default:
-        TORCH_CHECK(false, "HIP event received unknown flag");
-    }
-
-    C10_HIP_CHECK(hipEventCreateWithFlags(hip_event, hip_flag));
-  }
-
-  void destroyEvent(
-    void* event,
-    const DeviceIndex device_index) const noexcept override {
-    if (!event) return;
-    auto hip_event = static_cast<hipEvent_t>(event);
-    int orig_device;
-    C10_HIP_CHECK_WARN(hipGetDevice(&orig_device));
-    C10_HIP_CHECK_WARN(hipSetDevice(device_index));
-    C10_HIP_CHECK_WARN(hipEventDestroy(hip_event));
-    C10_HIP_CHECK_WARN(hipSetDevice(orig_device));
-  }
-
-  void record(void** event,
-    const Stream& stream,
-    const DeviceIndex device_index,
-    const EventFlag flag) const override {
-    TORCH_CHECK(device_index == -1 || device_index == stream.device_index(),
-      "Event device index ",
-      device_index,
-      " does not match recording stream's device index ",
-      stream.device_index(),
-      ".");
-
-    hipEvent_t hip_event = static_cast<hipEvent_t>(*event);
-    HIPStreamMasqueradingAsCUDA hip_stream{stream};
-
-    // Moves to stream's device to record
-    const auto orig_device = getDevice();
-    setDevice(stream.device());
-
-    // Creates the event (lazily)
-    if (!hip_event) createEvent(&hip_event, flag);
-    C10_HIP_CHECK(hipEventRecord(hip_event, hip_stream));
-    // Makes the void* point to the (possibly just allocated) HIP event
-    *event = hip_event;
-
-    // Resets device
-    setDevice(orig_device);
-  }
-
-  void block(
-    void* event,
-    const Stream& stream) const override {
-    if (!event) return;
-    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
-    HIPStreamMasqueradingAsCUDA hip_stream{stream};
-    const auto orig_device = getDevice();
-    setDevice(stream.device());
-    C10_HIP_CHECK(hipStreamWaitEvent(
-      hip_stream,
-      hip_event,
-      /*flags (must be zero)=*/ 0));
-    setDevice(orig_device);
-  }
-
-  bool queryEvent(void* event) const override {
-    if (!event) return true;
-    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
-    const hipError_t err = hipEventQuery(hip_event);
-    if (err != hipErrorNotReady) C10_HIP_CHECK(err);
-    else {
-      // ignore and clear the error if not ready
-      (void)hipGetLastError();
-    }
-    return (err == hipSuccess);
-  }
-
-  // Stream-related functions
-  bool queryStream(const Stream& stream) const override {
-    HIPStreamMasqueradingAsCUDA hip_stream{stream};
-    return hip_stream.query();
-  }
-
-  void synchronizeStream(const Stream& stream) const override {
-    HIPStreamMasqueradingAsCUDA hip_stream{stream};
-    hip_stream.synchronize();
-  }
-
-  void synchronizeEvent(void* event) const override {
-    if (!event)
-      return;
-    hipEvent_t hip_event = static_cast<hipEvent_t>(event);
-    C10_HIP_CHECK(hipEventSynchronize(hip_event));
-  }
-
-  void recordDataPtrOnStream(
-    const c10::DataPtr& data_ptr,
-    const Stream& stream) const override {
-    HIPStreamMasqueradingAsCUDA hip_stream{stream};
-    HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(data_ptr, hip_stream);
-  }
-
-  double elapsedTime(void* event1, void* event2, const DeviceIndex device_index)
-      const override {
-    TORCH_CHECK(
-        event1 && event2,
-        "Both events must be recorded before calculating elapsed time.");
-    int orig_device;
-    C10_HIP_CHECK(hipGetDevice(&orig_device));
-    C10_HIP_CHECK(hipSetDevice(device_index));
-    hipEvent_t hip_event1 = static_cast<hipEvent_t>(event1);
-    hipEvent_t hip_event2 = static_cast<hipEvent_t>(event2);
-    float time_ms = 0;
-    // raise hipErrorNotReady if either event is recorded but not yet completed
-    C10_HIP_CHECK(hipEventElapsedTime(&time_ms, hip_event1, hip_event2));
-    C10_HIP_CHECK(hipSetDevice(orig_device));
-    return static_cast<double>(time_ms);
-  }
-};
-
-// All of the guards which have HIPGuardImpl burned in need to also have
-// variants using HIPGuardImplMasqueradingAsCUDA.
-
-/// This code is all a direct copy from c10/cuda/HIPGuardMasqueradingAsCUDA.h, but with
-/// the correct InlineDeviceGuard burned in.  Sorry about the
-/// copy-pasting.
-
-struct HIPGuardMasqueradingAsCUDA {
-  explicit HIPGuardMasqueradingAsCUDA() = delete;
-  explicit HIPGuardMasqueradingAsCUDA(DeviceIndex device_index) : guard_(device_index) {}
-  explicit HIPGuardMasqueradingAsCUDA(Device device) : guard_(device) {}
-
-  HIPGuardMasqueradingAsCUDA(const HIPGuardMasqueradingAsCUDA&) = delete;
-  HIPGuardMasqueradingAsCUDA& operator=(const HIPGuardMasqueradingAsCUDA&) = delete;
-  HIPGuardMasqueradingAsCUDA(HIPGuardMasqueradingAsCUDA&& other) = delete;
-  HIPGuardMasqueradingAsCUDA& operator=(HIPGuardMasqueradingAsCUDA&& other) = delete;
-
-  void set_device(Device device) { guard_.set_device(device); }
-  void reset_device(Device device) { guard_.reset_device(device); }
-  void set_index(DeviceIndex device_index) { guard_.set_index(device_index); }
-  Device original_device() const { return guard_.original_device(); }
-  Device current_device() const { return guard_.current_device(); }
-
- private:
-  c10::impl::InlineDeviceGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
-};
-
-struct OptionalHIPGuardMasqueradingAsCUDA {
-  explicit OptionalHIPGuardMasqueradingAsCUDA() : guard_() {}
-  explicit OptionalHIPGuardMasqueradingAsCUDA(std::optional<Device> device_opt) : guard_(device_opt) {}
-  explicit OptionalHIPGuardMasqueradingAsCUDA(std::optional<DeviceIndex> device_index_opt) : guard_(device_index_opt) {}
-
-  OptionalHIPGuardMasqueradingAsCUDA(const OptionalHIPGuardMasqueradingAsCUDA&) = delete;
-  OptionalHIPGuardMasqueradingAsCUDA& operator=(const OptionalHIPGuardMasqueradingAsCUDA&) = delete;
-  OptionalHIPGuardMasqueradingAsCUDA(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete;
-  OptionalHIPGuardMasqueradingAsCUDA& operator=(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete;
-
-  void set_device(Device device) { guard_.set_device(device); }
-  void reset_device(Device device) { guard_.reset_device(device); }
-  void set_index(DeviceIndex device_index) { guard_.set_index(device_index); }
-  std::optional<Device> original_device() const { return guard_.original_device(); }
-  std::optional<Device> current_device() const { return guard_.current_device(); }
-  void reset() { guard_.reset(); }
-
-private:
-  c10::impl::InlineOptionalDeviceGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
-};
-
-struct HIPStreamGuardMasqueradingAsCUDA {
-  explicit HIPStreamGuardMasqueradingAsCUDA() = delete;
-  explicit HIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {}
-  HIPStreamGuardMasqueradingAsCUDA(const HIPStreamGuardMasqueradingAsCUDA&) = delete;
-  HIPStreamGuardMasqueradingAsCUDA& operator=(const HIPStreamGuardMasqueradingAsCUDA&) = delete;
-  HIPStreamGuardMasqueradingAsCUDA(HIPStreamGuardMasqueradingAsCUDA&& other) = delete;
-  HIPStreamGuardMasqueradingAsCUDA& operator=(HIPStreamGuardMasqueradingAsCUDA&& other) = delete;
-
-  void reset_stream(Stream stream) { guard_.reset_stream(stream); }
-
-  HIPStreamMasqueradingAsCUDA original_stream() const {
-    return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.original_stream());
-  }
-  HIPStreamMasqueradingAsCUDA current_stream() const {
-    return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.current_stream());
-  }
-
-  Device current_device() const { return guard_.current_device(); }
-  Device original_device() const { return guard_.original_device(); }
-
-private:
-  c10::impl::InlineStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
-};
-
-struct OptionalHIPStreamGuardMasqueradingAsCUDA {
-  explicit OptionalHIPStreamGuardMasqueradingAsCUDA() : guard_() {}
-  explicit OptionalHIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {}
-  explicit OptionalHIPStreamGuardMasqueradingAsCUDA(std::optional<Stream> stream_opt) : guard_(stream_opt) {}
-
-  OptionalHIPStreamGuardMasqueradingAsCUDA(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete;
-  OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete;
-  OptionalHIPStreamGuardMasqueradingAsCUDA(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete;
-  OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete;
-
-  void reset_stream(Stream stream) { guard_.reset_stream(stream); }
-
-  std::optional<HIPStreamMasqueradingAsCUDA> original_stream() const {
-    auto r = guard_.original_stream();
-    if (r.has_value()) {
-      return std::make_optional(HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value()));
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  std::optional<HIPStreamMasqueradingAsCUDA> current_stream() const {
-    auto r = guard_.current_stream();
-    if (r.has_value()) {
-      return std::make_optional(HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value()));
-    } else {
-      return std::nullopt;
-    }
-  }
-
-  void reset() { guard_.reset(); }
-
-private:
-  c10::impl::InlineOptionalStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
-};
-
-struct HIPMultiStreamGuardMasqueradingAsCUDA {
-  explicit HIPMultiStreamGuardMasqueradingAsCUDA(ArrayRef<HIPStreamMasqueradingAsCUDA> streams)
-    : guard_(unwrapStreams(streams)) {}
-
-  HIPMultiStreamGuardMasqueradingAsCUDA(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete;
-  HIPMultiStreamGuardMasqueradingAsCUDA& operator=(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete;
-  HIPMultiStreamGuardMasqueradingAsCUDA(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete;
-  HIPMultiStreamGuardMasqueradingAsCUDA& operator=(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete;
-
-private:
-  c10::impl::InlineMultiStreamGuard<HIPGuardImplMasqueradingAsCUDA> guard_;
-
-  static std::vector<Stream> unwrapStreams(ArrayRef<HIPStreamMasqueradingAsCUDA> hipStreams) {
-    std::vector<Stream> streams;
-    streams.reserve(hipStreams.size());
-    for (const HIPStreamMasqueradingAsCUDA& hipStream : hipStreams) {
-      streams.push_back(hipStream);
-    }
-    return streams;
-  }
-};
-
-}} // namespace c10::hip
--- a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h
@ -1,135 +0,0 @@
-#pragma once
-
-#include <c10/hip/HIPStream.h>
-
-// Use of c10::hip namespace here makes hipification easier, because
-// I don't have to also fix namespaces.  Sorry!
-namespace c10 { namespace hip {
-
-// See Note [Masquerading as CUDA] for motivation
-
-class HIPStreamMasqueradingAsCUDA {
-public:
-
-  enum Unchecked { UNCHECKED };
-
-  explicit HIPStreamMasqueradingAsCUDA(Stream stream)
-    : HIPStreamMasqueradingAsCUDA(UNCHECKED, stream) {
-    // We did the coercion unchecked; check that it was right.
-    TORCH_CHECK(stream.device().is_cuda() /* !!! */);
-  }
-
-  explicit HIPStreamMasqueradingAsCUDA(Unchecked, Stream stream)
-    // Unsafely coerce the "CUDA" stream into a HIP stream
-    : stream_(
-        HIPStream(
-          Stream(
-            Stream::UNSAFE,
-            Device(c10::DeviceType::HIP, stream.device_index()),
-            stream.id())
-        )
-      ) {}
-
-  // New constructor, just for this.  Does NOT coerce.
-  explicit HIPStreamMasqueradingAsCUDA(HIPStream stream) : stream_(stream) {}
-
-  bool operator==(const HIPStreamMasqueradingAsCUDA& other) const noexcept {
-    return stream_ == other.stream_;
-  }
-
-  bool operator!=(const HIPStreamMasqueradingAsCUDA& other) const noexcept {
-    return stream_ != other.stream_;
-  }
-
-  operator hipStream_t() const { return stream_.stream(); }
-
-  operator Stream() const {
-    // Unsafely coerce HIP stream into a "CUDA" stream
-    return Stream(Stream::UNSAFE, device(), id());
-  }
-
-  DeviceIndex device_index() const { return stream_.device_index(); }
-
-  // Unsafely coerce HIP device into CUDA device
-  c10::DeviceType device_type() const { return c10::DeviceType::CUDA; }
-
-  Device device() const {
-    // Unsafely coerce HIP device into CUDA device
-    return Device(c10::DeviceType::CUDA, stream_.device_index());
-  }
-
-  StreamId id() const        { return stream_.id(); }
-  bool query() const         { return stream_.query(); }
-  void synchronize() const   { stream_.synchronize(); }
-  int priority() const       { return stream_.priority(); }
-  hipStream_t stream() const { return stream_.stream(); }
-
-  Stream unwrap() const {
-    // Unsafely coerce HIP stream into "CUDA" stream
-    return Stream(Stream::UNSAFE, device(), id());
-  }
-
-  c10::StreamData3 pack3() const noexcept {
-    // Unsafely coerce HIP stream into "CUDA" stream before packing
-    return unwrap().pack3();
-  }
-
-  static HIPStreamMasqueradingAsCUDA unpack3(StreamId stream_id,
-                                             DeviceIndex device_index,
-                                             c10::DeviceType device_type) {
-    // NB: constructor manages CUDA->HIP translation for us
-    return HIPStreamMasqueradingAsCUDA(Stream::unpack3(
-        stream_id, device_index, device_type));
-  }
-
-  static std::tuple<int, int> priority_range() { return HIPStream::priority_range(); }
-
-  // New method, gets the underlying HIPStream
-  HIPStream hip_stream() const { return stream_; }
-
-private:
-  HIPStream stream_;
-};
-
-HIPStreamMasqueradingAsCUDA
-inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, DeviceIndex device = -1) {
-  return HIPStreamMasqueradingAsCUDA(getStreamFromPool(isHighPriority, device));
-}
-
-HIPStreamMasqueradingAsCUDA
-inline getStreamFromPoolMasqueradingAsCUDA(const int priority, DeviceIndex device = -1) {
-  return HIPStreamMasqueradingAsCUDA(getStreamFromPool(priority, device));
-}
-
-HIPStreamMasqueradingAsCUDA
-inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) {
-  return HIPStreamMasqueradingAsCUDA(getStreamFromExternal(ext_stream, device));
-}
-
-inline HIPStreamMasqueradingAsCUDA getDefaultHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
-  return HIPStreamMasqueradingAsCUDA(getDefaultHIPStream(device_index));
-}
-
-inline HIPStreamMasqueradingAsCUDA getCurrentHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) {
-  return HIPStreamMasqueradingAsCUDA(getCurrentHIPStream(device_index));
-}
-
-inline void setCurrentHIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA stream) {
-  setCurrentHIPStream(stream.hip_stream());
-}
-
-inline std::ostream& operator<<(std::ostream& stream, const HIPStreamMasqueradingAsCUDA& s) {
-  stream << s.hip_stream() << " (masquerading as CUDA)";
-  return stream;
-}
-
-}} // namespace c10::hip
-
-namespace std {
-  template <>
-  struct hash<c10::hip::HIPStreamMasqueradingAsCUDA> {
-    size_t operator()(c10::hip::HIPStreamMasqueradingAsCUDA s) const noexcept {
-      return std::hash<c10::Stream>{}(s.unwrap());
-    }
-  };
-} // namespace std
--- a/aten/src/ATen/miopen/Handle.cpp
+++ b/aten/src/ATen/miopen/Handle.cpp
@ -46,7 +46,7 @@ miopenHandle_t getMiopenHandle() {
      pool->newPoolWindow());

  auto handle = myPoolWindow->reserve(device);
-  MIOPEN_CHECK(miopenSetStream(handle, at::hip::getCurrentHIPStream()));
+  MIOPEN_CHECK(miopenSetStream(handle, at::cuda::getCurrentCUDAStream()));
  return handle;
 }

--- a/aten/src/ATen/native/Distributions.h
+++ b/aten/src/ATen/native/Distributions.h
@ -4,9 +4,13 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/MathConstants.h>

-// ROCM hcc doesn't work well with using std:: in kernel functions
+// ROCm hip compiler doesn't work well with using std:: in kernel functions
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 #if defined(__CUDA_ARCH__)
 #include <c10/cuda/CUDAMathCompat.h>
+#elif defined(__HIPCC__)
+#include <c10/hip/HIPMathCompat.h>
+#endif
 #define compat_exp c10::cuda::compat::exp
 #define compat_ceil c10::cuda::compat::ceil
 #define compat_floor c10::cuda::compat::floor
@ -16,17 +20,6 @@
 #define compat_tan c10::cuda::compat::tan
 #define compat_abs c10::cuda::compat::abs
 #define compat_log1p c10::cuda::compat::log1p
-#elif defined(__HIPCC__)
-#include <c10/hip/HIPMathCompat.h>
-#define compat_exp c10::hip::compat::exp
-#define compat_ceil c10::hip::compat::ceil
-#define compat_floor c10::hip::compat::floor
-#define compat_log c10::hip::compat::log
-#define compat_pow c10::hip::compat::pow
-#define compat_sqrt c10::hip::compat::sqrt
-#define compat_tan c10::hip::compat::tan
-#define compat_abs c10::hip::compat::abs
-#define compat_log1p c10::hip::compat::log1p
 #else
 #define compat_exp std::exp
 #define compat_ceil std::ceil
--- a/aten/src/ATen/native/LegacyBridge.cpp
+++ b/aten/src/ATen/native/LegacyBridge.cpp
--- a/aten/src/ATen/native/SharedReduceOps.h
+++ b/aten/src/ATen/native/SharedReduceOps.h
@ -51,13 +51,14 @@ inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
 #define MIN(X, Y) min_impl(X,Y)
 #endif

-// ROCM hcc doesn't work well with using std:: in kernel functions
+// ROCm hip compiler doesn't work well with using std:: in kernel functions
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 #if defined(__CUDA_ARCH__)
 #include <c10/cuda/CUDAMathCompat.h>
-#define compat_pow c10::cuda::compat::pow
 #elif defined(__HIPCC__)
 #include <c10/hip/HIPMathCompat.h>
-#define compat_pow c10::hip::compat::pow
+#endif
+#define compat_pow c10::cuda::compat::pow
 #else
 #define compat_pow std::pow
 #endif
--- a/aten/src/ATen/native/SummaryOps.cpp
+++ b/aten/src/ATen/native/SummaryOps.cpp
@ -5,6 +5,8 @@
 #include <ATen/Dispatch.h>
 #include <c10/util/irange.h>

+#include <limits>
+
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
@ -33,6 +35,17 @@ Tensor _bincount_cpu_template(
    AT_ERROR("bincount only supports 1-d non-negative integral inputs.");
  }

+  // Ensure max_val < 2 ^ 63 - 1 (9223372036854775807)
+  auto max_val = *self.max().data_ptr<input_t>();
+  if (max_val >= std::numeric_limits<int64_t>::max()) {
+    AT_ERROR(
+        "maximum value of input overflowed, it should be < ",
+        std::numeric_limits<int64_t>::max(),
+        " but got ",
+        max_val
+    );
+  }
+
  bool has_weights = weights.defined();
  if (has_weights && (weights.dim() != 1 || weights.size(0) != self.size(0))) {
    AT_ERROR("weights should be 1-d and have the same length as input");
@ -40,7 +53,7 @@ Tensor _bincount_cpu_template(

  Tensor output;
  int64_t self_size = self.size(0);
-  int64_t nbins = static_cast<int64_t>(*self.max().data_ptr<input_t>()) + 1L;
+  int64_t nbins = static_cast<int64_t>(max_val) + 1L;
  nbins = std::max(nbins, minlength); // at least minlength # of bins

  const input_t* self_p = self.const_data_ptr<input_t>();
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -239,38 +239,22 @@ static void norm_kernel_tensor_iterator_impl(

          using Vec = Vectorized<scalar_t>;
          using fVec = Vectorized<acc_t>;
+          fVec acc_vec{acc_t(0)};
          acc_t buffer[fVec::size()];
-          auto inner_reduction = [&buffer](scalar_t* inner_self_data, int64_t inner_size) -> acc_t {
-            fVec acc_vec{acc_t(0)};
-            int64_t d = 0;
-            for (; d < inner_size - (inner_size % Vec::size()); d += Vec::size()) {
-              Vec data_vec = Vec::loadu(inner_self_data + d);
-              norm_two_reduce_step(acc_vec, data_vec);
-            }
-            acc_vec.store(buffer);
-            for (int j = 1; j < fVec::size(); j++) {
-              buffer[0] = buffer[0] + buffer[j];
-            }
-            for (; d < inner_size; d++) {
-              acc_t data_val = acc_t(inner_self_data[d]);
-              buffer[0] += data_val * data_val;
-            }
-            return buffer[0];
-          };
-
-          // Use group reduction to avoid overflow.
-          // See https://github.com/pytorch/pytorch/pull/123416
-          int64_t group_size = 32768L;
-          int64_t group_n = (size + group_size - 1) / group_size;
-          scalar_t* inner_self_data = self_data;
-          int64_t inner_size = group_size;
-          double result = 0;
-          for (int64_t g = 0; g < group_n; g++) {
-            inner_size = (g * inner_size + group_size) > size ? (size - g * inner_size) : group_size;
-            result += inner_reduction(inner_self_data, inner_size);
-            inner_self_data += inner_size;
+          int64_t d = 0;
+          for (; d < size - (size % Vec::size()); d += Vec::size()) {
+            Vec data_vec = Vec::loadu(self_data + d);
+            norm_two_reduce_step(acc_vec, data_vec);
          }
-          result_data[0] = scalar_t(std::sqrt(result));
+          acc_vec.store(buffer);
+          for (int j = 1; j < fVec::size(); j++) {
+            buffer[0] = buffer[0] + buffer[j];
+          }
+          for (; d < size; d++) {
+            acc_t data_val = acc_t(self_data[d]);
+            buffer[0] += data_val * data_val;
+          }
+          result_data[0] = scalar_t(std::sqrt(buffer[0]));
        });
      });
  } else {
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@ -5,6 +5,74 @@
 #include <cuda_bf16.h>
 #endif

+// ROCm 6.3 is planned to have these functions, but until then here they are.
+#if defined(USE_ROCM) && ROCM_VERSION >= 60201
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+
+__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
+#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
+  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
+  typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
+  static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
+  union {
+    __hip_bfloat162_raw bf162_raw;
+    vec_short2 vs2;
+  } u{static_cast<__hip_bfloat162_raw>(value)};
+  u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
+  return static_cast<__hip_bfloat162>(u.bf162_raw);
+#else
+  static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
+  union u_hold {
+    __hip_bfloat162_raw h2r;
+    unsigned int u32;
+  };
+  u_hold old_val, new_val;
+  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  do {
+    new_val.h2r = __hadd2(old_val.h2r, value);
+  } while (!__hip_atomic_compare_exchange_strong(
+        (unsigned int*)address, &old_val.u32, new_val.u32,
+        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
+  return old_val.h2r;
+#endif
+}
+
+__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
+#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
+  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
+  // The api expects an ext_vector_type of half
+  typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
+  static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
+  union {
+    __half2_raw h2r;
+    vec_fp162 fp16;
+  } u {static_cast<__half2_raw>(value)};
+  u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
+  return static_cast<__half2>(u.h2r);
+#else
+  static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
+  union u_hold {
+    __half2_raw h2r;
+    unsigned int u32;
+  };
+  u_hold old_val, new_val;
+  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  do {
+    new_val.h2r = __hadd2(old_val.h2r, value);
+  } while (!__hip_atomic_compare_exchange_strong(
+        (unsigned int*)address, &old_val.u32, new_val.u32,
+        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
+  return old_val.h2r;
+#endif
+}
+#define ATOMICADD preview_unsafeAtomicAdd
+#define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
+#else
+#define ATOMICADD atomicAdd
+#define NATIVE_ZERO_BF16 __int2bfloat16_rz(0)
+#endif
+
 namespace at:: native {

 __device__ __forceinline__ size_t
@ -47,7 +115,7 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
    const index_t numel,
    scalar_t value) {
 #if (                      \
-    (defined(USE_ROCM)) || \
+    (defined(USE_ROCM) && ROCM_VERSION < 60201) || \
    (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
  gpuAtomicAddNoReturn(
      reinterpret_cast<at::Half*>(tensor) + index,
@ -61,17 +129,22 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
    __half2 value2;
    value2.x = static_cast<__half>(value);
    value2.y = __int2half_rz(0);
-    atomicAdd(reinterpret_cast<__half2*>(target_addr), value2);
+    ATOMICADD(reinterpret_cast<__half2*>(target_addr), value2);

  } else if (!low_byte && index > 0) {
    __half2 value2;
    value2.x = __int2half_rz(0);
    value2.y = static_cast<__half>(value);
-    atomicAdd(reinterpret_cast<__half2*>(target_addr - 1), value2);
+    ATOMICADD(reinterpret_cast<__half2*>(target_addr - 1), value2);

  } else {
+#ifdef USE_ROCM
+    gpuAtomicAddNoReturn(
+        reinterpret_cast<at::Half*>(tensor) + index, static_cast<at::Half>(value));
+#else
    atomicAdd(
        reinterpret_cast<__half*>(tensor) + index, static_cast<__half>(value));
+#endif
  }
 #endif
 }
@ -87,7 +160,7 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
    const index_t numel,
    scalar_t value) {
 #if (                      \
-    (defined(USE_ROCM)) || \
+    (defined(USE_ROCM) && ROCM_VERSION < 60201) || \
    (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
  gpuAtomicAddNoReturn(
      reinterpret_cast<at::BFloat16*>(tensor) + index,
@ -100,18 +173,23 @@ __device__ __forceinline__ void fastSpecializedAtomicAdd(
  if (low_byte && index < (numel - 1)) {
    __nv_bfloat162 value2;
    value2.x = *reinterpret_cast<__nv_bfloat16*>(&value);
-    value2.y = __int2bfloat16_rz(0);
-    atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr), value2);
+    value2.y = NATIVE_ZERO_BF16;
+    ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr), value2);

  } else if (!low_byte && index > 0) {
    __nv_bfloat162 value2;
-    value2.x = __int2bfloat16_rz(0);
+    value2.x = NATIVE_ZERO_BF16;
    value2.y = *reinterpret_cast<__nv_bfloat16*>(&value);
-    atomicAdd(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2);
+    ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2);

  } else {
+#ifdef USE_ROCM
+    gpuAtomicAddNoReturn(
+        reinterpret_cast<at::BFloat16*>(tensor) + index, static_cast<at::BFloat16>(value));
+#else
    atomicAdd(
        reinterpret_cast<__nv_bfloat16*>(tensor) + index, *reinterpret_cast<__nv_bfloat16*>(&value));
+#endif
  }
 #endif
 }
@ -144,4 +222,7 @@ __device__ __forceinline__ void fastAtomicAdd(
  }
 }

+#undef ATOMICADD
+#undef NATIVE_ZERO_BF16
+
 } // namespace at::native
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@ -43,7 +43,11 @@ inline __device__ bool _isinf(T x) {
  }
 }

+#ifdef USE_ROCM
+#define MAX_NUM_BLOCKS 64
+#else
 #define MAX_NUM_BLOCKS 200
+#endif

 // Normalizes the L1 norm of every row to 1; used by multinomial
 template <typename scalar_t>
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -109,33 +109,33 @@ CuFFTParamsLRUCache &cufft_get_plan_cache(DeviceIndex device_index) {
 namespace detail {

 int64_t cufft_get_plan_cache_max_size_impl(DeviceIndex device_index) {
-  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().deviceCount(),
    "cufft_get_plan_cache_max_size: expected 0 <= device_index < ",
-    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
  return cufft_get_plan_cache(device_index).max_size();
 }

 void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_size) {
-  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().deviceCount(),
    "cufft_set_plan_cache_max_size: expected 0 <= device_index < ",
-    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
  return cufft_get_plan_cache(device_index).resize(max_size);
 }

 int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index) {
-  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().deviceCount(),
    "cufft_get_plan_cache_size: expected 0 <= device_index < ",
-    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
  return cufft_get_plan_cache(device_index).size();
 }

 void cufft_clear_plan_cache_impl(DeviceIndex device_index) {
-  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().getNumGPUs(),
+  TORCH_CHECK(0 <= device_index && device_index < at::detail::getCUDAHooks().deviceCount(),
    "cufft_clear_plan_cache: expected 0 <= device_index < ",
-    at::detail::getCUDAHooks().getNumGPUs(), "], but got device_index=",
+    at::detail::getCUDAHooks().deviceCount(), "], but got device_index=",
    device_index);
  return cufft_get_plan_cache(device_index).clear();
 }
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@ -11,12 +11,15 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_assert_async.h>
 #include <ATen/ops/_cudnn_ctc_loss.h>
 #include <ATen/ops/_cudnn_ctc_loss_native.h>
 #include <ATen/ops/_use_cudnn_ctc_loss.h>
 #include <ATen/ops/_use_cudnn_ctc_loss_native.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_like.h>
+#include <ATen/ops/le.h>
+#include <ATen/ops/lt.h>
 #endif

 #if (!AT_CUDNN_ENABLED())
@ -81,11 +84,6 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss_tensor(
 namespace at {
 namespace native {

-namespace {
-// "cache" whether we've previously failed the target lengths check
-static bool tensor_failed_target_lengths_check = false;
-} // namespace
-
 bool _use_cudnn_ctc_loss(
    const Tensor& log_probs,
    const Tensor& targets,
@ -132,29 +130,27 @@ bool _use_cudnn_ctc_loss_tensor(
      (log_probs.dim() == 3) && (input_lengths.scalar_type() == at::kInt) &&
      (target_lengths.scalar_type() == at::kInt);

-  if (at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None) {
-    Tensor tlc = target_lengths.to(Device(at::kCPU), at::kLong).contiguous();
-    IntArrayRef tl(tlc.data_ptr<int64_t>(), tlc.numel());
-    for (const auto b : c10::irange(tl.size())) {
-      // target length < 256 is documented, but we see illegal memory accesses
-      // when target lengths > input lengths for CuDNN
-      Tensor ilc = input_lengths.to(Device(at::kCPU), at::kLong).contiguous();
+  if (use_cudnn) {
+    if (at::cuda::currentStreamCaptureStatus() ==
+        at::cuda::CaptureStatus::None) {
      Tensor tlc = target_lengths.to(Device(at::kCPU), at::kLong).contiguous();
-      IntArrayRef il(ilc.data_ptr<int64_t>(), ilc.numel());
      IntArrayRef tl(tlc.data_ptr<int64_t>(), tlc.numel());
-      use_cudnn = use_cudnn && (tl[b] < 256) && (tl[b] <= il[b]);
-      if (!use_cudnn) {
-        tensor_failed_target_lengths_check = true;
-        break;
+      for (const auto b : c10::irange(tl.size())) {
+        // target length < 256 is documented, but we see illegal memory accesses
+        // when target lengths > input lengths for CuDNN
+        Tensor ilc = input_lengths.to(Device(at::kCPU), at::kLong).contiguous();
+        Tensor tlc =
+            target_lengths.to(Device(at::kCPU), at::kLong).contiguous();
+        IntArrayRef il(ilc.const_data_ptr<int64_t>(), ilc.numel());
+        IntArrayRef tl(tlc.data_ptr<int64_t>(), tlc.numel());
+        use_cudnn = use_cudnn && (tl[b] < 256) && (tl[b] <= il[b]);
+        if (!use_cudnn) {
+          break;
+        }
      }
-    }
-  } else {
-    use_cudnn = use_cudnn && !tensor_failed_target_lengths_check;
-    if (tensor_failed_target_lengths_check) {
-      TORCH_WARN(
-          "cuDNN max target length restriction < 256 cannot be checked during graph capture,"
-          " but target length >= 256 was observed previously e.g., during warmup, so we"
-          " presume it is unsafe to dispatch to cuDNN ctc_loss.");
+    } else {
+      at::_assert_async(at::lt(input_lengths.max(), 256));
+      at::_assert_async(at::le(target_lengths, input_lengths).all());
    }
  }

--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@ -291,14 +291,14 @@ BenchmarkCache<size_t> bwd_filter_wssizes;

 struct Workspace {
  Workspace(size_t size) : size(size), data(NULL) {
-    data = c10::hip::HIPCachingAllocator::raw_alloc(size);
+    data = c10::cuda::CUDACachingAllocator::raw_alloc(size);
  }
  Workspace(const Workspace&) = delete;
  Workspace(Workspace&&) = default;
  Workspace& operator=(Workspace&&) = default;
  ~Workspace() {
    if (data) {
-      c10::hip::HIPCachingAllocator::raw_delete(data);
+      c10::cuda::CUDACachingAllocator::raw_delete(data);
    }
  }

@ -600,7 +600,7 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) {
  wsscache.insert(args.params, perfResults.memory);

  if (at::native::_cudnn_get_conv_benchmark_empty_cache()) {
-      c10::hip::HIPCachingAllocator::emptyCache();
+      c10::cuda::CUDACachingAllocator::emptyCache();
  }

 }
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@ -3,6 +3,7 @@
 #include <torch/library.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/ConvUtils.h>
+#include <ATen/native/mkldnn/Conv.h>
 #include <ATen/native/utils/ParamUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -295,7 +296,6 @@ Tensor mkldnn_convolution(
      use_channels_last);
 }

-namespace{
 Tensor mkldnn_convolution_pointwise(
    const Tensor& input_t,
    const Tensor& weight_t,
@ -324,6 +324,7 @@ Tensor mkldnn_convolution_pointwise(
      algorithm);
 }

+
 // Fuse convolution+binary_op+unary_op for good performance, which doing such
 // operation: output=unary_op(binary_op(conv(input_t, ...), other_t, alpha)).
 // The binary_attr means which binary_op is, it can be "add", or
@ -589,6 +590,7 @@ Tensor& mkldnn_convolution_pointwise_binary_(
  return other_t;
 }

+namespace{
 std::vector<int64_t> _original_deconv_weight_size(
    const Tensor& weight_t,
    int64_t groups) {
@ -711,37 +713,6 @@ Tensor _mkldnn_convolution_transpose(
  }
 }

-Tensor mkldnn_convolution_transpose_pointwise(
-    const Tensor& input_t,
-    const Tensor& weight_t,
-    const std::optional<Tensor>& bias_opt,
-    IntArrayRef padding,
-    IntArrayRef output_padding,
-    IntArrayRef stride,
-    IntArrayRef dilation,
-    int64_t groups,
-    c10::string_view attr,
-    torch::List<std::optional<at::Scalar>> scalars,
-    std::optional<c10::string_view> algorithm) {
-  c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
-  bool use_channels_last =
-      weight_t.is_mkldnn() || mkldnn_conv_use_channels_last(input_t, weight_t);
-  return _mkldnn_convolution_transpose(
-      input_t,
-      weight_t,
-      bias_opt,
-      padding,
-      output_padding,
-      stride,
-      dilation,
-      groups,
-      use_channels_last,
-      attr,
-      scalars,
-      algorithm
-  );
-}
-
 Tensor mkldnn_convolution_transpose_pointwise_meta(
    const Tensor& input_t,
    const Tensor& weight_t,
@ -889,6 +860,37 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_convolution_backward(
 }
 }

+Tensor mkldnn_convolution_transpose_pointwise(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::string_view attr,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<c10::string_view> algorithm) {
+  c10::impl::ExcludeDispatchKeyGuard edkg(c10::autograd_dispatch_keyset);
+  bool use_channels_last =
+      weight_t.is_mkldnn() || mkldnn_conv_use_channels_last(input_t, weight_t);
+  return _mkldnn_convolution_transpose(
+      input_t,
+      weight_t,
+      bias_opt,
+      padding,
+      output_padding,
+      stride,
+      dilation,
+      groups,
+      use_channels_last,
+      attr,
+      scalars,
+      algorithm
+  );
+}
+
 REGISTER_ALL_CPU_DISPATCH(mkldnn_convolution_backward_stub, &mkldnn_convolution_backward);

 namespace{
--- a/aten/src/ATen/native/mkldnn/Conv.h
+++ b/aten/src/ATen/native/mkldnn/Conv.h
@ -0,0 +1,68 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/Config.h>
+
+#if AT_MKLDNN_ENABLED()
+
+namespace at {
+namespace native {
+C10_API Tensor mkldnn_convolution_pointwise(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::string_view attr,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<c10::string_view> algorithm);
+
+C10_API Tensor mkldnn_convolution_pointwise_binary(
+    const Tensor& input_t,
+    const Tensor& other_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::string_view binary_attr,
+    std::optional<at::Scalar> alpha,
+    std::optional<c10::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<c10::string_view> unary_algorithm);
+
+C10_API Tensor& mkldnn_convolution_pointwise_binary_(
+    Tensor& other_t,
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::string_view binary_attr,
+    std::optional<at::Scalar> alpha,
+    std::optional<c10::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<c10::string_view> unary_algorithm);
+
+Tensor mkldnn_convolution_transpose_pointwise(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    c10::string_view attr,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<c10::string_view> algorithm);
+
+} // namespace native
+} // namespace at
+
+#endif // AT_MKLDNN_ENABLED()
--- a/aten/src/ATen/native/mkldnn/Linear.cpp
+++ b/aten/src/ATen/native/mkldnn/Linear.cpp
@ -3,6 +3,7 @@
 #include <ATen/Parallel.h>
 #include <ATen/core/Tensor.h>
 #include <torch/library.h>
+#include <ATen/native/mkldnn/Linear.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -180,12 +181,12 @@ std::tuple<Tensor, Tensor, Tensor> mkldnn_linear_backward(
  return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
 }

-static Tensor mkldnn_linear_pointwise(
+Tensor mkldnn_linear_pointwise(
    const Tensor& input_t,
    const Tensor& weight_t,
    const std::optional<Tensor>& bias_opt,
    c10::string_view attr,
-    torch::List<std::optional<at::Scalar>> scalars,
+    c10::List<std::optional<at::Scalar>> scalars,
    std::optional<c10::string_view> algorithm) {
  auto input = input_t.contiguous();
  auto input_size = input.sizes();
@ -254,7 +255,7 @@ static Tensor mkldnn_linear_pointwise(
  return output;
 }

-static Tensor mkldnn_linear_pointwise_binary(
+Tensor mkldnn_linear_pointwise_binary(
    const Tensor& input_t,
    const Tensor& other_t,
    const Tensor& weight_t,
--- a/aten/src/ATen/native/mkldnn/Linear.h
+++ b/aten/src/ATen/native/mkldnn/Linear.h
@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/Config.h>
+
+#if AT_MKLDNN_ENABLED()
+
+namespace at {
+namespace native {
+C10_API Tensor mkldnn_linear_pointwise(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    c10::string_view attr,
+    c10::List<std::optional<at::Scalar>> scalars,
+    std::optional<c10::string_view> algorithm);
+
+C10_API Tensor mkldnn_linear_pointwise_binary(
+    const Tensor& input_t,
+    const Tensor& other_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_opt,
+    c10::string_view attr);
+
+} // namespace native
+} // namespace at
+
+#endif // AT_MKLDNN_ENABLED()
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@ -454,6 +454,10 @@ inline bool supportedFloatingOrComplexType(const Tensor& t) {
  return supportedFloatingOrComplexType(t.scalar_type());
 }

+inline void checkSupportsBFloat16() {
+  TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
+                   "MPS bfloat16 type is supported on MacOS 14.0 or newer.");
+}

 inline bool needsGather(const Tensor& t) {
  static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -55,11 +55,6 @@ static inline void checkSupportsComplex() {
  TORCH_CHECK_TYPE(supportsComplex(), "MPS complex types are only supported on MacOS 14.0 or newer.");
 }

-static inline void checkSupportsBFloat16() {
-  TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS),
-                   "MPS bfloat16 type is supported on MacOS 14.0 or newer.");
-}
-
 MPSDataType getMPSDataType(ScalarType scalar_type) {
  switch (scalar_type) {
    case ScalarType::Float:
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@ -39,7 +39,7 @@ kernel void fmax(constant void     * input_        [[buffer(0)]],
  constant T* input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
  constant T* other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);

-  *out = fmax(*input, *other);
+  *out = static_cast<T>(fmax(*input, *other));
 }

 template<typename T>
@ -52,7 +52,7 @@ kernel void fmin(constant void     * input_        [[buffer(0)]],
  constant T* input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
  constant T* other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);

-  *out = fmin(*input, *other);
+  *out = static_cast<T>(fmin(*input, *other));
 }

 template<typename T>
@ -65,7 +65,7 @@ kernel void copysign(constant void     * input_        [[buffer(0)]],
  constant T* input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
  constant T* other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);

-  *out = copysign(*input, *other);
+  *out = static_cast<T>(copysign(*input, *other));
 }

 template<typename T>
@ -127,6 +127,11 @@ REGISTER_FMIN_OP(float);
 REGISTER_FMIN_OP(half);
 REGISTER_COPYSIGN_OP(float);
 REGISTER_COPYSIGN_OP(half);
+#if __METAL_VERSION__ >= 310
+REGISTER_FMAX_OP(bfloat);
+REGISTER_FMIN_OP(bfloat);
+REGISTER_COPYSIGN_OP(bfloat);
+#endif
 REGISTER_COPYSIGN_INTEGRAL_OP(int);
 REGISTER_COPYSIGN_INTEGRAL_OP(long);
 REGISTER_COPYSIGN_INTEGRAL_OP(short);
@ -196,7 +201,7 @@ kernel void nextafter_kernel(constant void  * input_       [[buffer(0)]],
  auto input = *(constant T*)((constant uint8_t*)input_ + offsets[tid].y);
  auto other = *(constant T*)((constant uint8_t*)other_ + offsets[tid].z);
 #if __METAL_VERSION__ >= 310
-  *out = nextafter(input, other);
+  *out = static_cast<T>(nextafter(input, other));
 #else
  if (input == other) {
    *out = input;
@ -225,6 +230,9 @@ kernel void nextafter_kernel<DTYPE, UTYPE>(  \

 REGISTER_NEXTAFTER_OP(float, uint);
 REGISTER_NEXTAFTER_OP(half, ushort);
+#if __METAL_VERSION__ >= 310
+REGISTER_NEXTAFTER_OP(bfloat, ushort);
+#endif

 template<typename T>
 kernel void complex_kernel(constant void  * real_       [[buffer(0)]],
--- a/aten/src/ATen/native/mps/operations/Bucketization.mm
+++ b/aten/src/ATen/native/mps/operations/Bucketization.mm
@ -183,6 +183,10 @@ REGISTER_SEARCHSORTED_OP(float, int);
 REGISTER_SEARCHSORTED_OP(float, long);
 REGISTER_SEARCHSORTED_OP(half, int);
 REGISTER_SEARCHSORTED_OP(half, long);
+#if __METAL_VERSION__ >= 310
+REGISTER_SEARCHSORTED_OP(bfloat, int);
+REGISTER_SEARCHSORTED_OP(bfloat, long);
+#endif
 REGISTER_SEARCHSORTED_OP(char, int);
 REGISTER_SEARCHSORTED_OP(char, long);
 REGISTER_SEARCHSORTED_OP(uchar, int);
--- a/aten/src/ATen/native/mps/operations/CrossKernel.mm
+++ b/aten/src/ATen/native/mps/operations/CrossKernel.mm
@ -33,6 +33,9 @@ REGISTER_CROSS_FUNC(short);
 REGISTER_CROSS_FUNC(char);
 REGISTER_CROSS_FUNC(uchar);
 REGISTER_CROSS_FUNC(bool);
+#if __METAL_VERSION__ >= 310
+REGISTER_CROSS_FUNC(bfloat);
+#endif

 template<typename T, typename U>
 kernel void cross(constant void     * input_        [[buffer(0)]],
@ -77,6 +80,9 @@ REGISTER_CROSS_OP(short);
 REGISTER_CROSS_OP(char);
 REGISTER_CROSS_OP(uchar);
 REGISTER_CROSS_OP(bool);
+#if __METAL_VERSION__ >= 310
+REGISTER_CROSS_OP(bfloat);
+#endif

 )CROSS_METAL");

--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@ -5,6 +5,7 @@
 #include <ATen/native/DistributionTemplates.h>
 #include <ATen/native/Distributions.h>
 #include <ATen/native/TensorFactories.h>
+#include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>

@ -68,13 +69,28 @@ Tensor& random_mps_impl(Tensor& self,
      newCachedGraph->stateTensor =
          mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[ @(at::mps::detail::PHILOX_STATE_N) ]);

-      // FP16, FP32 and Int32 are the only data types supported for distributions on MPS backend.
+      // BF16, FP16, FP32 and Int32 are the only data types supported for distributions on MPS backend.
      const MPSDataType inputDataType = [&] {
        // only for random_mps, we pass interval range of type int64_t
        if constexpr (std::is_same_v<scalar_t, int64_t>) {
          return MPSDataTypeInt32;
        }
-        return (self.scalar_type() == ScalarType::Half) ? MPSDataTypeFloat16 : MPSDataTypeFloat32;
+        // for bernoully always use float32
+        if constexpr (std::is_same_v<scalar_t, bool>) {
+          return MPSDataTypeFloat32;
+        }
+        switch (self.scalar_type()) {
+          case kHalf:
+            return MPSDataTypeFloat16;
+          case kFloat:
+            return MPSDataTypeFloat32;
+          case kBFloat16: {
+            checkSupportsBFloat16();
+            return MPSDataTypeBFloat16;
+          }
+          default:
+            TORCH_CHECK_TYPE(false, "Unsupported type ", self.scalar_type(), " for operation ", op_name);
+        }
      }();
      const MPSDataType outputDataType = std::is_same_v<scalar_t, bool> ? MPSDataTypeBool : inputDataType;

--- a/aten/src/ATen/native/mps/operations/Gamma.mm
+++ b/aten/src/ATen/native/mps/operations/Gamma.mm
@ -333,7 +333,7 @@ kernel void lgamma(device {0} *input [[buffer(0)]],
                   device {1} *output [[buffer(1)]],
                   uint id [[thread_position_in_grid]])
 {{
-    output[id] = LogGamma(static_cast<float>(input[id]));
+    output[id] = static_cast<{1}>(LogGamma(static_cast<float>(input[id])));
 }}


@ -346,24 +346,21 @@ kernel void digamma (device {0} *input [[buffer(0)]],
        if (x == trunc(x)) {{
            // As per C++ standard for gamma related functions and SciPy,
            // If the argument is a negative integer, NaN is returned
-            output[id] = NAN;
-        }}
-        else {{
+            output[id] = static_cast<{1}>(NAN);
+        }} else {{
            // Extracts the fractional part of x as r, since tan(pi * r) is more numerically
            // accurate than tan(pi * x). While these operations are mathematically equivalent
            // since both x and r are in radians and tan() has a periodicity of pi, in practice
            // the computation of pi * x is a source of error (when |x| > 1).
            float r = fract(x);
-            output[id] = calc_digamma_positive_domain(1.0f - x) - PI / tan(PI * r);
+            output[id] = static_cast<{1}>(calc_digamma_positive_domain(1.0f - x) - PI / tan(PI * r));
        }}
-    }}
-    else if (x == 0.0f) {{
+    }} else if (x == 0.0f) {{
        // As per C++ standard for gamma related functions and SciPy,
        // If the argument is ±0, ±∞ is returned
-        output[id] = copysign(INFINITY, -x);
-    }}
-    else {{
-        output[id] = calc_digamma_positive_domain(x);
+        output[id] = static_cast<{1}>(copysign(INFINITY, -x));
+    }} else {{
+        output[id] = static_cast<{1}>(calc_digamma_positive_domain(x));
    }}
 }}

@ -373,7 +370,7 @@ kernel void trigamma(device {0} *input [[buffer(0)]],
                     uint id [[thread_position_in_grid]])
 {{
    float x = input[id];
-    output[id] = calc_trigamma(x);
+    output[id] = static_cast<{1}>(calc_trigamma(x));
 }}


@ -385,7 +382,7 @@ kernel void polygamma(device {0} *input [[buffer(0)]],
  float x = input[id];
  float n = order;
  float sgn = ((order % 2) ? 1 : -1);
-  output[id] = sgn * Gamma(n + 1) * calc_zeta(n + 1, x);
+  output[id] = static_cast<{1}>(sgn * Gamma(n + 1) * calc_zeta(n + 1, x));
 }}

 )METAL",
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -311,18 +311,22 @@ static void avg_pool2d_template(const Tensor& input,
      MPSGraphTensor* avgPoolTensor = [mpsGraph avgPooling2DWithSourceTensor:paddedTensor descriptor:desc name:nil];
      if (cachedGraph.divisorTensor) {
        // workaround: custom divisor isn't supported by MPS backend, so we scale manually
-        return [mpsGraph multiplicationWithPrimaryTensor:avgPoolTensor
-                                         secondaryTensor:cachedGraph.divisorTensor
-                                                    name:nil];
+        return
+            [mpsGraph multiplicationWithPrimaryTensor:avgPoolTensor
+                                      secondaryTensor:mps::castMPSTensor(
+                                                          mpsGraph, cachedGraph.divisorTensor, [avgPoolTensor dataType])
+                                                 name:nil];
      } else {
        return avgPoolTensor;
      }
    } else { // backward pass
      MPSGraphTensor* scaledGradTensor = cachedGraph.gradOutputTensor;
      if (cachedGraph.divisorTensor) {
-        scaledGradTensor = [mpsGraph multiplicationWithPrimaryTensor:cachedGraph.gradOutputTensor
-                                                     secondaryTensor:cachedGraph.divisorTensor
-                                                                name:nil];
+        scaledGradTensor = [mpsGraph
+            multiplicationWithPrimaryTensor:cachedGraph.gradOutputTensor
+                            secondaryTensor:mps::castMPSTensor(
+                                                mpsGraph, cachedGraph.divisorTensor, [scaledGradTensor dataType])
+                                       name:nil];
      }
      MPSGraphTensor* avgPoolTensor = [mpsGraph avgPooling2DGradientWithGradientTensor:scaledGradTensor
                                                                          sourceTensor:paddedTensor
--- a/aten/src/ATen/native/mps/operations/RenormKernel.mm
+++ b/aten/src/ATen/native/mps/operations/RenormKernel.mm
@ -27,9 +27,9 @@ kernel void renorm(constant T* norm [[buffer(0)]],
                   device T* factor [[buffer(1)]],
                   constant float& maxnorm [[buffer(2)]],
                   uint index [[thread_position_in_grid]]) {
-  constexpr T eps = 1e-7;
+  constexpr auto eps = static_cast<T>(1e-7);
  constexpr T one = 1;
-  factor[index] = norm[index] > maxnorm ? maxnorm / (norm[index] + eps) : one;
+  factor[index] = norm[index] > maxnorm ? static_cast<T>(maxnorm / (norm[index] + eps)) : one;
 }

 #define REGISTER_RENORM_OP(DTYPE)                                  \
@ -42,6 +42,9 @@ kernel void renorm<DTYPE>(constant DTYPE* norm [[buffer(0)]],      \

 REGISTER_RENORM_OP(float);
 REGISTER_RENORM_OP(half);
+#if __METAL_VERSION__ >= 310
+REGISTER_RENORM_OP(bfloat);
+#endif

 )RENORM_METAL");

--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@ -535,7 +535,7 @@ Tensor& nan_to_num_out_mps(const Tensor& self,
                                                                    name:nil];
    });
    MPSScalar nanReplacementScalar, posInfReplacementScalar, negInfReplacementScalar;
-    AT_DISPATCH_FLOATING_TYPES_AND(kHalf, self.scalar_type(), "nan_to_num_mps", [&]() {
+    AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, self.scalar_type(), "nan_to_num_mps", [&]() {
      scalar_t nan_replacement = static_cast<scalar_t>(nan.value_or(0.));
      scalar_t pos_inf_replacement =
          pos_inf.has_value() ? static_cast<scalar_t>(pos_inf.value()) : std::numeric_limits<scalar_t>::max();
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@ -733,6 +733,10 @@ static std::string genScatterGatherCvtFunc(const std::string& dtypeSrc, const st
  const bool srcComplex = dtypeSrc[dtypeSrc.size() - 1] == '2';
  const bool dstComplex = dtypeDst[dtypeDst.size() - 1] == '2';
  if (dstComplex) {
+    // TODO: Document why explicit cast is needed only for bfloat types
+    if (dtypeSrc == "bfloat") {
+      return dtypeDst + "(float(x), 0.0)";
+    }
    return dtypeDst + (srcComplex ? needsConj ? "(x.x, -x.y)" : "(x.x, x.y)" : "(x,  0.0)");
  }
  if (srcComplex) {
@ -746,7 +750,7 @@ static std::string genScatterGatherCvtFunc(const std::string& dtypeSrc, const st
  if (dtypeDst == "bfloat") {
    return "bfloat(x)";
  }
-  return "(x)";
+  return dtypeSrc == "bfloat" ? dtypeDst + "(x)" : "(x)";
 }

 static MetalShaderLibrary scatterLib(SCATTER_OPS_TEMPLATE, 3);
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -13120,7 +13120,7 @@
  variants: function

 # See NOTE [_add_batch_dim and _remove_batch_dim]
- func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor
+- func: _remove_batch_dim(Tensor self, int level, SymInt batch_size, int out_dim) -> Tensor
  variants: function

 ## Functions related to the `torch.special` namespace
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl
@ -435,9 +435,6 @@ def define_qnnpack(third_party, labels = []):
        # @autodeps-skip
        name = "ukernels_asm",
        srcs = [
-            # Dummy empty source file to work around link error on x86-64 Android
-            # when static library contains no symbols.
-            "wrappers/dummy.c",
            # AArch32 ukernels
            "wrappers/hgemm/8x8-aarch32-neonfp16arith.S",
            "wrappers/q8conv/4x8-aarch32-neon.S",
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/wrappers/dummy.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/wrappers/dummy.c
--- a/aten/src/ATen/native/sparse/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/sparse/cuda/SoftMax.cu
@ -59,8 +59,6 @@
 #include <thrust/transform.h>
 #include <thrust/unique.h>

-#include <c10/cuda/CUDAMathCompat.h>
-
 namespace at::native {
 namespace {

--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@ -53,6 +53,11 @@ at::Tensor _cslt_compress(const Tensor& sparse_input)
        case at::ScalarType::Float:
            type = CUDA_R_32F;
            break;
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
+        case at::ScalarType::Float8_e4m3fn:
+            type = CUDA_R_8F_E4M3;
+            break;
+#endif
        default:
            TORCH_CHECK(false, "Unsupported dtype for cuSPARSELt compressed matrix");
            break;
@ -123,15 +128,16 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
  float beta = 0.0;
  cudaDataType input_type;
  cudaDataType output_type;
+  cudaDataType C_type;
  cusparseComputeType compute_type;
  auto compression_factor = 9;

-
  switch(compressed_A.scalar_type())
  {
    case at::ScalarType::Char:
        input_type = CUDA_R_8I;
        output_type = CUDA_R_8I;
+        C_type = CUDA_R_8I;
        compute_type = CUSPARSE_COMPUTE_32I;
        compression_factor = 10;
        break;
@ -141,61 +147,111 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
    case at::ScalarType::Half:
        input_type = CUDA_R_16F;
        output_type = CUDA_R_16F;
+        C_type = CUDA_R_16F;
        compute_type = CUSPARSE_COMPUTE_32F;
        break;
    case at::ScalarType::BFloat16:
        input_type = CUDA_R_16BF;
        output_type = CUDA_R_16BF;
+        C_type = CUDA_R_16BF;
        compute_type = CUSPARSE_COMPUTE_32F;
        break;
    case at::ScalarType::Float:
        input_type = CUDA_R_32F;
        output_type = CUDA_R_32F;
+        C_type = CUDA_R_32F;
        compute_type = CUSPARSE_COMPUTE_32F;
        break;
-
+// if cuSPARSELt >= 6.2.3, we can add Float8 support
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
+    case at::ScalarType::Float8_e4m3fn:
+        input_type = CUDA_R_8F_E4M3;
+        output_type = CUDA_R_8F_E4M3;
+        C_type = CUDA_R_16F;
+        compute_type = CUSPARSE_COMPUTE_32F;
+        break;
+#endif
 // cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
 #else
    case at::ScalarType::Half:
        input_type = CUDA_R_16F;
        output_type = CUDA_R_16F;
+        C_type = CUDA_R_16F;
        compute_type = CUSPARSE_COMPUTE_16F;
        break;
    case at::ScalarType::BFloat16:
        input_type = CUDA_R_16BF;
        output_type = CUDA_R_16BF;
+        C_type = CUDA_R_16BF;
        compute_type = CUSPARSE_COMPUTE_16F;
        break;
    case at::ScalarType::Float:
        input_type = CUDA_R_32F;
        output_type = CUDA_R_32F;
+        C_type = CUDA_R_32F;
        compute_type = CUSPARSE_COMPUTE_TF32;
        break;
 #endif
-
    default:
        TORCH_CHECK(false, "Unsupported dtype for cuSPARSELt compressed matrix multiplication.");
        break;
  }
  ScalarType out_dtype = dense_B.scalar_type();
-  // special check for mixed dtype int8 int8 -> {fp16, bf16, int32} support
+  // special check for mixed dtype support for 8 bit dtypes
+  // cslt 0.5.2+: int8 int8 -> {fp16, bf16, int32} support
  if (out_dtype_opt.has_value()) {
    out_dtype = out_dtype_opt.value();
-    TORCH_CHECK(input_type == CUDA_R_8I, "out_dtype support only available for int8 inputs");
-    switch (out_dtype)
+    if (input_type == CUDA_R_8I)
    {
-        case at::ScalarType::Half:
-            output_type = CUDA_R_16F;
-            break;
-        case at::ScalarType::BFloat16:
-            output_type = CUDA_R_16BF;
-            break;
-        case at::ScalarType::Int:
-            output_type = CUDA_R_32I;
-            break;
-        default:
-            TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, int32}");
-            break;
+        switch (out_dtype)
+        {
+            case at::ScalarType::Half:
+                C_type = CUDA_R_16F;
+                output_type = CUDA_R_16F;
+                break;
+            case at::ScalarType::BFloat16:
+                C_type = CUDA_R_16BF;
+                output_type = CUDA_R_16BF;
+                break;
+            case at::ScalarType::Int:
+                C_type = CUDA_R_32I;
+                output_type = CUDA_R_32I;
+                break;
+            default:
+                TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, int32} for int8 inputs");
+                break;
+        }
+    }
+// cslt 0.6.2+: fp8 fp8 -> {fp8, fp16, bf16, fp32} support
+#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
+    else if (input_type == CUDA_R_8F_E4M3)
+    {
+        switch (out_dtype)
+        {
+            case at::ScalarType::Float8_e4m3fn:
+                output_type = CUDA_R_8F_E4M3;
+                C_type = CUDA_R_16F;
+                break;
+            case at::ScalarType::Half:
+                output_type = CUDA_R_16F;
+                C_type = CUDA_R_16F;
+                break;
+            case at::ScalarType::BFloat16:
+                output_type = CUDA_R_16BF;
+                C_type = CUDA_R_16BF;
+                break;
+            case at::ScalarType::Float:
+                output_type = CUDA_R_32F;
+                C_type = CUDA_R_32F;
+                break;
+            default:
+                TORCH_CHECK(false, "Unsupported out_dtype passed, must be one of {fp16, bf16, float32} for fp8 inputs");
+                break;
+        }
+    }
+#endif
+    else {
+        TORCH_CHECK(false, "out_dtype support only available for int8/fp8 inputs");
    }
  }

@ -244,6 +300,18 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
      output_type,
      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));

+  // For float8, need fp16 C_descriptor, can't use FP8 for this matrix
+  cusparseLtMatDescriptor_t C_descriptor;
+  TORCH_CUDASPARSE_CHECK(cusparseLtDenseDescriptorInit(
+      &handle,
+      &C_descriptor,
+      m,
+      n,
+      (transpose_result) ? m: n,
+      16,
+      C_type,
+      (transpose_result) ? CUSPARSE_ORDER_COL : CUSPARSE_ORDER_ROW));
+
  // initialize matmul
  TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescriptorInit(
      &handle,
@ -252,7 +320,7 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
      (dense_B.is_contiguous()) ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE,
      &sparse_input_descriptor,
      &dense_input_descriptor,
-      &res_descriptor,
+      &C_descriptor,
      &res_descriptor,
      compute_type));

@ -273,11 +341,17 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(

  // set tensor_alpha_mode and alpha pointer for matmul
  const auto alpha_tensor = alpha_opt.has_value() ? *alpha_opt: Tensor{};
-  const auto alpha_ptr = alpha_opt.has_value() ? alpha_tensor.data_ptr(): &alpha;
+  auto alpha_ptr = &alpha;
  if (alpha_opt.has_value()) {
-    tensor_alpha_mode = 1;
-    TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
-        &handle, &matmul, CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING, &tensor_alpha_mode, sizeof(tensor_alpha_mode)));
+    if (alpha_tensor.numel() == 1) {
+        alpha = alpha_tensor.item<float>();
+    }
+    else {
+        tensor_alpha_mode = 1;
+        TORCH_CUDASPARSE_CHECK(cusparseLtMatmulDescSetAttribute(
+            &handle, &matmul, CUSPARSELT_MATMUL_ALPHA_VECTOR_SCALING, &tensor_alpha_mode, sizeof(tensor_alpha_mode)));
+        alpha_ptr = static_cast<float*>(alpha_tensor.data_ptr());
+    }
  }

  TORCH_CUDASPARSE_CHECK(
--- a/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/flash_api.hip
@ -37,7 +37,7 @@
 #ifdef USE_FLASH_ATTENTION
 #include <ATen/core/Tensor.h>
 #include <ATen/hip/HIPContext.h>
-#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <c10/hip/HIPGuard.h>
 #include <ATen/hip/HIPGraphsUtils.cuh>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -96,7 +96,7 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
        int window_size_right,
        const bool return_softmax,
        std::optional<at::Generator> gen_) {
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
  check_gpu_arch(stream);

  auto q_dtype = q.dtype();
@ -157,7 +157,7 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head

  // Otherwise the kernel will be launched from cuda:0 device
  // Cast to char to avoid compiler warning about narrowing
-  at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};

  // We want to checkpoint and save the RNG state for backward if dropout
  // We get the default generator and return the seed and offset which will
@ -295,7 +295,7 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
        const bool deterministic,
        const at::Tensor philox_seed,
        const at::Tensor philox_offset) {
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA().stream();
+  auto stream = at::cuda::getCurrentCUDAStream().stream();
  check_gpu_arch(stream);

  bool is_dropout = p_dropout > 0.0;
@ -383,7 +383,7 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si

  // Otherwise the kernel will be launched from cuda:0 device
  // Cast to char to avoid compiler warning about narrowing
-  at::hip::HIPGuardMasqueradingAsCUDA device_guard{(char)q.get_device()};
+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};

  auto opts = q.options();
  auto softmax_d = at::empty({batch_size, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
--- a/aten/src/ATen/native/vulkan/api/StringUtil.cpp
+++ b/aten/src/ATen/native/vulkan/api/StringUtil.cpp
--- a/aten/src/ATen/test/vitals.cpp
+++ b/aten/src/ATen/test/vitals.cpp
@ -3,6 +3,7 @@

 #include <ATen/ATen.h>
 #include <ATen/core/Vitals.h>
+#include <c10/util/env.h>
 #include <c10/util/irange.h>
 #include <cstdlib>

@ -15,11 +16,7 @@ TEST(Vitals, Basic) {
  std::streambuf* sbuf = std::cout.rdbuf();
  std::cout.rdbuf(buffer.rdbuf());
  {
-#ifdef _WIN32
-    _putenv("TORCH_VITAL=1");
-#else
-    setenv("TORCH_VITAL", "1", 1);
-#endif
+    c10::utils::set_env("TORCH_VITAL", "1");
    TORCH_VITAL_DEFINE(Testing);
    TORCH_VITAL(Testing, Attribute0) << 1;
    TORCH_VITAL(Testing, Attribute1) << "1";
@ -44,11 +41,7 @@ TEST(Vitals, MultiString) {
  std::streambuf* sbuf = std::cout.rdbuf();
  std::cout.rdbuf(buffer.rdbuf());
  {
-#ifdef _WIN32
-    _putenv("TORCH_VITAL=1");
-#else
-    setenv("TORCH_VITAL", "1", 1);
-#endif
+    c10::utils::set_env("TORCH_VITAL", "1");
    TORCH_VITAL_DEFINE(Testing);
    TORCH_VITAL(Testing, Attribute0) << 1 << " of " << 2;
    TORCH_VITAL(Testing, Attribute1) << 1;
@ -69,15 +62,7 @@ TEST(Vitals, OnAndOff) {
    std::streambuf* sbuf = std::cout.rdbuf();
    std::cout.rdbuf(buffer.rdbuf());
    {
-#ifdef _WIN32
-      if (i) {
-        _putenv("TORCH_VITAL=1");
-      } else {
-        _putenv("TORCH_VITAL=0");
-      }
-#else
-      setenv("TORCH_VITAL", i ? "1" : "", 1);
-#endif
+      c10::utils::set_env("TORCH_VITAL", i ? "1" : "0");
      TORCH_VITAL_DEFINE(Testing);
      TORCH_VITAL(Testing, Attribute0) << 1;
    }
@ -100,11 +85,7 @@ TEST(Vitals, APIVitals) {
  std::streambuf* sbuf = std::cout.rdbuf();
  std::cout.rdbuf(buffer.rdbuf());
  {
-#ifdef _WIN32
-    _putenv("TORCH_VITAL=1");
-#else
-    setenv("TORCH_VITAL", "1", 1);
-#endif
+    c10::utils::set_env("TORCH_VITAL", "1");
    APIVitals api_vitals;
    rvalue = api_vitals.setVital("TestingSetVital", "TestAttr", "TestValue");
  }
--- a/aten/src/ATen/xpu/detail/XPUHooks.cpp
+++ b/aten/src/ATen/xpu/detail/XPUHooks.cpp
@ -53,10 +53,16 @@ Device XPUHooks::getDeviceFromPtr(void* data) const {
 #endif
 }

+/**
+ * DEPRECATED: use deviceCount() instead
+ */
 c10::DeviceIndex XPUHooks::getNumGPUs() const {
  return at::xpu::device_count();
 }

+/**
+ * DEPRECATED: use getCurrentDevice() instead
+ */
 DeviceIndex XPUHooks::current_device() const {
  return c10::xpu::current_device();
 }
@ -85,6 +91,14 @@ bool XPUHooks::hasPrimaryContext(DeviceIndex device_index) const {
  return true;
 }

+DeviceIndex XPUHooks::deviceCount() const {
+  return at::xpu::device_count();
+}
+
+DeviceIndex XPUHooks::getCurrentDevice() const {
+  return at::xpu::current_device();
+}
+
 REGISTER_XPU_HOOKS(XPUHooks);

 } // namespace at::xpu::detail
--- a/aten/src/ATen/xpu/detail/XPUHooks.h
+++ b/aten/src/ATen/xpu/detail/XPUHooks.h
@ -21,6 +21,8 @@ struct XPUHooks : public at::XPUHooksInterface {
  Allocator* getPinnedMemoryAllocator() const override;
  bool isPinnedPtr(const void* data) const override;
  bool hasPrimaryContext(DeviceIndex device_index) const override;
+  DeviceIndex deviceCount() const override;
+  DeviceIndex getCurrentDevice() const override;
 };

 } // namespace at::xpu::detail
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@ -282,7 +282,7 @@ sam,fail_to_run,0



-sam_fast,timeout,0
+sam_fast,fail_to_run,0



--- a/Show More
+++ b/Show More