[CI] Add Triton 3.13t build (#143212 )

By just extending the matrix and invoking script with appropriate cpython runtime Pull Request resolved: https://github.com/pytorch/pytorch/pull/143212 Approved by: https://github.com/clee2000, https://github.com/atalman, https://github.com/seemethere
Infer whether prologues can be computed without upcasting to fp32 without changing numerics (#142402 )
2025-11-11 14:28:03 +08:00 · 2024-12-13 23:45:47 +00:00 · 2024-12-13 23:25:15 +00:00 · 2024-12-13 23:03:45 +00:00 · 2024-12-13 22:48:44 +00:00 · 2024-12-13 22:37:33 +00:00
619 changed files with 14257 additions and 7694 deletions
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@ -9,7 +9,7 @@ install_ubuntu() {
  # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
  apt-get install -y cargo
  echo "Checking out sccache repo"
-  git clone https://github.com/mozilla/sccache -b v0.8.2
+  git clone https://github.com/mozilla/sccache -b v0.9.0
  cd sccache
  echo "Building sccache"
  cargo build --release
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -30,10 +30,10 @@ dill==0.3.7
 #Pinned versions: 0.3.7
 #test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py

-expecttest==0.2.1
+expecttest==0.3.0
 #Description: method for writing tests where test framework auto populates
 # the expected output based on previous runs
-#Pinned versions: 0.2.1
+#Pinned versions: 0.3.0
 #test that import:

 fbscribelogger==0.1.7
@ -280,9 +280,9 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 #test that import:

 #lintrunner is supported on aarch64-linux only from 0.12.4 version
-lintrunner==0.12.5
+lintrunner==0.12.7
 #Description: all about linters!
-#Pinned versions: 0.12.5
+#Pinned versions: 0.12.7
 #test that import:

 redis>=4.0.0
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -247,7 +247,7 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
 fi

 if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
-  set -e
+  set -e -o pipefail

  get_bazel

--- a/.ci/pytorch/common.sh
+++ b/.ci/pytorch/common.sh
@ -3,7 +3,7 @@
 # Common setup for all Jenkins scripts
 # shellcheck source=./common_utils.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-set -ex
+set -ex -o pipefail

 # Required environment variables:
 #   $BUILD_ENVIRONMENT (should be set by your Docker image)
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -160,7 +160,7 @@ function install_torchvision() {
 }

 function install_tlparse() {
-  pip_install --user "tlparse==0.3.25"
+  pip_install --user "tlparse==0.3.30"
  PATH="$(python -m site --user-base)/bin:$PATH"
 }

--- a/.ci/pytorch/cpp_doc_push_script.sh
+++ b/.ci/pytorch/cpp_doc_push_script.sh
@ -40,7 +40,7 @@ echo "Building PyTorch C++ API docs..."
 rm -rf cppdocs
 git clone https://github.com/pytorch/cppdocs

-set -ex
+set -ex -o pipefail

 # Generate ATen files
 pushd "${pt_checkout}"
--- a/.ci/pytorch/functorch_doc_push_script.sh
+++ b/.ci/pytorch/functorch_doc_push_script.sh
@ -5,7 +5,7 @@ pt_checkout="/var/lib/jenkins/workspace"
 source "$pt_checkout/.ci/pytorch/common_utils.sh"
 echo "functorch_doc_push_script.sh: Invoked with $*"

-set -ex
+set -ex -o pipefail

 version=${DOCS_VERSION:-nightly}
 echo "version: $version"
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -6,7 +6,7 @@
 # return the same thing, ex checks for for rocm, CUDA, and changing the path
 # where sccache is installed, and not changing /etc/environment.

-set -ex
+set -ex -o pipefail

 install_binary() {
  echo "Downloading sccache binary from S3 repo"
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -7,7 +7,7 @@ source "$pt_checkout/.ci/pytorch/common_utils.sh"

 echo "python_doc_push_script.sh: Invoked with $*"

-set -ex
+set -ex -o pipefail

 # for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
 # the order of operations goes:
@ -63,7 +63,7 @@ build_docs () {
    echo "(tried to echo the WARNINGS above the ==== line)"
    echo =========================
  fi
-  set -ex
+  set -ex -o pipefail
  return $code
 }

--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -180,7 +180,7 @@ def smoke_test_cuda(
    # torch.compile is available on macos-arm64 and Linux for python 3.8-3.13
    if (
        torch_compile_check == "enabled"
-        and sys.version_info < (3, 13, 0)
+        and sys.version_info < (3, 14, 0)
        and target_os in ["linux", "linux-aarch64", "macos-arm64", "darwin"]
    ):
        smoke_test_compile("cuda" if torch.cuda.is_available() else "cpu")
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -4,7 +4,7 @@
 # (This is set by default in the Docker images we build, so you don't
 # need to set it yourself.

-set -ex
+set -ex -o pipefail

 # Suppress ANSI color escape sequences
 export TERM=vt100
@ -313,6 +313,7 @@ test_dynamo_wrapped_shard() {
    --exclude-jit-executor \
    --exclude-distributed-tests \
    --exclude-torch-export-tests \
+    --exclude-aot-dispatch-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose \
    --upload-artifacts-while-running
@ -1243,7 +1244,7 @@ EOF
 }

 test_bazel() {
-  set -e
+  set -e -o pipefail

  # bazel test needs sccache setup.
  # shellcheck source=./common-build.sh
--- a/.ci/pytorch/win-build.sh
+++ b/.ci/pytorch/win-build.sh
@ -38,7 +38,7 @@ if [[ $PYLONG_API_CHECK == 0 ]]; then
  echo "PyLong_AsUnsignedLong -> THPUtils_unpackUInt32 / THPUtils_unpackUInt64"
  exit 1
 fi
-set -ex
+set -ex -o pipefail

 "$SCRIPT_HELPERS_DIR"/build_pytorch.bat

--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -1,5 +1,5 @@
 #!/bin/bash
-set -ex
+set -ex -o pipefail

 SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 # shellcheck source=./common.sh
@ -41,7 +41,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
 python -m pip install z3-solver==4.12.2.0

 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
-python -m pip install tlparse==0.3.25
+python -m pip install tlparse==0.3.30

 # Install parameterized
 python -m pip install parameterized==0.8.1
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -173,8 +173,10 @@ conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_p
 source activate "$tmp_env_name"

 pip install -q "numpy=${NUMPY_PINNED_VERSION}"  "pyyaml${PYYAML_PINNED_VERSION}" requests
-retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq  llvm-openmp=14.0.6 cmake ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing_extensions
 retry pip install -qr "${pytorch_rootdir}/requirements.txt" || true
+# TODO : Remove me later (but in the interim, use Anaconda cmake, to find Anaconda installed OpenMP)
+retry pip uninstall -y cmake
+retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq  llvm-openmp=14.0.6 cmake ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing_extensions

 # For USE_DISTRIBUTED=1 on macOS, need libuv and pkg-config to find libuv.
 export USE_DISTRIBUTED=1
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -75,9 +75,8 @@ export PYTORCH_BUILD_NUMBER=1
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
+TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-  # Only linux Python < 3.13 are supported wheels for triton
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
      TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,6 +1,6 @@
 boto3==1.35.42
 hypothesis==6.56.4
-expecttest==0.2.1
+expecttest==0.3.0
 fbscribelogger==0.1.7
 librosa>=0.6.2
 mpmath==1.3.0
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -19,7 +19,7 @@ fi

 # if lintrunner is not installed, install it
 if ! command -v lintrunner &> /dev/null; then
-    python3 -m pip install lintrunner==0.12.5
+    python3 -m pip install lintrunner==0.12.7
 fi

 # This has already been cached in the docker image
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -206,21 +206,6 @@ jobs:
          git clean -fxd
        working-directory: pytorch

-      - name: Checkout pytorch/builder to builder dir
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-
      - name: Check if the job is disabled
        id: filter
        uses: ./pytorch/.github/actions/filter-test-configs
@ -246,7 +231,6 @@ jobs:
          mkdir -p artifacts/
          container_name=$(docker run \
            -e BINARY_ENV_FILE \
-            -e BUILDER_ROOT \
            -e BUILD_ENVIRONMENT \
            -e DESIRED_CUDA \
            -e DESIRED_DEVTOOLSET \
@ -264,7 +248,6 @@ jobs:
            --tty \
            --detach \
            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
-            -v "${GITHUB_WORKSPACE}/builder:/builder" \
            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
            -w / \
            "${DOCKER_IMAGE}"
@ -272,10 +255,8 @@ jobs:
          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
          if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
-          elif [[ ${{ inputs.PACKAGE_TYPE }} == "manywheel" || ${{ inputs.PACKAGE_TYPE }} == "libtorch" ]]; then
-            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
          else
-            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/${{ inputs.PACKAGE_TYPE }}/build.sh"
+            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
          fi

      - name: Chown artifacts
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -87,7 +87,7 @@ jobs:
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
-        rocm_version: ["6.1", "6.2.4"]
+        rocm_version: ["6.2.4", "6.3"]
    env:
      GPU_ARCH_TYPE: rocm
      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -178,7 +178,7 @@ jobs:
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
-        rocm_version: ["6.1", "6.2.4"]
+        rocm_version: ["6.2.4", "6.3"]
    env:
      GPU_ARCH_TYPE: rocm-manylinux_2_28
      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -44,7 +44,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
        device: ["cuda", "rocm", "xpu"]
        docker-image: ["pytorch/manylinux-builder:cpu", "pytorch/manylinux2_28-builder:cpu"]
        exclude:
@ -114,6 +114,9 @@ jobs:
          3.13)
            PYTHON_EXECUTABLE=/opt/python/cp313-cp313/bin/python
            ;;
+          3.13t)
+            PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
+            ;;
          *)
            echo "Unsupported python version ${PY_VERS}"
            exit 1
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -165,7 +165,7 @@ jobs:

  validate:
    needs: build
-    uses: pytorch/builder/.github/workflows/validate-docker-images.yml@main
+    uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@main
    with:
      channel: nightly
      ref: main
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -207,8 +207,8 @@ jobs:
        conda activate "${CONDA_ENV}"

        # Test tools
-        PYTHONPATH=$(pwd) pytest tools/test/test_*.py
-        PYTHONPATH=$(pwd) pytest .github/scripts/test_*.py
+        PYTHONPATH=$(pwd) pytest tools/test -o "python_files=test*.py"
+        PYTHONPATH=$(pwd) pytest .github/scripts -o "python_files=test*.py"

  test_run_test:
    name: Test `run_test.py` is usable without boto3
@ -229,7 +229,7 @@ jobs:
      - name: Install dependencies
        run: |
          python3 -m pip install --upgrade pip
-          pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.2.* fbscribelogger==0.1.* numpy==1.24.*
+          pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.3.* fbscribelogger==0.1.* numpy==1.24.*
          pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
      - name: Run run_test.py (nonretryable)
        run: |
--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@ -41,6 +41,9 @@ jobs:
          { config: "default", shard: 2, num_shards: 4, runner: "linux.arm64.2xlarge" },
          { config: "default", shard: 3, num_shards: 4, runner: "linux.arm64.2xlarge" },
          { config: "default", shard: 4, num_shards: 4, runner: "linux.arm64.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
        ]}
    secrets: inherit

--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -214,73 +214,6 @@ jobs:
      test-matrix: ${{ needs.linux-focal-py3_9-clang10-build.outputs.test-matrix }}
    secrets: inherit

-  linux-focal-py3_11-clang10-build:
-    name: linux-focal-py3.11-clang10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-py3.11-clang10
-      docker-image-name: pytorch-linux-focal-py3.11-clang10
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-focal-py3_11-clang10-test:
-    name: linux-focal-py3.11-clang10
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-focal-py3_11-clang10-build
-      - target-determination
-    with:
-      build-environment: linux-focal-py3.11-clang10
-      docker-image: ${{ needs.linux-focal-py3_11-clang10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_11-clang10-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-focal-py3_12-clang10-build:
-    name: linux-focal-py3.12-clang10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-py3.12-clang10
-      docker-image-name: pytorch-linux-focal-py3.12-clang10
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-focal-py3_12-clang10-test:
-    name: linux-focal-py3.12-clang10
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_12-clang10-build
-    with:
-      build-environment: linux-focal-py3.12-clang10
-      docker-image: ${{ needs.linux-focal-py3_12-clang10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_12-clang10-build.outputs.test-matrix }}
-      timeout-minutes: 600
-    secrets: inherit
-
  linux-focal-py3_13-clang10-build:
    name: linux-focal-py3.13-clang10
    uses: ./.github/workflows/_linux-build.yml
@ -296,6 +229,8 @@ jobs:
          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -47,14 +47,14 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-focal-cuda12_1-py3_10-gcc9-sm86-build:
-    name: linux-focal-cuda12.1-py3.10-gcc9-sm86
+  linux-focal-cuda12_4-py3_10-gcc9-sm86-build:
+    name: linux-focal-cuda12.4-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
@ -64,16 +64,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
-    name: linux-focal-cuda12.1-py3.10-gcc9-sm86
+  linux-focal-cuda12_4-py3_10-gcc9-sm86-test:
+    name: linux-focal-cuda12.4-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-focal-cuda12_1-py3_10-gcc9-sm86-build
+      - linux-focal-cuda12_4-py3_10-gcc9-sm86-build
      - target-determination
    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-sm86-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-sm86-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm86-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm86-build.outputs.test-matrix }}
    secrets: inherit

  linux-focal-py3_9-clang10-build:
--- a/.github/workflows/test-check-binary.yml
+++ b/.github/workflows/test-check-binary.yml
@ -0,0 +1,40 @@
+name: Test check_binary
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/test-check-binary.yml
+      - .ci/pytorch/check_binary.sh
+      - .ci/pytorch//smoke_test/smoke_test.py
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  check_binary_linux_cpu:
+    if: github.repository_owner == 'pytorch'
+    name: Test check_binary.sh for Linux CPU
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      docker-image: python:3.11
+      docker-build-dir: "skip-docker-build"
+      script: |
+          pushd .ci/pytorch/
+          pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+          DESIRED_PYTHON=3.11 DESIRED_CUDA=cpu PACKAGE_TYPE=manywheel ./check_binary.sh
+          popd
+
+  check_binary_linux_cuda:
+    if: github.repository_owner == 'pytorch'
+    name: Test check_binary.sh for Linux CUDA
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      runner: linux.4xlarge.nvidia.gpu
+      docker-image: python:3.11
+      docker-build-dir: "skip-docker-build"
+      script: |
+          pushd .ci/pytorch/
+          pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124
+          DESIRED_PYTHON=3.11 DESIRED_CUDA=cu124 PACKAGE_TYPE=manywheel ./check_binary.sh
+          popd
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -45,43 +45,12 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  libtorch-linux-focal-cuda12_1-py3_7-gcc9-debug-build:
-    name: libtorch-linux-focal-cuda12.1-py3.7-gcc9-debug
+  libtorch-linux-focal-cuda12_4-py3_10-gcc9-debug-build:
+    name: libtorch-linux-focal-cuda12.4-py3.10-gcc9-debug
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: libtorch-linux-focal-cuda12.1-py3.7-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
-      build-generates-artifacts: false
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.4xlarge"
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1 },
-        ]}
-    secrets: inherit
-
-  # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
-  linux-focal-cuda12_1-py3_10-gcc9-no-ops-build:
-    name: linux-focal-cuda12.1-py3.10-gcc9-no-ops
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-no-ops
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1 },
-        ]}
-    secrets: inherit
-
-  libtorch-linux-focal-cuda12_4-py3_7-gcc9-debug-build:
-    name: libtorch-linux-focal-cuda12.4-py3.7-gcc9-debug
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-environment: libtorch-linux-focal-cuda12.4-py3.7-gcc9
+      build-environment: libtorch-linux-focal-cuda12.4-py3.10-gcc9
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
      build-generates-artifacts: false
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -73,6 +73,8 @@ include_patterns = [
    'aten/src/ATen/native/cudnn/*.cpp',
    'aten/src/ATen/native/mkldnn/xpu/**/*.h',
    'aten/src/ATen/native/mkldnn/xpu/**/*.cpp',
+    'aten/src/ATen/native/Tensor*.h',
+    'aten/src/ATen/native/Tensor*.cpp',
    'c10/**/*.h',
    'c10/**/*.cpp',
    'torch/csrc/**/*.h',
@ -143,7 +145,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
    'numpy==2.1.0 ; python_version >= "3.12"',
-    'expecttest==0.2.1',
+    'expecttest==0.3.0',
    'mypy==1.13.0',
    'sympy==1.13.0 ; python_version >= "3.9"',
    'types-requests==2.27.25',
@ -1711,7 +1713,7 @@ command = [
    '@{{PATHSFILE}}'
 ]
 include_patterns = [
-    'torch/**/does-not-exist.py'
+    "torch/_inductor/**/*.py",
 ]
 is_formatter = true

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -997,8 +997,6 @@ if(NOT MSVC)
  append_cxx_flag_if_supported("-Wnarrowing" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-missing-field-initializers"
                               CMAKE_CXX_FLAGS)
-  append_cxx_flag_if_supported("-Wno-type-limits" CMAKE_CXX_FLAGS)
-  append_cxx_flag_if_supported("-Wno-array-bounds" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-unused-parameter" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-strict-overflow" CMAKE_CXX_FLAGS)
@ -1076,7 +1074,6 @@ if(NOT MSVC)
      set(WERROR FALSE)
    endif()
  endif()
-  append_cxx_flag_if_supported("-Wno-unused-but-set-variable" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-fstandalone-debug" CMAKE_CXX_FLAGS_DEBUG)
  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
@ -1093,6 +1090,7 @@ if(NOT MSVC)
  append_cxx_flag_if_supported("-fno-trapping-math" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Werror=format" CMAKE_CXX_FLAGS)
  if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
+    append_cxx_flag_if_supported("-Wno-dangling-reference" CMAKE_CXX_FLAGS)
    append_cxx_flag_if_supported("-Wno-error=dangling-reference" CMAKE_CXX_FLAGS)
    append_cxx_flag_if_supported("-Wno-error=redundant-move" CMAKE_CXX_FLAGS)
  endif()
--- a/aten/src/ATen/DeviceAccelerator.cpp
+++ b/aten/src/ATen/DeviceAccelerator.cpp
@ -1,8 +1,6 @@
 #include <ATen/Context.h>
 #include <ATen/DeviceAccelerator.h>
-#include <c10/core/impl/VirtualGuardImpl.h>
-
-namespace at::accelerator {
+namespace at {

 std::optional<c10::DeviceType> getAccelerator(bool checked) {
 #define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \
@ -39,8 +37,8 @@ std::optional<c10::DeviceType> getAccelerator(bool checked) {
 #undef DETECT_AND_ASSIGN_ACCELERATOR
 }

-bool isAccelerator(c10::DeviceType device_type) {
-  switch (device_type) {
+bool isAccelerator(c10::DeviceType d) {
+  switch (d) {
    case at::kCUDA:
    case at::kMTIA:
    case at::kXPU:
@ -54,50 +52,4 @@ bool isAccelerator(c10::DeviceType device_type) {
  }
 }

-c10::DeviceIndex deviceCount() {
-  const auto device_type = getAccelerator(false);
-  if (!device_type.has_value()) {
-    return static_cast<c10::DeviceIndex>(0);
-  }
-  c10::impl::VirtualGuardImpl impl(device_type.value());
-  return static_cast<c10::DeviceIndex>(impl.deviceCount());
-}
-
-void setDeviceIndex(c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  c10::impl::VirtualGuardImpl impl(device_type);
-  impl.setDevice({device_type, device_index});
-}
-
-c10::DeviceIndex getDeviceIndex() {
-  const auto device_type = getAccelerator(true).value();
-  c10::impl::VirtualGuardImpl impl(device_type);
-  return static_cast<c10::DeviceIndex>(impl.getDevice().index());
-}
-
-void setCurrentStream(c10::Stream stream) {
-  const auto device_type = getAccelerator(true).value();
-  TORCH_CHECK(
-      device_type == stream.device_type(),
-      "stream's device type ",
-      c10::DeviceTypeName(stream.device_type()),
-      " doesn't match the current accelerator ",
-      c10::DeviceTypeName(device_type));
-  c10::impl::VirtualGuardImpl impl(device_type);
-  impl.exchangeStream(stream);
-}
-
-c10::Stream getCurrentStream(c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  c10::impl::VirtualGuardImpl impl(device_type);
-  return impl.getStream({device_type, device_index});
-}
-
-void synchronizeDevice(c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  c10::impl::VirtualGuardImpl impl(device_type);
-  // impl.synchronizeDevice should can be safely called from any device
-  impl.synchronizeDevice(device_index);
-}
-
-} // namespace at::accelerator
+} // namespace at
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -6,8 +6,6 @@
 #include <ATen/detail/MTIAHooksInterface.h>
 #include <optional>

-namespace at::accelerator {
-
 // This file defines the top level Accelerator concept for PyTorch.
 // A device is an accelerator per the definition here if:
 // - It is mutually exclusive with all other accelerators
@ -17,39 +15,13 @@ namespace at::accelerator {
 // As of today, accelerator devices are (in no particular order):
 // CUDA, MTIA, XPU, HIP, MPS, PrivateUse1

+namespace at {
+
 // Ensures that only one accelerator is available (at
 // compile time if possible) and return it.
 // When checked is true, the returned optional always has a value.
 TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);

-// Check if the given device type is an accelerator.
-TORCH_API bool isAccelerator(c10::DeviceType device_type);
+TORCH_API bool isAccelerator(c10::DeviceType d);

-// Return the number of the device available. Note that this is *REQUIRED* to
-// not raise any exception.
-TORCH_API c10::DeviceIndex deviceCount();
-
-// Set the current device index to the given device index.
-TORCH_API void setDeviceIndex(c10::DeviceIndex device_index);
-
-// Get the current device index.
-TORCH_API c10::DeviceIndex getDeviceIndex();
-
-// Set the current stream to a given stream. Note that this API doesn't change
-// the current device index.
-TORCH_API void setCurrentStream(c10::Stream stream);
-
-// Get the current stream of the given device index.
-TORCH_API c10::Stream getCurrentStream(c10::DeviceIndex device_index);
-
-// Wait (by blocking the calling thread) until all the work previously enqueued
-// on the given device index has been completed.
-TORCH_API void synchronizeDevice(c10::DeviceIndex device_index);
-
-} // namespace at::accelerator
-
-namespace at {
-// Keep BC only
-using at::accelerator::getAccelerator;
-using at::accelerator::isAccelerator;
 } // namespace at
--- a/aten/src/ATen/MatrixRef.h
+++ b/aten/src/ATen/MatrixRef.h
@ -92,8 +92,8 @@ class MatrixRef {
  /// The declaration here is extra complicated so that "arrayRef = {}"
  /// continues to select the move assignment operator.
  template <typename U>
-  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
  std::enable_if_t<std::is_same_v<U, T>, MatrixRef<T>>& operator=(
+      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
      U&& Temporary) = delete;

  /// Disallow accidental assignment from a temporary.
--- a/aten/src/ATen/core/List_test.cpp
+++ b/aten/src/ATen/core/List_test.cpp
@ -1129,6 +1129,7 @@ TEST(ListTest, canAccessOptionalStringByReference) {
  EXPECT_EQ("two", str1);
  EXPECT_FALSE(str2.has_value());
  EXPECT_TRUE(strRef1.has_value());
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  EXPECT_EQ("two", strRef1.value().get());
  EXPECT_FALSE(strRef2.has_value());
 }
--- a/aten/src/ATen/core/enum_type.h
+++ b/aten/src/ATen/core/enum_type.h
@ -66,7 +66,7 @@ struct TORCH_API EnumType : public NamedType {
  }

  const QualifiedName& qualifiedClassName() const {
-    // NOLINTLEXTLINE(bugprone-unchecked-optional-access)
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    return name().value();
  }

--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -95,7 +95,7 @@ struct TORCH_API Argument {
  const TypePtr& real_type() const {
    return real_type_;
  }
-  std::optional<int32_t> N() const {
+  const std::optional<int32_t>& N() const {
    return N_;
  }
  const std::optional<IValue>& default_value() const {
@ -651,11 +651,11 @@ template<>
      hash = c10::hash_combine(hash, type_hash);
      hash = c10::hash_combine(hash, kwarg_only_hash);
      // hashing optional fields if they exist
-      if (arg.default_value()) {
-        auto default_value_hash = c10::hash<c10::IValue>{}(arg.default_value().value());
+      if (arg.default_value().has_value()) {
+        auto default_value_hash = c10::hash<c10::IValue>{}(*arg.default_value());
        hash = c10::hash_combine(hash, default_value_hash);
      }
-      if (arg.N()) {
+      if (arg.N().has_value()) {
        auto N_hash = std::hash<int64_t>{}(*arg.N());
        hash = c10::hash_combine(hash, N_hash);
      }
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -1546,11 +1546,11 @@ struct WeakOrStrongCompilationUnit {
  }

  bool holdingStrongRef() const {
-    return strong_ptr_ != std::nullopt;
+    return strong_ptr_.has_value();
  }

  bool holdingEmptyStrongRef() const {
-    return holdingStrongRef() && *strong_ptr_ == nullptr;
+    return strong_ptr_ == nullptr;
  }

  std::optional<std::shared_ptr<torch::jit::CompilationUnit>> strong_ptr_;
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -625,13 +625,13 @@ struct TORCH_API TensorType : public SharedType {
    return strides_;
  }

-  std::optional<at::Device> device() const {
+  const std::optional<at::Device>& device() const {
    return device_;
  }
-  std::optional<at::ScalarType> scalarType() const {
+  const std::optional<at::ScalarType>& scalarType() const {
    return scalar_type_;
  }
-  std::optional<bool> requiresGrad() const {
+  const std::optional<bool>& requiresGrad() const {
    return requires_grad_;
  }
  bool requires_grad() const override {
@ -656,7 +656,7 @@ struct TORCH_API TensorType : public SharedType {
    const auto& shape = sizes();

    for (size_t i = 0; i < shape.size(); i++) {
-      if (!shape[i]) {
+      if (!shape[i].has_value()) {
        return std::optional<size_t>{};
      }
      prod *= shape[i].value();
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@ -292,7 +292,7 @@ TensorTypePtr TensorType::create(
      scalar_type, device, symbol_sizes, sprops, requires_grad, undefined);
  } else {
    // strides are all null, but still have number of strides equal to number of ranks
-    TORCH_INTERNAL_ASSERT(sizes.sizes() && sizes.size());
+    TORCH_INTERNAL_ASSERT(sizes.sizes().has_value() && sizes.size());
    auto symbol_sizes = SymbolicShape(*sizes.sizes());
    return TensorType::create(
      scalar_type, device, symbol_sizes, VaryingShape<Stride>(*sizes.size()), requires_grad, undefined);
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -61,8 +61,8 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
    } else {
      out << "Tensor";
    }
-    if (auto ndim = value->sizes().size()) {
-      bool has_valid_strides_info = *ndim > 0 &&
+    if (auto ndim = value->sizes().size(); ndim.has_value()) {
+      bool has_valid_strides_info = ndim > 0 &&
          value->strides().isComplete() && value->strides().size() == ndim;

      out << "(";
@ -87,7 +87,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          if (i > 0) {
            out << ", ";
          }
-          out << *value->strides()[i];
+          out << value->strides()[i].value();
        }
        out << "]";
      }
@ -903,7 +903,7 @@ bool ListType::isSubtypeOfExt(const Type& rhs_, std::ostream* why_not) const {

 std::string TupleType::str() const {
  std::stringstream ss;
-  if (schema_ && name()) {
+  if (schema_ && name().has_value()) {
    ss << name()->qualifiedName();
  } else {
    ss << "(";
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -106,6 +106,7 @@ static hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status error)
 namespace {

 static cublasOperation_t _cublasOpFromChar(char op) {
+  // NOLINTNEXTLINE(bugprone-switch-missing-default-case)
  switch (op) {
    case 'n':
    case 'N':
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -466,6 +466,6 @@ void CUDAHooks::deviceSynchronize(DeviceIndex device_index) const {
 using at::CUDAHooksRegistry;
 using at::RegistererCUDAHooksRegistry;

-REGISTER_CUDA_HOOKS(CUDAHooks);
+REGISTER_CUDA_HOOKS(CUDAHooks)

 } // namespace at::cuda::detail
--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -127,8 +127,8 @@ RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3, ARG4 a4) {
 #define NVRTC_STUB2(NAME, A1, A2) _STUB_2(NVRTC, NAME, nvrtcResult, A1, A2)
 #define NVRTC_STUB3(NAME, A1, A2, A3) _STUB_3(NVRTC, NAME, nvrtcResult, A1, A2, A3)

-NVRTC_STUB2(nvrtcVersion, int*, int*);
-NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const);
+NVRTC_STUB2(nvrtcVersion, int*, int*)
+NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const)

 nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
                               const char *src,
@ -143,32 +143,32 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
  return fn(prog, src, name, numHeaders, headers, includeNames);
 }

-NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *);
-NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *);
-NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *);
+NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *)
+NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *)
+NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *)
 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
-NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *);
-NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *);
+NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *)
+NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *)
 #endif
-NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *);
-_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult);
-NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*);
-NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *);
-NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **);
+NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *)
+_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult)
+NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)
+NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *)
+NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **)

-CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *);
-CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *);
-CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t);
-CUDA_STUB2(cuGetErrorString, CUresult, const char **);
-CUDA_STUB1(cuCtxGetCurrent, CUcontext *);
-CUDA_STUB1(cuCtxSetCurrent, CUcontext);
-CUDA_STUB1(cuModuleUnload, CUmodule);
-CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *);
-CUDA_STUB2(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice);
-CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *);
-CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *);
-CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int);
-CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction);
+CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *)
+CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *)
+CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t)
+CUDA_STUB2(cuGetErrorString, CUresult, const char **)
+CUDA_STUB1(cuCtxGetCurrent, CUcontext *)
+CUDA_STUB1(cuCtxSetCurrent, CUcontext)
+CUDA_STUB1(cuModuleUnload, CUmodule)
+CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *)
+CUDA_STUB2(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice)
+CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *)
+CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *)
+CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
+CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)

 #if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
 CUresult CUDAAPI
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -13,6 +13,7 @@
 #include <ATen/cuda/tunable/Tunable.h>
 #include <c10/util/Exception.h>
 #include <c10/util/StringUtil.h>
+#include <c10/util/env.h>
 #include <torch/version.h>

 #ifndef _WIN32
@ -435,8 +436,8 @@ void TuningContext::EnableTunableOp(bool value) {
 }

 bool TuningContext::IsTunableOpEnabled() const {
-  static const char *env = std::getenv("PYTORCH_TUNABLEOP_ENABLED");
-  if (env != nullptr && strcmp(env, "1") == 0) {
+  static const bool eval = c10::utils::get_env("PYTORCH_TUNABLEOP_ENABLED") == "1";
+  if (eval) {
    return true;
  }
  return enable_;
@ -462,16 +463,16 @@ void TuningContext::EnableRecordUntuned(bool value) {
 }

 bool TuningContext::IsTuningEnabled() const {
-  static const char *env = std::getenv("PYTORCH_TUNABLEOP_TUNING");
-  if (env != nullptr && strcmp(env, "0") == 0) {
+  static const bool eval = c10::utils::get_env("PYTORCH_TUNABLEOP_TUNING") == "0";
+  if (eval) {
    return false;
  }
  return tuning_enable_;
 }

 bool TuningContext::IsRecordUntunedEnabled() const {
-  static const char *env = std::getenv("PYTORCH_TUNABLEOP_RECORD_UNTUNED");
-  if (env != nullptr && strcmp(env, "1") == 0) {
+  static const bool eval = c10::utils::get_env("PYTORCH_TUNABLEOP_RECORD_UNTUNED") == "1";
+  if (eval) {
    return true;
  }
  return record_untuned_enable_;
@ -479,8 +480,8 @@ bool TuningContext::IsRecordUntunedEnabled() const {

 std::ofstream& TuningContext::GetUntunedFile(){
  if (!untuned_file_.is_open()) {
-    const char *env = std::getenv("PYTORCH_TUNABLEOP_UNTUNED_FILENAME");
-    std::string filename = (env == nullptr) ? "tunableop_untuned.csv" : env;
+    const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_UNTUNED_FILENAME");
+    std::string filename = (!env.has_value()) ? "tunableop_untuned.csv" : env.value();

    std::string device = c10::str(int(c10::cuda::current_device()));
    std::size_t found = filename.rfind('.');
@ -517,9 +518,9 @@ void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
 }

 int TuningContext::GetMaxTuningDurationMs() const {
-  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS");
-  if (env != nullptr) {
-    int val = atoi(env);
+  static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS");
+  if (env.has_value()) {
+    int val = stoi(env.value());
    return val < 0 ? 0 : val;
  }
  return max_tuning_duration_ms_;
@ -530,9 +531,9 @@ void TuningContext::SetMaxTuningIterations(int max_iter) {
 }

 int TuningContext::GetMaxTuningIterations() const {
-  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_TUNING_ITERATIONS");
-  if (env != nullptr) {
-    int val = atoi(env);
+  static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_MAX_TUNING_ITERATIONS");
+  if (env.has_value()) {
+    int val = stoi(env.value());
    return val < 0 ? 0 : val;
  }
  return max_tuning_iterations_;
@ -543,9 +544,9 @@ void TuningContext::SetMaxWarmupDurationMs(int max_duration_ms) {
 }

 int TuningContext::GetMaxWarmupDurationMs() const {
-  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_WARMUP_DURATION_MS");
-  if (env != nullptr) {
-    int val = atoi(env);
+  static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_MAX_WARMUP_DURATION_MS");
+  if (env.has_value()) {
+    int val = stoi(env.value());
    return val < 0 ? 0 : val;
  }
  return max_warmup_duration_ms_;
@ -556,9 +557,9 @@ void TuningContext::SetMaxWarmupIterations(int max_iter) {
 }

 int TuningContext::GetMaxWarmupIterations() const {
-  static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS");
-  if (env != nullptr) {
-    int val = atoi(env);
+  static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS");
+  if (env.has_value()) {
+    int val = stoi(env.value());
    return val < 0 ? 0 : val;
  }
  return max_warmup_iterations_;
@ -569,8 +570,8 @@ void TuningContext::EnableICacheFlush(bool value) {
 }

 bool TuningContext::IsICacheFlushEnabled() const {
-  static const char *env = std::getenv("PYTORCH_TUNABLEOP_ICACHE_FLUSH_ENABLED");
-  if (env != nullptr && strcmp(env, "0") == 0) {
+  static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_ICACHE_FLUSH_ENABLED");
+  if (env == "0") {
    return false;
  }
  return icache_flush_;
@ -581,10 +582,10 @@ void TuningContext::SetRotatingBufferSize(int size) {
 }

 int TuningContext::GetRotatingBufferSize() const {
-  static const char *env = std::getenv("PYTORCH_TUNABLEOP_ROTATING_BUFFER_SIZE");
-  if (env != nullptr) {
+  static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_ROTATING_BUFFER_SIZE");
+  if (env.has_value()) {
    constexpr int MB = 1024 * 1024;
-    int val = atoi(env);
+    int val = stoi(env.value());
    return val < 0 ? 0 : val * MB;  // env var is specified as MB, returned as bytes
  }
  else {
@ -604,8 +605,8 @@ TuningResultsManager& TuningContext::GetTuningResultsManager() {
    manager_initialized_ = true;
    if (GetFilename().empty()) {
      // if SetFilename() was not already called, call it now with the default or env var
-      const char *env = std::getenv("PYTORCH_TUNABLEOP_FILENAME");
-      std::string filename = (env == nullptr) ? "tunableop_results.csv" : env;
+      const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_FILENAME");
+      std::string filename = (!env.has_value()) ? "tunableop_results.csv" : env.value();
      SetFilename(filename, true);
    }
    auto filename = GetFilename();
--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@ -42,8 +42,9 @@ static Tensor materializeGradWrappers(const Tensor& tensor, int64_t current_leve
  if (!wrapper) {
    return makeTensorWrapper(tensor, current_level, /*is_immutable=*/true);
  }
-  TORCH_INTERNAL_ASSERT(wrapper->level().value() <= current_level, "escaped?");
-  if (wrapper->level() == current_level) {
+  auto level = wrapper->level();
+  TORCH_INTERNAL_ASSERT(level.has_value() && level <= current_level, "escaped?");
+  if (level == current_level) {
    TORCH_INTERNAL_ASSERT(tensor.defined());
    return tensor;
  }
--- a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
@ -54,6 +54,8 @@ struct BinaryRandomPointwiseBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
  static Tensor apply(const Tensor& tensor, const Tensor& other, T... extra_args) {
    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
    auto maybe_layer = maybeCurrentDynamicLayer();
+    TORCH_INTERNAL_ASSERT(maybe_layer.has_value())
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    auto cur_level = maybe_layer->layerId();
    RandomnessType randomness = maybe_layer->randomness();

--- a/aten/src/ATen/functorch/BatchRulesFactory.cpp
+++ b/aten/src/ATen/functorch/BatchRulesFactory.cpp
@ -19,6 +19,7 @@ struct NewBlahBatchRuleHelperSymInt<F, Func, typelist<A, B, T...>> {
      std::optional<int64_t> batch_dim,
      SymIntArrayRef shape,
      T... extra_args) {
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    const auto bdim_size = tensor.sym_size(batch_dim.value());
    c10::SmallVector<c10::SymInt> new_shape;
    new_shape.reserve(shape.size() + 1);
--- a/aten/src/ATen/functorch/BatchRulesHelper.cpp
+++ b/aten/src/ATen/functorch/BatchRulesHelper.cpp
@ -9,7 +9,7 @@

 namespace at::functorch {

-Tensor moveBatchDimToFront(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim) {
+Tensor moveBatchDimToFront(Tensor tensor, std::optional<int64_t> maybe_batch_dim) {
  if (!maybe_batch_dim.has_value()) {
    return tensor;
  }
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@ -30,7 +30,7 @@ TORCH_API Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x);

 TORCH_API Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1, const Tensor& x);

-Tensor moveBatchDimToFront(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
+Tensor moveBatchDimToFront(Tensor tensor, std::optional<int64_t> maybe_batch_dim);
 int64_t rankWithoutBatchDim(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
 int64_t numelWithoutBatchDim(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
 std::optional<int64_t> valIfNonempty(std::optional<int64_t> maybe_empty, int64_t new_val);
@ -243,9 +243,8 @@ inline void boxed_existing_bdim_all_batch_rule(
  const auto num_arguments = static_cast<int64_t>(schema.arguments().size());

  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
-  auto maybe_layer = maybeCurrentDynamicLayer();
+  const auto maybe_layer = maybeCurrentDynamicLayer();
  vmap_check_escaped(maybe_layer, "boxed_existing_bdim_all_batch_rule");
-  int64_t cur_level = maybe_layer->layerId();

  const auto arguments = torch::jit::last(stack, num_arguments);
  if (std::none_of(arguments.begin(), arguments.end(), ivalueParticipatesInCurrentLevel)) {
@ -257,6 +256,8 @@ inline void boxed_existing_bdim_all_batch_rule(
  SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
  SmallVector<int64_t, 5> tensor_pos;
  int64_t batch_size = 0;
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+  int64_t cur_level = maybe_layer->layerId();

  find_and_unpack_tensors(
      stack, num_arguments, cur_level,
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@ -492,6 +492,7 @@ _scaled_dot_product_flash_attention_batch_rule(
 ) {
  if (dropout_p > 0) {
    auto maybe_layer = maybeCurrentDynamicLayer();
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    RandomnessType randomness = maybe_layer->randomness();
    check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
  }
@ -543,6 +544,7 @@ fourOutputs _scaled_dot_product_efficient_attention_batch_rule(
 ) {
  if (dropout_p > 0) {
    auto maybe_layer = maybeCurrentDynamicLayer();
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    RandomnessType randomness = maybe_layer->randomness();
    check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
  }
@ -585,6 +587,7 @@ _scaled_dot_product_cudnn_attention_batch_rule(
 ) {
  if (dropout_p > 0) {
    auto maybe_layer = maybeCurrentDynamicLayer();
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    RandomnessType randomness = maybe_layer->randomness();
    check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
  }
--- a/aten/src/ATen/functorch/BatchRulesLoss.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLoss.cpp
@ -90,6 +90,7 @@ static Tensor binary_cross_entropy_plumbing(
    const std::optional<Tensor>& weight, int64_t reduction) {
  auto maybe_layer = maybeCurrentDynamicLayer();
  vmap_check_escaped(maybe_layer, "binary_cross_entropy_plumbing");
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  int64_t cur_level = maybe_layer->layerId();

  if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(target, cur_level)
@ -126,6 +127,7 @@ static Tensor binary_cross_entropy_backward_plumbing(
    const std::optional<Tensor>& weight_opt, int64_t reduction) {
  auto maybe_layer = maybeCurrentDynamicLayer();
  vmap_check_escaped(maybe_layer, "binary_cross_entropy_backward_plumbing");
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  int64_t cur_level = maybe_layer->layerId();

  if (!areAnyBatchedAtLevel({grad, input, target, weight_opt}, cur_level)) {
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@ -57,7 +57,7 @@ embedding_dense_backward_batch_rule(
    c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq) {
  Tensor grad = grad_;
  Tensor indices = indices_;
-  if (!indices_bdim && grad_bdim) {
+  if (!indices_bdim.has_value() && grad_bdim) {
    const auto bdim_size = grad.sym_size(*grad_bdim);
    grad = reshape_dim_into(*grad_bdim, -1, grad);
    auto result = at::embedding_dense_backward_symint(
@ -162,12 +162,12 @@ grid_sample_backward_helper_in(
 static std::tuple<Tensor, std::optional<int64_t>, Tensor, std::optional<int64_t>>
 grid_sample_backward_helper_out(
    std::tuple<Tensor, Tensor> bw_out,
-    std::optional<int64_t> grad_input_out_bdim,
-    std::optional<int64_t> grad_grid_out_bdim,
+    int64_t grad_input_out_bdim,
+    int64_t grad_grid_out_bdim,
    int64_t bdim_size) {
  auto& [grad_input, grad_grid] = bw_out;
-  grad_input = reshape_dim_outof(*grad_input_out_bdim, bdim_size, grad_input);
-  grad_grid = reshape_dim_outof(*grad_grid_out_bdim, bdim_size, grad_grid);
+  grad_input = reshape_dim_outof(grad_input_out_bdim, bdim_size, grad_input);
+  grad_grid = reshape_dim_outof(grad_grid_out_bdim, bdim_size, grad_grid);
  return std::make_tuple(std::move(grad_input), grad_input_out_bdim, std::move(grad_grid), grad_grid_out_bdim);
 }

--- a/aten/src/ATen/functorch/BatchRulesNorm.cpp
+++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp
@ -218,6 +218,8 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
  c10::MaybeOwned<Tensor> running_var_maybe_owned = at::borrow_from_optional_tensor(running_var_opt);
  const Tensor& running_var = *running_var_maybe_owned;
  // NB: not sure why these are optional...these are required from the forward
+  TORCH_INTERNAL_ASSERT(save_mean_opt.has_value());
+  TORCH_INTERNAL_ASSERT(save_rstd_opt.has_value());
  const Tensor& save_mean = *save_mean_opt;
  const Tensor& save_rstd = *save_rstd_opt;
  TORCH_INTERNAL_ASSERT(save_mean.defined());
@ -226,6 +228,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
  // plumbing
  auto maybe_layer = maybeCurrentDynamicLayer();
  vmap_check_escaped(maybe_layer, "batch_norm_backward_plumbing");
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  int64_t cur_level = maybe_layer->layerId();

  auto [grad_out_value, grad_out_bdim] = unwrapTensorAtLevel(grad_out, cur_level);
@ -298,6 +301,7 @@ static std::tuple<Tensor,Tensor,Tensor> native_group_norm_plumbing(

  auto maybe_layer = maybeCurrentDynamicLayer();
  vmap_check_escaped(maybe_layer, "native_group_norm_plumbing");
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  int64_t cur_level = maybe_layer->layerId();

  if (!areAnyBatchedAtLevel({input, weight_opt, bias_opt}, cur_level)) {
@ -380,6 +384,7 @@ static std::tuple<Tensor,Tensor,Tensor> native_group_norm_backward_plumbing(
  // plumbing
  auto maybe_layer = maybeCurrentDynamicLayer();
  vmap_check_escaped(maybe_layer, "native_group_norm_backward_plumbing");
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  int64_t cur_level = maybe_layer->layerId();

  if (!areAnyBatchedAtLevel({grad_out, input, mean, rstd, weight_opt}, cur_level)) {
@ -579,6 +584,7 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_p
  // plumbing
  auto maybe_layer = maybeCurrentDynamicLayer();
  vmap_check_escaped(maybe_layer, "native_layer_norm_backward_plumbing");
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  int64_t cur_level = maybe_layer->layerId();
  if (!areAnyBatchedAtLevel({grad_out, input, mean, rstd, weight_opt, bias_opt}, cur_level)) {
    c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
@ -721,6 +727,7 @@ struct NativeBatchNormBackwardBatchRuleHelper {

    auto maybe_layer = maybeCurrentDynamicLayer();
    vmap_check_escaped(maybe_layer, "NativeBatchNormBackwardBatchRuleHelper.apply");
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    int64_t cur_level = maybe_layer->layerId();

    if (!areAnyBatchedAtLevel({grad_out, input, weight_opt, running_mean_opt,
@ -751,6 +758,7 @@ struct CudnnBatchNormBackwardBatchRuleHelper {

    auto maybe_layer = maybeCurrentDynamicLayer();
    vmap_check_escaped(maybe_layer, "CudnnBatchNormBackwardBatchRuleHelper.apply");
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    int64_t cur_level = maybe_layer->layerId();

    if (!areAnyBatchedAtLevel({input, grad_out, weight, running_mean_opt,
@ -779,6 +787,7 @@ struct MiopenBatchNormBackwardBatchRuleHelper {

    auto maybe_layer = maybeCurrentDynamicLayer();
    vmap_check_escaped(maybe_layer, "MiopenBatchNormBackwardBatchRuleHelper.apply");
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    int64_t cur_level = maybe_layer->layerId();

    if (!areAnyBatchedAtLevel({input, grad_out, weight, running_mean_opt,
--- a/aten/src/ATen/functorch/BatchRulesPooling.cpp
+++ b/aten/src/ATen/functorch/BatchRulesPooling.cpp
@ -28,8 +28,10 @@ max_pool_with_indices_batch_rule_helper(
    return std::make_tuple(std::move(std::get<0>(result)), 0, std::move(std::get<1>(result)), 0);
  }
  // Tensor[B, N, logical_rank...] -> Tensor[B * N, logical_rank...]
-  auto bdim_size = self.size(*self_bdim);
-  auto self_ = reshape_dim_into(*self_bdim, 0, self);
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+  auto bdim_size = self.size(self_bdim.value());
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+  auto self_ = reshape_dim_into(self_bdim.value(), 0, self);
  auto result = pooling_fn(
      self_, kernel_size, stride, padding, dilation, ceil_mode);
  return std::make_tuple(
--- a/aten/src/ATen/functorch/BatchRulesRandomness.cpp
+++ b/aten/src/ATen/functorch/BatchRulesRandomness.cpp
@ -25,6 +25,7 @@ Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) {
  c10::SmallVector<SymInt> shapeVec(1, maybe_layer->batchSize());
  shapeVec.reserve(shape.size() + 1);
  shapeVec.insert(shapeVec.end(), shape.begin(), shape.end());
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  RandomnessType randomness = maybe_layer->randomness();
  check_randomness(randomness);
  if (randomness == RandomnessType::Different) {
@ -38,9 +39,11 @@ template <typename F, F Func, typename... ExtraArgs>
 Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) {
  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
  auto maybe_layer = maybeCurrentDynamicLayer();
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  const auto cur_level = maybe_layer->layerId();
  auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
-  self_value = moveBatchDimToFront(self_value, self_bdim);
+  self_value = moveBatchDimToFront(std::move(self_value), self_bdim);
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
  RandomnessType randomness = maybe_layer->randomness();
  check_randomness(randomness);
  TORCH_CHECK(
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -1732,11 +1732,10 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const std::option
  // See [Note: hacky wrapper removal for optional tensor]
  c10::MaybeOwned<Tensor> ggI_maybe_owned = at::borrow_from_optional_tensor(ggI_opt);
  const Tensor& ggI = *ggI_maybe_owned;
-  const Tensor& ggW_r = ggW_r_opt.value_or(Tensor());
+  Tensor ggW = ggW_r_opt.value_or(Tensor());
  const Tensor& ggb = ggb_opt.value_or(Tensor());


-  auto ggW = ggW_r;
  auto gO = gO_r;
  auto weight = weight_r;

--- a/aten/src/ATen/native/Loss.cpp
+++ b/aten/src/ATen/native/Loss.cpp
@ -251,20 +251,12 @@ Tensor kl_div(const Tensor& input, const Tensor& target, int64_t reduction, bool
 }

 Tensor binary_cross_entropy_cpu(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
    Tensor loss = at::empty_like(input);
    return at::native::binary_cross_entropy_out_cpu(
-        input, target, weight, reduction, loss);
+        input, target, weight_opt, reduction, loss);
 }

 Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
    Tensor loss_squeezed = at::squeeze(loss);

    auto iter = TensorIteratorConfig()
@ -297,8 +289,8 @@ Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target,
              });
        });

-    if (weight.defined()) {
-        loss.mul_(weight);
+    if (weight_opt.has_value() && weight_opt->defined()) {
+        loss.mul_(*weight_opt);
    }
    if (reduction != at::Reduction::None) {
        Tensor loss_reduced = apply_loss_reduction(loss, reduction);
@ -308,20 +300,12 @@ Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target,
 }

 Tensor binary_cross_entropy_backward_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
    Tensor grad_input = at::empty_like(input);
    return at::native::binary_cross_entropy_backward_out_cpu(
-        grad, input, target, weight, reduction, grad_input);
+        grad, input, target, weight_opt, reduction, grad_input);
 }

 Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& grad_input) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
    Tensor grad_input_squeezed = at::squeeze(grad_input);

    auto iter = TensorIteratorConfig()
@ -350,8 +334,8 @@ Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor&
              });
        });

-    if (weight.defined()) {
-        grad_input.mul_(weight);
+    if (weight_opt.has_value() && weight_opt->defined()) {
+        grad_input.mul_(*weight_opt);
    }
    if (reduction == at::Reduction::Mean) {
        grad_input.div_(input.numel());
@ -360,23 +344,17 @@ Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor&
 }

 Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& pos_weight_opt, int64_t reduction) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-  c10::MaybeOwned<Tensor> pos_weight_maybe_owned = at::borrow_from_optional_tensor(pos_weight_opt);
-  const Tensor& pos_weight = *pos_weight_maybe_owned;
-
  auto log_sigmoid_input = at::log_sigmoid(input);
-  if (pos_weight.defined()) {
+  if (pos_weight_opt.has_value() && pos_weight_opt->defined()) {
      // pos_weight need to be broadcasted, thus mul(target) is not inplace.
-      auto log_weight = (pos_weight - 1).mul(target).add_(1);
+      auto log_weight = (*pos_weight_opt- 1).mul(target).add_(1);
      log_sigmoid_input.mul_(log_weight);
  }

  Tensor loss = (1 - target).mul_(input).sub_(log_sigmoid_input);

-  if (weight.defined()) {
-      loss.mul_(weight);
+  if (weight_opt.has_value() && weight_opt->defined()) {
+      loss.mul_(*weight_opt);
  }

  return apply_loss_reduction(loss, reduction);
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@ -659,20 +659,12 @@ Tensor cross_entropy_loss_symint(
 }

 Tensor & nll_loss_out(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
  Tensor total_weight = at::empty({0}, self.options());
-  return std::get<0>(at::nll_loss_forward_out(output, total_weight, self, target, weight, reduction, ignore_index));
+  return std::get<0>(at::nll_loss_forward_out(output, total_weight, self, target, weight_opt, reduction, ignore_index));
 }

 Tensor nll_loss_symint(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, c10::SymInt ignore_index) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
-  return std::get<0>(at::nll_loss_forward_symint(self, target, weight, reduction, std::move(ignore_index)));
+  return std::get<0>(at::nll_loss_forward_symint(self, target, weight_opt, reduction, std::move(ignore_index)));
 }

 Tensor nll_loss_nd_symint(
--- a/aten/src/ATen/native/LossNLL2d.cpp
+++ b/aten/src/ATen/native/LossNLL2d.cpp
@ -424,14 +424,10 @@ std::tuple<Tensor, Tensor> nll_loss2d_forward_cpu(
    const Tensor& target, const std::optional<Tensor>& weight_opt,
    int64_t reduction,
    int64_t ignore_index) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
  auto output = at::empty({0}, self.options());
  auto total_weight = at::empty({0}, self.options());
  at::native::nll_loss2d_forward_out_cpu(
-      self, target, weight, reduction, ignore_index, output, total_weight);
+      self, target, weight_opt, reduction, ignore_index, output, total_weight);
  return std::make_tuple(output, total_weight);
 }

@ -465,16 +461,12 @@ Tensor nll_loss2d_backward_cpu(
    int64_t reduction,
    int64_t ignore_index,
    const Tensor& total_weight) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
  auto grad_input = at::zeros_like(self);
  at::native::nll_loss2d_backward_out_cpu(
      grad_output,
      self,
      target,
-      weight,
+      weight_opt,
      reduction,
      ignore_index,
      total_weight,
@ -483,20 +475,12 @@ Tensor nll_loss2d_backward_cpu(
 }

 Tensor & nll_loss2d_out(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
  Tensor total_weight = at::empty({0}, self.options());
-  return std::get<0>(at::nll_loss2d_forward_out(output, total_weight, self, target, weight, reduction, ignore_index));
+  return std::get<0>(at::nll_loss2d_forward_out(output, total_weight, self, target, weight_opt, reduction, ignore_index));
 }

 Tensor nll_loss2d_symint(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, c10::SymInt ignore_index) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
-  return std::get<0>(at::nll_loss2d_forward_symint(self, target, weight, reduction, std::move(ignore_index)));
+  return std::get<0>(at::nll_loss2d_forward_symint(self, target, weight_opt, reduction, std::move(ignore_index)));
 }

 } // namespace at::native
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
--- a/aten/src/ATen/native/TensorAdvancedIndexing.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.h
@ -13,21 +13,62 @@ struct TensorIterator;

 namespace at::native {

-using index_put_with_sort_fn = void(*)(Tensor &, const c10::List<std::optional<Tensor>> &, const Tensor &, bool accumulate, bool unsafe);
-using index_put_with_sort_quantized_fn = void(*)(Tensor& self, const c10::List<std::optional<Tensor>>& indices, const Tensor& value, double scale, int zero_point, bool unsafe);
-using gather_fn = void (*)(const Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
-using scatter_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
-using scatter_fill_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src);
-using scatter_add_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
-using scatter_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
-                                  const Tensor& src, const ReductionType& reduce);
-using scatter_scalar_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
-                                         const Scalar& value, const ReductionType& reduce);
-using scatter_reduce_two_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
-                                      const Tensor& src, const ReductionType& reduce);
+using index_put_with_sort_fn = void (*)(
+    Tensor&,
+    const c10::List<std::optional<Tensor>>&,
+    const Tensor&,
+    bool accumulate,
+    bool unsafe);
+using index_put_with_sort_quantized_fn = void (*)(
+    Tensor& self,
+    const c10::List<std::optional<Tensor>>& indices,
+    const Tensor& value,
+    double scale,
+    int zero_point,
+    bool unsafe);
+using gather_fn = void (*)(
+    const Tensor& result,
+    const Tensor& self,
+    int64_t dim,
+    const Tensor& index);
+using scatter_fn = void (*)(
+    const Tensor& self,
+    int64_t dim,
+    const Tensor& index,
+    const Tensor& src);
+using scatter_fill_fn = void (*)(
+    const Tensor& self,
+    int64_t dim,
+    const Tensor& index,
+    const Scalar& src);
+using scatter_add_fn = void (*)(
+    const Tensor& self,
+    int64_t dim,
+    const Tensor& index,
+    const Tensor& src);
+using scatter_reduce_fn = void (*)(
+    const Tensor& self,
+    const int64_t dim,
+    const Tensor& index,
+    const Tensor& src,
+    const ReductionType& reduce);
+using scatter_scalar_reduce_fn = void (*)(
+    const Tensor& self,
+    const int64_t dim,
+    const Tensor& index,
+    const Scalar& value,
+    const ReductionType& reduce);
+using scatter_reduce_two_fn = void (*)(
+    const Tensor& self,
+    const int64_t dim,
+    const Tensor& index,
+    const Tensor& src,
+    const ReductionType& reduce);

 DECLARE_DISPATCH(index_put_with_sort_fn, index_put_with_sort_stub)
-DECLARE_DISPATCH(index_put_with_sort_quantized_fn, index_put_with_sort_quantized_stub)
+DECLARE_DISPATCH(
+    index_put_with_sort_quantized_fn,
+    index_put_with_sort_quantized_stub)
 DECLARE_DISPATCH(gather_fn, gather_stub)
 DECLARE_DISPATCH(scatter_fn, scatter_stub)
 DECLARE_DISPATCH(scatter_fill_fn, scatter_fill_stub)
@ -36,14 +77,26 @@ DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub)
 DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub)
 DECLARE_DISPATCH(scatter_reduce_two_fn, scatter_reduce_two_stub)

-TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List<std::optional<at::Tensor>>& indices);
+TORCH_API Tensor& index_out(
+    Tensor& result,
+    const Tensor& self,
+    const c10::List<std::optional<at::Tensor>>& indices);

-using scatter_add_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&);
-using scatter_reduce_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const ReductionType& reduce, bool);
-using gather_expanded_index_fn = void (*)(const Tensor&, const Tensor&, const Tensor&);
+using scatter_add_expanded_index_fn =
+    void (*)(const Tensor&, const Tensor&, const Tensor&);
+using scatter_reduce_expanded_index_fn = void (*)(
+    const Tensor&,
+    const Tensor&,
+    const Tensor&,
+    const ReductionType& reduce,
+    bool);
+using gather_expanded_index_fn =
+    void (*)(const Tensor&, const Tensor&, const Tensor&);

 DECLARE_DISPATCH(scatter_add_expanded_index_fn, scatter_add_expanded_index_stub)
-DECLARE_DISPATCH(scatter_reduce_expanded_index_fn, scatter_reduce_expanded_index_stub)
+DECLARE_DISPATCH(
+    scatter_reduce_expanded_index_fn,
+    scatter_reduce_expanded_index_stub)
 DECLARE_DISPATCH(gather_expanded_index_fn, gather_expanded_index_stub)

 } // namespace at::native
--- a/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexingUtils.h
@ -23,28 +23,38 @@ inline std::string shapes_as_str(TensorList tensors) {
 #endif
 } // anonymous namespace

-inline std::tuple<bool, Tensor> canDispatchToMaskedFill(const Tensor& self, const torch::List<std::optional<at::Tensor>>& indices,
-const Tensor& value){
-  if (!(value.numel() ==1 && value.device().is_cpu())){
-    return std::make_tuple(false,Tensor());
+inline std::tuple<bool, Tensor> canDispatchToMaskedFill(
+    const Tensor& self,
+    const torch::List<std::optional<at::Tensor>>& indices,
+    const Tensor& value) {
+  if (!(value.numel() == 1 && value.device().is_cpu())) {
+    return std::make_tuple(false, Tensor());
  }
  int64_t num_ind = 0;
  Tensor mask;
  auto self_device = self.device();
-  for (const std::optional<Tensor>& i: indices) {
-    if (!i.has_value() || !(*i).defined()){
+  for (const std::optional<Tensor>& i : indices) {
+    if (!i.has_value() || !(*i).defined()) {
      num_ind++;
    } else {
-      const Tensor &index = *i;
+      const Tensor& index = *i;
      if ((index.scalar_type() != kByte && index.scalar_type() != kBool) ||
-          index.device() != self_device || mask.defined()){
+          index.device() != self_device || mask.defined()) {
        return std::make_tuple(false, Tensor());
      } else {
        mask = index;
        for (const auto j : c10::irange(index.dim())) {
          int64_t srcIdx = num_ind + j;
-          TORCH_CHECK_INDEX(index.size(j) == self.size(srcIdx), "The shape of the mask ", index.sizes(), " at index ", j,
-  " does not match the shape of the indexed tensor ", self.sizes(), " at index ", srcIdx);
+          TORCH_CHECK_INDEX(
+              index.size(j) == self.size(srcIdx),
+              "The shape of the mask ",
+              index.sizes(),
+              " at index ",
+              j,
+              " does not match the shape of the indexed tensor ",
+              self.sizes(),
+              " at index ",
+              srcIdx);
        }
        num_ind += mask.ndimension();
      }
@ -59,14 +69,18 @@ const Tensor& value){

 inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
  checkIndexTensorTypes(orig, /*allow_int*/ true);
-  // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
+  // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more
+  // LongTensors
  auto indices = expandTensors(self, orig);
  // next broadcast all index tensors together
  try {
    indices = expand_outplace(indices);
  } catch (std::exception& e) {
-    TORCH_CHECK_INDEX(false, "shape mismatch: indexing tensors could not be broadcast together"
-                   " with shapes ", shapes_as_str(indices));
+    TORCH_CHECK_INDEX(
+        false,
+        "shape mismatch: indexing tensors could not be broadcast together"
+        " with shapes ",
+        shapes_as_str(indices));
  }
  // add missing null Tensors so that it matches self.dim()
  while (indices.size() < (size_t)self.dim()) {
@ -78,12 +92,12 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
    std::tie(self, indices) = transposeToFront(self, indices);
  }
  // Ensure indices are on the same device as self
-  for (auto & indice : indices) {
+  for (auto& indice : indices) {
    if (indice.defined() && indice.device() != self.device()) {
      indice = indice.to(self.device());
    }
  }
-  for (auto & indice : indices) {
+  for (auto& indice : indices) {
    if (indice.defined() && indice.dtype() == at::kInt) {
      indice = indice.to(at::kLong);
    }
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
--- a/aten/src/ATen/native/TensorCompare.h
+++ b/aten/src/ATen/native/TensorCompare.h
@ -10,7 +10,7 @@ namespace at {
 class Tensor;
 struct TensorIterator;
 struct TensorIteratorBase;
-}
+} // namespace at

 namespace at::native {

@ -22,28 +22,35 @@ using structured_reduce_minmax_fn =
 DECLARE_DISPATCH(structured_reduce_minmax_fn, max_stub)
 DECLARE_DISPATCH(structured_reduce_minmax_fn, min_stub)

-using where_fn = void (*)(TensorIterator &);
+using where_fn = void (*)(TensorIterator&);
 DECLARE_DISPATCH(where_fn, where_kernel)

-using is_infinity_op_fn = void (*)(TensorIteratorBase &);
+using is_infinity_op_fn = void (*)(TensorIteratorBase&);
 DECLARE_DISPATCH(is_infinity_op_fn, isposinf_stub)
 DECLARE_DISPATCH(is_infinity_op_fn, isneginf_stub)

 using mode_fn = void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool);
 DECLARE_DISPATCH(mode_fn, mode_stub)

-using clamp_tensor_fn = void (*)(TensorIteratorBase &);
+using clamp_tensor_fn = void (*)(TensorIteratorBase&);
 DECLARE_DISPATCH(clamp_tensor_fn, clamp_stub)

 namespace detail {
-    enum class ClampLimits {Min, Max, MinMax};
+enum class ClampLimits { Min, Max, MinMax };
 }

-DECLARE_DISPATCH(void (*)(TensorIteratorBase &, const c10::Scalar&, const c10::Scalar&), clamp_scalar_stub)
-DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_min_scalar_stub)
-DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_max_scalar_stub)
+DECLARE_DISPATCH(
+    void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&),
+    clamp_scalar_stub)
+DECLARE_DISPATCH(
+    void (*)(TensorIteratorBase&, c10::Scalar),
+    clamp_min_scalar_stub)
+DECLARE_DISPATCH(
+    void (*)(TensorIteratorBase&, c10::Scalar),
+    clamp_max_scalar_stub)

-using isin_default_fn = void (*)(const Tensor&, const Tensor&, bool, const Tensor&);
+using isin_default_fn =
+    void (*)(const Tensor&, const Tensor&, bool, const Tensor&);
 DECLARE_DISPATCH(isin_default_fn, isin_default_stub)

 } // namespace at::native
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
--- a/aten/src/ATen/native/TensorConversions.h
+++ b/aten/src/ATen/native/TensorConversions.h
@ -7,7 +7,7 @@
 #include <optional>

 namespace at {
-  class Tensor;
+class Tensor;
 namespace native {
 bool to_will_alias(
    const Tensor& self,
@ -20,7 +20,12 @@ bool to_will_alias(
 Tensor to_meta(const Tensor& tensor);
 std::optional<Tensor> to_meta(const std::optional<Tensor>& tensor);
 std::vector<Tensor> to_meta(at::ITensorListRef t_list);
-Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, std::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, std::optional<int64_t> dense_dim_opt);
+Tensor dense_to_sparse_with_mask(
+    const Tensor& self,
+    const Tensor& mask,
+    std::optional<c10::Layout> layout,
+    OptionalIntArrayRef blocksize,
+    std::optional<int64_t> dense_dim_opt);

 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/TensorDimApply.h
+++ b/aten/src/ATen/native/TensorDimApply.h
@ -3,10 +3,15 @@
 #include <c10/util/irange.h>

 namespace at::native {
-//input tensors are non-zero dim and non-empty
-template<typename T1, typename T2, typename Function>
+// input tensors are non-zero dim and non-empty
+template <typename T1, typename T2, typename Function>

-void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim, Function func) {
+void tensor_dim_apply3(
+    const Tensor& self,
+    Tensor& values,
+    Tensor& indices,
+    int64_t dim,
+    Function func) {
  int ndims = self.dim();
  int tensor_dim_apply_has_finished = 0;
  std::vector<int64_t> counter(ndims, 0);
@ -19,9 +24,16 @@ void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int6
  int self_dim_size = self.size(dim);

  while (!tensor_dim_apply_has_finished) {
-    func(self_data, values_data, indices_data, self_dim_size, self_stride, values_stride, indices_stride);
+    func(
+        self_data,
+        values_data,
+        indices_data,
+        self_dim_size,
+        self_stride,
+        values_stride,
+        indices_stride);
    if (ndims == 1) {
-       break;
+      break;
    }
    for (const auto dim_i : c10::irange(ndims)) {
      if (dim_i == dim) {
@ -37,18 +49,18 @@ void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int6
      indices_data += indices.stride(dim_i);

      if (counter[dim_i] == self.size(dim_i)) {
-        if (dim_i == ndims-1) {
+        if (dim_i == ndims - 1) {
          tensor_dim_apply_has_finished = 1;
          break;
        } else {
-          self_data -= counter[dim_i]*self.stride(dim_i);
-          values_data -= counter[dim_i]*values.stride(dim_i);
-          indices_data -= counter[dim_i]*indices.stride(dim_i);
+          self_data -= counter[dim_i] * self.stride(dim_i);
+          values_data -= counter[dim_i] * values.stride(dim_i);
+          indices_data -= counter[dim_i] * indices.stride(dim_i);
          counter[dim_i] = 0;
        }
      } else {
        break;
-     }
+      }
    }
  }
 }
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@ -1,10 +1,10 @@
 #pragma once

-#include <ATen/core/Tensor.h>
-#include <ATen/EmptyTensor.h>
-#include <ATen/TensorIterator.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/native/DispatchStub.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -41,9 +41,9 @@ inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
    return 0;
  }
  // number of elements in the first row of the tril
-  auto m_first_row = offset > 0 ?
-    std::min<int64_t>(col, 1 + offset) : // upper bounded by col
-    row + offset > 0; // either 0 or 1
+  auto m_first_row = offset > 0 ? std::min<int64_t>(col, 1 + offset)
+                                : // upper bounded by col
+      row + offset > 0; // either 0 or 1
  // number of elements in the last row of the tril, bounded by [0, col]
  auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset));
  // number of rows, bounded by [0, row]
@ -63,35 +63,49 @@ inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
 }

 inline void check_args(
-    int64_t row, int64_t col, std::optional<Layout> layout_opt) {
+    int64_t row,
+    int64_t col,
+    std::optional<Layout> layout_opt) {
  TORCH_CHECK(row >= 0, "row must be non-negative, got", row);
  TORCH_CHECK(col >= 0, "col must be non-negative, got", col);
  if (layout_opt.has_value()) {
    TORCH_CHECK(
-      *layout_opt == at::kStrided,
-      "only support layout=torch.strided, got",
-      *layout_opt)
+        *layout_opt == at::kStrided,
+        "only support layout=torch.strided, got",
+        *layout_opt)
  }
 }

 using at::check_size_nonnegative;

 // assumes maximum value in created tensor is n-1 (e.g., torch.randperm(n))
-inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tensor) {
+inline void check_supported_max_int_with_precision(
+    int64_t n,
+    const Tensor& tensor) {
  // match defined() to behavior of checks below
-  TORCH_CHECK(at::scalar_tensor(n>0?n-1:n, tensor.options()).defined(),
-              "n is too large for result tensor type: '", tensor.toString(), "'");
+  TORCH_CHECK(
+      at::scalar_tensor(n > 0 ? n - 1 : n, tensor.options()).defined(),
+      "n is too large for result tensor type: '",
+      tensor.toString(),
+      "'");

  // Ensure sufficient precision for floating point representation.
  switch (tensor.scalar_type()) {
    case at::ScalarType::Half:
-      TORCH_CHECK(n <= (int64_t(1) << 11) + 1, "n cannot be greater than 2049 for Half type.");
+      TORCH_CHECK(
+          n <= (int64_t(1) << 11) + 1,
+          "n cannot be greater than 2049 for Half type.");
      break;
    case at::ScalarType::Float:
-      TORCH_CHECK(n <= (int64_t(1) << 24) + 1, "n cannot be greater than 2^24+1 for Float type.");
+      TORCH_CHECK(
+          n <= (int64_t(1) << 24) + 1,
+          "n cannot be greater than 2^24+1 for Float type.");
      break;
-    case at::ScalarType::Double:  // Unlikely to happen, but doesn't hurt to check
-      TORCH_CHECK(n <= (int64_t(1) << 53) + 1, "n cannot be greater than 2^53+1 for Double type.");
+    case at::ScalarType::Double: // Unlikely to happen, but doesn't hurt to
+                                 // check
+      TORCH_CHECK(
+          n <= (int64_t(1) << 53) + 1,
+          "n cannot be greater than 2^53+1 for Double type.");
      break;
    default:
      break;
@ -104,14 +118,24 @@ inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tens
 inline Tensor& fill_empty_deterministic_(Tensor& tensor) {
  if (tensor.is_floating_point() || tensor.is_complex()) {
    AT_DISPATCH_V2(
-      tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
-        tensor.fill_(std::numeric_limits<scalar_t>::quiet_NaN());
-    }), AT_EXPAND(AT_FLOATING_TYPES), AT_EXPAND(AT_COMPLEX_TYPES), AT_EXPAND(AT_FLOAT8_TYPES), kBFloat16, kHalf, kComplexHalf);
+        tensor.scalar_type(),
+        "fill_empty_deterministic_",
+        AT_WRAP([&]() {
+          tensor.fill_(std::numeric_limits<scalar_t>::quiet_NaN());
+        }),
+        AT_EXPAND(AT_FLOATING_TYPES),
+        AT_EXPAND(AT_COMPLEX_TYPES),
+        AT_EXPAND(AT_FLOAT8_TYPES),
+        kBFloat16,
+        kHalf,
+        kComplexHalf);
  } else {
    AT_DISPATCH_V2(
-      tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
-        tensor.fill_(std::numeric_limits<scalar_t>::max());
-    }), kBool, AT_EXPAND(AT_INTEGRAL_TYPES_V2));
+        tensor.scalar_type(),
+        "fill_empty_deterministic_",
+        AT_WRAP([&]() { tensor.fill_(std::numeric_limits<scalar_t>::max()); }),
+        kBool,
+        AT_EXPAND(AT_INTEGRAL_TYPES_V2));
  }
  return tensor;
 }
@ -130,7 +154,10 @@ struct ZeroTensorAllocator final : public at::Allocator {
  DeleterFnPtr raw_deleter() const override {
    return deleter;
  }
-  void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const final {}
+  void copy_data(
+      void* dest [[maybe_unused]],
+      const void* src [[maybe_unused]],
+      std::size_t count [[maybe_unused]]) const final {}
  at::Device device_;
 };

--- a/aten/src/ATen/native/TensorIteratorDynamicCasting.h
+++ b/aten/src/ATen/native/TensorIteratorDynamicCasting.h
@ -1,39 +1,39 @@
 #pragma once

-#include <complex>
-#include <type_traits>
-#include <c10/core/ScalarType.h>
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/native/TensorIterator.h>
+#include <c10/core/ScalarType.h>
+#include <complex>
+#include <type_traits>

+// This file includes utilities for dynamic_casting done by TensorIterator, see
+// CUDALoops.cuh and Loops.h.

-// This file includes utilities for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h.
-
-// dynamic_casting handles when the types expected by the iterator do not match the types of the arguments
-// to the function that is being called.
-// On CUDA, the cast is currently pushed down into the kernel (for performance reasons).
-// On CPU, there is currently an internal assert that a dynamic_cast is not needed.
+// dynamic_casting handles when the types expected by the iterator do not match
+// the types of the arguments to the function that is being called. On CUDA, the
+// cast is currently pushed down into the kernel (for performance reasons). On
+// CPU, there is currently an internal assert that a dynamic_cast is not needed.

 namespace at::native {

 // `needs_dynamic_casting` compares the types expected by iterator
 // (i.e. dtypes of the operands) with the actual type of the arguments
 // (and returns) of func_t
-template<typename func_t, int nargs=function_traits<func_t>::arity>
+template <typename func_t, int nargs = function_traits<func_t>::arity>
 struct needs_dynamic_casting {
  static bool check(TensorIteratorBase& iter) {
    using traits = function_traits<func_t>;
    using cpp_type = typename traits::template arg<nargs - 1>::type;
    using cpp_map = c10::CppTypeToScalarType<cpp_type>;

-    if (iter.input_dtype(nargs-1) != cpp_map::value) {
+    if (iter.input_dtype(nargs - 1) != cpp_map::value) {
      return true;
    }
    return needs_dynamic_casting<func_t, nargs - 1>::check(iter);
  }
 };

-template<typename func_t>
+template <typename func_t>
 struct needs_dynamic_casting<func_t, 0> {
  static bool check(TensorIteratorBase& iter) {
    using traits = function_traits<func_t>;
@ -49,4 +49,4 @@ struct needs_dynamic_casting<func_t, 0> {
  }
 };

-} //namespace at::native
+} // namespace at::native
--- a/aten/src/ATen/native/TensorIteratorReduce.cpp
+++ b/aten/src/ATen/native/TensorIteratorReduce.cpp
@ -1,6 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/TensorIterator.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorIterator.h>
 #include <ATen/TensorIteratorInternal.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -22,7 +22,9 @@ static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop);
 static void parallel_dim_reduction(TensorIteratorBase& iter, loop2d_t loop);

 void TensorIteratorBase::parallel_reduce(loop2d_t loop) {
-  TORCH_CHECK(ntensors() == 2, "parallel_reduce only supports one input and one output");
+  TORCH_CHECK(
+      ntensors() == 2,
+      "parallel_reduce only supports one input and one output");
  int64_t numel = this->numel();
  if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
      at::in_parallel_region()) {
@ -54,18 +56,24 @@ static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop) {
  auto first_reduce = TensorIterator::reduce_op(buffer_0, iter.input(0));
  TORCH_INTERNAL_ASSERT(first_reduce.output(0).is_alias_of(buffer_0));

-  at::parallel_for(0, iter.numel(), internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
-    const auto thread_num = at::get_thread_num();
-    auto shape = first_reduce.shape();
-    auto strides = first_reduce.get_strides();
+  at::parallel_for(
+      0, iter.numel(), internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
+        const auto thread_num = at::get_thread_num();
+        auto shape = first_reduce.shape();
+        auto strides = first_reduce.get_strides();

-    // Bump output ptr so each thread has its own output slice
-    auto base_ptrs = first_reduce.get_base_ptrs();
-    base_ptrs[0] += buffer_stride * thread_num;
+        // Bump output ptr so each thread has its own output slice
+        auto base_ptrs = first_reduce.get_base_ptrs();
+        base_ptrs[0] += buffer_stride * thread_num;

-    at::internal::serial_for_each(shape, strides, base_ptrs.data(),
-                                  base_ptrs.size(), loop, {begin, end});
-  });
+        at::internal::serial_for_each(
+            shape,
+            strides,
+            base_ptrs.data(),
+            base_ptrs.size(),
+            loop,
+            {begin, end});
+      });

  auto final_reduce = TensorIterator::reduce_op(unsqueezed, buffer);
  final_reduce.for_each(loop);
@ -91,8 +99,12 @@ static int find_split_dim(TensorIteratorBase& iter) {
  return best_dim;
 }

-static std::tuple<int64_t, int64_t>
-round_columns(TensorIteratorBase& iter, int dim, int multiple, int64_t begin, int64_t end) {
+static std::tuple<int64_t, int64_t> round_columns(
+    TensorIteratorBase& iter,
+    int dim,
+    int multiple,
+    int64_t begin,
+    int64_t end) {
  begin = begin - (begin % multiple);
  if (end != iter.shape()[dim]) {
    // only round the 'end' column down if it's not the final column
@ -113,7 +125,8 @@ static void parallel_dim_reduction(TensorIteratorBase& iter, loop2d_t loop) {
      // round columns to multiples of 128 bytes if adjacent columns are
      // contiguous in memory.
      int64_t cols_per_128_bytes = 128 / element_size;
-      std::tie(begin, end) = round_columns(iter, dim, cols_per_128_bytes, begin, end);
+      std::tie(begin, end) =
+          round_columns(iter, dim, cols_per_128_bytes, begin, end);
    }
    if (begin == end) {
      return;
@ -124,7 +137,9 @@ static void parallel_dim_reduction(TensorIteratorBase& iter, loop2d_t loop) {
  });
 }

-void TensorIteratorBase::foreach_reduced_elt(loop_subiter_t loop, bool parallelize) {
+void TensorIteratorBase::foreach_reduced_elt(
+    loop_subiter_t loop,
+    bool parallelize) {
  AT_ASSERT(ninputs() == 1);
  AT_ASSERT(noutputs() >= 1);

@ -134,26 +149,26 @@ void TensorIteratorBase::foreach_reduced_elt(loop_subiter_t loop, bool paralleli
  }
  if (output(0).numel() == 1) {
    loop(*this);
-  }
-  else if (numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
+  } else if (
+      numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
      at::in_parallel_region() || !parallelize) {
    auto reduce_dims = num_reduce_dims();

-    auto non_reduced_shape = shape.slice(reduce_dims, shape.size() - reduce_dims);
+    auto non_reduced_shape =
+        shape.slice(reduce_dims, shape.size() - reduce_dims);

    int64_t non_reduced_numel = 1;
    for (const auto i : non_reduced_shape) {
      non_reduced_numel *= i;
    }
-    DimCounter dims {non_reduced_shape, {0, non_reduced_numel}};
+    DimCounter dims{non_reduced_shape, {0, non_reduced_numel}};
    while (!dims.is_done()) {
      TensorIterator reduced = *this;
      reduced.select_all_keeping_dim(reduce_dims, dims.values);
      loop(reduced);
      dims.increment({1, 1});
    }
-  }
-  else {
+  } else {
    int dim = find_split_dim(*this);
    int64_t cols = shape[dim];
    at::parallel_for(0, cols, 1, [&](int64_t begin, int64_t end) {
@ -177,4 +192,4 @@ void TensorIteratorBase::foreach_reduced_elt(loop_subiter_t loop, bool paralleli
  }
 }

-}  // namespace at
+} // namespace at
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@ -1,7 +1,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/core/Tensor.h>
 #include <ATen/Context.h>
 #include <ATen/NamedTensorUtils.h>
+#include <ATen/core/Tensor.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/TensorProperties.h>

@ -36,9 +36,10 @@ bool nested_is_same_size(const Tensor& self, const Tensor& other) {
  TORCH_CHECK(
      self.is_nested() && other.is_nested(),
      "Expected both self and other to be nested tensors. ",
-      "Self ", self.is_nested()? "is " : "is not ",
+      "Self ",
+      self.is_nested() ? "is " : "is not ",
      "nested. While Other ",
-      other.is_nested()? "is " : "is not ",
+      other.is_nested() ? "is " : "is not ",
      "nested.")
  const auto self_nt_size = _nested_tensor_size(self);
  const auto other_nt_size = _nested_tensor_size(other);
@ -79,16 +80,21 @@ int64_t stride(const Tensor& self, Dimname dim) {
 }

 bool cudnn_is_acceptable(const TensorBase& self) {
-  if (!globalContext().userEnabledCuDNN()) return false;
-  if (!self.is_cuda()) return false;
+  if (!globalContext().userEnabledCuDNN())
+    return false;
+  if (!self.is_cuda())
+    return false;
  auto st = self.scalar_type();
-  if (!(st == kDouble || st == kFloat || st == kHalf)) return false;
-  if (!detail::getCUDAHooks().compiledWithCuDNN()) return false;
+  if (!(st == kDouble || st == kFloat || st == kHalf))
+    return false;
+  if (!detail::getCUDAHooks().compiledWithCuDNN())
+    return false;
  // cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
  // tensors. Maybe some cuDNN functions actually support empty tensors, but
  // native/THNN kernels shouldn't be much slower because the output is also
  // likely empty.
-  if (self.sym_numel() == 0) return false;
+  if (self.sym_numel() == 0)
+    return false;
  // NB: In the old Python code, there was also a test to see if the
  // cuDNN library was actually dynamically linked or not.  I'm not
  // sure if we can actually test this.
@ -99,9 +105,10 @@ bool cudnn_is_acceptable(const Tensor& self) {
  return cudnn_is_acceptable(static_cast<const TensorBase&>(self));
 }

-Tensor & detach_(Tensor & self) {
-  // this just exists to give us a hook in VariableType and an entry in Declarations.yaml
-  //TORCH_CHECK(false, "detach_ is not implemented for Tensor");
+Tensor& detach_(Tensor& self) {
+  // this just exists to give us a hook in VariableType and an entry in
+  // Declarations.yaml
+  // TORCH_CHECK(false, "detach_ is not implemented for Tensor");
  return self;
 }

@ -117,7 +124,8 @@ Tensor contiguous(const Tensor& self, MemoryFormat memory_format) {
 }

 bool is_set_to(const Tensor& self, const Tensor& src) {
-  if (self.storage().unsafeGetStorageImpl() == src.storage().unsafeGetStorageImpl() &&
+  if (self.storage().unsafeGetStorageImpl() ==
+          src.storage().unsafeGetStorageImpl() &&
      self.storage_offset() == src.storage_offset() &&
      self.dim() == src.dim()) {
    for (const auto d : c10::irange(self.dim())) {
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
--- a/aten/src/ATen/native/TensorShape.h
+++ b/aten/src/ATen/native/TensorShape.h
@ -1,7 +1,7 @@
 #pragma once
+#include <ATen/core/IListRef.h>
 #include <ATen/core/Tensor.h>
 #include <c10/util/irange.h>
-#include <ATen/core/IListRef.h>

 namespace at::native {

@ -11,45 +11,74 @@ inline bool cat_should_skip_tensor(const Tensor& t) {
  return t.sym_numel() == 0 && t.dim() == 1;
 }

- // Check to see if the shape of tensors is compatible
- // for being concatenated along a given dimension.
-inline void check_cat_shape_except_dim(const Tensor & first, const Tensor & second, int64_t dimension, int64_t index) {
-   int64_t first_dims = first.dim();
-   int64_t second_dims = second.dim();
-   TORCH_CHECK(first_dims == second_dims, "Tensors must have same number of dimensions: got ",
-               first_dims, " and ", second_dims);
-   for (const auto dim : c10::irange(first_dims)) {
-     if (dim == dimension) {
-       continue;
-     }
-     int64_t first_dim_size = first.sizes()[dim];
-     int64_t second_dim_size = second.sizes()[dim];
-     TORCH_CHECK(first_dim_size == second_dim_size, "Sizes of tensors must match except in dimension ",
-                 dimension, ". Expected size ", static_cast<long long>(first_dim_size), " but got size ", static_cast<long long>(second_dim_size), " for tensor number ", index, " in the list.");
-   }
- }
+// Check to see if the shape of tensors is compatible
+// for being concatenated along a given dimension.
+inline void check_cat_shape_except_dim(
+    const Tensor& first,
+    const Tensor& second,
+    int64_t dimension,
+    int64_t index) {
+  int64_t first_dims = first.dim();
+  int64_t second_dims = second.dim();
+  TORCH_CHECK(
+      first_dims == second_dims,
+      "Tensors must have same number of dimensions: got ",
+      first_dims,
+      " and ",
+      second_dims);
+  for (const auto dim : c10::irange(first_dims)) {
+    if (dim == dimension) {
+      continue;
+    }
+    int64_t first_dim_size = first.sizes()[dim];
+    int64_t second_dim_size = second.sizes()[dim];
+    TORCH_CHECK(
+        first_dim_size == second_dim_size,
+        "Sizes of tensors must match except in dimension ",
+        dimension,
+        ". Expected size ",
+        static_cast<long long>(first_dim_size),
+        " but got size ",
+        static_cast<long long>(second_dim_size),
+        " for tensor number ",
+        index,
+        " in the list.");
+  }
+}

 inline void check_cat_no_zero_dim(const MaterializedITensorListRef& tensors) {
  [[maybe_unused]] int64_t i = 0;
-  for(const Tensor& t : tensors) {
-    TORCH_CHECK(t.dim() > 0,
-             "zero-dimensional tensor (at position ", i, ") cannot be concatenated");
+  for (const Tensor& t : tensors) {
+    TORCH_CHECK(
+        t.dim() > 0,
+        "zero-dimensional tensor (at position ",
+        i,
+        ") cannot be concatenated");
    i++;
  }
 }

-inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t dim) {
+inline int64_t get_num_splits(
+    const Tensor& self,
+    int64_t split_size,
+    int64_t dim) {
  TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
-  TORCH_CHECK(split_size >= 0,  "split expects split_size be non-negative, but got split_size=", split_size);
+  TORCH_CHECK(
+      split_size >= 0,
+      "split expects split_size be non-negative, but got split_size=",
+      split_size);
  int64_t dim_size = self.size(dim);
-  TORCH_CHECK(split_size > 0 || dim_size == 0,
-           "split_size can only be 0 if dimension size is 0, "
-           "but got dimension size of ", dim_size);
+  TORCH_CHECK(
+      split_size > 0 || dim_size == 0,
+      "split_size can only be 0 if dimension size is 0, "
+      "but got dimension size of ",
+      dim_size);
  // if split_size is 0 and dimension size is 0, there is 1 split.
  int64_t num_splits = 1;
  if (split_size != 0) {
-    // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size
-    // (returns a single split).  We might want to error here, but keep it for BC.
+    // ensuring num_splits is at least 1 makes consistent the case where
+    // split_size > dim_size (returns a single split).  We might want to error
+    // here, but keep it for BC.
    num_splits = std::max<int64_t>((dim_size + split_size - 1) / split_size, 1);
  }
  return num_splits;
@ -58,7 +87,7 @@ inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t di
 inline bool have_same_ndims(TensorList tensors) {
  auto ndim = tensors[0].dim();
  for (const auto tensor_idx : c10::irange(tensors.size())) {
-    if(tensors[tensor_idx].dim() != ndim) {
+    if (tensors[tensor_idx].dim() != ndim) {
      return false;
    }
  }
@ -67,35 +96,46 @@ inline bool have_same_ndims(TensorList tensors) {

 inline void leading_dimension_matches(TensorList tensors, int64_t dim) {
  auto tensor_zero_size = tensors[0].sizes();
-  std::vector<c10::SymInt> leading_dim_sizes(tensor_zero_size.begin(), tensor_zero_size.begin() + dim);
+  std::vector<c10::SymInt> leading_dim_sizes(
+      tensor_zero_size.begin(), tensor_zero_size.begin() + dim);
  for (const auto i : c10::irange(tensors.size())) {
    at::Tensor tensor = tensors[i];
-    for(const auto j : c10::irange(dim)) {
+    for (const auto j : c10::irange(dim)) {
      TORCH_CHECK(
-        tensor.size(j) == leading_dim_sizes[j],
-        "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors"
-      );
+          tensor.size(j) == leading_dim_sizes[j],
+          "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors");
    }
  }
 }

-inline int64_t preprocess_chunk_cat_inputs(TensorList tensors, int64_t dim, int64_t num_chunks) {
+inline int64_t preprocess_chunk_cat_inputs(
+    TensorList tensors,
+    int64_t dim,
+    int64_t num_chunks) {
  TORCH_CHECK(num_chunks >= 1, "_chunk_cat expects positive num_chunks");
-  TORCH_CHECK(!tensors.empty(),
-           "_chunk_cat expects a non-empty input tensor list");
+  TORCH_CHECK(
+      !tensors.empty(), "_chunk_cat expects a non-empty input tensor list");
  auto expected_dtype = tensors[0].dtype();
  auto expected_device = tensors[0].device();
-  for(const auto i : c10::irange(tensors.size())) {
+  for (const auto i : c10::irange(tensors.size())) {
    TORCH_CHECK(tensors[i].numel() > 0, "_chunk_cat expects non-empty tensor");
-    TORCH_CHECK(tensors[i].dtype() == expected_dtype, "_chunk_cat expects all input tensors with the same dtype");
-    TORCH_CHECK(tensors[i].device() == expected_device, "_chunk_cat expects all inputs tensors on the same device");
+    TORCH_CHECK(
+        tensors[i].dtype() == expected_dtype,
+        "_chunk_cat expects all input tensors with the same dtype");
+    TORCH_CHECK(
+        tensors[i].device() == expected_device,
+        "_chunk_cat expects all inputs tensors on the same device");
  }
  if (have_same_ndims(tensors)) {
    dim = maybe_wrap_dim(dim, tensors[0].dim());
  } else {
-    TORCH_CHECK(dim >= 0, "_chunk_cat expects non-negative dim when input tensors have different ndims")
-    for(const auto i : c10::irange(tensors.size())) {
-      TORCH_CHECK(dim < tensors[i].ndimension(), "_chunk_cat expects dim < ndim for all input tensors");
+    TORCH_CHECK(
+        dim >= 0,
+        "_chunk_cat expects non-negative dim when input tensors have different ndims")
+    for (const auto i : c10::irange(tensors.size())) {
+      TORCH_CHECK(
+          dim < tensors[i].ndimension(),
+          "_chunk_cat expects dim < ndim for all input tensors");
    }
  }
  leading_dimension_matches(tensors, dim);
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@ -1,6 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/IndexKernel.h> // for flip_stub
 #include <ATen/native/TensorTransformations.h>
-#include <ATen/native/IndexKernel.h>  // for flip_stub

 #include <ATen/Parallel.h>
 #include <ATen/TensorIterator.h>
@ -44,28 +44,30 @@ Tensor flip(const Tensor& self, IntArrayRef dims) {
  int n = 0;
  auto strides = DimVector(self.strides());
  for (const auto i : c10::irange(total_dims)) {
-    if(flip_dims_b[i] && self.size(i) > 1 && self.stride(i) != 0) {
+    if (flip_dims_b[i] && self.size(i) > 1 && self.stride(i) != 0) {
      n++;
      strides[i] = 0;
    }
  }

  // Nothing to do, we return fast
-  if (n == 0 || self.numel() <=1) {
+  if (n == 0 || self.numel() <= 1) {
    out_tensor.copy_(self);
    return out_tensor;
  }

-  //create dummy output with 0 strides at flipped dimension, to prevent tensorIterator from coalescing flipped dims
+  // create dummy output with 0 strides at flipped dimension, to prevent
+  // tensorIterator from coalescing flipped dims
  const auto restrided_self = self.as_strided(self.sizes(), strides);
-  auto iter = TensorIteratorConfig()
-    .set_check_mem_overlap(false)
-    .check_all_same_dtype(false)
-    .declare_static_dtype_and_device(self.scalar_type(), self.device())
-    .add_output(out_tensor)
-    .add_const_input(self)
-    .add_const_input(restrided_self)
-    .build();
+  auto iter =
+      TensorIteratorConfig()
+          .set_check_mem_overlap(false)
+          .check_all_same_dtype(false)
+          .declare_static_dtype_and_device(self.scalar_type(), self.device())
+          .add_output(out_tensor)
+          .add_const_input(self)
+          .add_const_input(restrided_self)
+          .build();

  auto* data = reinterpret_cast<char*>(iter.data_ptr(0));
  const auto sizes = iter.shape();
@ -83,11 +85,12 @@ Tensor flip(const Tensor& self, IntArrayRef dims) {
  //   - We iterate in the opposite direction (invert the strides)

  for (const auto i : c10::irange(iter.ndim())) {
-    // We know that an dimension has a zero stride and self[i] does not, as we defined above
-    // Note that it may be the case that strides_dummy[i] = 0 not because we set it, but because
-    // strides_self[i] == 0. We do not want to do anything there
+    // We know that an dimension has a zero stride and self[i] does not, as we
+    // defined above Note that it may be the case that strides_dummy[i] = 0 not
+    // because we set it, but because strides_self[i] == 0. We do not want to do
+    // anything there
    if (strides_dummy[i] == 0 && strides_self[i] != 0) {
-      data += strides_bytes[i] * (sizes[i]-1);
+      data += strides_bytes[i] * (sizes[i] - 1);
      strides_bytes[i] *= -1;
    }
  }
@ -99,7 +102,10 @@ Tensor flip(const Tensor& self, IntArrayRef dims) {
  return out_tensor;
 }

-Tensor roll(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { // Used by CPU and MPS dispatch.
+Tensor roll(
+    const Tensor& self,
+    IntArrayRef shifts,
+    IntArrayRef dims) { // Used by CPU and MPS dispatch.
  if (dims.size() != 1 || shifts.size() != 1) {
    return roll_common(self, shifts, dims);
  }
@ -115,7 +121,7 @@ Tensor roll(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { // Used
  if (start < 0) {
    start = start + size;
  }
-  auto t0 = self.narrow(dim, start, size-start);
+  auto t0 = self.narrow(dim, start, size - start);
  auto t1 = self.narrow(dim, 0, start);
  return at::cat({std::move(t0), std::move(t1)}, dim);
 }
@ -123,27 +129,38 @@ Tensor roll(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { // Used
 Tensor rot90(const Tensor& self, int64_t k, IntArrayRef dims) {
  const int64_t total_dims = self.dim(), total_rot_dims = dims.size();

-  TORCH_CHECK(total_rot_dims == 2,
-    "expected total rotation dims == 2, but got dims = ", total_rot_dims);
+  TORCH_CHECK(
+      total_rot_dims == 2,
+      "expected total rotation dims == 2, but got dims = ",
+      total_rot_dims);

-  TORCH_CHECK(total_dims >= 2,
-    "expected total dims >= 2, but got total dims = ", total_dims);
+  TORCH_CHECK(
+      total_dims >= 2,
+      "expected total dims >= 2, but got total dims = ",
+      total_dims);

-  TORCH_CHECK(dims[0] != dims[1] && std::abs(dims[0] - dims[1]) != total_dims,
-    "expected rotation dims to be different, but got dim0 = ", dims[0],
-    " and dim1 = ", dims[1]);
+  TORCH_CHECK(
+      dims[0] != dims[1] && std::abs(dims[0] - dims[1]) != total_dims,
+      "expected rotation dims to be different, but got dim0 = ",
+      dims[0],
+      " and dim1 = ",
+      dims[1]);

  // check range of dims
-  TORCH_CHECK(dims[0] < total_dims && dims[0] >= -total_dims,
-    "Rotation dim0 out of range, dim0 = ", dims[0]);
+  TORCH_CHECK(
+      dims[0] < total_dims && dims[0] >= -total_dims,
+      "Rotation dim0 out of range, dim0 = ",
+      dims[0]);

-  TORCH_CHECK(dims[1] < total_dims && dims[1] >= -total_dims,
-    "Rotation dim1 out of range, dim1 = ", dims[1]);
+  TORCH_CHECK(
+      dims[1] < total_dims && dims[1] >= -total_dims,
+      "Rotation dim1 out of range, dim1 = ",
+      dims[1]);

  // handle modulo with negative k
  k = (4 + (k % 4)) % 4;

-  switch(k) {
+  switch (k) {
    case 1:
      return self.flip({dims[1]}).transpose_(dims[0], dims[1]);
    case 2:
@ -181,7 +198,8 @@ std::vector<Tensor> atleast_1d(TensorList tensors) {
  auto transform_lambda = [](const Tensor& input) -> Tensor {
    return at::native::atleast_1d(input);
  };
-  std::transform(tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
+  std::transform(
+      tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
  return result;
 }

@ -202,7 +220,8 @@ std::vector<Tensor> atleast_2d(TensorList tensors) {
  auto transform_lambda = [](const Tensor& input) -> Tensor {
    return at::native::atleast_2d(input);
  };
-  std::transform(tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
+  std::transform(
+      tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
  return result;
 }

@ -226,7 +245,8 @@ std::vector<Tensor> atleast_3d(TensorList tensors) {
  auto transform_lambda = [](const Tensor& input) -> Tensor {
    return at::native::atleast_3d(input);
  };
-  std::transform(tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
+  std::transform(
+      tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
  return result;
 }

--- a/aten/src/ATen/native/TensorTransformations.h
+++ b/aten/src/ATen/native/TensorTransformations.h
@ -10,16 +10,21 @@

 namespace at::native {

-static inline Tensor roll_common(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
+static inline Tensor roll_common(
+    const Tensor& self,
+    IntArrayRef shifts,
+    IntArrayRef dims) {
  TORCH_CHECK(!shifts.empty(), "`shifts` required");
  if (dims.empty() && shifts.size() == 1) {
    auto flattened = self.contiguous().view(self.numel());
    return roll(flattened, shifts[0], 0).view(self.sizes());
  }
  TORCH_CHECK(
-    shifts.size() == dims.size(),
-    "shifts and dimensions must align. shifts: ", shifts.size(), ", dims:", dims.size()
-  );
+      shifts.size() == dims.size(),
+      "shifts and dimensions must align. shifts: ",
+      shifts.size(),
+      ", dims:",
+      dims.size());
  AT_ASSERT(dims.size() > 1);
  auto tail_shifts = shifts.slice(1);
  auto tail_dims = dims.slice(1);
@ -27,4 +32,4 @@ static inline Tensor roll_common(const Tensor& self, IntArrayRef shifts, IntArra
  return at::roll(first_dim_rolled, tail_shifts, tail_dims);
 }

-}  // namespace at::native
+} // namespace at::native
--- a/aten/src/ATen/native/cuda/Loss.cu
+++ b/aten/src/ATen/native/cuda/Loss.cu
@ -63,13 +63,9 @@ void binary_cross_entropy_backward_out_kernel(Tensor& grad_input, const Tensor&
 namespace at::native {

 Tensor binary_cross_entropy_cuda(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
    Tensor loss = at::empty_like(input);
    return at::native::binary_cross_entropy_out_cuda(
-        input, target, weight, reduction, loss);
+        input, target, weight_opt, reduction, loss);
 }

 Tensor& binary_cross_entropy_out_cuda(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss) {
@ -122,13 +118,9 @@ Tensor& binary_cross_entropy_out_cuda(const Tensor& input, const Tensor& target,
 }

 Tensor binary_cross_entropy_backward_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-
  Tensor grad_input = at::empty_like(input);
  return at::native::binary_cross_entropy_backward_out_cuda(
-      grad, input, target, weight, reduction, grad_input);
+      grad, input, target, weight_opt, reduction, grad_input);
 }

 Tensor& binary_cross_entropy_backward_out_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& grad_input) {
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -75,8 +75,6 @@ struct ReduceConfig {
  static constexpr int BLOCK_Y = 1;
  static constexpr int CTA = 2;

-  static constexpr int input_vec_size = 4;
-
  ReduceConfig(int element_size_bytes, int num_outputs, int num_inputs)
    : element_size_bytes(element_size_bytes)
    , num_inputs(num_inputs)
@ -286,7 +284,6 @@ struct ReduceJitOp {
  //TODO for now arg_t is always opmath_t of the input, later we'll need to change it
  using arg_t = at::opmath_type<scalar_t>;

-  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
  //TODO - ReduceJitOp will probably need to be changed for reductions that need full functor,
  //not just wrapper
  arg_t ident;
@ -336,7 +333,7 @@ struct ReduceJitOp {
  }
 };

-template <typename scalar_t, typename ops_t, typename index_t, typename out_scalar_t=scalar_t, int vt0=4>
+template <typename scalar_t, typename ops_t, typename index_t, typename out_scalar_t=scalar_t, int vt0=4, int input_vec_size=vt0>
 struct ReduceOp {
  using traits = function_traits<decltype(&ops_t::reduce)>;
  using arg_t = typename std::decay<typename traits::template arg<0>::type>::type;
@ -348,8 +345,6 @@ struct ReduceOp {
    std::is_convertible_v<arg_t, out_scalar_t>
    && std::is_convertible_v<out_scalar_t, arg_t>;

-  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
-
  ops_t ops;
  arg_t ident;
  ReduceConfig config;
@ -996,7 +991,7 @@ int get_output_vec_size(const TensorIterator &iter) {
  return vec_size;
 }

-template<typename arg_t, typename scalar_t, int vt0>
+template<typename arg_t, typename scalar_t, int vt0, int input_vec_size=vt0>
 ReduceConfig setReduceConfig(const TensorIterator& iter){
  // Start by assuming that each thread handles a single output and all
  // the inputs for that output.
@ -1063,12 +1058,16 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
  // threads with different threadIdx.x are independent and will produce results for different outputs.
  // In such case, values in each loaded vector always correspond to different outputs.
  if (fastest_moving_stride == sizeof(scalar_t)) {
-    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= ReduceConfig::input_vec_size) {
+#ifdef USE_ROCM
+    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1) {
+#else
+    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= input_vec_size) {
+#endif
      // Case 1: "vectorize along input"
      // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
      // we should avoid vectorization.
      config.vectorize_input = true;
-      dim0 /= config.input_vec_size;
+      dim0 /= input_vec_size;
    } else if (!reduction_on_fastest_striding_dimension) {
      // Case 2: "vectorize along output"
      config.output_vec_size = get_output_vec_size<scalar_t>(iter);
@ -1123,7 +1122,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
  // Control the number of threadblocks by adjusting the maximum number of
  // threads per multi-processor. These numbers better reflect the maximum
  // theoretical achievable threads per MP for the reduction operation.
-  if (iter.ndim() == 1)
+  if (iter.ndim() == 1 || iter.ndim() == 3)
    max_threads_per_mp = 512;
  if (iter.ndim() == 2)
    max_threads_per_mp = 256;
@ -1169,7 +1168,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
  return config;
 };

-template <typename scalar_t, typename out_scalar_t, int vt0=4, typename ops_t, typename ident_t=double>
+template <typename scalar_t, typename out_scalar_t, int vt0=4, int input_vec_size=vt0, typename ops_t, typename ident_t=double>
 inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0,
                              AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
@ -1221,7 +1220,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
    for (auto& sub_iter : iter.with_32bit_indexing()) {
      int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];

-      gpu_reduce_kernel<scalar_t, out_scalar_t, vt0>(sub_iter, ops, ident,
+      gpu_reduce_kernel<scalar_t, out_scalar_t, vt0, input_vec_size>(sub_iter, ops, ident,
          acc_buf_ptr, sub_iter_base_idx);
    }
    return;
@ -1238,7 +1237,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
  }
  char* acc_data = acc_buf_ptr->get_acc_slice(out_data);

-  ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0>(iter);
+  ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0, input_vec_size>(iter);
  at::DataPtr buffer;
  at::DataPtr semaphores;
  if (config.should_global_reduce()) {
@ -1253,7 +1252,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
  AT_ASSERT(can_use_32bit_indexing);
  auto output_calc = make_output_calculator<uint32_t>(iter);
  auto input_calc = make_input_calculator<uint32_t>(iter);
-  auto reduce = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0>(
+  auto reduce = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0, input_vec_size>(
      ops,
      config,
      input_calc,
--- a/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceSumProdKernel.cu
@ -13,6 +13,20 @@ namespace at::native {
 template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
 struct sum_functor {
  void operator()(TensorIterator& iter) {
+#ifdef USE_ROCM
+    // Half and BFloat16 can be packed in groups of up to 8 elements and
+    // can use *_DWORDX4 instructions to achieve that.
+    const bool is_16_bits =
+      ( (std::is_same<at::Half, scalar_t>::value) ||
+        (std::is_same<at::BFloat16, scalar_t>::value) );
+    if (is_16_bits) {
+      gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(
+        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
+          return a + b;
+        }));
+      return;
+    }
+#endif
    gpu_reduce_kernel<scalar_t, out_t>(
        iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
          return a + b;
--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@ -190,13 +190,7 @@ Tensor layer_norm_symint(
    c10::SymIntArrayRef normalized_shape, const std::optional<Tensor>& weight_opt /* optional */, const std::optional<Tensor>& bias_opt /* optional */,
    double eps,
    bool /* cudnn_enable, deprecated */) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-  c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
-  const Tensor& bias = *bias_maybe_owned;
-
-  return std::get<0>(at::native_layer_norm_symint(input, normalized_shape, weight, bias, eps));
+  return std::get<0>(at::native_layer_norm_symint(input, normalized_shape, weight_opt, bias_opt, eps));
 }

 DEFINE_DISPATCH(LayerNormKernel);
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
@ -54,7 +54,7 @@ at::Tensor quantized_convolution(
    at::Tensor weight,
    at::Tensor weight_scales,
    at::Tensor weight_zero_points,
-    c10::optional<at::Tensor> bias,
+    std::optional<at::Tensor> bias,
    torch::List<int64_t> stride,
    torch::List<int64_t> padding,
    torch::List<int64_t> dilation,
@ -63,15 +63,15 @@ at::Tensor quantized_convolution(
    at::Tensor output,
    double inv_output_scale,
    int64_t output_zero_point,
-    c10::optional<at::Tensor> accum,
+    std::optional<at::Tensor> accum,
    double accum_scale,
    int64_t accum_zero_point,
-    c10::optional<c10::ScalarType> output_dtype,
-    c10::optional<std::string_view> binary_attr,
-    c10::optional<at::Scalar> binary_alpha,
-    c10::optional<std::string_view> unary_attr,
-    torch::List<c10::optional<at::Scalar>> unary_scalars,
-    c10::optional<std::string_view> unary_algorithm) {
+    std::optional<c10::ScalarType> output_dtype,
+    std::optional<std::string_view> binary_attr,
+    std::optional<at::Scalar> binary_alpha,
+    std::optional<std::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<std::string_view> unary_algorithm) {
  Attr attr =
      Attr(/*q_scale=*/1.0 / inv_output_scale, /*zp=*/output_zero_point);

--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@ -114,7 +114,7 @@ at::Tensor quantized_convolution(
    at::Tensor weight,
    at::Tensor weight_scales,
    at::Tensor weight_zero_points,
-    c10::optional<at::Tensor> bias,
+    std::optional<at::Tensor> bias,
    torch::List<int64_t> stride,
    torch::List<int64_t> padding,
    torch::List<int64_t> dilation,
@ -123,14 +123,14 @@ at::Tensor quantized_convolution(
    at::Tensor output,
    double inv_output_scale,
    int64_t output_zero_point,
-    c10::optional<at::Tensor> accum,
+    std::optional<at::Tensor> accum,
    double accum_scale,
    int64_t accum_zero_point,
-    c10::optional<c10::ScalarType> output_dtype,
-    c10::optional<std::string_view> binary_attr,
-    c10::optional<at::Scalar> binary_alpha,
-    c10::optional<std::string_view> unary_attr,
-    torch::List<c10::optional<at::Scalar>> unary_scalars,
-    c10::optional<std::string_view> unary_algorithm);
+    std::optional<c10::ScalarType> output_dtype,
+    std::optional<std::string_view> binary_attr,
+    std::optional<at::Scalar> binary_alpha,
+    std::optional<std::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<std::string_view> unary_algorithm);

 } // namespace at::native::onednn
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
@ -31,17 +31,17 @@ class QConvoneDNNXPU final {
      at::Tensor weight,
      at::Tensor weight_scales,
      at::Tensor weight_zero_points,
-      c10::optional<at::Tensor> bias,
+      std::optional<at::Tensor> bias,
      torch::List<int64_t> stride,
      torch::List<int64_t> padding,
      torch::List<int64_t> dilation,
      int64_t groups,
      double inv_output_scale,
      int64_t output_zero_point,
-      c10::optional<c10::ScalarType> output_dtype,
+      std::optional<c10::ScalarType> output_dtype,
      std::string_view attr,
-      torch::List<c10::optional<at::Scalar>> scalars,
-      c10::optional<std::string_view> algorithm) {
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm) {
    if (act.dim() == 3 || act.dim() == 5) {
      TORCH_CHECK(
          attr == "none",
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -372,7 +372,6 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
  using namespace at::native::mps;
  using namespace mps;
  bool is3DConv = grad_output_t.dim() == 5;
-
  if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) {
    // On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16
    for (auto elem : grad_output_t.sizes()) {
@ -417,36 +416,29 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
        assert(0 && "Check should have been done earlier\n");
    }

-    MPSShape* gradOutputShape = getMPSShape(grad_output_t, memory_format);
    MPSShape* mps_input_shape = getMPSShape(input_size);
-    NSString* ns_shape_key = [[gradOutputShape valueForKey:@"description"] componentsJoinedByString:@","];
    string key;
    if (is3DConv) {
      key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
          ":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          getTensorsStringKey({grad_output_t, weight_t}) + ":" + string([ns_shape_key UTF8String]);
+          getTensorsStringKey({grad_output_t, weight_t});

    } else {
      key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
-          getTensorsStringKey({grad_output_t, weight_t}) + ":" + string([ns_shape_key UTF8String]);
+          getTensorsStringKey({grad_output_t, weight_t});
    }
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* gradOutputTensor =
-          mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(grad_output_t), gradOutputShape);
-      MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
+      auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
+      auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);

-      MPSGraphTensor* gradOutputTensorTranspose = gradOutputTensor;
-      if (is_channels_last) {
-        gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
-      }
      MPSGraphTensor* gradInputTensor;
      MPSShape* weightOutputShape = mps::getMPSShape(weight_t);
      // Depthwise conv is input feature channels = groups. So I in OIHW has to be 1.
-      bool isDepthwiseConv = ((groups > 1 && (weightOutputShape[1].intValue == 1)) && gradOutputShape.count >= 4 &&
+      bool isDepthwiseConv = ((groups > 1 && (weightOutputShape[1].intValue == 1)) && grad_output_t.ndimension() >= 4 &&
                              weightOutputShape.count >= 4 && !is_channels_last);

      if (is3DConv) {
@ -462,7 +454,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
                         padding[1],
                         padding[0],
                         groups);
-        gradInputTensor = [mpsGraph convolution3DDataGradientWithIncomingGradientTensor:gradOutputTensorTranspose
+        gradInputTensor = [mpsGraph convolution3DDataGradientWithIncomingGradientTensor:gradOutputTensor
                                                                          weightsTensor:weightTensor
                                                                            outputShape:mps_input_shape
                                                           forwardConvolutionDescriptor:conv3dDescriptor_
@ -484,7 +476,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
                                                            withDimension:-4
                                                                     name:nil];
        gradInputTensor =
-            [mpsGraph depthwiseConvolution3DDataGradientWithIncomingGradientTensor:gradOutputTensorTranspose
+            [mpsGraph depthwiseConvolution3DDataGradientWithIncomingGradientTensor:gradOutputTensor
                                                                     weightsTensor:weightTransposeTensor
                                                                       outputShape:mps_input_shape
                                                                        descriptor:depthWiseConv3dDescriptor_
@ -501,7 +493,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
                       at::MemoryFormat::Contiguous,
                       groups);

-        gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensorTranspose
+        gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensor
                                                                          weightsTensor:weightTensor
                                                                            outputShape:mps_input_shape
                                                           forwardConvolutionDescriptor:conv2dDescriptor_
@ -513,7 +505,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
      newCachedGraph->gradInputTensor_ = gradInputTensor;
    });

-    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
+    auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
    auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
    auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);

--- a/aten/src/ATen/native/quantized/cpu/Normalization.cpp
+++ b/aten/src/ATen/native/quantized/cpu/Normalization.cpp
@ -385,19 +385,15 @@ Tensor quantized_batch_norm(
    double eps,
    double output_scale,
    int64_t output_zero_point) {
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
-  const Tensor& weight = *weight_maybe_owned;
-  const Tensor& bias = bias_opt.value_or(Tensor());
-
-  Tensor qy;
-  // TODO: this should arguably support 3d as well
-  qy = q_batch_norm2d_impl<false>(
+  return q_batch_norm_impl<false>(
      qx,
-      weight.defined() ? std::make_optional(weight) : std::nullopt,
-      bias.defined() ? std::make_optional(bias) : std::nullopt,
-      mean, var, eps, output_scale, output_zero_point);
-  return qy;
+      weight_opt,
+      bias_opt,
+      mean,
+      var,
+      eps,
+      output_scale,
+      output_zero_point);
 }

 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@ -931,8 +931,8 @@ static at::Tensor linear_int8_with_onednn_weight(
    std::string_view& unary_post_op_algorithm) {
  using ideep::tensor;
  const int64_t dim = input.dim();
-  TORCH_CHECK(input.scalar_type() == c10::ScalarType::Byte,
-      "qlinear with mkldnn tensor: data type of input should be uint8 (unsigned char).");
+  TORCH_CHECK(input.scalar_type() == c10::ScalarType::Byte || input.scalar_type() == c10::ScalarType::Char,
+      "qlinear with mkldnn tensor: data type of input should be uint8 or int8 (unsigned char or char).");
  TORCH_CHECK(onednn_weight.scalar_type() == c10::ScalarType::Char,
      "qlinear with mkldnn tensor: data type of weight should be int8 (char).");
  TORCH_CHECK(
@ -1021,7 +1021,8 @@ static at::Tensor linear_int8_with_onednn_weight(
      empty_tensor;

  // Create onednn primitive
-  auto src_desc = tensor::desc(src_dims, ideep::data_type::u8, ideep::format_tag::any);
+  auto src_dtype = input.scalar_type() == c10::kByte ? ideep::data_type::u8 : ideep::data_type::s8;
+  auto src_desc = tensor::desc(src_dims, src_dtype, ideep::format_tag::any);
  auto weights_desc = packed_weight.get_desc();
  auto dst_dtype = dst.get_data_type();
  auto dst_desc = tensor::desc(dst_dims, dst_dtype, ideep::format_tag::any);
@ -1118,12 +1119,14 @@ namespace at::native {
      torch::List<std::optional<at::Scalar>> post_op_args,
      std::string_view post_op_algorithm) {
 #if AT_MKLDNN_ENABLED()
-    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
-        "onednn int8 linear: act scale/zp size should be 1");
+    // act_zero_point.numel() == 0 for symmetric quantization
+    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
+        "onednn int8 linear: act scale/zp size should be 1/<=1");
    static std::optional<at::Tensor> other = std::nullopt;
    static const std::string_view binary_post_op = "none";
+    int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
    return linear_int8_with_onednn_weight(
-        act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
+        act, act_scale.item().toDouble(), act_zp,
        onednn_weight, weight_scales, weight_zero_points,
        bias, output_scale, output_zero_point, output_dtype,
        other, /*other scale*/1.0, /*other zp*/0,
@ -1154,10 +1157,12 @@ namespace at::native {
      torch::List<std::optional<at::Scalar>> unary_post_op_args,
      std::string_view unary_post_op_algorithm) {
 #if AT_MKLDNN_ENABLED()
-    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
-        "onednn int8 linear: act scale/zp size should be 1");
+    // act_zero_point.numel() == 0 for symmetric quantization
+    TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
+        "onednn int8 linear: act scale/zp size should be 1/<=1");
+    int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
    return linear_int8_with_onednn_weight(
-        act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
+        act, act_scale.item().toDouble(), act_zp,
        onednn_weight, weight_scales, weight_zero_points,
        bias, output_scale, output_zero_point, output_dtype,
        other, other_scale, other_zero_point,
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@ -561,8 +561,8 @@ namespace {
        bool expected = std::isnan(val);
        CACHE_ALIGN c10::Half actual_vals[vHalf::size()];
        vHalf(val).isnan().store(actual_vals);
-        for (int jj = 0; jj < vHalf::size(); ++jj) {
-          EXPECT_EQ(expected, c10::bit_cast<uint16_t>(actual_vals[jj]) != 0) << "fp16 isnan failure for bit pattern " << std::hex << ii << std::dec;
+        for (auto actual_val : actual_vals) {
+          EXPECT_EQ(expected, c10::bit_cast<uint16_t>(actual_val) != 0) << "fp16 isnan failure for bit pattern " << std::hex << ii << std::dec;
        }
      }
    }
@ -1046,7 +1046,7 @@ namespace {
          mask[idx] = (VT)0;
        }
        else {
-          int64_t hex_mask = 0xFFFFFFFFFFFFFFFF;
+          uint64_t hex_mask = 0xFFFFFFFFFFFFFFFF;
          std::memcpy(&mask[idx], &hex_mask, sizeof(VT));
        }
        if (!test_blendv<vec, VT, idx+1, N>(expected_val, a, b, mask)) return false;
@ -1315,8 +1315,8 @@ namespace {
        ValueGen<float> generator_sc(1.f, 15.f, seed.add(2));
        for ([[maybe_unused]] const auto i : c10::irange(trials)) {
          float scale = generator_sc.get();
-          int32_t zero_point_val = generator.get();
-          float scale_zp_premul = -(scale * zero_point_val);
+          auto zero_point_val = generator.get();
+          float scale_zp_premul = -(scale * static_cast<float>(zero_point_val));
          vfloat vf_scale = vfloat{scale};
          vfloat vf_zp = vfloat{static_cast<float>(zero_point_val)};
          vfloat vf_scale_zp = vfloat{scale_zp_premul};
@ -1657,18 +1657,16 @@ namespace {
    TEST(HalfConversionTest, HalfFloat) {
      float f32s[100];
      for (const auto i : c10::irange(100)) {
-        f32s[i] = i + 0.3;
+        f32s[i] = static_cast<float>(i + 0.3);
      }
-      uint16_t u16;
-      float x;
      for (const auto i : c10::irange(100)) {
      #if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
          !defined(__APPLE__)
-        u16 = at::vec::float2half_scalar(f32s[i]);
-        x = at::vec::half2float_scalar(u16);
+        uint16_t u16 = at::vec::float2half_scalar(f32s[i]);
+        float x = at::vec::half2float_scalar(u16);
      #else
-        u16 = c10::detail::fp16_ieee_from_fp32_value(f32s[i]);
-        x = c10::detail::fp16_ieee_to_fp32_value(u16);
+        uint16_t u16 = c10::detail::fp16_ieee_from_fp32_value(f32s[i]);
+        float x = c10::detail::fp16_ieee_to_fp32_value(u16);
      #endif

        EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i]))
@ -1697,7 +1695,7 @@ namespace {
      VT v_pinf = static_cast<VT>(*(float *)&infBits);
      values[index] = v_pinf;
      auto vec_pinf = vec::loadu(values);
-      int negInfBits = 0xFF800000;
+      unsigned int negInfBits = 0xFF800000;
      VT v_ninf  = static_cast<VT>(*(float *)&negInfBits);
      values[index] = v_ninf;
      auto vec_ninf = vec::loadu(values);
@ -1779,8 +1777,8 @@ namespace {
        const auto expected = static_cast<float>(val);
        CACHE_ALIGN float actual_vals[vfloat::size()];
        at::vec::convert<float>(vBFloat16(val)).store(actual_vals);
-        for (int jj = 0; jj < vfloat::size(); ++jj) {
-          EXPECT_EQ(c10::bit_cast<uint32_t>(expected), c10::bit_cast<uint32_t>(actual_vals[jj]))
+        for (auto actual_val : actual_vals) {
+          EXPECT_EQ(c10::bit_cast<uint32_t>(expected), c10::bit_cast<uint32_t>(actual_val))
            << "convert-to-float failure for bf16 bit pattern "
            << std::hex << ii << std::dec;
        }
@ -1794,20 +1792,20 @@ namespace {

    #define TEST_MASK_LOAD(dst_t, mask_t, mask_n)                           \
      do {                                                                  \
-        CACHE_ALIGN dst_t x[mask_n * size];                                 \
-        CACHE_ALIGN dst_t y[mask_n * size];                                 \
-        CACHE_ALIGN dst_t ref[mask_n * size];                               \
-        auto seed = TestSeed();                                             \
-        dst_t generator_min = std::numeric_limits<dst_t>::is_signed ? dst_t(-100) : dst_t(0); \
-        ValueGen<dst_t> generator(generator_min, dst_t(100), seed);     \
-        for (const auto i : c10::irange(mask_n * size)) {                   \
-          x[i] = generator.get();                                           \
-        }                                                                   \
-        auto vec_mask = generate_vec_mask<mask_t, mask_n>(seed);            \
        constexpr int dst_size = at::vec::Vectorized<dst_t>::size();        \
        constexpr int dst_n = mask_n * size / dst_size;                     \
-        constexpr int rnd_n = (mask_n * size + dst_size - 1) / dst_size;    \
        if constexpr(dst_n * dst_size >= mask_n * size) {                   \
+            CACHE_ALIGN dst_t x[mask_n * size];                             \
+            CACHE_ALIGN dst_t y[mask_n * size];                             \
+            CACHE_ALIGN dst_t ref[mask_n * size];                           \
+            auto seed = TestSeed();                                         \
+            dst_t generator_min = std::numeric_limits<dst_t>::is_signed ? dst_t(-100) : dst_t(0); \
+            ValueGen<dst_t> generator(generator_min, dst_t(100), seed);     \
+            for (const auto i : c10::irange(mask_n * size)) {               \
+              x[i] = generator.get();                                       \
+            }                                                               \
+            auto vec_mask = generate_vec_mask<mask_t, mask_n>(seed);        \
+            constexpr int rnd_n = (mask_n * size + dst_size - 1) / dst_size;\
            auto x_vec = vec_mask.template loadu<dst_t, rnd_n>(x);          \
            x_vec.store(y);                                                 \
            for (const auto i : c10::irange(mask_n * size)) {               \
--- a/benchmarks/dynamo/pr_time_benchmarks/README.md
+++ b/benchmarks/dynamo/pr_time_benchmarks/README.md
@ -0,0 +1,9 @@
+# Instructions on how to make a new compile time benchmark
+
+1. Make a new benchmark file in /benchmarks/dynamo/pr_time_benchmarks/benchmarks/ eg. https://github.com/pytorch/pytorch/blob/0b75b7ff2b8ab8f40e433a52b06a671d6377997f/benchmarks/dynamo/pr_time_benchmarks/benchmarks/add_loop.py
+2. cd into the pr_time_benchmarks directory `cd benchmarks/dynamo/pr_time_benchmarks`
+3. Run `PYTHONPATH=./ python benchmarks/[YOUR_BENCHMARK].py a.txt`
+4. (Optional) flip a flag that you know will change the benchmark and run again with b.txt `PYTHONPATH=./ python benchmarks/[YOUR_BENCHMARK].py a.txt`
+5. Compare `a.txt` and `b.txt` located within the `benchmarks/dynamo/pr_time_benchmarks` folder to make sure things look as you expect
+6. Check in your new benchmark file and submit a new PR
+7. In a few days, if your benchmark is stable, bug Laith Sakka to enable running your benchmark on all PRs. If your a meta employee, you can find the dashboard here: internalfb.com/intern/unidash/dashboard/pt2_diff_time_metrics
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/float_args.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/float_args.py
@ -0,0 +1,47 @@
+import sys
+
+from benchmark_base import BenchmarkBase
+
+import torch
+from torch._inductor.utils import fresh_inductor_cache
+
+
+class Benchmark(BenchmarkBase):
+    def __init__(self):
+        super().__init__(
+            category="float_args",
+            backend="inductor",
+            device="cpu",
+        )
+
+    def name(self):
+        return f"{self.category()}"
+
+    def description(self):
+        return "Benchmark to measure recompilations with float arguments."
+
+    def _prepare_once(self):
+        torch.manual_seed(0)
+
+    def _prepare(self):
+        torch._dynamo.reset()
+
+    def _work(self):
+        @torch.compile(backend="inductor")
+        def f(x, y):
+            return x + y
+
+        with fresh_inductor_cache():
+            for i in range(8):
+                f(torch.arange(3), i * 2.5)
+
+
+def main():
+    result_path = sys.argv[1]
+    Benchmark().enable_compile_time_instruction_count().collect_all().append_results(
+        result_path
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@ -6,27 +6,27 @@ add_loop_eager_dynamic,compile_time_instruction_count,5703000000,0.025



-add_loop_inductor,compile_time_instruction_count,29510000000,0.015
+add_loop_inductor,compile_time_instruction_count,32220000000,0.015



-add_loop_inductor_dynamic_gpu,compile_time_instruction_count,43280000000,0.025
+add_loop_inductor_dynamic_gpu,compile_time_instruction_count,44500000000,0.025



-add_loop_inductor_gpu,compile_time_instruction_count,25690000000,0.015
+add_loop_inductor_gpu,compile_time_instruction_count,27320000000,0.015



-basic_modules_ListOfLinears_eager,compile_time_instruction_count,1033000000,0.015
+basic_modules_ListOfLinears_eager,compile_time_instruction_count,1018000000,0.015



-basic_modules_ListOfLinears_inductor,compile_time_instruction_count,20810000000,0.015
+basic_modules_ListOfLinears_inductor,compile_time_instruction_count,21760000000,0.015



-basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17020000000,0.015
+basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17810000000,0.015



@ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1669000000,0.02



-sum_floordiv_regression,compile_time_instruction_count,1113000000,0.015
+sum_floordiv_regression,compile_time_instruction_count,1033000000,0.015



@ -50,7 +50,7 @@ aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2018000000



-aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5843000000,0.015
+aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5796000000,0.015



@ -62,4 +62,4 @@ aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3863000000,



-aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10410000000,0.015
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10330000000,0.015
--- a/benchmarks/static_runtime/test_utils.cc
+++ b/benchmarks/static_runtime/test_utils.cc
@ -353,7 +353,7 @@ void testStaticRuntime(

          size_t new_managed_bytes =
              memory_planner ? memory_planner->total_managed() : 0;
-          if (check_resize && new_managed_bytes >= 0) {
+          if (check_resize) {
            EXPECT_GE(new_managed_bytes, managed_bytes);
          }

--- a/c10/util/ConstexprCrc.h
+++ b/c10/util/ConstexprCrc.h
@ -123,7 +123,7 @@ inline constexpr crc64_t crc64(const char* str, size_t size) {
  return crc64_t{detail::crc64impl(0, str, size)};
 }

-inline constexpr crc64_t crc64(c10::string_view str) {
+inline constexpr crc64_t crc64(std::string_view str) {
  return crc64(str.data(), str.size());
 }
 } // namespace c10::util
--- a/c10/util/StringUtil.cpp
+++ b/c10/util/StringUtil.cpp
@ -92,7 +92,7 @@ size_t ReplaceAll(std::string& s, std::string_view from, std::string_view to) {
  std::string::size_type last_pos = 0u;
  std::string::size_type cur_pos = 0u;
  std::string::size_type write_pos = 0u;
-  const c10::string_view input(s);
+  const std::string_view input(s);

  if (from.size() >= to.size()) {
    // If the replacement string is not larger than the original, we
--- a/caffe2/utils/threadpool/WorkersPool.h
+++ b/caffe2/utils/threadpool/WorkersPool.h
@ -188,7 +188,6 @@ class BlockingCounter {
  // returns false.
  bool DecrementCount() {
    const auto count_value = count_.fetch_sub(1, std::memory_order_relaxed) - 1;
-    TORCH_DCHECK_GE(count_value, 0);
    if (count_value == 0) {
      std::lock_guard<std::mutex> g(mutex_);
      cond_.notify_one();
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@ -414,6 +414,9 @@ function(torch_compile_options libname)
      $<$<COMPILE_LANGUAGE:CXX>:${private_compile_options}>)
  if(USE_CUDA)
    foreach(option IN LISTS private_compile_options)
+      if("${option}" STREQUAL "-Wextra-semi")
+        continue()
+      endif()
      target_compile_options(${libname} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler ${option}>)
    endforeach()
  endif()
--- a/docs/source/_static/css/custom.css
+++ b/docs/source/_static/css/custom.css
@ -1,7 +1,7 @@
 /* styles needed for the Google Search button */

-.pytorch-left-menu-search input[type=text] {
-    background-image: none;
+.gsc-overflow-hidden {
+    overflow: visible !important;
 }

 .gsc-control-cse {
--- a/docs/source/accelerator.rst
+++ b/docs/source/accelerator.rst
@ -10,7 +10,9 @@ torch.accelerator
    device_count
    is_available
    current_accelerator
+    set_device_index
    set_device_idx
+    current_device_index
    current_device_idx
    set_stream
    current_stream
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -305,6 +305,7 @@ coverage_ignore_functions = [
    "node_arg_is_weight",
    "return_arg_list",
    # torch.ao.quantization.pt2e.graph_utils
+    "bfs_trace_with_node_process",
    "find_sequential_partitions",
    "get_equivalent_types",
    "update_equivalent_types_dict",
--- a/docs/source/distributed.pipelining.rst
+++ b/docs/source/distributed.pipelining.rst
@ -199,15 +199,8 @@ the model.  For example:
          stage_index,
          num_stages,
          device,
-          input_args=example_input_microbatch,
      )

-
-The ``PipelineStage`` requires an example argument ``input_args`` representing
-the runtime input to the stage, which would be one microbatch worth of input
-data.  This argument is passed through the forward method of the stage module to
-determine the input and output shapes required for communication.
-
 When composing with other Data or Model parallelism techniques, ``output_args``
 may also be required, if the output shape/dtype of the model chunk will be
 affected.
@ -421,7 +414,7 @@ are subclasses of ``PipelineScheduleMulti``.
 Logging
 *******

-You can turn on additional logging using the `TORCH_LOGS` environment variable from [`torch._logging`](https://pytorch.org/docs/main/logging.html#module-torch._logging):
+You can turn on additional logging using the `TORCH_LOGS` environment variable from `torch._logging <https://pytorch.org/docs/main/logging.html#module-torch._logging>`_:

 * `TORCH_LOGS=+pp` will display `logging.DEBUG` messages and all levels above it.
 * `TORCH_LOGS=pp` will display `logging.INFO` messages and above.
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@ -508,7 +508,7 @@ API Example::

  import torch
  from torch.ao.quantization.quantize_pt2e import prepare_pt2e
-  from torch._export import capture_pre_autograd_graph
+  from torch.export import export_for_training
  from torch.ao.quantization.quantizer import (
      XNNPACKQuantizer,
      get_symmetric_quantization_config,
@ -535,7 +535,7 @@ API Example::
  # Step 1. program capture
  # NOTE: this API will be updated to torch.export API in the future, but the captured
  # result should mostly stay the same
-  m = capture_pre_autograd_graph(m, *example_inputs)
+  m = export_for_training(m, *example_inputs).module()
  # we get a model with aten ops

  # Step 2. quantization
--- a/Show More
+++ b/Show More