Update (base update)

[ghstack-poisoned]
2025-11-16 07:24:54 +08:00 · 2024-10-22 13:37:54 +08:00
parent a2170cd153 3b186c5659
commit cecfe68c32
400 changed files with 7432 additions and 5508 deletions
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -203,7 +203,9 @@ if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
 fi

 if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
-  export USE_CUDA=0
+  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+    export USE_CUDA=1
+  fi
  export USE_ASAN=1
  export REL_WITH_DEB_INFO=1
  export UBSAN_FLAGS="-fno-sanitize-recover=all"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -196,6 +196,9 @@ install_tlparse
 # ASAN test is not working
 if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_stack_use_after_return=true:strict_init_order=true:detect_odr_violation=1:detect_container_overflow=0:check_initialization_order=true:debug=true
+    if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+        export ASAN_OPTIONS="${ASAN_OPTIONS}:protect_shadow_gap=0"
+    fi
    export UBSAN_OPTIONS=print_stacktrace=1:suppressions=$PWD/ubsan.supp
    export PYTORCH_TEST_WITH_ASAN=1
    export PYTORCH_TEST_WITH_UBSAN=1
@ -320,6 +323,7 @@ test_inductor_distributed() {
  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
  python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
+  python test/run_test.py -i distributed/_composable/test_replicate_with_compiler.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
@ -331,11 +335,12 @@ test_inductor_distributed() {
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_compile.py --verbose
  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
-  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives --verbose
+  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose
  assert_git_not_dirty
 }

--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@ -28,7 +28,7 @@ runs:
      run: |
        # Remove any previous test jsons if they exist
        rm -f test-jsons-*.zip
-        zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
+        zip -r "test-jsons-${FILE_SUFFIX}.zip" test/test-reports -i '*.json'

    - name: Zip test reports for upload
      if: runner.os != 'Windows' && !inputs.use-gha
@ -38,7 +38,7 @@ runs:
      run: |
        # Remove any previous test reports if they exist
        rm -f test-reports-*.zip
-        zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' -i '*.csv'
+        zip -r "test-reports-${FILE_SUFFIX}.zip" test/test-reports -i '*.xml' -i '*.csv'

    - name: Zip usage log for upload
      if: runner.os != 'Windows' && !inputs.use-gha
@ -53,8 +53,8 @@ runs:
        if [ -f 'usage_log.txt' ]; then
            zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt'
        fi
-        if ls test/**/*.log 1> /dev/null 2>&1; then
-            zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
+        if find "test/test-reports" -name "*.log" 2>/dev/null | grep -q .; then
+            zip -r "logs-${FILE_SUFFIX}.zip" test/test-reports -i '*.log'
        fi

    - name: Zip debugging artifacts for upload
@ -77,7 +77,7 @@ runs:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
        # -ir => recursive include all files in pattern
-        7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json'
+        7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\test-reports\*.json'

    - name: Zip test reports for upload
      if: runner.os == 'Windows' && !inputs.use-gha
@ -86,7 +86,7 @@ runs:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
        # -ir => recursive include all files in pattern
-        7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' -ir'!test\*.csv'
+        7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\test-reports\*.xml' -ir'!test\test-reports\*.csv'

    - name: Zip usage log for upload
      if: runner.os == 'Windows' && !inputs.use-gha
@ -96,7 +96,7 @@ runs:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
        # -ir => recursive include all files in pattern
-        7z a "logs-$Env:FILE_SUFFIX.zip" 'usage_log.txt' -ir'!test\*.log'
+        7z a "logs-$Env:FILE_SUFFIX.zip" 'usage_log.txt' -ir'!test\test-reports\*.log'

    # S3 upload
    - name: Store Test Downloaded JSONs on S3
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -459,7 +459,7 @@ def generate_wheels_matrix(
                            ".", "_"
                        ),
                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]
+                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.4"]
                            if os != "linux" and gpu_arch_type != "xpu"
                            else ""
                        ),
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -123,7 +123,7 @@ jobs:
          IMAGE_NAME: ${{ matrix.docker-image-name }}
        with:
          shell: bash
-          timeout_minutes: 15
+          timeout_minutes: 30
          max_attempts: 5
          retry_wait_seconds: 90
          command: |
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -65,7 +65,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-aarch64-test:  # Testing
@ -185,7 +185,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-aarch64-test:  # Testing
@ -305,7 +305,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-aarch64-test:  # Testing
@ -425,7 +425,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-aarch64-test:  # Testing
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -64,7 +64,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-s390x-test:  # Testing
@ -133,7 +133,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-s390x-test:  # Testing
@ -202,7 +202,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-s390x-test:  # Testing
@ -271,7 +271,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-s390x-test:  # Testing
@ -340,7 +340,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_13-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cpu-s390x-test:  # Testing
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -46,7 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -162,7 +162,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -278,7 +278,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -394,7 +394,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -510,7 +510,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -55,7 +55,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -322,7 +322,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -591,7 +591,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -860,7 +860,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1393,7 +1393,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1660,7 +1660,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1929,7 +1929,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2198,7 +2198,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2731,7 +2731,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2998,7 +2998,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3267,7 +3267,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3536,7 +3536,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4069,7 +4069,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4336,7 +4336,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4605,7 +4605,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4874,7 +4874,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -385,3 +385,32 @@ jobs:
      build-environment: linux-focal-cuda11.8-py3.9-gcc9-experimental-split-build
      docker-image: ${{ needs.linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build.outputs.test-matrix }}
+
+  linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build:
+    name: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      use_split_build: true
+      build-environment: linux-focal-cuda11.8-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+      cuda-arch-list: '7.5'
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+        ]}
+
+  linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build-test:
+    name: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
+      docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -280,11 +280,12 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda11.8-py3.10-gcc9
      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+      cuda-arch-list: '7.5'
      test-matrix: |
        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -288,31 +288,3 @@ jobs:
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
-
-  linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build:
-    name: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      use_split_build: true
-      build-environment: linux-focal-cuda11.8-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
-      test-matrix: |
-        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
-        ]}
-
-  linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build-test:
-    name: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
-      docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -241,7 +241,9 @@ exclude_patterns = [
    'c10/util/*inl.h',
    'c10/test/**/*.h',
    'third_party/**/*',
-    'torch/csrc/api/**',
+    'torch/csrc/api/include/torch/nn/modules/common.h',
+    'torch/csrc/api/include/torch/linalg.h',
+    'torch/csrc/api/include/torch/nn/pimpl-inl.h',
    'torch/csrc/autograd/generated/**',
    'torch/csrc/distributed/**/*',
    'torch/csrc/dynamo/eval_frame.h',
@ -1230,87 +1232,6 @@ exclude_patterns = [
    'torch/fft/__init__.py',
    'torch/func/__init__.py',
    'torch/futures/__init__.py',
-    'torch/fx/__init__.py',
-    'torch/fx/_compatibility.py',
-    'torch/fx/_symbolic_trace.py',
-    'torch/fx/annotate.py',
-    'torch/fx/config.py',
-    'torch/fx/experimental/__init__.py',
-    'torch/fx/experimental/accelerator_partitioner.py',
-    'torch/fx/experimental/const_fold.py',
-    'torch/fx/experimental/debug.py',
-    'torch/fx/experimental/graph_gradual_typechecker.py',
-    'torch/fx/experimental/merge_matmul.py',
-    'torch/fx/experimental/meta_tracer.py',
-    'torch/fx/experimental/migrate_gradual_types/__init__.py',
-    'torch/fx/experimental/migrate_gradual_types/constraint.py',
-    'torch/fx/experimental/migrate_gradual_types/constraint_generator.py',
-    'torch/fx/experimental/migrate_gradual_types/constraint_transformation.py',
-    'torch/fx/experimental/migrate_gradual_types/operation.py',
-    'torch/fx/experimental/migrate_gradual_types/transform_to_z3.py',
-    'torch/fx/experimental/migrate_gradual_types/util.py',
-    'torch/fx/experimental/migrate_gradual_types/z3_types.py',
-    'torch/fx/experimental/normalize.py',
-    'torch/fx/experimental/optimization.py',
-    'torch/fx/experimental/partitioner_utils.py',
-    'torch/fx/experimental/refinement_types.py',
-    'torch/fx/experimental/rewriter.py',
-    'torch/fx/experimental/schema_type_annotation.py',
-    'torch/fx/experimental/unification/__init__.py',
-    'torch/fx/experimental/unification/core.py',
-    'torch/fx/experimental/unification/dispatch.py',
-    'torch/fx/experimental/unification/match.py',
-    'torch/fx/experimental/unification/more.py',
-    'torch/fx/experimental/unification/multipledispatch/__init__.py',
-    'torch/fx/experimental/unification/multipledispatch/conflict.py',
-    'torch/fx/experimental/unification/multipledispatch/core.py',
-    'torch/fx/experimental/unification/multipledispatch/dispatcher.py',
-    'torch/fx/experimental/unification/multipledispatch/utils.py',
-    'torch/fx/experimental/unification/multipledispatch/variadic.py',
-    'torch/fx/experimental/unification/unification_tools.py',
-    'torch/fx/experimental/unification/utils.py',
-    'torch/fx/experimental/unification/variable.py',
-    'torch/fx/experimental/unify_refinements.py',
-    'torch/fx/graph.py',
-    'torch/fx/graph_module.py',
-    'torch/fx/interpreter.py',
-    'torch/fx/node.py',
-    'torch/fx/operator_schemas.py',
-    'torch/fx/passes/__init__.py',
-    'torch/fx/passes/annotate_getitem_nodes.py',
-    'torch/fx/passes/backends/__init__.py',
-    'torch/fx/passes/backends/cudagraphs.py',
-    'torch/fx/passes/dialect/__init__.py',
-    'torch/fx/passes/dialect/common/__init__.py',
-    'torch/fx/passes/dialect/common/cse_pass.py',
-    'torch/fx/passes/fake_tensor_prop.py',
-    'torch/fx/passes/graph_drawer.py',
-    'torch/fx/passes/graph_manipulation.py',
-    'torch/fx/passes/infra/__init__.py',
-    'torch/fx/passes/infra/partitioner.py',
-    'torch/fx/passes/infra/pass_base.py',
-    'torch/fx/passes/infra/pass_manager.py',
-    'torch/fx/passes/net_min_base.py',
-    'torch/fx/passes/operator_support.py',
-    'torch/fx/passes/param_fetch.py',
-    'torch/fx/passes/pass_manager.py',
-    'torch/fx/passes/reinplace.py',
-    'torch/fx/passes/shape_prop.py',
-    'torch/fx/passes/split_module.py',
-    'torch/fx/passes/split_utils.py',
-    'torch/fx/passes/splitter_base.py',
-    'torch/fx/passes/tests/__init__.py',
-    'torch/fx/passes/tests/test_pass_manager.py',
-    'torch/fx/passes/tools_common.py',
-    'torch/fx/passes/utils/__init__.py',
-    'torch/fx/passes/utils/common.py',
-    'torch/fx/passes/utils/fuser_utils.py',
-    'torch/fx/passes/utils/matcher_utils.py',
-    'torch/fx/passes/utils/source_matcher_utils.py',
-    'torch/fx/proxy.py',
-    'torch/fx/subgraph_rewriter.py',
-    'torch/fx/tensor_type.py',
-    'torch/fx/traceback.py',
    'torch/linalg/__init__.py',
    'torch/monitor/__init__.py',
    'torch/nested/__init__.py',
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@ -136,7 +136,7 @@ inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
  checkDeviceType("CPU_tensor_apply", tensors, kCPU);
  checkLayout("CPU_tensor_apply", tensors, kStrided);
  if (!_all_equal_numel(tensors))
-    AT_ERROR(_all_equal_numel_error(tensors));
+    TORCH_CHECK(false, _all_equal_numel_error(tensors));
  // An empty tensor has no elements
  for (auto& t : tensors)
    if (t.numel() == 0)
--- a/aten/src/ATen/CPUFixedAllocator.h
+++ b/aten/src/ATen/CPUFixedAllocator.h
@ -12,11 +12,11 @@
 namespace at {

 static cpu_fixed_malloc(void*, ptrdiff_t) {
-  AT_ERROR("attempting to resize a tensor view of an external blob");
+  TORCH_CHECK(false, "attempting to resize a tensor view of an external blob");
 }

 static cpu_fixed_realloc(void*, void*, ptrdiff_t) {
-  AT_ERROR("attempting to resize a tensor view of an external blob");
+  TORCH_CHECK(false, "attempting to resize a tensor view of an external blob");
 }

 static cpu_fixed_free(void* state, void* allocation) {
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@ -189,7 +189,7 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
      double_normal_sample = std::optional<double>(legacy_pod->normal_y);
    }
  } else {
-    AT_ERROR("Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
+    TORCH_CHECK(false, "Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
             " or a CPUGeneratorImplState of size ", size_current,
             " but found the input RNG state size to be ", new_state_size);
  }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -43,19 +43,9 @@ class TORCH_API Context {

    if (device_type == at::kCPU) {
      return at::detail::getDefaultCPUGenerator();
-    } else if (device_type == at::kCUDA) {
-      return at::detail::getCUDAHooks().getDefaultCUDAGenerator(device.index());
-    } else if (device_type == at::kMPS) {
-      return at::detail::getMPSHooks().getDefaultMPSGenerator();
-    } else if (device_type == at::kXPU) {
-      return at::detail::getXPUHooks().getDefaultXPUGenerator(device.index());
-    } else if (device_type == at::kIPU) {
-      return at::detail::getIPUHooks().getDefaultIPUGenerator(device.index());
-    } else if (device_type == at::kPrivateUse1) {
-      return at::detail::getPrivateUse1Hooks().getDefaultGenerator(
-          device.index());
    } else {
-      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
+      return getAcceleratorHooksInterface(device_type)
+          .getDefaultGenerator(device.index());
    }
  }

@ -77,8 +67,10 @@ class TORCH_API Context {
    } else if (device_type == at::kHIP) {
      return at::detail::getHIPHooks();
    } else {
-      AT_ERROR(
-          c10::DeviceTypeName(device_type), " device type not an accelerator.");
+      TORCH_CHECK(
+          false,
+          c10::DeviceTypeName(device_type),
+          " device type not an accelerator.");
    }
  }

@ -349,18 +341,28 @@ class TORCH_API Context {

  // Preserved for BC
  void lazyInitCUDA() {
+    TORCH_WARN_DEPRECATION(
+        "lazyInitCUDA is deprecated. Please use lazyInitDevice(at::kCUDA) instead.")
    lazyInitDevice(at::kCUDA);
  }
  void lazyInitHIP() {
+    TORCH_WARN_DEPRECATION(
+        "lazyInitHIP is deprecated. Please use lazyInitDevice(at::kHIP) instead.")
    lazyInitDevice(at::kHIP);
  }
  void lazyInitXPU() {
+    TORCH_WARN_DEPRECATION(
+        "lazyInitXPU is deprecated. Please use lazyInitDevice(at::kXPU) instead.")
    lazyInitDevice(at::kXPU);
  }
  void lazyInitMTIA() {
+    TORCH_WARN_DEPRECATION(
+        "lazyInitMTIA is deprecated. Please use lazyInitDevice(at::kMTIA) instead.")
    lazyInitDevice(at::kMTIA);
  }
  void lazyInitPrivateUse1() {
+    TORCH_WARN_DEPRECATION(
+        "lazyInitPrivateUse1 is deprecated. Please use lazyInitDevice(at::kPrivateUse1) instead.")
    lazyInitDevice(at::kPrivateUse1);
  }

--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@ -13,10 +13,12 @@ namespace at {
 TORCH_API ScalarType toScalarType(const DLDataType& dtype);
 TORCH_API DLManagedTensor* toDLPack(const Tensor& src);
 TORCH_API Tensor fromDLPack(DLManagedTensor* src);
-C10_DEPRECATED_MESSAGE("Please migrate to a non-const variant")
-inline Tensor fromDLPack(const DLManagedTensor* src) {
+
+[[deprecated("Please migrate to a non-const variant")]] inline Tensor fromDLPack(
+    const DLManagedTensor* src) {
  return fromDLPack(const_cast<DLManagedTensor*>(src));
 }
+
 TORCH_API Tensor
 fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter);
 TORCH_API DLDataType getDLDataType(const Tensor& t);
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -55,7 +55,8 @@ TORCH_API void record_kernel_function_dtype(std::string name);
  do {                                                \
    if constexpr (!at::should_include_kernel_dtype(   \
                      at_dispatch_name, enum_type)) { \
-      AT_ERROR(                                       \
+      TORCH_CHECK(                                    \
+          false,                                      \
          "dtype '",                                  \
          toString(enum_type),                        \
          "' not selected for kernel tag ",           \
@ -103,23 +104,23 @@ inline at::ScalarType scalar_type(at::ScalarType s) {
  return s;
 }

-C10_DEPRECATED_MESSAGE(
+[[deprecated(
    "passing at::DeprecatedTypeProperties to an AT_DISPATCH macro is deprecated, "
-    "pass an at::ScalarType instead")
-inline at::ScalarType scalar_type(const at::DeprecatedTypeProperties& t) {
+    "pass an at::ScalarType instead")]] inline at::ScalarType
+scalar_type(const at::DeprecatedTypeProperties& t) {
  return t.scalarType();
 }

-C10_DEPRECATED_MESSAGE(
+[[deprecated(
    "AT_DISPATCH_ALL_TYPES_AND_HALF is deprecated, "
-    "use AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, ...) instead")
-inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF() {}
+    "use AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, ...) instead")]] inline void
+deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF() {}

-C10_DEPRECATED_MESSAGE(
+[[deprecated(
    "AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX is deprecated, "
    "use AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, ...) "
-    "instead")
-inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
+    "instead")]] inline void
+deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}

 } // namespace detail

@ -220,7 +221,8 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
    switch (_st) {                                                          \
      __VA_ARGS__                                                           \
      default:                                                              \
-        AT_ERROR(                                                           \
+        TORCH_CHECK(                                                        \
+            false,                                                          \
            '"',                                                            \
            at_dispatch_name,                                               \
            "\" not implemented for '",                                     \
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -78,7 +78,7 @@ inline void check_defined(
    const char* api_name) {
  for (auto& t : tensors) {
    if (!t.get().defined()) {
-      AT_ERROR(api_name, "(...) called with an undefined Tensor");
+      TORCH_CHECK(false, api_name, "(...) called with an undefined Tensor");
    }
  }
 }
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -231,6 +231,7 @@ Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor
    }
 }

+// NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, c10::SymInt split_size, int64_t dim) {
    // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can.
    // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i
@ -452,6 +453,7 @@ Tensor FunctionalInverses::chunk_inverse(const at::Tensor & base, const at::Tens
    return split_with_sizes_inverse(base, mutated_view, inverse_return_mode, mutated_view_idx, split_sizes, dim);
 }

+// NOLINTNEXTLINE(performance-unnecessary-value-param)
 Tensor FunctionalInverses::narrow_inverse(const at::Tensor & base, const at::Tensor & mutated_view, InverseReturnMode inverse_return_mode, int dim, c10::SymInt start, c10::SymInt length) {
    if (inverse_return_mode == InverseReturnMode::AlwaysView) {
      // NB: assumes mutated_view is a narrowed view of base.
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@ -33,7 +33,7 @@ inline void infer_size_impl(
    } else if (shape[dim] >= 0) {
      newsize *= shape[dim];
    } else {
-      AT_ERROR("invalid shape dimension ", shape[dim]);
+      TORCH_CHECK(false, "invalid shape dimension ", shape[dim]);
    }
  }

--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@ -45,15 +45,15 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
  }

  void set_size(int64_t dim, int64_t new_size) override {
-    AT_ERROR("opaque tensors do not have set_size");
+    TORCH_CHECK(false, "opaque tensors do not have set_size");
  }

  void set_stride(int64_t dim, int64_t new_stride) override {
-    AT_ERROR("opaque tensors do not have set_stride");
+    TORCH_CHECK(false, "opaque tensors do not have set_stride");
  }

  void set_storage_offset(int64_t storage_offset) override {
-    AT_ERROR("opaque tensors do not have set_storage_offset");
+    TORCH_CHECK(false, "opaque tensors do not have set_storage_offset");
  }

 #ifdef DEBUG
--- a/aten/src/ATen/SparseCsrTensorUtils.h
+++ b/aten/src/ATen/SparseCsrTensorUtils.h
@ -23,7 +23,8 @@
      case kSparseBsc:                                               \
        return __VA_ARGS__();                                        \
      default:                                                       \
-        AT_ERROR(                                                    \
+        TORCH_CHECK(                                                 \
+            false,                                                   \
            NAME,                                                    \
            " expected sparse compressed tensor layout but got ",    \
            the_layout);                                             \
@ -42,7 +43,8 @@
      case kSparseBsc:                                            \
        return (COLUMN_DIM_ACTION)();                             \
      default:                                                    \
-        AT_ERROR(                                                 \
+        TORCH_CHECK(                                              \
+            false,                                                \
            NAME,                                                 \
            " expected sparse compressed tensor layout but got ", \
            the_layout);                                          \
@ -61,7 +63,8 @@
      case kSparseBsc:                                            \
        return (BLOCK_ACTION)();                                  \
      default:                                                    \
-        AT_ERROR(                                                 \
+        TORCH_CHECK(                                              \
+            false,                                                \
            NAME,                                                 \
            " expected sparse compressed tensor layout but got ", \
            the_layout);                                          \
@ -77,7 +80,8 @@
      case kSparseBsr:                                                \
        return (ROW_DIM_ACTION)();                                    \
      default:                                                        \
-        AT_ERROR(                                                     \
+        TORCH_CHECK(                                                  \
+            false,                                                    \
            NAME,                                                     \
            " expected sparse row compressed tensor layout but got ", \
            the_layout);                                              \
@ -93,7 +97,8 @@
      case kSparseBsc:                                                   \
        return (COL_DIM_ACTION)();                                       \
      default:                                                           \
-        AT_ERROR(                                                        \
+        TORCH_CHECK(                                                     \
+            false,                                                       \
            NAME,                                                        \
            " expected sparse column compressed tensor layout but got ", \
            the_layout);                                                 \
@ -108,7 +113,8 @@
      case kSparseCsc:                                                        \
        return (ACTION)();                                                    \
      default:                                                                \
-        AT_ERROR(                                                             \
+        TORCH_CHECK(                                                          \
+            false,                                                            \
            NAME,                                                             \
            " expected sparse compressed (non-block) tensor layout but got ", \
            the_layout);                                                      \
@ -123,7 +129,8 @@
      case kSparseBsc:                                                    \
        return (ACTION)();                                                \
      default:                                                            \
-        AT_ERROR(                                                         \
+        TORCH_CHECK(                                                      \
+            false,                                                        \
            NAME,                                                         \
            " expected sparse compressed block tensor layout but got ",   \
            the_layout);                                                  \
--- a/aten/src/ATen/SparseTensorImpl.cpp
+++ b/aten/src/ATen/SparseTensorImpl.cpp
@ -57,13 +57,13 @@ void SparseTensorImpl::release_resources() {
 }

 void SparseTensorImpl::set_size(int64_t dim, int64_t new_size) {
-  AT_ERROR("sparse tensors do not have set_size");
+  TORCH_CHECK(false, "sparse tensors do not have set_size");
 }
 void SparseTensorImpl::set_stride(int64_t dim, int64_t new_stride) {
-  AT_ERROR("sparse tensors do not have set_stride");
+  TORCH_CHECK(false, "sparse tensors do not have set_stride");
 }
 void SparseTensorImpl::set_storage_offset(int64_t storage_offset) {
-  AT_ERROR("sparse tensors do not have set_storage_offset");
+  TORCH_CHECK(false, "sparse tensors do not have set_storage_offset");
 }
 #ifdef DEBUG
 bool SparseTensorImpl::has_storage() const {
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -155,7 +155,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
    }
    oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
        << " to be on GPU (while checking arguments for " << c << ")";
-    AT_ERROR(oss.str());
+    TORCH_CHECK(false, oss.str());
  }
  TORCH_CHECK(
    t1->get_device() == t2->get_device(),
@ -200,7 +200,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t,
      }
      oss << "; but got " << t->toString()
          << " instead (while checking arguments for " << c << ")";
-      AT_ERROR(oss.str());
+      TORCH_CHECK(false, oss.str());
    }
 }

--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@ -36,7 +36,8 @@ inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
  for (const auto i : c10::irange(tensors.size())) {
    const auto& expr = tensors[i];
    if (expr.layout() != Layout::Strided) {
-      AT_ERROR(
+      TORCH_CHECK(
+          false,
          "Expected dense tensor but got ",
          expr.layout(),
          " for sequence element ",
@ -48,7 +49,8 @@ inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
          "'");
    }
    if (expr.device().type() != device_type) {
-      AT_ERROR(
+      TORCH_CHECK(
+          false,
          "Expected object of device type ",
          device_type,
          " but got device type ",
@ -62,7 +64,8 @@ inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
          "'");
    }
    if (expr.scalar_type() != scalar_type) {
-      AT_ERROR(
+      TORCH_CHECK(
+          false,
          "Expected object of scalar type ",
          scalar_type,
          " but got scalar type ",
@ -96,7 +99,8 @@ std::array<int64_t, N> check_intlist(
    return res;
  }
  if (list.size() != N) {
-    AT_ERROR(
+    TORCH_CHECK(
+        false,
        "Expected a list of ",
        N,
        " ints but got ",
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -149,7 +149,7 @@ Banned functions
 *******************************/

 static Tensor binary_cross_entropy_banned(const Tensor &, const Tensor &, const std::optional<Tensor>&, int64_t) {
-  AT_ERROR("torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
+  TORCH_CHECK(false, "torch.nn.functional.binary_cross_entropy and torch.nn.BCELoss are unsafe to autocast.\n"
           "Many models use a sigmoid layer right before the binary cross entropy layer.\n"
           "In this case, combine the two layers using torch.nn.functional.binary_cross_entropy_with_logits\n"
           "or torch.nn.BCEWithLogitsLoss.  binary_cross_entropy_with_logits and BCEWithLogits are\n"
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -23,36 +23,37 @@ TORCH_API bool is_autocast_cache_enabled();
 TORCH_API void set_autocast_cache_enabled(bool enabled);

 // deprecated CUDA-specific autocast APIs
-C10_DEPRECATED_MESSAGE(
-    "at::autocast::is_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.")
-TORCH_API inline bool is_enabled() {
+[[deprecated(
+    "at::autocast::is_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.")]] TORCH_API inline bool
+is_enabled() {
  TORCH_WARN_DEPRECATION(
      "at::autocast::",
      __func__,
      "() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.")
  return is_autocast_enabled(at::kCUDA);
 }
-C10_DEPRECATED_MESSAGE(
-    "at::autocast::set_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.")
-TORCH_API inline void set_enabled(bool enabled) {
+[[deprecated(
+    "at::autocast::set_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.")]] TORCH_API inline void
+set_enabled(bool enabled) {
  TORCH_WARN_DEPRECATION(
      "at::autocast::",
      __func__,
      "(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.")
  set_autocast_enabled(at::kCUDA, enabled);
 }
-C10_DEPRECATED_MESSAGE(
-    "at::autocast::get_autocast_gpu_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.")
-TORCH_API inline at::ScalarType get_autocast_gpu_dtype() {
+[[deprecated(
+    "at::autocast::get_autocast_gpu_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.")]] TORCH_API inline at::
+    ScalarType
+    get_autocast_gpu_dtype() {
  TORCH_WARN_DEPRECATION(
      "at::autocast::",
      __func__,
      "() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.")
  return get_autocast_dtype(at::kCUDA);
 }
-C10_DEPRECATED_MESSAGE(
-    "at::autocast::set_autocast_gpu_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(at::kCUDA, dtype) instead.")
-TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
+[[deprecated(
+    "at::autocast::set_autocast_gpu_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(at::kCUDA, dtype) instead.")]] TORCH_API inline void
+set_autocast_gpu_dtype(at::ScalarType dtype) {
  TORCH_WARN_DEPRECATION(
      "at::autocast::",
      __func__,
@ -61,11 +62,10 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
 }

 #define DECLARE_DEPRECATED_AUTOCAST_APIS(name, device_type)                                          \
-  C10_DEPRECATED_MESSAGE(                                                                            \
+  [[deprecated(                                                                                      \
      "at::autocast::is_" #name                                                                      \
      "_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(" #device_type         \
-      ") instead.")                                                                                  \
-  TORCH_API inline bool is_##name##_enabled() {                                                      \
+      ") instead.")]] TORCH_API inline bool is_##name##_enabled() {                                  \
    TORCH_WARN_DEPRECATION(                                                                          \
        "at::autocast::",                                                                            \
        __func__,                                                                                    \
@ -74,11 +74,11 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
    return is_autocast_enabled(device_type);                                                         \
  }                                                                                                  \
                                                                                                     \
-  C10_DEPRECATED_MESSAGE(                                                                            \
+  [[deprecated(                                                                                      \
      "at::autocast::set_" #name                                                                     \
      "_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(" #device_type \
-      ", enabled) instead.")                                                                         \
-  TORCH_API inline void set_##name##_enabled(bool enabled) {                                         \
+      ", enabled) instead.")]] TORCH_API inline void                                                 \
+      set_##name##_enabled(bool enabled) {                                                           \
    TORCH_WARN_DEPRECATION(                                                                          \
        "at::autocast::",                                                                            \
        __func__,                                                                                    \
@ -87,11 +87,11 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
    set_autocast_enabled(device_type, enabled);                                                      \
  }                                                                                                  \
                                                                                                     \
-  C10_DEPRECATED_MESSAGE(                                                                            \
+  [[deprecated(                                                                                      \
      "at::autocast::get_autocast_" #name                                                            \
      "_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(" #device_type            \
-      ") instead.")                                                                                  \
-  TORCH_API inline at::ScalarType get_autocast_##name##_dtype() {                                    \
+      ") instead.")]] TORCH_API inline at::ScalarType                                                \
+      get_autocast_##name##_dtype() {                                                                \
    TORCH_WARN_DEPRECATION(                                                                          \
        "at::autocast::",                                                                            \
        __func__,                                                                                    \
@ -100,11 +100,11 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
    return get_autocast_dtype(device_type);                                                          \
  }                                                                                                  \
                                                                                                     \
-  C10_DEPRECATED_MESSAGE(                                                                            \
+  [[deprecated(                                                                                      \
      "at::autocast::set_autocast_" #name                                                            \
      "_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(" #device_type       \
-      ", dtype) instead.")                                                                           \
-  TORCH_API inline void set_autocast_##name##_dtype(at::ScalarType dtype) {                          \
+      ", dtype) instead.")]] TORCH_API inline void                                                   \
+      set_autocast_##name##_dtype(at::ScalarType dtype) {                                            \
    TORCH_WARN_DEPRECATION(                                                                          \
        "at::autocast::",                                                                            \
        __func__,                                                                                    \
@ -211,7 +211,7 @@ inline at::ScalarType prioritize(
    const Tensor& nextArg,
    c10::DeviceType device_type = c10::DeviceType::CUDA) {
  if (current == at::kDouble) {
-    AT_ERROR("promote type is double in at::autocast::prioritize");
+    TORCH_CHECK(false, "promote type is double in at::autocast::prioritize");
    return current;
  }
  at::ScalarType lower_precision_fp =
@ -225,7 +225,8 @@ inline at::ScalarType prioritize(
    } else if (current == lower_precision_fp && next == lower_precision_fp) {
      return lower_precision_fp;
    } else {
-      AT_ERROR("Unexpected floating ScalarType in at::autocast::prioritize");
+      TORCH_CHECK(
+          false, "Unexpected floating ScalarType in at::autocast::prioritize");
      return current;
    }
  } else {
--- a/aten/src/ATen/core/DistributionsHelper.h
+++ b/aten/src/ATen/core/DistributionsHelper.h
@ -95,11 +95,9 @@ struct uniform_int_distribution {
 template <typename T>
 struct uniform_real_distribution {

-  C10_HOST_DEVICE inline uniform_real_distribution(T from, T to) {
+  C10_HOST_DEVICE inline uniform_real_distribution(T from, T to) : from_(from), to_(to) {
    TORCH_CHECK_IF_NOT_ON_CUDA(from <= to);
    TORCH_CHECK_IF_NOT_ON_CUDA(to - from <= std::numeric_limits<T>::max());
-    from_ = from;
-    to_ = to;
  }

  template <typename RNG>
@ -186,10 +184,8 @@ DISTRIBUTION_HELPER_GENERATE_NEXT_NORMAL_METHODS(float);
 template <typename T>
 struct normal_distribution {

-  C10_HOST_DEVICE inline normal_distribution(T mean_in, T stdv_in) {
+  C10_HOST_DEVICE inline normal_distribution(T mean_in, T stdv_in) : mean(mean_in), stdv(stdv_in) {
    TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in >= 0, "stdv_in must be positive: ", stdv_in);
-    mean = mean_in;
-    stdv = stdv_in;
  }

  template <typename RNG>
@ -236,9 +232,8 @@ template <> struct DiscreteDistributionType<double> { using type = double; };
 template <typename T>
 struct bernoulli_distribution {

-  C10_HOST_DEVICE inline bernoulli_distribution(T p_in) {
+  C10_HOST_DEVICE inline bernoulli_distribution(T p_in) : p(p_in) {
    TORCH_CHECK_IF_NOT_ON_CUDA(p_in >= 0 && p_in <= 1);
-    p = p_in;
  }

  template <typename RNG>
@ -257,9 +252,8 @@ struct bernoulli_distribution {
 template <typename T>
 struct geometric_distribution {

-  C10_HOST_DEVICE inline geometric_distribution(T p_in) {
+  C10_HOST_DEVICE inline geometric_distribution(T p_in) : p(p_in) {
    TORCH_CHECK_IF_NOT_ON_CUDA(p_in > 0 && p_in < 1);
-    p = p_in;
  }

  template <typename RNG>
@ -317,10 +311,8 @@ struct cauchy_distribution {
 template <typename T>
 struct lognormal_distribution {

-  C10_HOST_DEVICE inline lognormal_distribution(T mean_in, T stdv_in) {
+  C10_HOST_DEVICE inline lognormal_distribution(T mean_in, T stdv_in) : mean(mean_in), stdv(stdv_in) {
    TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in > 0);
-    mean = mean_in;
-    stdv = stdv_in;
  }

  template<typename RNG>
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@ -263,9 +263,8 @@ public:
 // Can't put this directly into the macro function args because of commas
 #define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>

-// Old name for `GenericPackedTensorAccessor`
 template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-C10_DEFINE_DEPRECATED_USING(PackedTensorAccessor, AT_X)
+using PackedTensorAccessor [[deprecated("Old name for `GenericPackedTensorAccessor`")]] = AT_X;

 #undef AT_X

--- a/aten/src/ATen/core/enum_type.h
+++ b/aten/src/ATen/core/enum_type.h
@ -28,7 +28,7 @@ struct TORCH_API EnumType : public NamedType {
            std::move(enum_names_values),
            std::move(cu)));
      default:
-        AT_ERROR(
+        TORCH_CHECK(false,
            "Cannot create Enum with value type '",
            value->str(),
            "', only int, float and string are supported");
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@ -55,7 +55,7 @@ inline void FunctionSchema::checkAndNormalizeInputs(
      inputs.push_back(*argument.default_value());
      continue;
    }
-    AT_ERROR(
+    TORCH_CHECK(false,
        name(),
        "() is missing value for argument '",
        argument.name(),
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -756,7 +756,7 @@ IValueComparator getLessThanComparator(const IValue& v) {
    torch::jit::Function* lt_func =
        checkObjectSortSchema(v.type()->expect<ClassType>(), why_not);
    if (!lt_func) {
-      AT_ERROR(why_not.str());
+      TORCH_CHECK(false, why_not.str());
    }

    return [lt_func](const IValue& a, const IValue& b) {
@ -772,7 +772,7 @@ IValueComparator getLessThanComparator(const IValue& v) {
    };
  }

-  AT_ERROR("IValues of type: ", v.tagKind(), " are not comparable");
+  TORCH_CHECK(false, "IValues of type: ", v.tagKind(), " are not comparable");
 }

 IValueComparator getGreaterThanComparator(const IValue& v) {
@ -967,7 +967,7 @@ IValue IValue::deepcopy(
      copy = *this;
    } break;
    default: {
-      AT_ERROR("Can't deepcopy IValue with tag: ", tagKind());
+      TORCH_CHECK(false, "Can't deepcopy IValue with tag: ", tagKind());
    }
  }
  // NB: this doesn't work if an object contains itself, and it may
@ -1050,7 +1050,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
      }
      err << ". Please define serialization methods via def_pickle() for "
            "this class.";
-      AT_ERROR(err.str());
+      TORCH_CHECK(false, err.str());
    }
    object->setSlot(i, slots_[i].deepcopy(memo, device));
  }
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -809,12 +809,9 @@ struct TORCH_API IValue final {
  IValue(c10::Dict<Key, Value> v);

  template <class Key, class Value>
-  /// \cond
-  /// DOXYGEN_CANNOT_HANDLE_CONSTRUCTORS_WITH_MACROS_SO_EXCLUDE_THIS_LINE_FROM_DOXYGEN
-  C10_DEPRECATED_MESSAGE(
-      "IValues based on std::unordered_map<K, V> are slow and deprecated. Please use c10::Dict<K, V> instead.")
-      /// \endcond
-      IValue(std::unordered_map<Key, Value> v);
+  [[deprecated(
+    "IValues based on std::unordered_map<K, V> are slow and deprecated. Please use c10::Dict<K, V> instead.")]]
+  IValue(std::unordered_map<Key, Value> v);

  template <class T, enable_if_ivalue_constructible<T> = nullptr>
  IValue(std::optional<T> v);
@ -1163,7 +1160,7 @@ struct TORCH_API IValue final {
  // this value different (e.g. using NaN boxing), and this would make it more
  // costly to determine the tag for all types vs just determining if something
  // is a particular type. Instead we want clients to use the `isX` methods when
-  // possible. If for perf. reasons you really, absolutely, must have a jump
+  // possible. If for performance reasons you really, absolutely, must have a jump
  // table, then we can revisit this.
  enum class Tag : uint32_t {
 #define DEFINE_TAG(x) x,
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -863,6 +863,19 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
  Future& operator=(const Future&) = delete;
  Future& operator=(Future&&) = delete;

+  // Destructor
+  // Explicitly destroy events under device guard, otherwise it can lead to
+  // extra context being created on device 0.  Reason: python garbage collector
+  // calls this destructor, but python GC does not have a device context, so a
+  // "default" one (usually on device 0) could be created when we go down the
+  // line of event destroy.
+  ~Future() override {
+    while (!events_.empty()) {
+      c10::OptionalDeviceGuard deviceGuard(events_.back().device());
+      events_.pop_back();
+    }
+  }
+
  struct TORCH_API FutureError final : public std::exception {
    explicit FutureError(std::string&& error_msg_)
        : error_msg(std::move(error_msg_)) {}
@ -1758,8 +1771,8 @@ struct _fake_type {};
 template <class Elem>
 // TODO this is deprecated but we don't throw a warning because a lot of ops in
 // native_functions.yaml still return std::vector.
-// C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
-// and deprecated. Please use torch::List<T> instead.")
+// [[deprecated("IValues based on std::vector<T> are potentially slow
+// and deprecated. Please use torch::List<T> instead.")]]
 std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
  // We need to do a deep copy of the vector because there might be other
  // references to this same IValue that also use the list. We can't just
@ -1895,8 +1908,8 @@ c10::Dict<Key, Value> generic_to(
 }

 template <typename K, typename V>
-C10_DEPRECATED_MESSAGE(
-    "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
+[[deprecated(
+    "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")]]
 std::unordered_map<K, V> generic_to(
    IValue ivalue,
    _fake_type<std::unordered_map<K, V>>) {
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -938,7 +938,7 @@ struct TORCH_API DictType : public SharedType {
      case TypeKind::DeviceObjType:
        return DictTypePtr(new DictType(std::move(key), std::move(value)));
      default:
-        AT_ERROR(
+        TORCH_CHECK(false,
            "Cannot create dict for key type '",
            key->str(),
            "', only int, float, complex, Tensor, device and string keys are supported");
--- a/aten/src/ATen/core/jit_type_base.h
+++ b/aten/src/ATen/core/jit_type_base.h
@ -585,7 +585,7 @@ struct TORCH_API Type {
  virtual TypePtr createWithContained(
      // NOLINTNEXTLINE(performance-unnecessary-value-param)
      std::vector<TypePtr> /*contained_types*/) const {
-    AT_ERROR(
+    TORCH_CHECK(false,
        "type with contained types did not overload createWithContained: ",
        str());
  }
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@ -562,7 +562,7 @@ public:
    }

    template<class Lambda>
-    C10_DEPRECATED_MESSAGE("Registering operator kernels with stateful lambdas (i.e. lambdas with a capture) has non-obvious behavior. This is deprecated. Please use a lambda without a capture or a functor class instead.")
+    [[deprecated("Registering operator kernels with stateful lambdas (i.e. lambdas with a capture) has non-obvious behavior. This is deprecated. Please use a lambda without a capture or a functor class instead.")]]
    // enable_if: only enable it if Lambda is actually a functor but not a stateless lambda
    std::enable_if_t<guts::is_functor<Lambda>::value && !guts::is_stateless_lambda<std::decay_t<Lambda>>::value, RegisterOperators&&>
    op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && {
--- a/aten/src/ATen/core/stack.h
+++ b/aten/src/ATen/core/stack.h
@ -21,7 +21,7 @@ class Operation {
 public:
  template <typename F,
            std::enable_if_t<accepts<F, Stack*>::value, int> = 0>
-  C10_DEPRECATED_MESSAGE("Please use void(Stack&) to register operator instead.")
+  [[deprecated("Please use void(Stack&) to register operator instead.")]]
  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
  Operation(F&& raw): op_([raw = std::forward<F>(raw)](Stack& stack) {
    raw(&stack);
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -629,7 +629,7 @@ MatchTypeReturn matchTypeVariables(
    }
  }

-  AT_ERROR("Unhandled free variable container: ", formal->repr_str());
+  TORCH_CHECK(false, "Unhandled free variable container: ", formal->repr_str());
 }

 // change return types like List[List[t]] into List[List[int]]
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -34,7 +34,7 @@ static rocblas_operation hipOperationToRocOperation(hipblasOperation_t op)
    case HIPBLAS_OP_C:
        return rocblas_operation_conjugate_transpose;
    }
-    AT_ERROR("HIPBLAS_STATUS_INVALID_ENUM");
+    TORCH_CHECK(false, "HIPBLAS_STATUS_INVALID_ENUM");
 }
 static hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status error)
 {
@ -57,7 +57,7 @@ static hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status error)
    case rocblas_status_internal_error:
        return HIPBLAS_STATUS_INTERNAL_ERROR;
    }
-    AT_ERROR("HIPBLAS_STATUS_INVALID_ENUM");
+    TORCH_CHECK(false, "HIPBLAS_STATUS_INVALID_ENUM");
 }
 // hipblas does not have hipblasSetMathMode
 #define hipblasSetMathMode(handle, flags) HIPBLAS_STATUS_SUCCESS
@ -116,7 +116,7 @@ static cublasOperation_t _cublasOpFromChar(char op) {
    case 'C':
      return CUBLAS_OP_C;
  }
-  AT_ERROR(
+  TORCH_CHECK(false,
      "_cublasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
 }

--- a/aten/src/ATen/cuda/Exceptions.h
+++ b/aten/src/ATen/cuda/Exceptions.h
@ -165,9 +165,9 @@ constexpr const char* _cusolver_backend_suggestion =            \
      [[maybe_unused]] CUresult get_error_str_err =                         \
          at::globalContext().getNVRTC().cuGetErrorString(__err, &err_str); \
      if (get_error_str_err != CUDA_SUCCESS) {                              \
-        AT_ERROR("CUDA driver error: unknown error");                       \
+        TORCH_CHECK(false, "CUDA driver error: unknown error");             \
      } else {                                                              \
-        AT_ERROR("CUDA driver error: ", err_str);                           \
+        TORCH_CHECK(false, "CUDA driver error: ", err_str);                 \
      }                                                                     \
    }                                                                       \
  } while (0)
@ -178,7 +178,7 @@ constexpr const char* _cusolver_backend_suggestion =            \
  do {                                                                            \
    CUresult __err = EXPR;                                                        \
    if (__err != CUDA_SUCCESS) {                                                  \
-      AT_ERROR("CUDA driver error: ", static_cast<int>(__err));                   \
+      TORCH_CHECK(false, "CUDA driver error: ", static_cast<int>(__err));                   \
    }                                                                             \
  } while (0)

@ -198,9 +198,9 @@ constexpr const char* _cusolver_backend_suggestion =            \
    nvrtcResult __err = EXPR;                                                                       \
    if (__err != NVRTC_SUCCESS) {                                                                   \
      if (static_cast<int>(__err) != 7) {                                                           \
-        AT_ERROR("CUDA NVRTC error: ", at::globalContext().getNVRTC().nvrtcGetErrorString(__err));  \
+        TORCH_CHECK(false, "CUDA NVRTC error: ", at::globalContext().getNVRTC().nvrtcGetErrorString(__err));  \
      } else {                                                                                      \
-        AT_ERROR("CUDA NVRTC error: NVRTC_ERROR_BUILTIN_OPERATION_FAILURE");                        \
+        TORCH_CHECK(false, "CUDA NVRTC error: NVRTC_ERROR_BUILTIN_OPERATION_FAILURE");                        \
      }                                                                                             \
    }                                                                                               \
  } while (0)
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -103,7 +103,7 @@ void CUDAHooks::init() const {
 #endif
 }

-const Generator& CUDAHooks::getDefaultCUDAGenerator(DeviceIndex device_index) const {
+const Generator& CUDAHooks::getDefaultGenerator(DeviceIndex device_index) const {
  return at::cuda::detail::getDefaultCUDAGenerator(device_index);
 }

@ -300,7 +300,7 @@ long CUDAHooks::versionCuDNN() const {
 #if AT_CUDNN_ENABLED()
  return CUDNN_VERSION;
 #else
-  AT_ERROR("Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
+  TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
 #endif
 }

@ -408,7 +408,7 @@ double CUDAHooks::batchnormMinEpsilonCuDNN() const {
 #if AT_CUDNN_ENABLED()
  return CUDNN_BN_MIN_EPSILON;
 #else
-  AT_ERROR(
+  TORCH_CHECK(false,
      "Cannot query CUDNN_BN_MIN_EPSILON if ATen_cuda is not built with CuDNN");
 #endif
 }
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -22,7 +22,8 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  void init() const override;
  Device getDeviceFromPtr(void* data) const override;
  bool isPinnedPtr(const void* data) const override;
-  const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1) const override;
+  const Generator& getDefaultGenerator(
+      DeviceIndex device_index = -1) const override;
  bool hasCUDA() const override;
  bool hasMAGMA() const override;
  bool hasCuDNN() const override;
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@ -310,7 +310,7 @@ static hipblasOperation_t _hipblasOpFromChar(char op) {
    case 'C':
      return HIPBLAS_OP_C;
  }
-  AT_ERROR(
+  TORCH_CHECK(false,
      "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
 }

@ -323,7 +323,7 @@ static char _charFromhipblasOp(hipblasOperation_t op) {
    case HIPBLAS_OP_C:
      return 'C';
  }
-  AT_ERROR(
+  TORCH_CHECK(false,
      "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`");
 }

--- a/aten/src/ATen/cuda/tunable/GemmRocblas.h
+++ b/aten/src/ATen/cuda/tunable/GemmRocblas.h
@ -130,7 +130,7 @@ static rocblas_operation _rocblasOpFromChar(char op) {
    case 'C':
      return rocblas_operation_conjugate_transpose;
  }
-  AT_ERROR(
+  TORCH_CHECK(false,
      "_rocblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
 }

--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@ -197,15 +197,15 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmOp<T>>());

 #ifdef USE_ROCM
-    static const char *env_rocblas = std::getenv("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED");
-    if (env_rocblas == nullptr || strcmp(env_rocblas, "1") == 0) {
+    static const auto env_rocblas = c10::utils::check_env("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED");
+    if (!env_rocblas.has_value() || env_rocblas.value()) {
      for (auto&& [name, op] : GetRocBlasGemmTypeStringAndOps<T>()) {
        this->RegisterOp(std::move(name), std::move(op));
      }
    }

-    static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-    if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+    static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (!env_hipblaslt.has_value() || env_hipblaslt.value()) {
      // disallow tuning of hipblaslt with c10::complex
      if constexpr (
          !std::is_same_v<T, c10::complex<float>> &&
@ -230,8 +230,8 @@ class GemmAndBiasTunableOp : public TunableOp<GemmAndBiasParams<T>, StreamTimer>
    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmAndBiasOp<T>>());

 #ifdef USE_ROCM
-    static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-    if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+    static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (!env_hipblaslt.has_value() || env_hipblaslt.value()) {
      // disallow tuning of hipblaslt with c10::complex
      if constexpr (
          !std::is_same_v<T, c10::complex<float>> &&
@ -256,15 +256,15 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmStridedBatchedOp<T>>());

 #ifdef USE_ROCM
-    static const char *env_rocblas = std::getenv("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED");
-    if (env_rocblas == nullptr || strcmp(env_rocblas, "1") == 0) {
+    static const auto env_rocblas = c10::utils::check_env("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED");
+    if (!env_rocblas.has_value() || env_rocblas.value()) {
      for (auto&& [name, op] : GetRocBlasGemmStridedBatchedTypeStringAndOps<T>()) {
        this->RegisterOp(std::move(name), std::move(op));
      }
    }

-    static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
-    if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+    static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (!env_hipblaslt.has_value() || env_hipblaslt.value()) {
      // disallow tuning of hipblaslt with c10::complex
      if constexpr (
          !std::is_same_v<T, c10::complex<float>> &&
--- a/aten/src/ATen/cudnn/AutocastRNN.cpp
+++ b/aten/src/ATen/cudnn/AutocastRNN.cpp
@ -113,7 +113,7 @@ _cudnn_rnn_cast_reflatten(const Tensor & input,
      batch_sizes,
      dropout_state);
 #else // AT_CUDNN_ENABLED()
-  AT_ERROR("autocast::_cudnn_rnn_cast_reflatten: ATen not compiled with cuDNN support");
+  TORCH_CHECK(false, "autocast::_cudnn_rnn_cast_reflatten: ATen not compiled with cuDNN support");
  return {Tensor{}, Tensor{}, Tensor{}, Tensor{}, Tensor{}}; // never reached, placates the compiler
 #endif // AT_CUDNN_ENABLED()
 }
--- a/aten/src/ATen/detail/AcceleratorHooksInterface.h
+++ b/aten/src/ATen/detail/AcceleratorHooksInterface.h
@ -1,9 +1,13 @@
 #pragma once

+#include <ATen/core/Generator.h>
+
+#include <c10/core/Allocator.h>
 #include <c10/core/Device.h>
 #include <c10/core/Stream.h>
-#include <c10/core/Allocator.h>
+
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
+
 namespace at {

 // AcceleratorHooksInterface is a shared interface provided by all
@ -58,7 +62,18 @@ struct TORCH_API AcceleratorHooksInterface {
  virtual Device getDeviceFromPtr(void* data) const {
    TORCH_CHECK(false, "Backend doesn't support getDeviceFromPtr()");
  }
+
+  virtual const Generator& getDefaultGenerator(
+      C10_UNUSED DeviceIndex device_index = -1) const {
+    TORCH_CHECK(false, "Backend doesn`t support getDefaultGenerator()");
+  }
+
+  virtual Generator getNewGenerator(
+      C10_UNUSED DeviceIndex device_index = -1) const {
+    TORCH_CHECK(false, "Backend doesn`t support getNewGenerator()");
+  }
 };

 } // namespace at
+
 C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -6,16 +6,13 @@

 #include <ATen/detail/AcceleratorHooksInterface.h>

-// Forward-declares at::Generator and at::cuda::NVRTC
+// NB: Class must live in `at` due to limitations of Registry.h.
 namespace at {
-struct Generator;
+
+// Forward-declares at::cuda::NVRTC
 namespace cuda {
 struct NVRTC;
 } // namespace cuda
-} // namespace at
-
-// NB: Class must live in `at` due to limitations of Registry.h.
-namespace at {

 #ifdef _MSC_VER
 constexpr const char* CUDA_HELP =
@ -69,8 +66,8 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    TORCH_CHECK(false, "Cannot initialize CUDA without ATen_cuda library. ", CUDA_HELP);
  }

-  virtual const Generator& getDefaultCUDAGenerator(
-      [[maybe_unused]] DeviceIndex device_index = -1) const {
+  const Generator& getDefaultGenerator(
+      [[maybe_unused]] DeviceIndex device_index = -1) const override {
    TORCH_CHECK(
        false,
        "Cannot get default CUDA generator without ATen_cuda library. ",
--- a/aten/src/ATen/detail/HIPHooksInterface.h
+++ b/aten/src/ATen/detail/HIPHooksInterface.h
@ -1,19 +1,13 @@
 #pragma once

 #include <c10/core/Allocator.h>
-#include <c10/core/GeneratorImpl.h>
 #include <c10/util/Exception.h>
-
 #include <c10/util/Registry.h>

 #include <ATen/detail/AcceleratorHooksInterface.h>

 #include <memory>

-namespace at {
-class Context;
-}
-
 // NB: Class must live in `at` due to limitations of Registry.h.
 namespace at {

@ -30,8 +24,9 @@ struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface {
    TORCH_CHECK(false, "Cannot initialize HIP without ATen_hip library.");
  }

-  virtual std::unique_ptr<c10::GeneratorImpl> initHIPGenerator(Context*) const {
-    AT_ERROR("Cannot initialize HIP generator without ATen_hip library.");
+  const Generator& getDefaultGenerator(
+      C10_UNUSED DeviceIndex device_index = -1) const override {
+    TORCH_CHECK(false, "Cannot initialize HIP without ATen_hip library.");
  }

  virtual bool hasHIP() const {
@ -47,11 +42,7 @@ struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface {
  }

  Allocator* getPinnedMemoryAllocator() const override {
-    AT_ERROR("Pinned memory requires HIP.");
-  }
-
-  virtual void registerHIPTypes(Context*) const {
-    AT_ERROR("Cannot registerHIPTypes() without ATen_hip library.");
+    TORCH_CHECK(false, "Pinned memory requires HIP.");
  }

  virtual int getNumGPUs() const {
@ -59,7 +50,7 @@ struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface {
  }

  bool hasPrimaryContext(DeviceIndex device_index) const override {
-    AT_ERROR("Cannot check primary context without ATen_hip library.");
+    TORCH_CHECK(false, "Cannot check primary context without ATen_hip library.");
  }
 };

--- a/aten/src/ATen/detail/IPUHooksInterface.h
+++ b/aten/src/ATen/detail/IPUHooksInterface.h
@ -1,6 +1,5 @@
 #pragma once

-#include <ATen/core/Generator.h>
 #include <ATen/detail/AcceleratorHooksInterface.h>

 #include <c10/core/Allocator.h>
@ -9,7 +8,7 @@

 namespace at {

-struct TORCH_API IPUHooksInterface: AcceleratorHooksInterface {
+struct TORCH_API IPUHooksInterface : AcceleratorHooksInterface {
  ~IPUHooksInterface() override = default;

  void init() const override {
@ -21,16 +20,14 @@ struct TORCH_API IPUHooksInterface: AcceleratorHooksInterface {
    return false;
  }

-  virtual const Generator& getDefaultIPUGenerator(
-      DeviceIndex device_index [[maybe_unused]] = -1) const {
-    AT_ERROR(
-        "Cannot get the default IPU generator: the IPU backend is not "
-        "available.");
+  const Generator& getDefaultGenerator(
+      C10_UNUSED DeviceIndex device_index = -1) const override {
+    TORCH_CHECK(false, "Cannot initialize IPU without ATen_ipu library.");
  }

-  virtual Generator newIPUGenerator(DeviceIndex device_index [[maybe_unused]] = -1) const {
-    AT_ERROR(
-        "Cannot create a new IPU generator: the IPU backend is not available.");
+  Generator getNewGenerator(
+      DeviceIndex device_index [[maybe_unused]] = -1) const override {
+    TORCH_CHECK(false, "Cannot initialize IPU without ATen_ipu library.");
  }
 };

--- a/aten/src/ATen/detail/MPSHooksInterface.h
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@ -2,9 +2,9 @@

 #pragma once

-#include <c10/core/Allocator.h>
-#include <ATen/core/Generator.h>
 #include <ATen/detail/AcceleratorHooksInterface.h>
+
+#include <c10/core/Allocator.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Registry.h>

@ -31,7 +31,8 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
  virtual bool isOnMacOSorNewer(unsigned major = 13, unsigned minor = 0) const {
    FAIL_MPSHOOKS_FUNC(__func__);
  }
-  virtual const Generator& getDefaultMPSGenerator() const {
+  const Generator& getDefaultGenerator(
+      C10_UNUSED DeviceIndex device_index = -1) const override {
    FAIL_MPSHOOKS_FUNC(__func__);
  }
  virtual Allocator* getMPSDeviceAllocator() const {
--- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h
+++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h
@ -1,18 +1,20 @@
 #pragma once

-#include <ATen/core/Generator.h>
 #include <ATen/detail/AcceleratorHooksInterface.h>
 #include <c10/core/Allocator.h>
 #include <c10/core/Device.h>
 #include <c10/core/Storage.h>
 #include <c10/util/Exception.h>
+
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
+
 namespace at {

 struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
  ~PrivateUse1HooksInterface() override = default;
-  virtual const at::Generator& getDefaultGenerator(
-      c10::DeviceIndex device_index) const {
+
+  const at::Generator& getDefaultGenerator(
+      c10::DeviceIndex device_index) const override {
    TORCH_CHECK_NOT_IMPLEMENTED(
        false,
        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDefaultGenerator`.");
@ -24,17 +26,17 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDeviceFromPtr`.");
  }

-  virtual bool isPinnedPtr(const void* data) const override {
+  bool isPinnedPtr(const void* data) const override {
    return false;
  }

-  virtual Allocator* getPinnedMemoryAllocator() const override {
+  Allocator* getPinnedMemoryAllocator() const override {
    TORCH_CHECK(
        false,
        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`.");
  }

-  virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
+  bool hasPrimaryContext(DeviceIndex device_index) const override {
    TORCH_CHECK_NOT_IMPLEMENTED(
        false,
        "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `hasPrimaryContext`.");
--- a/aten/src/ATen/detail/XPUHooksInterface.h
+++ b/aten/src/ATen/detail/XPUHooksInterface.h
@ -4,7 +4,6 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Registry.h>

-#include <ATen/core/Generator.h>
 #include <ATen/detail/AcceleratorHooksInterface.h>

 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
@ -32,17 +31,17 @@ struct TORCH_API XPUHooksInterface : AcceleratorHooksInterface{
    TORCH_CHECK(false, "Cannot get XPU global device index without ATen_xpu library.");
  }

-  virtual Generator getXPUGenerator(
-      [[maybe_unused]] DeviceIndex device_index = -1) const {
-    TORCH_CHECK(false, "Cannot get XPU generator without ATen_xpu library.");
-  }
-
-  virtual const Generator& getDefaultXPUGenerator(
-      [[maybe_unused]] DeviceIndex device_index = -1) const {
+  const Generator& getDefaultGenerator(
+      [[maybe_unused]] DeviceIndex device_index = -1) const override {
    TORCH_CHECK(
        false, "Cannot get default XPU generator without ATen_xpu library.");
  }

+  Generator getNewGenerator(
+      [[maybe_unused]] DeviceIndex device_index = -1) const override {
+    TORCH_CHECK(false, "Cannot get XPU generator without ATen_xpu library.");
+  }
+
  virtual DeviceIndex getNumGPUs() const {
    return 0;
  }
--- a/aten/src/ATen/dlpack.h
+++ b/aten/src/ATen/dlpack.h
@ -32,7 +32,9 @@
 #define DLPACK_DLL
 #endif

+// NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stdint.h>
+// NOLINTNEXTLINE(modernize-deprecated-headers)
 #include <stddef.h>

 #ifdef __cplusplus
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@ -224,7 +224,7 @@ static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes
    // but shape inference is not possible.
    if (self.sym_numel() == 0) {
        if (num_classes <= 0) {
-            AT_ERROR("Can not infer total number of classes from empty tensor.");
+            TORCH_CHECK(false, "Can not infer total number of classes from empty tensor.");
        } else {
            shape.emplace_back(num_classes);
            return at::empty_symint(shape, self.options());
--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@ -103,7 +103,7 @@ template<
  // optional cannot be used in a template, otherwise we would use it here.
  int maybe_keepdim_arg_pos
 >
-void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+static void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
  const auto& schema = op.schema();
  const auto num_returns = schema.returns().size();
  const auto num_arguments = schema.arguments().size();
@ -357,21 +357,21 @@ static std::tuple<Tensor, std::optional<int64_t>> searchsorted_batch_rule(
    // B<...>D, B<...>V -> no change
    if (buckets_bdim.has_value() && self_bdim.has_value()) {
      auto self_ = moveBatchDimToFront(self, self_bdim);
-      auto result = at::searchsorted(buckets, self_, out_int32, right, std::move(side), sorter_);
+      auto result = at::searchsorted(buckets, self_, out_int32, right, side, sorter_);
      return std::make_tuple(std::move(result), 0);
    }
    // B<...>D, <...>V -> B<...>D, B<...>V
    if (buckets_bdim.has_value() && !self_bdim.has_value()) {
      auto self_ = moveBatchDimToFront(self, self_bdim);
      self_ = ensure_has_bdim(self_, self_bdim.has_value(), buckets.size(0));
-      auto result = at::searchsorted(buckets, self_, out_int32, right, std::move(side), sorter_);
+      auto result = at::searchsorted(buckets, self_, out_int32, right, side, sorter_);
      return std::make_tuple(std::move(result), 0);
    }
    // <...>D, B<...>V -> <...>D, <...>(BV)
    if (!buckets_bdim.has_value() && self_bdim.has_value()) {
      auto bdim_size = self.size(*self_bdim);
      auto self_ = reshape_dim_into(*self_bdim, -1, self);
-      auto result = at::searchsorted(buckets, self_, out_int32, right, std::move(side), sorter_);
+      auto result = at::searchsorted(buckets, self_, out_int32, right, side, sorter_);
      result = reshape_dim_outof(-1, bdim_size, result);
      return std::make_tuple(result, result.dim() - 2);
    }
@ -382,7 +382,7 @@ static std::tuple<Tensor, std::optional<int64_t>> searchsorted_batch_rule(
  if (buckets_bdim.has_value() && self_bdim.has_value()) {
    auto self_ = moveBatchDimToFront(self, self_bdim);
    auto self_view_ = self_logical_rank == 0 ? self_.unsqueeze(-1) : self_.flatten(1);
-    auto result = at::searchsorted(buckets, self_view_, out_int32, right, std::move(side), sorter_);
+    auto result = at::searchsorted(buckets, self_view_, out_int32, right, side, sorter_);
    result = self_logical_rank == 0 ? result.squeeze(-1) : result.view(self_.sizes());
    return std::make_tuple(std::move(result), 0);
  }
@ -391,13 +391,13 @@ static std::tuple<Tensor, std::optional<int64_t>> searchsorted_batch_rule(
    auto bdim_size = buckets.size(*buckets_bdim);
    auto self_ = ensure_has_bdim(self, false, bdim_size);
    auto self_view_ = self_logical_rank == 0 ? self_.unsqueeze(-1) : self_.flatten(1);
-    auto result = at::searchsorted(buckets, self_view_, out_int32, right, std::move(side), sorter_);
+    auto result = at::searchsorted(buckets, self_view_, out_int32, right, side, sorter_);
    result = self_logical_rank == 0 ? result.squeeze(-1) : result.view(self_.sizes());
    return std::make_tuple(std::move(result), 0);
  }
  // D, B* -> no change
  if (!buckets_bdim.has_value() && self_bdim.has_value()) {
-    auto result = at::searchsorted(buckets, self, out_int32, right, std::move(side), sorter_);
+    auto result = at::searchsorted(buckets, self, out_int32, right, side, sorter_);
    return std::make_tuple(std::move(result), self_bdim);
  }
  TORCH_INTERNAL_ASSERT(false);
--- a/aten/src/ATen/metal/Context.cpp
+++ b/aten/src/ATen/metal/Context.cpp
@ -16,7 +16,7 @@ at::Tensor& metal_copy_(at::Tensor& self, const at::Tensor& src) {
  if (p) {
    return p->metal_copy_(self, src);
  }
-  AT_ERROR("Metal backend was not linked to the build");
+  TORCH_CHECK(false, "Metal backend was not linked to the build");
 }
 } // namespace at::metal

--- a/aten/src/ATen/miopen/AutocastRNN.cpp
+++ b/aten/src/ATen/miopen/AutocastRNN.cpp
@ -46,7 +46,7 @@ miopen_rnn(const Tensor & input_r,
                fn_dropout_state_opt);

 #else
-    AT_ERROR("autocast::miopen_rnn: ATen not compiled with ROCm enabled");
+    TORCH_CHECK(false, "autocast::miopen_rnn: ATen not compiled with ROCm enabled");
    return {Tensor{}, Tensor{}, Tensor{}, Tensor{}, Tensor{}}; // placate the compiler
 #endif

--- a/aten/src/ATen/mps/MPSHooks.h
+++ b/aten/src/ATen/mps/MPSHooks.h
@ -19,7 +19,8 @@ struct MPSHooks : public at::MPSHooksInterface {
  bool isOnMacOSorNewer(unsigned major, unsigned minor) const override;

  // MPSGeneratorImpl interface
-  const Generator& getDefaultMPSGenerator() const override;
+  const Generator& getDefaultGenerator(
+      DeviceIndex device_index = -1) const override;

  // MPSStream interface
  void deviceSynchronize() const override;
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@ -59,7 +59,7 @@ Allocator* MPSHooks::getMPSDeviceAllocator() const {
  return at::mps::GetMPSAllocator();
 }

-const Generator& MPSHooks::getDefaultMPSGenerator() const {
+const Generator& MPSHooks::getDefaultGenerator([[maybe_unused]] DeviceIndex device_index) const {
  return at::mps::detail::getDefaultMPSGenerator();
 }

--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@ -189,7 +189,7 @@ void MPSProfiler::initialize() {
      currentSigint.sa_flags = SA_RESTART;
      sigfillset(&currentSigint.sa_mask);
      if (sigaction(SIGINT, &currentSigint, &previousSigint) == -1) {
-        AT_ERROR("Cannot install SIGINT handler for MPSProfiler.");
+        TORCH_CHECK(false, "Cannot install SIGINT handler for MPSProfiler.");
      }
    }
  }
@ -207,7 +207,7 @@ void MPSProfiler::StartTrace(const std::string& mode, bool waitUntilCompleted) {
      } else if (token == "event") {
        m_profile_options |= ProfileOptions::ALL_SIGNPOST_EVENTS;
      } else {
-        AT_ERROR("Invalid Signpost trace mode: ", token);
+        TORCH_CHECK(false, "Invalid Signpost trace mode: ", token);
      }
    }
  }
@ -654,7 +654,7 @@ bool MPSProfiler::isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isEx
      isInfoLoggingEnabled = (m_log_options & LogOptions::CPU_FALLBACK_INFO);
      break;
    default:
-      AT_ERROR("invalid profiling info type");
+      TORCH_CHECK(false, "invalid profiling info type");
  }
  if (!isInfoLoggingEnabled) {
    return false;
@ -685,7 +685,7 @@ void MPSProfiler::emitSignpostEvent(SignpostTypes signpost_type,
      os_signpost_event_emit(m_os_log_events, signpost_id, kEvtSignpostCPUFallbacksStr, "%s", msg);
      break;
    default:
-      AT_ERROR("unknown SignpostType in MPS profiler");
+      TORCH_CHECK(false, "unknown SignpostType in MPS profiler");
  }
 }

@ -709,7 +709,7 @@ void MPSProfiler::beginSignpostInterval(SignpostTypes signpost_type,
      os_signpost_interval_begin(m_os_log_intervals, signpost_id, kIntSignpostCPUFallbacksStr, "%s", msg);
      break;
    default:
-      AT_ERROR("unknown SignpostType in MPS profiler");
+      TORCH_CHECK(false, "unknown SignpostType in MPS profiler");
  }
 }

@ -728,7 +728,7 @@ void MPSProfiler::endSignpostInterval(SignpostTypes signpost_type, os_signpost_i
      os_signpost_interval_end(m_os_log_intervals, signpost_id, kIntSignpostCPUFallbacksStr);
      break;
    default:
-      AT_ERROR("unknown SignpostType in MPS profiler");
+      TORCH_CHECK(false, "unknown SignpostType in MPS profiler");
  }
 }

@ -750,7 +750,7 @@ MPSProfiler::SignpostTypes MPSProfiler::getSignpostType(BaseInfo::Type infoType)
    case BaseInfo::Type::CPU_FALLBACK:
      return SignpostTypes::CPU_FALLBACK;
    default:
-      AT_ERROR("invalid profiling info type");
+      TORCH_CHECK(false, "invalid profiling info type");
  }
 }

--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -1624,7 +1624,7 @@ Tensor inverse(const Tensor& A) {
 template<typename scalar_t>
 static void apply_cholesky_solve(Tensor& b, Tensor& A, bool upper, Tensor& infos) {
 #if !AT_BUILD_WITH_LAPACK()
-  AT_ERROR("cholesky_solve: LAPACK library not found in compilation");
+  TORCH_CHECK(false, "cholesky_solve: LAPACK library not found in compilation");
 #else
  char uplo = upper ? 'U' : 'L';

--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -168,7 +168,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co
    ss << arg_name << " should be greater than zero but got (";
    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
 }

--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -719,7 +719,7 @@ static void check_shape_forward(const at::Tensor& input,
        separator = " x ";
      }

-      AT_ERROR("Calculated padded input size per channel: (", input_ss.str(), "). "
+      TORCH_CHECK(false, "Calculated padded input size per channel: (", input_ss.str(), "). "
               "Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
    }
  } else { // transposed
@ -1304,7 +1304,7 @@ ConvBackend _select_conv_backend(
  }

  // Error out if no suitable backend was found.
-  AT_ERROR("unsupported ConvNd parameters");
+  TORCH_CHECK(false, "unsupported ConvNd parameters");
 }

 // Selects a backend for convolution based on the inputs and params.
--- a/aten/src/ATen/native/DispatchStub.cpp
+++ b/aten/src/ATen/native/DispatchStub.cpp
@ -262,7 +262,7 @@ void* DispatchStubImpl::get_call_ptr(
            false, "DispatchStub: missing kernel for ", device_type);
        return nullptr;
      case ErrorType::DeviceNotSupported:
-        AT_ERROR("DispatchStub: unsupported device type", device_type);
+        TORCH_CHECK(false, "DispatchStub: unsupported device type", device_type);
    }
  }

--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@ -81,7 +81,7 @@ Tensor embedding_sparse_backward(

  // TODO: implement scale_grad_by_freq
  if (scale_grad_by_freq) {
-    AT_ERROR(
+    TORCH_CHECK(false,
        "embedding_backward: scale_grad_by_freq not supported with sparse gradients");
  }

--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@ -104,7 +104,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
    int64_t dim1 = height;
    for (const auto i : c10::irange(1, nDims)) {
      if (self.size(i) != dim1) {
-        AT_ERROR("all dimensions of input must be of equal length");
+        TORCH_CHECK(false, "all dimensions of input must be of equal length");
      }
    }
  }
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@ -269,7 +269,7 @@ inline double _get_epsilon(const ScalarType& sc_type) {
    case at::ScalarType::Double:
      return std::numeric_limits<double>::epsilon();
    default:
-      AT_ERROR("This function doesn't handle types other than float and double");
+      TORCH_CHECK(false, "This function doesn't handle types other than float and double");
  }
 }

--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@ -136,7 +136,7 @@ static void max_unpooling3d_shape_check(
  if (gradOutput.defined()) {
    if (oT != gradOutput.size(dimt) || oH != gradOutput.size(dimh) ||
        oW != gradOutput.size(dimw)) {
-      AT_ERROR(
+      TORCH_CHECK(false,
          "Inconsistent gradOutput size. oT= ",
          oT,
          ", oH= ",
--- a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp
@ -85,7 +85,7 @@ static inline void slow_conv_transpose2d_shape_check(
      check_dim_size(bias, 1, 0, weight.size(1));
    }
  } else if (!weight_nullable) {
-    AT_ERROR("weight tensor is expected to be non-nullable");
+    TORCH_CHECK(false, "weight tensor is expected to be non-nullable");
  }

  int ndim = input.dim();
@ -112,7 +112,7 @@ static inline void slow_conv_transpose2d_shape_check(
      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;

  if (output_width < 1 || output_height < 1) {
-    AT_ERROR(
+    TORCH_CHECK(false,
        "Given input size per channel: (",
        input_height,
        " x ",
--- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
@ -107,7 +107,7 @@ static inline void slow_conv_transpose3d_shape_check(
      check_dim_size(bias, 1, 0, weight.size(1));
    }
  } else if (!weight_nullable) {
-    AT_ERROR("weight tensor is expected to be non-nullable");
+    TORCH_CHECK(false, "weight tensor is expected to be non-nullable");
  }

  int ndim = input.dim();
@ -142,7 +142,7 @@ static inline void slow_conv_transpose3d_shape_check(
      output_padding_width;

  if (output_depth < 1 || output_width < 1 || output_height < 1) {
-    AT_ERROR(
+    TORCH_CHECK(false,
        "Given input size per channel: (",
        input_depth,
        " x ",
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -573,12 +573,12 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
  if (running_mean.defined()) {
    check_dims_match_num_input_features("running_mean", num_features, running_mean.sym_numel());
  } else if (!training) {
-    AT_ERROR("running_mean must be defined in evaluation mode");
+    TORCH_CHECK(false, "running_mean must be defined in evaluation mode");
  }
  if (running_var.defined()) {
    check_dims_match_num_input_features("running_var", num_features, running_var.sym_numel());
  } else if (!training) {
-    AT_ERROR("running_var must be defined in evaluation mode");
+    TORCH_CHECK(false, "running_var must be defined in evaluation mode");
  }
  if (weight.defined()) {
    check_dims_match_num_input_features("weight", num_features, weight.sym_numel());
--- a/aten/src/ATen/native/Onehot.cpp
+++ b/aten/src/ATen/native/Onehot.cpp
@ -34,7 +34,7 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
    // but shape inference is not possible.
    if (self.numel() == 0) {
        if (num_classes <= 0) {
-            AT_ERROR("Can not infer total number of classes from empty tensor.");
+            TORCH_CHECK(false, "Can not infer total number of classes from empty tensor.");
        } else {
            shape.push_back(num_classes);
            return at::empty(shape, self.options());
--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@ -51,7 +51,7 @@ std::tuple<Tensor, Tensor> _pack_padded_sequence(const Tensor& _input, const Ten
      // NB: enforce_sorted is implemented at a Python level, but the sortedness
      // check lives here. If enforce_sorted=False then this error should never
      // get called.
-      AT_ERROR("`lengths` array must be sorted in decreasing order when "
+      TORCH_CHECK(false, "`lengths` array must be sorted in decreasing order when "
               "`enforce_sorted` is True. You can pass `enforce_sorted=False` "
               "to pack_padded_sequence and/or pack_sequence to sidestep this "
               "requirement if you do not need ONNX exportability.");
--- a/aten/src/ATen/native/Repeat.cpp
+++ b/aten/src/ATen/native/Repeat.cpp
@ -83,7 +83,7 @@ Tensor repeat_interleave_symint(
        repeats.sym_size(0), " and input.size(", dim.value(), ") = ", input.sym_size(dim.value())
    );
  } else {
-    AT_ERROR("repeats must be 0-dim or 1-dim tensor");
+    TORCH_CHECK(false, "repeats must be 0-dim or 1-dim tensor");
  }

  auto ret = input.index_select(
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@ -881,12 +881,12 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
  if (!at::isFloatingType(self.scalar_type()) && !at::isComplexType(self.scalar_type())) {
    std::ostringstream ss;
    REPR(ss) << ": expected a tensor of floating point or complex values";
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
  if (self.dim() > 2 || self.dim() < 1) {
    std::ostringstream ss;
    REPR(ss) << ": expected a 1D or 2D tensor";
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
  Tensor input = self;
  if (self.dim() == 1) {
@ -911,24 +911,24 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
    std::ostringstream ss;
    REPR(ss) << ": expected 0 < n_fft < " << len
             << ", but got n_fft=" << win_length;
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
  if (hop_length <= 0) {
    std::ostringstream ss;
    REPR(ss) << ": expected hop_length > 0, but got hop_length=" << hop_length;
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
  if (win_length <= 0 || win_length > n_fft) {
    std::ostringstream ss;
    REPR(ss) << ": expected 0 < win_length <= n_fft, but got win_length="
             << win_length;
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
  if (window.defined() && (window.dim() != 1 || window.size(0) != win_length)) {
    std::ostringstream ss;
    REPR(ss) << ": expected a 1D window tensor of size equal to win_length="
             << win_length << ", but got window with size " << window.sizes();
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
  #undef REPR
  auto window_ = window;
@ -1063,17 +1063,17 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
  if (input.numel() == 0) {
    std::ostringstream ss;
    REPR(ss) << ": input tensor cannot be empty.";
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
  if (input_dim != 3 && input_dim != 4) {
    std::ostringstream ss;
    REPR(ss) << ": expected a tensor with 3 or 4 dimensions, but got " << input_dim;
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
  if (input.size(-1) != 2) {
    std::ostringstream ss;
    REPR(ss) << ": expected the last dimension to be 2 (corresponding to real and imaginary parts), but got " << self.size(-1);
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }

  const bool onesided = onesidedOpt.value_or(fft_size != n_fft);
@ -1081,32 +1081,32 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
    if (n_fft / 2 + 1 != fft_size) {
      std::ostringstream ss;
      REPR(ss) << ": expected the frequency dimension (3rd to the last) of the input tensor to match n_fft / 2 + 1 when onesided=True, but got " << fft_size;
-      AT_ERROR(ss.str());
+      TORCH_CHECK(false, ss.str());
    }
  } else {
    if (n_fft != fft_size) {
      std::ostringstream ss;
      REPR(ss) << ": expected the frequency dimension (3rd to the last) of the input tensor to match n_fft when onesided=False, but got " << fft_size;
-      AT_ERROR(ss.str());
+      TORCH_CHECK(false, ss.str());
    }
  }

  if (!(0 < hop_length && hop_length <= win_length)) {
    std::ostringstream ss;
    REPR(ss) << ": expected 0 < hop_length <= win_length";
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }

  if (!(0 < win_length && win_length <= n_fft)) {
    std::ostringstream ss;
    REPR(ss) << ": expected 0 < win_length <= n_fft";
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
  if (window.defined()) {
    if (window.dim() != 1 || window.size(0) != win_length) {
      std::ostringstream ss;
      REPR(ss) << ": Invalid window shape. window has to be 1D and length of `win_length`";
-      AT_ERROR(ss.str());
+      TORCH_CHECK(false, ss.str());
    }
  }

@ -1175,7 +1175,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
  if (at::is_scalar_tensor_true(window_envelop_lowest)) {
    std::ostringstream ss;
    REPR(ss) << "window overlap add min: " << window_envelop_lowest;
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }

  y = (y / window_envelop);  // size: (channel, expected_output_signal_len)
--- a/aten/src/ATen/native/SpectralOpsUtils.h
+++ b/aten/src/ATen/native/SpectralOpsUtils.h
@ -63,7 +63,7 @@ inline int64_t infer_ft_complex_to_real_onesided_size(int64_t complex_size,
    std::ostringstream ss;
    ss << "expected real signal size " << expected_size << " is incompatible "
       << "with onesided complex frequency size " << complex_size;
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
 }

--- a/aten/src/ATen/native/SummaryOps.cpp
+++ b/aten/src/ATen/native/SummaryOps.cpp
@ -26,19 +26,19 @@ Tensor _bincount_cpu_template(
    const Tensor& weights,
    int64_t minlength) {
  if (minlength < 0) {
-    AT_ERROR("minlength should be >= 0");
+    TORCH_CHECK(false, "minlength should be >= 0");
  }
  if (self.dim() == 1 && self.numel() == 0) {
    return at::zeros({minlength}, kLong);
  }
  if (self.dim() != 1 || *self.min().data_ptr<input_t>() < 0) {
-    AT_ERROR("bincount only supports 1-d non-negative integral inputs.");
+    TORCH_CHECK(false, "bincount only supports 1-d non-negative integral inputs.");
  }

  // Ensure max_val < 2 ^ 63 - 1 (9223372036854775807)
  auto max_val = *self.max().data_ptr<input_t>();
  if (max_val >= std::numeric_limits<int64_t>::max()) {
-    AT_ERROR(
+    TORCH_CHECK(false,
        "maximum value of input overflowed, it should be < ",
        std::numeric_limits<int64_t>::max(),
        " but got ",
@ -48,7 +48,7 @@ Tensor _bincount_cpu_template(

  bool has_weights = weights.defined();
  if (has_weights && (weights.dim() != 1 || weights.size(0) != self.size(0))) {
-    AT_ERROR("weights should be 1-d and have the same length as input");
+    TORCH_CHECK(false, "weights should be 1-d and have the same length as input");
  }

  Tensor output;
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@ -588,7 +588,7 @@ Tensor to_dense_backward(const Tensor& grad, const Tensor& input_, std::optional
    case kMkldnn:
      return grad.to_mkldnn(input_.scalar_type());
    default:
-      AT_ERROR("to_dense_backward: Unsupported input layout: ", input_layout);
+      TORCH_CHECK(false, "to_dense_backward: Unsupported input layout: ", input_layout);
      return Tensor{};
  }
 }
@ -928,23 +928,23 @@ void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self,

  auto layout_from_valid = layout_from == kStrided || layout_from == kSparse || at::sparse_csr::is_sparse_compressed(layout_from);
  if (!layout_from_valid) {
-    AT_ERROR(funcname, ": unexpected source layout ", layout_from);
+    TORCH_CHECK(false, funcname, ": unexpected source layout ", layout_from);
  }

  if (layout_from == kStrided) {
    if (sparse_dim == 0 && self.dim() > 0) {
-      AT_ERROR(funcname, ": sparse_dim argument must be in >0 when self.dim()>0");
+      TORCH_CHECK(false, funcname, ": sparse_dim argument must be in >0 when self.dim()>0");
    }
    if (sparse_dim < 0 || sparse_dim > self.dim()) {
-      AT_ERROR(funcname, ": sparse_dim argument must be in [0,", self.dim(), "] range, but ", sparse_dim, " is given");
+      TORCH_CHECK(false, funcname, ": sparse_dim argument must be in [0,", self.dim(), "] range, but ", sparse_dim, " is given");
    }
  } else if (layout_from == kSparse) {
    if (sparse_dim != self.sparse_dim()) {
-      AT_ERROR(funcname, ": conversion from ", layout_from, " to ", kSparse, " with sparse_dim argument !=self.sparse_dim() is not supported");
+      TORCH_CHECK(false, funcname, ": conversion from ", layout_from, " to ", kSparse, " with sparse_dim argument !=self.sparse_dim() is not supported");
    }
  } else if (at::sparse_csr::is_sparse_compressed(layout_from)) {
    if (sparse_dim != 2) {
-      AT_ERROR(funcname, ": conversion from ", layout_from, " to ", kSparse, " with sparse_dim argument !=2 is not supported");
+      TORCH_CHECK(false, funcname, ": conversion from ", layout_from, " to ", kSparse, " with sparse_dim argument !=2 is not supported");
    }
  }
 }
@ -956,40 +956,40 @@ void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self,

  auto layout_from_valid = layout_from == kStrided || layout_from == kSparse || at::sparse_csr::is_sparse_compressed(layout_from);
  if (!layout_from_valid) {
-    AT_ERROR(funcname, ": unexpected source layout ", layout_from);
+    TORCH_CHECK(false, funcname, ": unexpected source layout ", layout_from);
  }
  auto layout_to_valid = layout_to == kStrided || layout_to == kSparse || at::sparse_csr::is_sparse_compressed(layout_to);
  if (!layout_to_valid) {
-    AT_ERROR(funcname, ": unexpected source layout ", layout_from);
+    TORCH_CHECK(false, funcname, ": unexpected source layout ", layout_from);
  }

  if (layout_from == kSparse && layout_to != kSparse) {
    if (self.sparse_dim() != 2) {
-      AT_ERROR(funcname, ": conversion from ", layout_from, " to ", layout_to, " for input tensors with sparse_dim()!=2 is not supported");
+      TORCH_CHECK(false, funcname, ": conversion from ", layout_from, " to ", layout_to, " for input tensors with sparse_dim()!=2 is not supported");
    }
  }

  if ((layout_from == kSparseCsr || layout_from == kSparseCsc) &&
      (layout_to == kSparseBsr || layout_to == kSparseBsc)) {
    if (sparse_csr::numBatchDimensions(self) > 0) {
-      AT_ERROR(funcname, ": conversion from ", layout_from, " to ", layout_to, " for batched inputs is not supported");
+      TORCH_CHECK(false, funcname, ": conversion from ", layout_from, " to ", layout_to, " for batched inputs is not supported");
    }
  }

  if (blocksize.has_value()) {
    if (blocksize.value().size() != 2) {
-      AT_ERROR(funcname, ": blocksize needs to be a tuple of size 2, but got ", blocksize.value().size());
+      TORCH_CHECK(false, funcname, ": blocksize needs to be a tuple of size 2, but got ", blocksize.value().size());
    }
    auto blocksize_to = *blocksize;
    if (blocksize_to[0] <= 0 || blocksize_to[1] <= 0) {
-      AT_ERROR(funcname, ": blocksize needs to be positive, but got ", blocksize_to);
+      TORCH_CHECK(false, funcname, ": blocksize needs to be positive, but got ", blocksize_to);
    }

    if (layout_to == kSparseBsr || layout_to == kSparseBsc) {
      if (layout_from == kSparseBsr || layout_from == kSparseBsc) {
        auto blocksize_from = at::sparse_csr::getBlockSize(self);
        if (!(blocksize_to == blocksize_from)) {
-          AT_ERROR(funcname, ": conversion from ", layout_from, " to ", layout_to, " with blocksize changed from ", blocksize_from, " to ", blocksize_to, " is not supported");
+          TORCH_CHECK(false, funcname, ": conversion from ", layout_from, " to ", layout_to, " with blocksize changed from ", blocksize_from, " to ", blocksize_to, " is not supported");
        }
      } else {
        auto dense_dim = (layout_from == kStrided) ? dense_dim_opt.value_or(0) : self.dense_dim();
@ -997,35 +997,35 @@ void _to_sparse_check_arguments(const std::string& funcname, const Tensor& self,
        auto sparse_col_dim = -(dense_dim + 1);
        if ((self.size(sparse_row_dim) % blocksize_to[0] != 0) ||
            (self.size(sparse_col_dim) % blocksize_to[1] != 0)) {
-            AT_ERROR(funcname, ": tensor sparse size (", self.size(sparse_row_dim), ",", self.size(sparse_row_dim), ") must be divisible by given blocksize (", blocksize_to[0], ",", blocksize_to[1], ")");
+            TORCH_CHECK(false, funcname, ": tensor sparse size (", self.size(sparse_row_dim), ",", self.size(sparse_row_dim), ") must be divisible by given blocksize (", blocksize_to[0], ",", blocksize_to[1], ")");
        }
      }
    } else {
-      AT_ERROR(funcname, ": conversion from ", layout_from, " to ", layout_to, " with blocksize argument given is not supported");
+      TORCH_CHECK(false, funcname, ": conversion from ", layout_from, " to ", layout_to, " with blocksize argument given is not supported");
    }
  } else {
    if ((layout_to == kSparseBsr || layout_to == kSparseBsc) &&
        !(layout_from == kSparseBsr && layout_from == kSparseBsc)) {
-      AT_ERROR(funcname, ": conversion from ", layout_from, " to ", layout_to, " without blocksize argument given is not supported");
+      TORCH_CHECK(false, funcname, ": conversion from ", layout_from, " to ", layout_to, " without blocksize argument given is not supported");
    }
  }

  if (dense_dim_opt.has_value()) {
    if (layout_from != kStrided) {
-      AT_ERROR(funcname, ": conversion from ", layout_from, " to ", layout_to, " with dense_dim argument given is not supported");
+      TORCH_CHECK(false, funcname, ": conversion from ", layout_from, " to ", layout_to, " with dense_dim argument given is not supported");
    }

    auto dense_dim = *dense_dim_opt;
    if (layout_to == kSparse) {
      if (dense_dim == self.dim() && self.dim() > 0) {
-        AT_ERROR(funcname, ": dense_dim argument must be !=self.dim() when self.dim()>0");
+        TORCH_CHECK(false, funcname, ": dense_dim argument must be !=self.dim() when self.dim()>0");
      }
      if (dense_dim < 0 || dense_dim > self.dim()) {
-        AT_ERROR(funcname, ": dense_dim argument must be in [0,", self.dim(), "] range, but ", dense_dim, " is given");
+        TORCH_CHECK(false, funcname, ": dense_dim argument must be in [0,", self.dim(), "] range, but ", dense_dim, " is given");
      }
    } else {
      if (dense_dim < 0 || dense_dim > self.dim() - 2) {
-        AT_ERROR(funcname, ": dense_dim argument must be in [0,", self.dim() - 2, "] range, but ", dense_dim, " is given");
+        TORCH_CHECK(false, funcname, ": dense_dim argument must be in [0,", self.dim() - 2, "] range, but ", dense_dim, " is given");
      }
    }
  }
@ -1129,7 +1129,7 @@ Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, std::op
    break;
  }

-  AT_ERROR("dense_to_sparse_with_mask: ", self.layout(), " to ", layout_to, " conversion not supported");
+  TORCH_CHECK(false, "dense_to_sparse_with_mask: ", self.layout(), " to ", layout_to, " conversion not supported");
  return Tensor{};
 }

@ -1181,7 +1181,7 @@ Tensor dense_to_sparse(const Tensor& self, std::optional<c10::Layout> layout, Op
    break;
  }

-  AT_ERROR("dense_to_sparse: ", self.layout(), " to ", layout_to, " conversion not supported");
+  TORCH_CHECK(false, "dense_to_sparse: ", self.layout(), " to ", layout_to, " conversion not supported");
  return Tensor{};
 }

@ -1440,7 +1440,7 @@ Tensor sparse_compressed_to_sparse_csr(const Tensor& self, std::optional<int64_t
    return sparse_compressed_to_flipped(self, std::nullopt, "to_sparse_csr");
  }

-  AT_ERROR("sparse_compressed_to_sparse_csr: expected SparseCsr or SparseCsc layout but got ", self.layout());
+  TORCH_CHECK(false, "sparse_compressed_to_sparse_csr: expected SparseCsr or SparseCsc layout but got ", self.layout());
  return Tensor{};
 }

@ -1453,7 +1453,7 @@ Tensor sparse_compressed_to_sparse_csc(const Tensor& self, std::optional<int64_t
    return sparse_compressed_to_flipped(self, std::nullopt, "to_sparse_csc");
  }

-  AT_ERROR("sparse_compressed_to_sparse_csc: expected SparseCsr or SparseCsc layout but got ", self.layout());
+  TORCH_CHECK(false, "sparse_compressed_to_sparse_csc: expected SparseCsr or SparseCsc layout but got ", self.layout());
  return Tensor{};
 }

@ -1828,7 +1828,7 @@ Tensor sparse_compressed_to_sparse_bsr(const Tensor& self, IntArrayRef blocksize
    return self.to_sparse_csr(dense_dim_opt).to_sparse_bsr(blocksize);
  }

-  AT_ERROR("sparse_compressed_to_sparse_bsr: expected SparseCsr, SparseCsc, SparseBsr or SparseBsc layout but got ", self.layout());
+  TORCH_CHECK(false, "sparse_compressed_to_sparse_bsr: expected SparseCsr, SparseCsc, SparseBsr or SparseBsc layout but got ", self.layout());
  return Tensor{};
 }

@ -1850,14 +1850,14 @@ Tensor sparse_compressed_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize
    return self.to_sparse_csc(dense_dim_opt).to_sparse_bsc(blocksize);
  }

-  AT_ERROR("sparse_compressed_to_sparse_bsc: expected SparseCsr, SparseCsc, SparseBsr or SparseBsc layout but got ", self.layout());
+  TORCH_CHECK(false, "sparse_compressed_to_sparse_bsc: expected SparseCsr, SparseCsc, SparseBsr or SparseBsc layout but got ", self.layout());
  return Tensor{};
 }

 Tensor sparse_coo_to_sparse(const Tensor& self, const int64_t sparse_dim) {
  _to_sparse_check_arguments("sparse_coo_to_sparse", self, sparse_dim);

-  AT_ERROR("sparse_coo_to_sparse: ", self.layout(), " to ", kSparse, " conversion not supported");
+  TORCH_CHECK(false, "sparse_coo_to_sparse: ", self.layout(), " to ", kSparse, " conversion not supported");
  return Tensor{};
 }

@ -1927,7 +1927,7 @@ Tensor sparse_compressed_to_sparse(const Tensor& self, std::optional<c10::Layout
    break;
  }

-  AT_ERROR("sparse_compressed_to_sparse: ", self.layout(), " to ", layout_to, " conversion not supported");
+  TORCH_CHECK(false, "sparse_compressed_to_sparse: ", self.layout(), " to ", layout_to, " conversion not supported");
  return Tensor{};
 }

@ -1951,7 +1951,7 @@ Tensor sparse_coo_to_sparse(const Tensor& self, std::optional<c10::Layout> layou
    break;
  }

-  AT_ERROR("sparse_coo_to_sparse: ", self.layout(), " to ", layout_to, " conversion not supported");
+  TORCH_CHECK(false, "sparse_coo_to_sparse: ", self.layout(), " to ", layout_to, " conversion not supported");
  return Tensor{};
 }

--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@ -101,7 +101,7 @@ bool cudnn_is_acceptable(const Tensor& self) {

 Tensor & detach_(Tensor & self) {
  // this just exists to give us a hook in VariableType and an entry in Declarations.yaml
-  //AT_ERROR("detach_ is not implemented for Tensor");
+  //TORCH_CHECK(false, "detach_ is not implemented for Tensor");
  return self;
 }

--- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
+++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp
@ -83,11 +83,11 @@ void cpu_max_unpool(

  if (optional_error_index) {
    if constexpr (is_3d) {
-      AT_ERROR("Found an invalid max index: ", optional_error_index.value(),
+      TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
          " (output volumes are of size ", output_depth,
          "x", output_height, "x", output_width);
    } else {
-      AT_ERROR("Found an invalid max index: ", optional_error_index.value(),
+      TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
          " (output volumes are of size ", output_height,
          "x", output_width);
    }
@ -151,7 +151,7 @@ void cpu_max_unpool_channels_last(
  });

  if (optional_error_index) {
-    AT_ERROR("Found an invalid max index: ", optional_error_index.value(),
+    TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
        " (output volumes are of size ", output_height,
        "x", output_width, ")");
  }
@ -223,12 +223,12 @@ void cpu_max_unpool_backward(

  if (optional_error_index) {
    if (is_3d) {
-      AT_ERROR("invalid max index ", optional_error_index.value(),
+      TORCH_CHECK(false, "invalid max index ", optional_error_index.value(),
          ", odepth= ", output_depth,
          ", owidth= ", output_width,
          ", oheight= ", output_height);
    } else {
-      AT_ERROR("invalid max index ", optional_error_index.value(),
+      TORCH_CHECK(false, "invalid max index ", optional_error_index.value(),
          ", owidth= ", output_width,
          ", oheight= ", output_height);
    }
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -180,19 +180,10 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa

 static bool getDisableAddmmCudaLt() {
    static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
-    // When DISABLE_ADDMM_CUDA_LT is unset the default is TRUE on
-    // AMD architectures otherwise it is FALSE.
-#ifdef USE_ROCM
-    if (env_value != nullptr && strcmp(env_value, "0") == 0) {
-      return false;
-    }
-    return true;
-#else
    if (env_value != nullptr && strcmp(env_value, "1") == 0) {
      return true;
    }
    return false;
-#endif
 }

 #ifdef USE_ROCM
@ -325,14 +316,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
    }
    self__sizes = self_->sizes();
  } else {
-#if defined(USE_ROCM)
-    useLtInterface = !disable_addmm_cuda_lt &&
-        result.dim() == 2 && result.is_contiguous() &&
-        isSupportedHipLtROCmArch(self.device().index()) &&
-        (scalar_type == at::ScalarType::Float ||
-          scalar_type == at::ScalarType::Half ||
-          scalar_type == at::ScalarType::BFloat16);
-#endif
    self_ = c10::MaybeOwned<Tensor>::borrowed(self);
    self__sizes = self_->sizes();
    TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
--- a/aten/src/ATen/native/cuda/CuFFTUtils.h
+++ b/aten/src/ATen/native/cuda/CuFFTUtils.h
@ -66,7 +66,7 @@ static inline void CUFFT_CHECK(cufftResult error)
  if (error != CUFFT_SUCCESS) {
    std::ostringstream ss;
    ss << "cuFFT error: " << _cudaGetErrorEnum(error);
-    AT_ERROR(ss.str());
+    TORCH_CHECK(false, ss.str());
  }
 }

--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@ -462,7 +462,7 @@ Tensor _embedding_bag_dense_backward_cuda(const Tensor &grad_, const Tensor &ind
              padding_idx);

    default:
-      AT_ERROR(
+      TORCH_CHECK(false,
          "Unknown mode for embedding_bag_backward_cuda ", mode);
  }
 }
--- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
@ -45,10 +45,10 @@ __device__ inline int get_interval(accscalar_t sample,

 template <typename scalar_t>
 __global__ void fractional_max_pool2d_out_cuda_frame(
-  PackedTensorAccessor<scalar_t, 4> output,
-  PackedTensorAccessor<int64_t, 4> indices,
-  PackedTensorAccessor<const scalar_t, 4> input,
-  PackedTensorAccessor<const scalar_t, 3> samples,
+  GenericPackedTensorAccessor<scalar_t, 4> output,
+  GenericPackedTensorAccessor<int64_t, 4> indices,
+  GenericPackedTensorAccessor<const scalar_t, 4> input,
+  GenericPackedTensorAccessor<const scalar_t, 3> samples,
  int poolSizeH, int poolSizeW) {

  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
@ -102,9 +102,9 @@ __global__ void fractional_max_pool2d_out_cuda_frame(

 template <typename scalar_t>
 __global__ void fractional_max_pool2d_backward_out_cuda_frame(
-  PackedTensorAccessor<scalar_t, 4> gradInput,
-  PackedTensorAccessor<const scalar_t, 4> gradOutput,
-  PackedTensorAccessor<const int64_t, 4> indices) {
+  GenericPackedTensorAccessor<scalar_t, 4> gradInput,
+  GenericPackedTensorAccessor<const scalar_t, 4> gradOutput,
+  GenericPackedTensorAccessor<const int64_t, 4> indices) {
  // Output (h, w) point that this thread is responsible for
  int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
  int plane = blockIdx.y;
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@ -8,6 +8,7 @@
 // ROCm 6.3 is planned to have these functions, but until then here they are.
 #if defined(USE_ROCM) && ROCM_VERSION >= 60201
 #include <hip/hip_fp16.h>
+#include <hip/hip_bf16.h>

 __device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && \
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@ -267,7 +267,7 @@ static void max_unpooling3d_shape_check(
  if (gradOutput.defined()) {
    if (oT != gradOutput.size(dimt) || oH != gradOutput.size(dimh) ||
        oW != gradOutput.size(dimw)) {
-      AT_ERROR(
+      TORCH_CHECK(false,
          "Inconsistent gradOutput size. oT= ",
          oT,
          ", oH= ",
@ -447,7 +447,7 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
  nInputRows = self.size(dimh);

  if (oheight != grad_output.size(dimh) || owidth != grad_output.size(dimw)) {
-    AT_ERROR(
+    TORCH_CHECK(false,
        "Inconsistent gradOutput size. output height: ",
        oheight,
        ", output width= ",
--- a/aten/src/ATen/native/cuda/MixedDtypesLinear.cu
+++ b/aten/src/ATen/native/cuda/MixedDtypesLinear.cu
@ -164,7 +164,7 @@ mixed_dtypes_linear_dispatch_bias_activation(
          ElementInputB,
          fastertransformer::EpilogueOpNoBias>(input, weight, scale, bias);
      }
-      AT_ERROR("mixed_dtypes_linear_dispatch_bias_activation: Activation \"",
+      TORCH_CHECK(false, "mixed_dtypes_linear_dispatch_bias_activation: Activation \"",
               activation, "\" is not supported");
      return Tensor{};
    }
@ -185,7 +185,7 @@ mixed_dtypes_linear_dispatch_bias_activation(
            ElementInputB,
            fastertransformer::EpilogueOpBiasSilu>(input, weight, scale, bias);
      }
-      AT_ERROR("mixed_dtypes_linear_dispatch_bias_activation: Activation \"",
+      TORCH_CHECK(false, "mixed_dtypes_linear_dispatch_bias_activation: Activation \"",
               activation, "\" is not supported");
      return Tensor{};
    }
@ -198,7 +198,7 @@ _mixed_dtypes_linear(const Tensor& input, const Tensor& weight,
                     const std::optional<Tensor>& bias_opt,
                     const std::optional<c10::string_view> activation_opt) {
 #if defined(USE_ROCM) || defined(_MSC_VER) || (defined(CUDA_VERSION) && CUDA_VERSION < 11080)
-  AT_ERROR("_mixed_dtypes_linear: not compiled for this platform");
+  TORCH_CHECK(false, "_mixed_dtypes_linear: not compiled for this platform");
  return Tensor{};
 #else
  const auto bias = bias_opt.has_value() ? *bias_opt : Tensor{};
--- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
+++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu
@ -88,7 +88,7 @@ static inline void slow_conv_transpose2d_shape_check(
      check_dim_size(bias, 1, 0, weight.size(1));
    }
  } else if (!weight_nullable) {
-    AT_ERROR("weight tensor is expected to be non-nullable");
+    TORCH_CHECK(false, "weight tensor is expected to be non-nullable");
  }

  int ndim = input.dim();
@ -115,7 +115,7 @@ static inline void slow_conv_transpose2d_shape_check(
      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;

  if (output_width < 1 || output_height < 1) {
-    AT_ERROR(
+    TORCH_CHECK(false,
        "Given input size per channel: (",
        input_height,
        " x ",
--- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
+++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu
@ -106,7 +106,7 @@ static inline void slow_conv_transpose3d_shape_check(
      check_dim_size(bias, 1, 0, weight.size(1));
    }
  } else if (!weight_nullable) {
-    AT_ERROR("weight tensor is expected to be non-nullable");
+    TORCH_CHECK(false, "weight tensor is expected to be non-nullable");
  }

  int ndim = input.dim();
@ -140,7 +140,7 @@ static inline void slow_conv_transpose3d_shape_check(
      (dilation_width * (kernel_width - 1) + 1) + output_padding_width;

  if (output_depth < 1 || output_width < 1 || output_height < 1) {
-    AT_ERROR(
+    TORCH_CHECK(false,
        "Given input size per channel: (",
        input_depth,
        " x ",
--- a/aten/src/ATen/native/cuda/Sorting.cu
+++ b/aten/src/ATen/native/cuda/Sorting.cu
@ -184,7 +184,7 @@ struct KthValueLauncher {
      int64_t slice_size) {
    dim3 grid;
    if (!getGridFromTiles(num_slices, grid)) {
-      AT_ERROR("slices are too many");
+      TORCH_CHECK(false, "slices are too many");
    }

    dim3 block(std::min(
@ -221,7 +221,7 @@ struct MedianLauncher {
      int64_t slice_size) {
    dim3 grid;
    if (!getGridFromTiles(num_slices, grid)) {
-      AT_ERROR("slices are too many");
+      TORCH_CHECK(false, "slices are too many");
    }

    dim3 block(std::min(
--- a/aten/src/ATen/native/cuda/SparseMM.cu
+++ b/aten/src/ATen/native/cuda/SparseMM.cu
@ -12,10 +12,10 @@ namespace at::native {
 // sparse, sparse, sparse, dense, real, real -> sparse
 Tensor& _sspaddmm_out_only_sparse_cuda(const Tensor& self,
    const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Tensor& result) {
-  AT_ERROR("tensor.sspaddmm(...) can only be called on sparse tensors");
+  TORCH_CHECK(false, "tensor.sspaddmm(...) can only be called on sparse tensors");
 }
 Tensor& _sspaddmm_out_cuda(const Tensor& self,
    const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Tensor& result) {
-  AT_ERROR("NYI: CUDA sspaddmm is not implemented");
+  TORCH_CHECK(false, "NYI: CUDA sspaddmm is not implemented");
 }
 } // namespace at::native
--- a/aten/src/ATen/native/cuda/SummaryOps.cu
+++ b/aten/src/ATen/native/cuda/SummaryOps.cu
@ -251,7 +251,7 @@ Tensor _bincount_cuda_template(
    const Tensor& weights,
    int64_t minlength) {
  if (minlength < 0) {
-    AT_ERROR("minlength should be >= 0");
+    TORCH_CHECK(false, "minlength should be >= 0");
  }
  if (self.dim() == 1 && self.numel() == 0) {
    return at::zeros(
@ -264,12 +264,12 @@ Tensor _bincount_cuda_template(
  if (self.dim() != 1 ||
      (!std::is_same_v<input_t, uint8_t> &&
       *self.min().cpu().const_data_ptr<input_t>() < 0)) {
-    AT_ERROR("bincount only supports 1-d non-negative integral inputs.");
+    TORCH_CHECK(false, "bincount only supports 1-d non-negative integral inputs.");
  }

  bool has_weights = weights.defined();
  if (has_weights && (weights.dim() != 1 || weights.size(0) != self.size(0))) {
-    AT_ERROR("weights should be 1-d and have the same length as input");
+    TORCH_CHECK(false, "weights should be 1-d and have the same length as input");
  }

  const int64_t nbins =
@ -312,7 +312,7 @@ Tensor _histc_cuda_template(
    at::acc_type<input_t, /*is_cuda=*/true> min,
    at::acc_type<input_t, /*is_cuda=*/true> max) {
  if (nbins <= 0) {
-    AT_ERROR("bins must be > 0");
+    TORCH_CHECK(false, "bins must be > 0");
  }
  Tensor output = at::zeros(
      {nbins},
@ -387,7 +387,7 @@ Tensor _histc_cuda(
    const Scalar& min,
    const Scalar& max) {
  if (self.scalar_type() == ScalarType::Half) {
-    AT_ERROR("HalfTensor is not supported");
+    TORCH_CHECK(false, "HalfTensor is not supported");
  }
  // See Note [Writing Nondeterministic Operations]
  // Nondeterministic because of atomicAdd usage
--- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@ -37,8 +37,8 @@ __global__ void upsample_bilinear2d_out_frame(
    const accscalar_t rheight,
    const accscalar_t rwidth,
    const bool align_corners,
-    const PackedTensorAccessor<const scalar_t, 4> idata,
-    PackedTensorAccessor<scalar_t, 4> odata) {
+    const GenericPackedTensorAccessor<const scalar_t, 4> idata,
+    GenericPackedTensorAccessor<scalar_t, 4> odata) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;

  const int batchsize = idata.size(0);
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -1158,7 +1158,7 @@ REGISTER_CUDA_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
 template <typename scalar_t>
 static void apply_cholesky_solve(Tensor& b, Tensor& A, bool upper, int64_t& info) {
 #if !AT_MAGMA_ENABLED()
-AT_ERROR("cholesky_solve: MAGMA library not found in "
+TORCH_CHECK(false, "cholesky_solve: MAGMA library not found in "
    "compilation. Please rebuild with MAGMA.");
 #else
  magma_uplo_t uplo = upper ? MagmaUpper : MagmaLower;
@ -1476,7 +1476,7 @@ template <typename scalar_t>
 static void apply_lu_factor_looped_magma(const Tensor& input, const Tensor& pivots, const Tensor& infos, bool compute_pivots) {
 #if !AT_MAGMA_ENABLED()
  // This should never be thrown if the calling functions are correct.
-  AT_ERROR("linalg.lu_factor: PyTorch was not compiled with MAGMA support.");
+  TORCH_CHECK(false, "linalg.lu_factor: PyTorch was not compiled with MAGMA support.");
 #else
  // magmaLu and magmaLuNoPiv require infos and pivots tensor to be on CPU
  // the data is later copied back to the appropriate output tensor
@ -1677,7 +1677,7 @@ REGISTER_CUDA_DISPATCH(lu_factor_stub, &lu_factor);
 template <typename scalar_t>
 static void apply_triangular_solve_batched_magma(const Tensor& A, const Tensor& b, bool left, bool upper, TransposeType transpose, bool unitriangular) {
 #if !AT_MAGMA_ENABLED()
-AT_ERROR("triangular_solve: MAGMA library not found in "
+TORCH_CHECK(false, "triangular_solve: MAGMA library not found in "
         "compilation. Please rebuild with MAGMA.");
 #else
  magma_uplo_t uplo = upper ? MagmaUpper : MagmaLower;
@ -2106,7 +2106,7 @@ static void apply_svd_magma(const Tensor& A,
                            const Tensor& Vh,
                            const Tensor& info) {
 #if !AT_MAGMA_ENABLED()
-AT_ERROR("linalg.svd: MAGMA library not found in "
+TORCH_CHECK(false, "linalg.svd: MAGMA library not found in "
    "compilation. Please rebuild with MAGMA.");
 #else
  using value_t = typename c10::scalar_value_type<scalar_t>::type;
--- a/aten/src/ATen/native/cuda/linalg/MagmaUtils.h
+++ b/aten/src/ATen/native/cuda/linalg/MagmaUtils.h
@ -59,7 +59,7 @@ struct MAGMAQueue {
 static inline magma_int_t magma_int_cast(int64_t value, const char* varname) {
  auto result = static_cast<magma_int_t>(value);
  if (static_cast<int64_t>(result) != value) {
-    AT_ERROR("magma: The value of ", varname, "(", (long long)value,
+    TORCH_CHECK(false, "magma: The value of ", varname, "(", (long long)value,
             ") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)");
  }
  return result;
--- a/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
+++ b/aten/src/ATen/native/cudnn/AffineGridGenerator.cpp
@ -25,7 +25,8 @@ Tensor cudnn_affine_grid_generator_forward(
    int64_t C,
    int64_t H,
    int64_t W) {
-  AT_ERROR(
+  TORCH_CHECK(
+      false,
      "cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support");
 }

@ -35,7 +36,8 @@ Tensor cudnn_affine_grid_generator_backward(
    int64_t C,
    int64_t H,
    int64_t W) {
-  AT_ERROR(
+  TORCH_CHECK(
+      false,
      "cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support");
 }

--- a/Show More
+++ b/Show More