remove check

[micro_pipeline_tp] support all _scaled_mm args (#131984 )
Pull Request resolved: https://github.com/pytorch/pytorch/pull/131984 Approved by: https://github.com/weifengpy
2025-10-27 17:54:55 +08:00 · 2024-08-05 15:22:44 -07:00 · 2024-08-05 21:44:37 +00:00 · 2024-08-05 21:40:56 +00:00 · 2024-08-05 21:28:59 +00:00 · 2024-08-05 21:28:59 +00:00
1966 changed files with 37434 additions and 30678 deletions
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-48da61aa34b73ea8e2ee815a6a79eea817e361db
+91298923a0076c1b41059efb6dad2876426e4b03
--- a/.ci/docker/conda/Dockerfile
+++ b/.ci/docker/conda/Dockerfile
@ -21,9 +21,8 @@ RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
 # EPEL for cmake
-RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
-    rpm -ivh epel-release-latest-7.noarch.rpm && \
-    rm -f epel-release-latest-7.noarch.rpm
+RUN yum --enablerepo=extras install -y epel-release
+
 # cmake
 RUN yum install -y cmake3 && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -29,7 +29,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/re

 # Install cuda and cudnn
 ARG CUDA_VERSION
-RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
+COPY ./common/install_cuda.sh install_cuda.sh
 RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -29,9 +29,7 @@ RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_
 ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH

-RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
-    rpm -ivh epel-release-latest-7.noarch.rpm && \
-    rm -f epel-release-latest-7.noarch.rpm
+RUN yum --enablerepo=extras install -y epel-release

 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
@ -117,7 +115,8 @@ RUN yum install -y \
        yasm
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
-    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
+
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
--- a/.ci/docker/manywheel/Dockerfile_2014
+++ b/.ci/docker/manywheel/Dockerfile_2014
@ -93,7 +93,8 @@ RUN yum install -y \
        yasm
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
-    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
+
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -87,10 +87,10 @@ RUN yum install -y \
        xz \
        gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
        glibc-langpack-en
-
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
-    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
+
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -50,7 +50,7 @@ RUN  bash ./install_lcov.sh && rm install_lcov.sh

 # Install cuda and cudnn
 ARG CUDA_VERSION
-RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
+COPY ./common/install_cuda.sh install_cuda.sh
 RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -44,15 +44,19 @@ time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compi
 time python test/run_test.py --verbose -i distributed/test_device_mesh

 # DTensor/TP tests
+time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
+time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state

 # FSDP2 tests
 time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

+# Pipelining composability tests
+time python test/run_test.py --verbose -i distributed/pipelining/test_composability.py
+
 # ND composability tests
 time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_2d_composability
-time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_pp_composability

 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -316,6 +316,8 @@ test_inductor_distributed() {
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
+  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
+  python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
@ -428,7 +430,6 @@ test_perf_for_dashboard() {
  local targets=(accuracy performance)

  local device=cuda
-  local taskset=""
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    if [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
      device=cpu_x86
@ -436,8 +437,8 @@ test_perf_for_dashboard() {
      device=cpu_aarch64
    fi
    test_inductor_set_cpu_affinity
-    end_core=$(( $(test_inductor_get_core_number)-1 ))
-    taskset="taskset -c 0-$end_core"
+  elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
+    device=cuda_a10g
  fi

  for mode in "${modes[@]}"; do
@ -455,43 +456,43 @@ test_perf_for_dashboard() {
      fi

      if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
-        $taskset python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then
-        $taskset python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then
-        $taskset python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
            --dynamic-batch-only "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_CPP_WRAPPER=1 $taskset python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *freezing_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
-        $taskset python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *freeze_autotune_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_MAX_AUTOTUNE=1 $taskset python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_ABI_COMPATIBLE=1 $taskset python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_ABI_COMPATIBLE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *maxautotune-true* ]]; then
-        TORCHINDUCTOR_MAX_AUTOTUNE=1 $taskset python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
@ -499,7 +500,7 @@ test_perf_for_dashboard() {
        # TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
        # The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
        # to fill the dashboard.
-        $taskset python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
          "${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \
          --output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_${device}_${target}.csv" || true
        # Copy cudagraph results as mock data, easiest choice?
@ -547,6 +548,10 @@ test_single_dynamo_benchmark() {
      # For CPU device, we perfer non ABI-compatible mode on CI when testing AOTInductor.
      export TORCHINDUCTOR_ABI_COMPATIBLE=1
    fi
+
+    if [[ "${TEST_CONFIG}" == *_avx2* ]]; then
+      TEST_CONFIG=${TEST_CONFIG::-5}
+    fi
    python "benchmarks/dynamo/$suite.py" \
      --ci --accuracy --timing --explain \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" \
@ -657,12 +662,16 @@ test_inductor_torchbench_smoketest_perf() {
 }

 test_inductor_get_core_number() {
-  echo $(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))
+  if [[ "${TEST_CONFIG}" == *aarch64 ]]; then
+    echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
+  else
+    echo "$(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))"
+  fi
 }

 test_inductor_set_cpu_affinity(){
  #set jemalloc
-  JEMALLOC_LIB="/usr/lib/x86_64-linux-gnu/libjemalloc.so.2"
+  JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
  IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
  export LD_PRELOAD="$JEMALLOC_LIB":"$IOMP_LIB":"$LD_PRELOAD"
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
@ -670,6 +679,8 @@ test_inductor_set_cpu_affinity(){
  export KMP_BLOCKTIME=1
  cores=$(test_inductor_get_core_number)
  export OMP_NUM_THREADS=$cores
+  end_core=$((cores-1))
+  export TASKSET="taskset -c 0-$end_core"
 }

 test_inductor_torchbench_cpu_smoketest_perf(){
@ -677,7 +688,6 @@ test_inductor_torchbench_cpu_smoketest_perf(){
  mkdir -p "$TEST_REPORTS_DIR"

  test_inductor_set_cpu_affinity
-  end_core=$(( $(test_inductor_get_core_number)-1 ))
  MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv

  grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg
@ -694,11 +704,11 @@ test_inductor_torchbench_cpu_smoketest_perf(){
    local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv"

    if [[ ${model_cfg[3]} == "dynamic" ]]; then
-      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
+      $TASKSET python benchmarks/dynamo/torchbench.py \
        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \
        --dynamic-batch-only --freezing --timeout 9000 --"$backend" --output "$output_name"
    else
-      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
+      $TASKSET python benchmarks/dynamo/torchbench.py \
        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \
        --freezing --timeout 9000 --"$backend" --output "$output_name"
    fi
@ -706,6 +716,17 @@ test_inductor_torchbench_cpu_smoketest_perf(){
    # The threshold value needs to be actively maintained to make this check useful.
    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target"
  done
+
+  # Add a few ABI-compatible accuracy tests for CPU. These can be removed once we turn on ABI-compatible as default.
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/timm_models.py --device cpu --accuracy \
+    --bfloat16 --inference --export-aot-inductor --disable-cudagraphs --only adv_inception_v3 \
+    --output "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv"
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/timm_models.py --device cpu --accuracy \
+    --bfloat16 --inference --export-aot-inductor --disable-cudagraphs --only beit_base_patch16_224 \
+    --output "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv"
+  python benchmarks/dynamo/check_accuracy.py \
+    --actual "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv" \
+    --expected "benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv"
 }

 test_torchbench_gcp_smoketest(){
@ -1019,11 +1040,113 @@ test_xla() {
  assert_git_not_dirty
 }

+function check_public_api_test_fails {
+    test_name=$1
+    invalid_item_name=$2
+    invalid_item_desc=$3
+
+    echo "Running public API test '${test_name}'..."
+    test_output=$(python test/test_public_bindings.py -k "${test_name}" 2>&1) && ret=$? || ret=$?
+
+    # Ensure test fails correctly.
+    if [ "$ret" -eq 0 ]; then
+        cat << EOF
+Expected the public API test '${test_name}' to fail after introducing
+${invalid_item_desc}, but it succeeded! Check test/test_public_bindings.py
+for any changes that may have broken the test.
+EOF
+        return 1
+    fi
+
+    # Ensure invalid item is in the test output.
+    echo "${test_output}" | grep -q "${invalid_item_name}" && ret=$? || ret=$?
+
+    if [ $ret -ne 0 ]; then
+        cat << EOF
+Expected the public API test '${test_name}' to identify ${invalid_item_desc}, but
+it didn't! It's possible the test may not have run. Check test/test_public_bindings.py
+for any changes that may have broken the test.
+EOF
+        return 1
+    fi
+
+    echo "Success! '${test_name}' identified ${invalid_item_desc} ${invalid_item_name}."
+    return 0
+}
+
 # Do NOT run this test before any other tests, like test_python_shard, etc.
 # Because this function uninstalls the torch built from branch and installs
 # the torch built on its base commit.
 test_forward_backward_compatibility() {
  set -x
+
+  # First, validate public API tests in the torch built from branch.
+  # Step 1. Make sure the public API test "test_correct_module_names" fails when a new file
+  # introduces an invalid public API function.
+  new_filename=$(mktemp XXXXXXXX.py -p "${TORCH_INSTALL_DIR}")
+
+  BAD_PUBLIC_FUNC=$(
+  cat << 'EOF'
+def new_public_func():
+  pass
+
+# valid public API functions have __module__ set correctly
+new_public_func.__module__ = None
+EOF
+  )
+
+  echo "${BAD_PUBLIC_FUNC}" >> "${new_filename}"
+  invalid_api="torch.$(basename -s '.py' "${new_filename}").new_public_func"
+  echo "Created an invalid public API function ${invalid_api}..."
+
+  check_public_api_test_fails \
+      "test_correct_module_names" \
+      "${invalid_api}" \
+      "an invalid public API function" && ret=$? || ret=$?
+
+  rm -v "${new_filename}"
+
+  if [ "$ret" -ne 0 ]; then
+      exit 1
+  fi
+
+  # Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
+  # file is modified to introduce an invalid public API function.
+  EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py"
+  cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
+  echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
+  invalid_api="torch.nn.parameter.new_public_func"
+  echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."
+
+  check_public_api_test_fails \
+      "test_correct_module_names" \
+      "${invalid_api}" \
+      "an invalid public API function" && ret=$? || ret=$?
+
+  mv -v "${EXISTING_FILEPATH}.orig" "${EXISTING_FILEPATH}"
+
+  if [ "$ret" -ne 0 ]; then
+      exit 1
+  fi
+
+  # Step 3. Make sure that the public API test "test_modules_can_be_imported" fails when a module
+  # cannot be imported.
+  new_module_dir=$(mktemp XXXXXXXX -d -p "${TORCH_INSTALL_DIR}")
+  echo "invalid syntax garbage" > "${new_module_dir}/__init__.py"
+  invalid_module_name="torch.$(basename "${new_module_dir}")"
+
+  check_public_api_test_fails \
+      "test_modules_can_be_imported" \
+      "${invalid_module_name}" \
+      "a non-importable module" && ret=$? || ret=$?
+
+  rm -rv "${new_module_dir}"
+
+  if [ "$ret" -ne 0 ]; then
+      exit 1
+  fi
+
+  # Next, build torch from the merge base.
  REPO_DIR=$(pwd)
  if [[ "${BASE_SHA}" == "${SHA1}" ]]; then
    echo "On trunk, we should compare schemas with torch built from the parent commit"
@ -1249,7 +1372,7 @@ if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-baze
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
-if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
+if [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
  test_linux_aarch64
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
@ -1301,9 +1424,9 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
-    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_gcn \
+    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
-      shufflenet_v2_x1_0 hf_GPT2 yolov3 mobilenet_v2 resnext50_32x4d hf_T5_base
+      functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
    checkout_install_torchbench
@ -1324,8 +1447,11 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    test_inductor_aoti
-    test_inductor_distributed
+    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.8-gcc11-build ]]; then
+      # Temporarily skip test_inductor_aoti due to https://github.com/pytorch/pytorch/issues/130311
+      test_inductor_aoti
+      test_inductor_distributed
+    fi
  fi
 elif [[ "${TEST_CONFIG}" == *dynamo* ]]; then
  install_torchvision
--- a/.flake8
+++ b/.flake8
@ -7,7 +7,7 @@ max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 ignore =
-    E203,E305,E402,E501,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
+    E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
    # to line this up with executable bit
    EXE001,
@ -55,6 +55,9 @@ per-file-ignores =
    torch/distributed/_functional_collectives.py: TOR901
    torch/distributed/_spmd/data_parallel.py: TOR901
    torch/distributed/_tensor/_collective_utils.py: TOR901
+    # This is a full package that happen to live within the test
+    # folder, so ok to skip
+    test/cpp_extensions/open_registration_extension/pytorch_openreg/__init__.py: TOR901
 optional-ascii-coding = True
 exclude =
    ./.git,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -14,6 +14,7 @@ self-hosted-runner:
    - linux.12xlarge.ephemeral
    - linux.24xlarge
    - linux.arm64.2xlarge
+    - linux.arm64.m7g.4xlarge
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
@ -36,6 +37,7 @@ self-hosted-runner:
    - amz2023.linux.12xlarge
    - amz2023.linux.24xlarge
    - amz2023.linux.arm64.2xlarge
+    - amz2023.linux.arm64.m7g.4xlarge
    - amz2023.linux.4xlarge.nvidia.gpu
    - amz2023.linux.8xlarge.nvidia.gpu
    - amz2023.linux.16xlarge.nvidia.gpu
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -41,6 +41,9 @@ outputs:
  ci-verbose-test-logs:
    description: True if ci-verbose-test-logs label was on PR or [ci-verbose-test-logs] in PR body.
    value: ${{ steps.filter.outputs.ci-verbose-test-logs }}
+  ci-test-showlocals:
+    description: True if ci-test-showlocals label was on PR or [ci-test-showlocals] in PR body.
+    value: ${{ steps.filter.outputs.ci-test-showlocals }}
  ci-no-test-timeout:
    description: True if ci-no-test-timeout label was on PR or [ci-no-test-timeout] in PR body.
    value: ${{ steps.filter.outputs.ci-no-test-timeout }}
--- a/.github/actions/linux-build/action.yml
+++ b/.github/actions/linux-build/action.yml
@ -1,226 +0,0 @@
-name: linux-build
-
-inputs:
-  build-environment:
-    required: true
-    description: Top-level label for what's being built/tested.
-  docker-image-name:
-    required: true
-    description: Name of the base docker image to build with.
-  build-generates-artifacts:
-    required: false
-    default: "true"
-    description: If set, upload generated build artifacts.
-  build-with-debug:
-    required: false
-    default: "false"
-    description: If set, build in debug mode.
-  sync-tag:
-    required: false
-    default: ""
-    description: |
-      If this is set, our linter will use this to make sure that every other
-      job with the same `sync-tag` is identical.
-  cuda-arch-list:
-    required: false
-    default: "5.2"
-    description: Runner label to select worker type
-  runner:
-    required: false
-    default: "linux.2xlarge"
-    description: |
-      List of CUDA architectures CI build should target.
-  test-matrix:
-    required: false
-    type: string
-    description: |
-      An option JSON description of what test configs to run later on. This
-      is moved here from the Linux test workflow so that we can apply filter
-      logic using test-config labels earlier and skip unnecessary builds
-  s3-bucket:
-    description: S3 bucket to download artifact
-    required: false
-    default: "gha-artifacts"
-  aws-role-to-assume:
-    description: role to assume for downloading artifacts
-    required: false
-    default: ""
-  GITHUB_TOKEN:
-    description: GitHub token
-    required: true
-  HUGGING_FACE_HUB_TOKEN:
-    description: Hugging Face Hub token
-    required: false
-    default: ""
-  use_split_build:
-    description: |
-      [Experimental] Build a libtorch only wheel and build pytorch such that
-      are built from the libtorch wheel.
-    required: false
-    type: boolean
-    default: false
-outputs:
-  docker-image:
-    value: ${{ steps.calculate-docker-image.outputs.docker-image }}
-    description: The docker image containing the built PyTorch.
-  test-matrix:
-    value: ${{ steps.filter.outputs.test-matrix }}
-    description: An optional JSON description of what test configs to run later on.
-
-runs:
-  using: composite
-  steps:
-    - name: Setup Linux
-      uses: ./.github/actions/setup-linux
-
-    - name: configure aws credentials
-      uses: aws-actions/configure-aws-credentials@v3
-      if: ${{ inputs.aws-role-to-assume != '' }}
-      with:
-        role-to-assume: ${{ inputs.aws-role-to-assume }}
-        role-session-name: gha-linux-build
-        role-duration-seconds: 10800
-        aws-region: us-east-1
-
-    - name: Calculate docker image
-      id: calculate-docker-image
-      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-      with:
-        docker-image-name: ${{ inputs.docker-image-name }}
-
-    - name: Use following to pull public copy of the image
-      id: print-ghcr-mirror
-      env:
-        ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      shell: bash
-      run: |
-        tag=${ECR_DOCKER_IMAGE##*/}
-        echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
-
-    - name: Pull docker image
-      uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-      with:
-        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-
-    - name: Parse ref
-      id: parse-ref
-      shell: bash
-      run: .github/scripts/parse_ref.py
-
-    - name: Get workflow job id
-      id: get-job-id
-      uses: ./.github/actions/get-workflow-job-id
-      if: always()
-      with:
-        github-token: ${{ inputs.GITHUB_TOKEN }}
-
-    # Apply the filter logic to the build step too if the test-config label is already there
-    - name: Select all requested test configurations (if the test matrix is available)
-      id: filter
-      uses: ./.github/actions/filter-test-configs
-      with:
-        github-token: ${{ inputs.GITHUB_TOKEN }}
-        test-matrix: ${{ inputs.test-matrix }}
-        job-name: ${{ steps.get-job-id.outputs.job-name }}
-
-    - name: Download pytest cache
-      uses: ./.github/actions/pytest-cache-download
-      continue-on-error: true
-      with:
-        cache_dir: .pytest_cache
-        job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
-        s3_bucket: ${{ inputs.s3-bucket }}
-
-    - name: Build
-      if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
-      id: build
-      env:
-        BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
-        BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # TODO duplicated
-        AWS_DEFAULT_REGION: us-east-1
-        PR_NUMBER: ${{ github.event.pull_request.number }}
-        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
-        XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-        PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
-        DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-        XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
-        DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }}
-        OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
-        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
-        USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
-      shell: bash
-      run: |
-        # detached container should get cleaned up by teardown_ec2_linux
-        container_name=$(docker run \
-          -e BUILD_ENVIRONMENT \
-          -e MAX_JOBS="$(nproc --ignore=2)" \
-          -e AWS_DEFAULT_REGION \
-          -e PR_NUMBER \
-          -e SHA1 \
-          -e BRANCH \
-          -e SCCACHE_BUCKET \
-          -e SCCACHE_S3_KEY_PREFIX \
-          -e XLA_CUDA \
-          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-          -e SKIP_SCCACHE_INITIALIZATION=1 \
-          -e TORCH_CUDA_ARCH_LIST \
-          -e PR_LABELS \
-          -e OUR_GITHUB_JOB_ID \
-          -e HUGGING_FACE_HUB_TOKEN \
-          -e USE_SPLIT_BUILD \
-          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-          --security-opt seccomp=unconfined \
-          --cap-add=SYS_PTRACE \
-          --tty \
-          --detach \
-          --user jenkins \
-          -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-          -w /var/lib/jenkins/workspace \
-          "${DOCKER_IMAGE}"
-        )
-        docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
-
-    - name: Archive artifacts into zip
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
-      shell: bash
-      run: |
-        zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
-
-    - name: Store PyTorch Build Artifacts on S3
-      uses: seemethere/upload-artifact-s3@v5
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build != 'true'
-      with:
-        name: ${{ inputs.build-environment }}
-        retention-days: 14
-        if-no-files-found: error
-        path: artifacts.zip
-        s3-bucket: ${{ inputs.s3-bucket }}
-
-    - name: Store PyTorch Build Artifacts on S3 for split build
-      uses: seemethere/upload-artifact-s3@v5
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build == 'true'
-      with:
-        name: ${{ inputs.build-environment }}-experimental-split-build
-        retention-days: 14
-        if-no-files-found: error
-        path: artifacts.zip
-        s3-bucket: ${{ inputs.s3-bucket }}
-
-    - name: Upload sccache stats
-      if: steps.build.outcome != 'skipped'
-      uses: seemethere/upload-artifact-s3@v5
-      with:
-        s3-prefix: |
-          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
-        retention-days: 365
-        if-no-files-found: warn
-        path: sccache-stats-*.json
-        s3-bucket: ${{ inputs.s3-bucket }}
-
-    - name: Teardown Linux
-      uses: pytorch/test-infra/.github/actions/teardown-linux@main
-      if: always()
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -167,6 +167,7 @@ runs:
        REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
        CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
        VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+        TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
        NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-69b2a0adc2ec03ab99990d7e8be3d4510438c148
+b3f6f511f2a1082bd56b13a3f6794e7fc3ba4862
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -1,13 +1,23 @@
-# Defines runner types that will be provisioned by by LF Self-hosted
-# runners for pytorch/pytorch-canary and their labels.
+
+# This file is generated by .github/scripts/validate_scale_config.py in test-infra
+# It defines runner types that will be provisioned by by LF Self-hosted runners
+
+# scale-config.yml:
+#   Powers what instance types are available for GHA auto-scaled
+#   runners. Runners listed here will be available as self hosted
+#   runners, configuration is directly pulled from the main branch.
 #
-# Runners listed here will be available as self hosted runners.
-# Configuration is directly pulled from the main branch.
+# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
 #
-# Default values:
+# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
+#                     to avoid RequestLimitExceeded issues
+#
+# TODO: Add some documentation on how the auto-scaling works
+#
+# NOTE: Default values,
 #
 # runner_types:
-#   runner_label: # label to specify in the Github Actions workflow
+#   runner_label:
 #     instance_type: m4.large
 #     os: linux
 #     max_available: 20
@ -21,17 +31,29 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
+  lf.c.linux.10xlarge.avx2:
+    disk_size: 200
+    instance_type: m4.10xlarge
+    is_ephemeral: false
+    max_available: 450
+    os: linux
  lf.c.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
    os: linux
  lf.c.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
+    os: linux
+  lf.c.linux.9xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.9xlarge
+    is_ephemeral: true
+    max_available: 20
    os: linux
  lf.c.linux.12xlarge.ephemeral:
    disk_size: 200
@ -43,7 +65,7 @@ runner_types:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
    os: linux
  lf.c.linux.24xlarge:
    disk_size: 150
@ -67,7 +89,7 @@ runner_types:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
-    max_available: 520
+    max_available: 1000
    os: linux
  lf.c.linux.8xlarge.nvidia.gpu:
    disk_size: 150
@ -79,19 +101,19 @@ runner_types:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
-    max_available: 50
+    max_available: 250
    os: linux
  lf.c.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
-    max_available: 30
+    max_available: 300
    os: linux
  lf.c.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
-    max_available: 20
+    max_available: 200
    os: linux
  lf.c.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
@ -103,9 +125,16 @@ runner_types:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
-    max_available: 1200
+    max_available: 2400
+    os: linux
+  lf.c.linux.g6.4xlarge.experimental.nvidia.gpu:
+    disk_size: 150
+    instance_type: g6.4xlarge
+    is_ephemeral: false
+    max_available: 30
    os: linux
  lf.c.linux.large:
+    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
@ -116,11 +145,17 @@ runner_types:
    is_ephemeral: false
    max_available: 200
    os: linux
-  lf.c.linux.arm64.m7g.2xlarge:
+  lf.c.linux.arm64.m7g.4xlarge:
    disk_size: 256
-    instance_type: m7g.2xlarge
+    instance_type: m7g.4xlarge
    is_ephemeral: false
-    max_available: 20
+    max_available: 200
+    os: linux
+  lf.c.linux.arm64.m7g.metal:
+    disk_size: 256
+    instance_type: m7g.metal
+    is_ephemeral: false
+    max_available: 100
    os: linux
  lf.c.windows.4xlarge:
    disk_size: 256
@ -138,7 +173,7 @@ runner_types:
    disk_size: 256
    instance_type: p3.2xlarge
    is_ephemeral: true
-    max_available: 150
+    max_available: 300
    os: windows
  lf.c.windows.8xlarge.nvidia.gpu.nonephemeral:
    disk_size: 256
@ -161,18 +196,32 @@ runner_types:
    max_available: 1000
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.10xlarge.avx2:
+    disk_size: 200
+    instance_type: m4.10xlarge
+    is_ephemeral: false
+    max_available: 450
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.c.amz2023.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.c.amz2023.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.9xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.9xlarge
+    is_ephemeral: true
+    max_available: 20
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.c.amz2023.linux.12xlarge.ephemeral:
@ -186,7 +235,7 @@ runner_types:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.c.amz2023.linux.24xlarge:
@ -214,7 +263,7 @@ runner_types:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
-    max_available: 520
+    max_available: 1000
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.c.amz2023.linux.8xlarge.nvidia.gpu:
@ -228,21 +277,21 @@ runner_types:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
-    max_available: 50
+    max_available: 250
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.c.amz2023.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
-    max_available: 30
+    max_available: 300
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.c.amz2023.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
-    max_available: 20
+    max_available: 200
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.c.amz2023.linux.g5.12xlarge.nvidia.gpu:
@ -256,10 +305,18 @@ runner_types:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
-    max_available: 1200
+    max_available: 2400
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g6.4xlarge.experimental.nvidia.gpu:
+    disk_size: 150
+    instance_type: g6.4xlarge
+    is_ephemeral: false
+    max_available: 30
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.c.amz2023.linux.large:
+    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
@ -271,11 +328,18 @@ runner_types:
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.arm64.m7g.2xlarge:
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+  lf.c.amz2023.linux.arm64.m7g.4xlarge:
    disk_size: 256
-    instance_type: m7g.2xlarge
+    instance_type: m7g.4xlarge
    is_ephemeral: false
-    max_available: 20
+    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+  lf.c.amz2023.linux.arm64.m7g.metal:
+    disk_size: 256
+    instance_type: m7g.metal
+    is_ephemeral: false
+    max_available: 100
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -1,13 +1,23 @@
-# Defines runner types that will be provisioned by by LF Self-hosted
-# runners for pytorch/pytorch and their labels.
+
+# This file is generated by .github/scripts/validate_scale_config.py in test-infra
+# It defines runner types that will be provisioned by by LF Self-hosted runners
+
+# scale-config.yml:
+#   Powers what instance types are available for GHA auto-scaled
+#   runners. Runners listed here will be available as self hosted
+#   runners, configuration is directly pulled from the main branch.
 #
-# Runners listed here will be available as self hosted runners.
-# Configuration is directly pulled from the main branch.
+# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
 #
-# Default values:
+# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
+#                     to avoid RequestLimitExceeded issues
+#
+# TODO: Add some documentation on how the auto-scaling works
+#
+# NOTE: Default values,
 #
 # runner_types:
-#   runner_label: # label to specify in the Github Actions workflow
+#   runner_label:
 #     instance_type: m4.large
 #     os: linux
 #     max_available: 20
@ -21,17 +31,29 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
+  lf.linux.10xlarge.avx2:
+    disk_size: 200
+    instance_type: m4.10xlarge
+    is_ephemeral: false
+    max_available: 450
+    os: linux
  lf.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
    os: linux
  lf.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
+    os: linux
+  lf.linux.9xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.9xlarge
+    is_ephemeral: true
+    max_available: 20
    os: linux
  lf.linux.12xlarge.ephemeral:
    disk_size: 200
@ -43,7 +65,7 @@ runner_types:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
    os: linux
  lf.linux.24xlarge:
    disk_size: 150
@ -67,7 +89,7 @@ runner_types:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
-    max_available: 520
+    max_available: 1000
    os: linux
  lf.linux.8xlarge.nvidia.gpu:
    disk_size: 150
@ -79,19 +101,19 @@ runner_types:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
-    max_available: 50
+    max_available: 250
    os: linux
  lf.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
-    max_available: 30
+    max_available: 300
    os: linux
  lf.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
-    max_available: 20
+    max_available: 200
    os: linux
  lf.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
@ -103,9 +125,16 @@ runner_types:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
-    max_available: 1200
+    max_available: 2400
+    os: linux
+  lf.linux.g6.4xlarge.experimental.nvidia.gpu:
+    disk_size: 150
+    instance_type: g6.4xlarge
+    is_ephemeral: false
+    max_available: 30
    os: linux
  lf.linux.large:
+    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
@ -116,11 +145,17 @@ runner_types:
    is_ephemeral: false
    max_available: 200
    os: linux
-  lf.linux.arm64.m7g.2xlarge:
+  lf.linux.arm64.m7g.4xlarge:
    disk_size: 256
-    instance_type: m7g.2xlarge
+    instance_type: m7g.4xlarge
    is_ephemeral: false
-    max_available: 20
+    max_available: 200
+    os: linux
+  lf.linux.arm64.m7g.metal:
+    disk_size: 256
+    instance_type: m7g.metal
+    is_ephemeral: false
+    max_available: 100
    os: linux
  lf.windows.4xlarge:
    disk_size: 256
@ -138,7 +173,7 @@ runner_types:
    disk_size: 256
    instance_type: p3.2xlarge
    is_ephemeral: true
-    max_available: 150
+    max_available: 300
    os: windows
  lf.windows.8xlarge.nvidia.gpu.nonephemeral:
    disk_size: 256
@ -161,18 +196,32 @@ runner_types:
    max_available: 1000
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.10xlarge.avx2:
+    disk_size: 200
+    instance_type: m4.10xlarge
+    is_ephemeral: false
+    max_available: 450
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.amz2023.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.amz2023.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.9xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.9xlarge
+    is_ephemeral: true
+    max_available: 20
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.amz2023.linux.12xlarge.ephemeral:
@ -186,7 +235,7 @@ runner_types:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
-    max_available: 30
+    max_available: 150
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.amz2023.linux.24xlarge:
@ -214,7 +263,7 @@ runner_types:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
-    max_available: 520
+    max_available: 1000
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.amz2023.linux.8xlarge.nvidia.gpu:
@ -228,21 +277,21 @@ runner_types:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
-    max_available: 50
+    max_available: 250
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.amz2023.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
-    max_available: 30
+    max_available: 300
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.amz2023.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
-    max_available: 20
+    max_available: 200
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.amz2023.linux.g5.12xlarge.nvidia.gpu:
@ -256,10 +305,18 @@ runner_types:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
-    max_available: 1200
+    max_available: 2400
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g6.4xlarge.experimental.nvidia.gpu:
+    disk_size: 150
+    instance_type: g6.4xlarge
+    is_ephemeral: false
+    max_available: 30
    os: linux
    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
  lf.amz2023.linux.large:
+    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
@ -271,11 +328,18 @@ runner_types:
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.arm64.m7g.2xlarge:
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+  lf.amz2023.linux.arm64.m7g.4xlarge:
    disk_size: 256
-    instance_type: m7g.2xlarge
+    instance_type: m7g.4xlarge
    is_ephemeral: false
-    max_available: 20
+    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+  lf.amz2023.linux.arm64.m7g.metal:
+    disk_size: 256
+    instance_type: m7g.metal
+    is_ephemeral: false
+    max_available: 100
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -523,6 +523,13 @@
  - Skylion007
  - ngimel
  - peterbell10
+  - eqy
+  - jansel
+  - jeffdaily
+  - eellison
+  - anijain2305
+  - bdhirsh
+  - zou3519
  mandatory_checks_name:
  - EasyCLA
  - Lint
@ -537,6 +544,8 @@
  - ezyang
  - dzhulgakov
  - malfet
+  - albanD
+  - ptrblck
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -505,6 +505,9 @@ def perform_misc_tasks(
        "ci-verbose-test-logs",
        check_for_setting(labels, pr_body, "ci-verbose-test-logs"),
    )
+    set_output(
+        "ci-test-showlocals", check_for_setting(labels, pr_body, "ci-test-showlocals")
+    )
    set_output(
        "ci-no-test-timeout", check_for_setting(labels, pr_body, "ci-no-test-timeout")
    )
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -215,7 +215,7 @@ LIBTORCH_CONTAINER_IMAGES: Dict[Tuple[str, str], str] = {
    ("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
 }

-FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"]
+FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12"]


 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@ -683,6 +683,7 @@ class TestConfigFilter(TestCase):
        def _gen_expected_string(
            keep_going: bool = False,
            ci_verbose_test_logs: bool = False,
+            ci_test_showlocals: bool = False,
            ci_no_test_timeout: bool = False,
            ci_no_td: bool = False,
            ci_td_distributed: bool = False,
@ -692,6 +693,7 @@ class TestConfigFilter(TestCase):
            return (
                f"keep-going={keep_going}\n"
                f"ci-verbose-test-logs={ci_verbose_test_logs}\n"
+                f"ci-test-showlocals={ci_test_showlocals}\n"
                f"ci-no-test-timeout={ci_no_test_timeout}\n"
                f"ci-no-td={ci_no_td}\n"
                f"ci-td-distributed={ci_td_distributed}\n"
@ -733,6 +735,21 @@ class TestConfigFilter(TestCase):
                ),
                "description": "No pipe logs label and no test timeout in PR body",
            },
+            {
+                "labels": {"ci-test-showlocals"},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "expected": _gen_expected_string(ci_test_showlocals=True),
+                "description": "Has ci-test-showlocals",
+            },
+            {
+                "labels": {},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "[ci-test-showlocals]",
+                "expected": _gen_expected_string(ci_test_showlocals=True),
+                "description": "ci-test-showlocals in body",
+            },
            {
                "labels": {"ci-no-test-timeout"},
                "test_matrix": '{include: [{config: "default"}]}',
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -43,6 +43,10 @@ on:
        required: false
        type: string
        default: ""
+      runner_prefix:
+        description: prefix for runner label
+        type: string
+        default: ""
    secrets:
      GH_PYTORCHBOT_TOKEN:
        required: false
@ -63,16 +67,16 @@ jobs:
            # an OOM issue when running the job, so this upgrades the runner from 4xlarge
            # to the next available tier of 12xlarge. So much memory just to generate cpp
            # doc
-            runner: linux.12xlarge
+            runner: ${{ inputs.runner_prefix }}linux.12xlarge
            # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
            # Let's try to figure out how this can be improved
            timeout-minutes: 240
          - docs_type: python
-            runner: linux.2xlarge
+            runner: ${{ inputs.runner_prefix }}linux.2xlarge
            # It takes less than 30m to finish python docs unless there are issues
            timeout-minutes: 30
          - docs_type: functorch
-            runner: linux.2xlarge
+            runner: ${{ inputs.runner_prefix }}linux.2xlarge
            # It takes less than 15m to finish functorch docs unless there are issues
            timeout-minutes: 15
    # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
--- a/.github/workflows/_linux-build-label.yml
+++ b/.github/workflows/_linux-build-label.yml
@ -1,117 +0,0 @@
-name: linux-build
-
-on:
-  workflow_call:
-    inputs:
-      build-environment:
-        required: true
-        type: string
-        description: Top-level label for what's being built/tested.
-      docker-image-name:
-        required: true
-        type: string
-        description: Name of the base docker image to build with.
-      build-generates-artifacts:
-        required: false
-        type: boolean
-        default: true
-        description: If set, upload generated build artifacts.
-      build-with-debug:
-        required: false
-        type: boolean
-        default: false
-        description: If set, build in debug mode.
-      sync-tag:
-        required: false
-        type: string
-        default: ""
-        description: |
-          If this is set, our linter will use this to make sure that every other
-          job with the same `sync-tag` is identical.
-      cuda-arch-list:
-        required: false
-        type: string
-        default: "5.2"
-        description: Runner label to select worker type
-      runner:
-        required: false
-        type: string
-        default: "linux.2xlarge"
-        description: |
-          List of CUDA architectures CI build should target.
-      test-matrix:
-        required: false
-        type: string
-        description: |
-          An option JSON description of what test configs to run later on. This
-          is moved here from the Linux test workflow so that we can apply filter
-          logic using test-config labels earlier and skip unnecessary builds
-      s3-bucket:
-        description: S3 bucket to download artifact
-        required: false
-        type: string
-        default: "gha-artifacts"
-      aws-role-to-assume:
-        description: role to assume for downloading artifacts
-        required: false
-        type: string
-        default: ""
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
-    secrets:
-      HUGGING_FACE_HUB_TOKEN:
-        required: false
-        description: |
-          HF Auth token to avoid rate limits when downloading models or datasets from hub
-
-    outputs:
-      docker-image:
-        value: ${{ jobs.build.outputs.docker-image }}
-        description: The docker image containing the built PyTorch.
-      test-matrix:
-        value: ${{ jobs.build.outputs.test-matrix }}
-        description: An optional JSON description of what test configs to run later on.
-
-jobs:
-  build:
-    # Don't run on forked repos
-    if: github.repository_owner == 'pytorch'
-    runs-on: ${{ inputs.runner }}
-    timeout-minutes: 240
-    outputs:
-      docker-image: ${{ steps.linux-build.outputs.docker-image }}
-      test-matrix: ${{ steps.linux-build.outputs.test-matrix }}
-    steps:
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-
-      # [pytorch repo ref]
-      # Use a pytorch/pytorch reference instead of a reference to the local
-      # checkout because when we run this action we don't *have* a local
-      # checkout. In other cases you should prefer a local checkout.
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-
-      - name: Linux Build
-        id: linux-build
-        uses: ./.github/actions/linux-build
-        with:
-          build-environment: ${{ inputs.build-environment }}
-          docker-image-name: ${{ inputs.docker-image-name }}
-          build-generates-artifacts: ${{ inputs.build-generates-artifacts }}
-          build-with-debug: ${{ inputs.build-with-debug }}
-          sync-tag: ${{ inputs.sync-tag }}
-          cuda-arch-list: ${{ inputs.cuda-arch-list }}
-          test-matrix: ${{ inputs.test-matrix }}
-          s3-bucket: ${{ inputs.s3-bucket }}
-          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          use_split_build: ${{ inputs.use_split_build }}
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -198,6 +198,7 @@ jobs:
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
@ -251,6 +252,7 @@ jobs:
            -e REENABLED_ISSUES \
            -e CONTINUE_THROUGH_ERROR \
            -e VERBOSE_TEST_LOGS \
+            -e TEST_SHOWLOCALS \
            -e NO_TEST_TIMEOUT \
            -e NO_TD \
            -e TD_DISTRIBUTED \
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@ -35,6 +35,7 @@ jobs:
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
      keep-going: ${{ steps.filter.outputs.keep-going }}
      ci-verbose-test-logs: ${{ steps.filter.outputs.ci-verbose-test-logs }}
+      ci-test-showlocals: ${{ steps.filter.outputs.ci-test-showlocals }}
      ci-no-test-timeout: ${{ steps.filter.outputs.ci-no-test-timeout }}
      ci-no-td: ${{ steps.filter.outputs.ci-no-td }}
      reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
@ -98,6 +99,7 @@ jobs:
          PR_BODY: ${{ github.event.pull_request.body }}
          CONTINUE_THROUGH_ERROR: ${{ needs.filter.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ needs.filter.outputs.ci-verbose-test-logs }}
+          TEST_SHOWLOCALS: ${{ needs.filter.outputs.ci-test-showlocals }}
          NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }}
          NO_TD: ${{ needs.filter.outputs.ci-no-td }}
          PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -144,6 +144,7 @@ jobs:
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -154,6 +154,7 @@ jobs:
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TEST_CONFIG: ${{ matrix.config }}
@ -205,6 +206,7 @@ jobs:
            -e REENABLED_ISSUES \
            -e CONTINUE_THROUGH_ERROR \
            -e VERBOSE_TEST_LOGS \
+            -e TEST_SHOWLOCALS \
            -e NO_TEST_TIMEOUT \
            -e NO_TD \
            -e MAX_JOBS="$(nproc --ignore=2)" \
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -157,6 +157,7 @@ jobs:
          PYTHON_VERSION: 3.8
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          VC_PRODUCT: "BuildTools"
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -143,6 +143,7 @@ jobs:
          PYTORCH_RETRY_TEST_CASES: 1
          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
@ -189,6 +190,7 @@ jobs:
            -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
            -e CONTINUE_THROUGH_ERROR \
            -e VERBOSE_TEST_LOGS \
+            -e TEST_SHOWLOCALS \
            -e NO_TEST_TIMEOUT \
            -e NO_TD \
            -e MAX_JOBS="$(nproc --ignore=2)" \
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -66,7 +66,8 @@ jobs:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
            runner: linux.arm64.2xlarge
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
-            runner: linux.arm64.2xlarge
+            runner: linux.arm64.m7g.4xlarge
+            timeout-minutes: 600
    runs-on: [self-hosted, "${{ matrix.runner }}"]
    env:
      DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
@ -123,7 +124,7 @@ jobs:
      - name: Chown workspace
        uses: ./.github/actions/chown-workspace
        with:
-          ALPINE_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/${{ (matrix.runner == 'linux.arm64.2xlarge') && 'arm64v8' || 'tool' }}/alpine
+          ALPINE_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/${{ contains(matrix.runner, 'arm64') && 'arm64v8' || 'tool' }}/alpine
        if: always()

      - name: Teardown Linux
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -37,114 +37,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  manywheel-py3_8-cpu-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
-      DESIRED_PYTHON: "3.8"
-      runs_on: linux.arm64.m7g.4xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_8-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cpu-aarch64-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cpu-aarch64-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      runs_on: linux.arm64.2xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cpu-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cpu-aarch64-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cpu-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-cuda-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      runs_on: linux.arm64.m7g.4xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_8-cuda-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cuda-aarch64-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_9-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
--- a/.github/workflows/generated-linux-binary-conda-nightly.yml
+++ b/.github/workflows/generated-linux-binary-conda-nightly.yml
@ -37,254 +37,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  conda-py3_8-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: conda-py3_8-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  conda-py3_8-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.8"
-      runs_on: linux.24xlarge
-      build_name: conda-py3_8-cuda11_8
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_8-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_8
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: conda-py3_8-cuda11_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  conda-py3_8-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.8"
-      runs_on: linux.24xlarge
-      build_name: conda-py3_8-cuda12_1
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda12_1-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda12_1
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: conda-py3_8-cuda12_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  conda-py3_8-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.8"
-      runs_on: linux.24xlarge
-      build_name: conda-py3_8-cuda12_4
-      build_environment: linux-binary-conda
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda12_4-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda12_4
-      build_environment: linux-binary-conda
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  conda-py3_8-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: conda-py3_8-cuda12_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  conda-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -37,832 +37,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  manywheel-py3_8-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cpu
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cpu-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cpu
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-cpu-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu-cxx11-abi
-      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cpu-cxx11-abi
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cpu-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cpu-cxx11-abi-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu-cxx11-abi
-      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cpu-cxx11-abi
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cpu-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cpu-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu-cxx11-abi
-      GPU_ARCH_TYPE: cpu-cxx11-abi
-      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cpu-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_8-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cuda11_8-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-cuda11_8-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda11_8-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_8-split-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8-split
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda11_8-split-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cuda11_8-split-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_1-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cuda12_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-cuda12_1-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_1-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_1-split-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1-split
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_1-split-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cuda12_1-split-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_4-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cuda12_4-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-cuda12_4-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_4-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_4-split-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4-split
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_4-split-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cuda12_4-split-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4-split
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-rocm6_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm6_0
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-rocm6_0-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm6_0-build
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-rocm6_0
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux-builder:rocm6.0-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  manywheel-py3_8-rocm6_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-rocm6_0-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm6_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-rocm6_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.1
-      GPU_ARCH_VERSION: 6.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm6_1
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-rocm6_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-rocm6_1-build
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.1
-      GPU_ARCH_VERSION: 6.1
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-rocm6_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux-builder:rocm6.1-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  manywheel-py3_8-rocm6_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-rocm6_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.1
-      GPU_ARCH_VERSION: 6.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-rocm6_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_8-xpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-xpu
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-xpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-xpu-build
-    runs-on: linux.idc.xpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v1.7.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-      - name: Login to Amazon ECR
-        id: login-ecr
-        uses: aws-actions/amazon-ecr-login@v2
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_8-xpu
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux2_28-builder:xpu-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown XPU
-        uses: ./.github/actions/teardown-xpu
-  manywheel-py3_8-xpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-xpu-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: xpu
-      GPU_ARCH_TYPE: xpu
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:xpu-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-xpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -37,69 +37,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  manywheel-py3_8-cpu-s390x-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
-      DESIRED_PYTHON: "3.8"
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
-      build_name: manywheel-py3_8-cpu-s390x
-      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cpu-s390x-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cpu-s390x-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cpu-s390x
-      build_environment: linux-s390x-binary-manywheel
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cpu-s390x-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cpu-s390x-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cpu-s390x
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_9-cpu-s390x-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@ -32,124 +32,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  conda-py3_8-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  conda-py3_8-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: conda-py3_8-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: macos-14-xlarge
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -32,125 +32,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  wheel-py3_8-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
-        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
-            sudo chmod +x /usr/local/bin/sccache
-            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_8-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: wheel-py3_8-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
-      DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: macos-14-xlarge
--- a/.github/workflows/generated-windows-binary-conda-nightly.yml
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
@ -32,983 +32,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  conda-py3_8-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: conda-py3_8-cpu-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_8-cuda11_8
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda11_8
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: conda-py3_8-cuda11_8-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_8-cuda12_1
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda12_1
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: conda-py3_8-cuda12_1-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  conda-py3_8-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: conda-py3_8-cuda12_4
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_8-cuda12_4-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: conda-py3_8-cuda12_4
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  conda-py3_8-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: conda-py3_8-cuda12_4-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: conda
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: conda-py3_8-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: windows.4xlarge.nonephemeral
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -32,987 +32,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  wheel-py3_8-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_8-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cpu-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: wheel-py3_8-cpu-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_8-cuda11_8
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cuda11_8
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda11_8-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: wheel-py3_8-cuda11_8-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cuda11_8
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_8-cuda12_1
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cuda12_1
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda12_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: wheel-py3_8-cuda12_1-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cuda12_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_8-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
-      - uses: actions/upload-artifact@v3
-        if: always()
-        with:
-          name: wheel-py3_8-cuda12_4
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_8-cuda12_4-build
-    runs-on: windows.8xlarge.nvidia.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.8"
-    steps:
-      - name: Display EC2 information
-        shell: bash
-        run: |
-          set -euo pipefail
-          function get_ec2_metadata() {
-            # Pulled from instance metadata endpoint for EC2
-            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
-            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
-          }
-          echo "ami-id: $(get_ec2_metadata ami-id)"
-          echo "instance-id: $(get_ec2_metadata instance-id)"
-          echo "instance-type: $(get_ec2_metadata instance-type)"
-          echo "system info $(uname -a)"
-      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        continue-on-error: true
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
-      - name: Enable long paths on Windows
-        shell: powershell
-        run: |
-          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
-      # removed once Windows Defender is removed from the AMI
-      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
-        continue-on-error: true
-        shell: powershell
-        run: |
-          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
-          # Let's both exclude the path and disable Windows Defender completely just to be sure
-          # that it doesn't interfere
-          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_8-cuda12_4
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-      - name: Wait until all sessions have drained
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        timeout-minutes: 120
-        run: |
-          .github\scripts\wait_for_ssh_to_drain.ps1
-      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
-        shell: powershell
-        working-directory: pytorch
-        if: always()
-        run: |
-          .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_8-cuda12_4-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: wheel-py3_8-cuda12_4-test
-    with:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      BUILDER_ROOT: ${{ github.workspace }}/builder
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DESIRED_PYTHON: "3.8"
-      build_name: wheel-py3_8-cuda12_4
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: windows.4xlarge.nonephemeral
--- a/.github/workflows/inductor-perf-test-nightly-a10g.yml
+++ b/.github/workflows/inductor-perf-test-nightly-a10g.yml
@ -0,0 +1,125 @@
+name: inductor-perf-nightly-A10g
+
+on:
+  schedule:
+    # - cron: 0 7 * * 1-6
+    # - cron: 0 7 * * 0
+    # Do not perform weekly max-autotune run for now.
+    - cron: 0 7 * * *
+  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
+  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
+  workflow_dispatch:
+    inputs:
+      training:
+        description: Run training (on by default)?
+        required: false
+        type: boolean
+        default: true
+      inference:
+        description: Run inference (off by default)?
+        required: false
+        type: boolean
+        default: false
+      default:
+        description: Run inductor_default?
+        required: false
+        type: boolean
+        default: false
+      dynamic:
+        description: Run inductor_dynamic_shapes?
+        required: false
+        type: boolean
+        default: false
+      cudagraphs:
+        description: Run inductor_cudagraphs?
+        required: false
+        type: boolean
+        default: true
+      freezing_cudagraphs:
+        description: Run inductor_cudagraphs with freezing for inference?
+        required: false
+        type: boolean
+        default: false
+      freeze_autotune_cudagraphs:
+        description: Run inductor_cudagraphs with freezing and max autotune for inference?
+        required: false
+        type: boolean
+        default: false
+      aotinductor:
+        description: Run aot_inductor for inference?
+        required: false
+        type: boolean
+        default: false
+      maxautotune:
+        description: Run inductor_max_autotune?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+        default: inductor_huggingface_perf_cuda_a10g,inductor_timm_perf_cuda_a10g,inductor_torchbench_perf_cuda_a10g
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_cuda_a10g", shard: 1, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_huggingface_perf_cuda_a10g", shard: 2, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_huggingface_perf_cuda_a10g", shard: 3, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm_perf_cuda_a10g", shard: 1, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm_perf_cuda_a10g", shard: 2, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm_perf_cuda_a10g", shard: 3, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm_perf_cuda_a10g", shard: 4, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm_perf_cuda_a10g", shard: 5, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench_perf_cuda_a10g", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench_perf_cuda_a10g", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench_perf_cuda_a10g", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench_perf_cuda_a10g", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+      selected-test-configs: ${{ inputs.benchmark_configs }}
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-test-nightly:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build
+    if: github.event.schedule == '0 7 * * *'
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+      timeout-minutes: 720
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-test:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build
+    if: github.event_name == 'workflow_dispatch'
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+      timeout-minutes: 720
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -52,6 +52,7 @@ jobs:
    name: linux-jammy-aarch64-py3.10-inductor
    uses: ./.github/workflows/_linux-build.yml
    with:
+      runner: linux.arm64.m7g.4xlarge
      build-environment: linux-jammy-aarch64-py3.10
      docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
      test-matrix: |
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -7,6 +7,8 @@ on:
      - release/*
    tags:
      - ciflow/inductor/*
+  schedule:
+    - cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
  workflow_dispatch:

 concurrency:
@ -164,6 +166,13 @@ jobs:
          { config: "cpu_aot_inductor_torchbench_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "cpu_aot_inductor_torchbench_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
+          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
+          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
+          { config: "cpu_inductor_huggingface_freezing_avx2", shard: 1, num_shards: 1, runner: "linux.10xlarge.avx2" },
+          { config: "cpu_inductor_torchbench_freezing_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
+          { config: "cpu_inductor_torchbench_freezing_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
+          { config: "cpu_inductor_timm_freezing_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
+          { config: "cpu_inductor_timm_freezing_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
        ]}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@ -29,6 +29,7 @@ jobs:
        { include: [
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
        ]}

  macos-py3-arm64-mps-test:
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -21,6 +21,7 @@ jobs:
    name: docs build
    uses: ./.github/workflows/_linux-build.yml
    with:
+      runner: "amz2023.linux.2xlarge"
      build-environment: linux-jammy-py3.8-gcc11
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11

@ -29,6 +30,7 @@ jobs:
    uses: ./.github/workflows/_docs.yml
    needs: docs-build
    with:
+      runner_prefix: "amz2023."
      build-environment: linux-jammy-py3.8-gcc11
      docker-image: ${{ needs.docs-build.outputs.docker-image }}
      push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -80,6 +80,7 @@ jobs:
    uses: ./.github/workflows/_docs.yml
    needs: linux-jammy-py3_8-gcc11-build
    with:
+      runner_prefix: amz2023.
      build-environment: linux-jammy-py3.8-gcc11
      docker-image: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.docker-image }}

@ -543,36 +544,6 @@ jobs:
      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}

-  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
-    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}amz2023.linux.2xlarge"
-      use_split_build: true
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}amz2023.linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}amz2023.linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}amz2023.linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}amz2023.linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}amz2023.linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build-test:
-    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
-      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
-
  linux-focal-py3_12-clang10-experimental-split-build:
    name: linux-focal-py3.12-clang10-experimental-split-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -294,9 +294,9 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}amz2023.linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}amz2023.linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}amz2023.linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build-test:
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats

 on:
  workflow_run:
-    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86]
+    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86]
    types:
      - completed

--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -15,6 +15,7 @@ exclude_patterns = [
    'functorch/examples/**',
    'functorch/notebooks/**',
    'torch/_inductor/fx_passes/serialized_patterns/**',
+    'torch/_inductor/autoheuristic/artifacts/**',
    'scripts/**',
    'test/generated_type_hints_smoketest.py',
    # Tests from the NumPy test suite
@ -196,6 +197,8 @@ include_patterns = [
    'aten/src/ATen/*.cpp',
    'aten/src/ATen/core/*.h',
    'aten/src/ATen/core/*.cpp',
+    'aten/src/ATen/cudnn/*.h',
+    'aten/src/ATen/cudnn/*.cpp',
    'aten/src/ATen/detail/*',
    'aten/src/ATen/functorch/*.h',
    'aten/src/ATen/functorch/*.cpp',
@ -233,7 +236,7 @@ exclude_patterns = [
    'torch/csrc/autograd/generated/**',
    'torch/csrc/distributed/**/*',
    'torch/csrc/dynamo/eval_frame.h',
-    'torch/csrc/inductor/**/*',
+    'torch/csrc/inductor/aoti_torch/c/shim.h',
    'torch/csrc/jit/**/*',
    'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
    'torch/csrc/lazy/**/*',
@ -940,6 +943,25 @@ command = [
    '@{{PATHSFILE}}'
 ]

+[[linter]]
+code = 'CONTEXT_DECORATOR'
+include_patterns = [
+    'torch/**',
+]
+command = [
+    'python3',
+    'tools/linter/adapters/grep_linter.py',
+    '--pattern=@.*(dynamo_timed)',
+    '--linter-name=CONTEXT_DECORATOR',
+    '--error-name=avoid context decorator',
+    """--error-description=\
+        Do not use context manager as decorator as it breaks cProfile traces.  Use it as \
+        a context manager instead\
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
 [[linter]]
 code = 'ONCE_FLAG'
 include_patterns = [
@ -986,16 +1008,16 @@ init_command = [
    'PyYAML==6.0.1',
 ]

-# Black + usort
+# usort + ruff-format
 [[linter]]
-code = 'UFMT'
+code = 'PYFMT'
 include_patterns = [
    '**/*.py',
    '**/*.pyi',
 ]
 command = [
    'python3',
-    'tools/linter/adapters/ufmt_linter.py',
+    'tools/linter/adapters/pyfmt_linter.py',
    '--',
    '@{{PATHSFILE}}'
 ]
@ -1010,6 +1032,7 @@ exclude_patterns = [
    'third_party/**/*.py',
    'third_party/**/*.pyi',
    'torch/_inductor/fx_passes/serialized_patterns/**',
+    'torch/_inductor/autoheuristic/artifacts/**',
    # These files are all grandfathered in, feel free to remove from this list
    # as necessary
    'test/_nvfuser/__init__.py',
@ -1452,9 +1475,9 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    '--no-black-binary',
    'black==23.12.1',
-    'ufmt==2.7.0',
    'usort==1.0.8.post1',
    'isort==5.13.2',
+    'ruff==0.5.5',  # sync with RUFF
 ]
 is_formatter = true

@ -1521,6 +1544,7 @@ exclude_patterns = [
    'functorch/docs/**',
    'functorch/notebooks/**',
    'torch/_inductor/fx_passes/serialized_patterns/**',
+    'torch/_inductor/autoheuristic/artifacts/**',
    'scripts/**',
    'third_party/**',
    'fb/**',
@ -1538,7 +1562,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.5.2',
+    'ruff==0.5.5',  # sync with PYFMT
 ]
 is_formatter = true

--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@ -1,6 +1,5 @@
 {
  "recommendations": [
    "ms-python.python",
-    "omnilib.ufmt"
  ]
 }
--- a/.vscode/settings_recommended.json
+++ b/.vscode/settings_recommended.json
@ -4,14 +4,12 @@
    },
    "files.associations": {
        "*.py.in": "python",
-        "*.pyi.in": "python",
-        "editor.defaultFormatter": "omnilib.ufmt"
+        "*.pyi.in": "python"
    },
    "files.eol": "\n",
    "files.insertFinalNewline": true,
    "files.trimFinalNewlines": true,
    "files.trimTrailingWhitespace": true,
-    "python.formatting.provider": "none",
    "python.linting.enabled": true,
    "python.linting.flake8Enabled": true
 }
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -413,7 +413,6 @@ cc_library(
        "@cuda//:nvrtc",
        "@cudnn",
        "@cudnn_frontend",
-        "@cuda//:cufile",
    ],
    alwayslink = True,
 )
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -251,15 +251,6 @@ cmake_dependent_option(USE_CUDNN "Use cuDNN" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_STATIC_CUDNN "Use cuDNN static libraries" OFF
                       "USE_CUDNN" OFF)
 cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
-# Binary builds will fail for cufile due to https://github.com/pytorch/builder/issues/1924
-# Using TH_BINARY_BUILD to check whether is binary build.
-# USE_ROCM is guarded against in Dependencies.cmake because USE_ROCM is not properly defined here
-if(DEFINED ENV{TH_BINARY_BUILD})
-  cmake_dependent_option(USE_CUFILE "Use cuFile" ON
-                         "USE_CUDA AND NOT $ENV{TH_BINARY_BUILD} AND NOT WIN32" OFF)
-else()
-  cmake_dependent_option(USE_CUFILE "Use cuFile" ON "USE_CUDA AND NOT WIN32" OFF)
-endif()
 option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
@ -547,8 +538,14 @@ option(BUILD_EXECUTORCH "Master flag to build Executorch" ON)
 if(LINUX)
  set(CMAKE_SHARED_LINKER_FLAGS
      "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed")
-  set(CMAKE_SHARED_LINKER_FLAGS
-      "${CMAKE_SHARED_LINKER_FLAGS} $ENV{LDFLAGS}")
+
+  set(ENV_LDFLAGS "$ENV{LDFLAGS}")
+  string(STRIP "${ENV_LDFLAGS}" ENV_LDFLAGS)
+  # Do not append linker flags passed via env var if they already there
+  if(NOT ${CMAKE_SHARED_LINKER_FLAGS} MATCHES "${ENV_LDFLAGS}")
+     set(CMAKE_SHARED_LINKER_FLAGS
+         "${CMAKE_SHARED_LINKER_FLAGS} ${ENV_LDFLAGS}")
+  endif()
 endif()

 if(MSVC)
@ -990,8 +987,6 @@ if(NOT MSVC)
  append_cxx_flag_if_supported("-Wno-array-bounds" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-unused-parameter" CMAKE_CXX_FLAGS)
-  append_cxx_flag_if_supported("-Wno-unused-function" CMAKE_CXX_FLAGS)
-  append_cxx_flag_if_supported("-Wno-unused-result" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-strict-overflow" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-strict-aliasing" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-stringop-overflow" CMAKE_CXX_FLAGS)
@ -1176,6 +1171,10 @@ if(APPLE)
  append_cxx_flag_if_supported("-Wno-missing-braces" CMAKE_CXX_FLAGS)
 endif()

+if(USE_XPU)
+  string(APPEND CMAKE_CXX_FLAGS " -DUSE_XPU")
+endif()
+
 if(EMSCRIPTEN)
  string(
    APPEND
--- a/README.md
+++ b/README.md
@ -207,7 +207,7 @@ pip install -r requirements.txt
 **On Linux**

 ```bash
-conda install intel::mkl-static intel::mkl-include
+pip install mkl-static mkl-include
 # CUDA only: Add LAPACK support for the GPU if needed
 conda install -c pytorch magma-cuda121  # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo

@ -221,7 +221,7 @@ make triton

 ```bash
 # Add this package on intel x86 processor machines only
-conda install intel::mkl-static intel::mkl-include
+pip install mkl-static mkl-include
 # Add these packages if torch.distributed is needed
 conda install pkg-config libuv
 ```
@ -229,7 +229,7 @@ conda install pkg-config libuv
 **On Windows**

 ```bash
-conda install intel::mkl-static intel::mkl-include
+pip install mkl-static mkl-include
 # Add these packages if torch.distributed is needed.
 # Distributed package support on Windows is a prototype feature and is subject to changes.
 conda install -c conda-forge libuv=1.39
@ -252,6 +252,8 @@ If you would like to compile PyTorch with [new C++ ABI](https://gcc.gnu.org/onli
 export _GLIBCXX_USE_CXX11_ABI=1
 ```

+Please **note** that starting from PyTorch 2.5, the PyTorch build with XPU supports both new and old C++ ABIs. Previously, XPU only supported the new C++ ABI. If you want to compile with Intel GPU support, please follow [Intel GPU Support](#intel-gpu-support).
+
 If you're compiling for AMD ROCm then first run this command:
 ```bash
 # Only run this if you're compiling for ROCm
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -471,7 +471,7 @@ Allocator* getCPUAllocator() {
 }

 // override_allow_tf32_flag = true
-//    means the allow_tf32 flags are overrided and tf32 is force disabled
+//    means the allow_tf32 flags are overridden and tf32 is force disabled
 // override_allow_tf32_flag = false
 //    means the original allow_tf32 flags are followed
 thread_local bool override_allow_tf32_flag = false;
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -100,7 +100,7 @@ class TORCH_API Context {
      const void* data,
      std::optional<c10::DeviceType> device_type = std::nullopt) {
    auto opt_device_type =
-        device_type.has_value() ? device_type.value() : at::getAccelerator();
+        device_type.has_value() ? device_type : at::getAccelerator();
    if (!opt_device_type.has_value() || // there is no accelerator
        !at::isAccelerator(
            opt_device_type.value())) { // passed device not an accelerator
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -131,21 +131,21 @@ static Device getATenDevice(const DLDevice& ctx, void* data) {
 #ifndef USE_ROCM
    // if we are compiled under HIP, we cannot do cuda
    case DLDeviceType::kDLCUDA:
-      return at::Device(DeviceType::CUDA, ctx.device_id);
+      return at::Device(DeviceType::CUDA, static_cast<c10::DeviceIndex>(ctx.device_id));
 #endif
    case DLDeviceType::kDLOpenCL:
-      return at::Device(DeviceType::OPENCL, ctx.device_id);
+      return at::Device(DeviceType::OPENCL, static_cast<c10::DeviceIndex>(ctx.device_id));
    case DLDeviceType::kDLROCM:
 #ifdef USE_ROCM
      // this looks funny, we need to return CUDA here to masquerade
-      return at::Device(DeviceType::CUDA, ctx.device_id);
+      return at::Device(DeviceType::CUDA, static_cast<c10::DeviceIndex>(ctx.device_id));
 #else
-      return at::Device(DeviceType::HIP, ctx.device_id);
+      return at::Device(DeviceType::HIP, static_cast<c10::DeviceIndex>(ctx.device_id));
 #endif
    case DLDeviceType::kDLOneAPI:
      return at::detail::getXPUHooks().getDeviceFromPtr(data);
    case DLDeviceType::kDLMAIA:
-      return at::Device(DeviceType::MAIA, ctx.device_id);
+      return at::Device(DeviceType::MAIA, static_cast<c10::DeviceIndex>(ctx.device_id));
    default:
      TORCH_CHECK(
          false, "Unsupported device_type: ", std::to_string(ctx.device_type));
@ -286,7 +286,7 @@ DLManagedTensor* toDLPack(const Tensor& src) {
    device_id = src.get_device();
  }
  atDLMTensor->tensor.dl_tensor.device = getDLDevice(src, device_id);
-  atDLMTensor->tensor.dl_tensor.ndim = src.dim();
+  atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
  atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
  atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
  atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -20,7 +20,7 @@ namespace at {
 * The method should_include_kernel_dtype() returns true/false
 * based on whether the switching code for a specific dtype should be
 * included based on build time constants generated from tracing model
- * execution. This method will be implmeneted via code-generation and
+ * execution. This method will be implemented via code-generation and
 * included in this file when code-gen is ready.
 */
 inline constexpr bool should_include_kernel_dtype(
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -29,7 +29,7 @@ static Tensor permute_inverse(const Tensor& self, IntArrayRef dims, InverseRetur
 static Tensor unsqueeze_copy_to(const Tensor & self, c10::SymIntArrayRef sizes, InverseReturnMode inverse_return_mode) {
  auto result = self;
  bool need_alias = (inverse_return_mode == InverseReturnMode::AlwaysView);
-  int64_t nDims = sizes.size();
+  int64_t nDims = static_cast<int64_t>(sizes.size());
  for(const auto dim : c10::irange(nDims)) {
    if (sizes[dim] == 1) {
      need_alias = false;
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -2,6 +2,7 @@

 #include <ATen/EmptyTensor.h>
 #include <ATen/FunctionalTensorWrapper.h>
+#include <ATen/SparseCsrTensorUtils.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <c10/util/Exception.h>
 #include <vector>
@ -71,7 +72,7 @@ static c10::SymInt get_nbytes(const Tensor& value) {
  // for these tensors (which is wrong), but we don't give them any space.
  // A more proper fix would be to have a SparseFunctionalTensorWrapper that
  // models sparse correctly.
-  if (value.is_sparse()) {
+  if (value.is_sparse() || at::sparse_csr::is_sparse_compressed(value)) {
    return 0;
  }
  if (value.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) {
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -660,6 +660,21 @@ void propagate_xla_data(const ITensorListRef functional_tensor, ITensorListRef o
  }
 }

+void propagate_xla_data_direct(const Tensor& tensor, const Tensor& other) {
+  if (tensor.key_set().has(c10::DispatchKey::XLA)) {
+    at::_propagate_xla_data(tensor, other);
+  }
+ }
+
+void propagate_xla_data_direct(const ITensorListRef tensor,
+                               ITensorListRef other) {
+  auto tensor_it = tensor.begin();
+  auto other_it = other.begin();
+  for (C10_UNUSED const auto i : c10::irange(tensor.size())) {
+    propagate_xla_data_direct(*tensor_it++, *other_it++);
+  }
+}
+
 void commit_update(const Tensor& functional_tensor) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isFunctionalTensor(functional_tensor));
  unsafeGetFunctionalWrapper(functional_tensor)->commit_update();
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -161,6 +161,10 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
    return was_storage_changed_;
  }

+  void set_storage_changed() {
+    was_storage_changed_ = true;
+  }
+
  c10::SymInt get_storage_size(bool before) {
    return functional_storage_impl()->get_storage_size(before);
  }
@ -343,6 +347,13 @@ TORCH_API void propagate_xla_data(
    const ITensorListRef functional_tensor,
    ITensorListRef other);

+TORCH_API void propagate_xla_data_direct(
+    const Tensor& tensor,
+    const Tensor& other);
+TORCH_API void propagate_xla_data_direct(
+    const ITensorListRef tensor,
+    ITensorListRef other);
+
 Tensor create_functional_tensor_with_view_meta(
    const Tensor& view_to_wrap,
    const Tensor& base,
--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@ -2,6 +2,7 @@
 #include <c10/util/Exception.h>
 #include <stack>
 #include <utility>
+#include <c10/core/SafePyObject.h>

 namespace at {

@ -57,26 +58,23 @@ void SavedTensorDefaultHooks::lazy_initialize() {
  is_initialized = true;
 }

-void SavedTensorDefaultHooks::push_hooks(PyObject* pack_hook, PyObject* unpack_hook) {
-  // Reference counting is handled by the caller of `push_hooks`
+void SavedTensorDefaultHooks::push_hooks(SafePyObject pack_hook, SafePyObject unpack_hook) {
  TORCH_INTERNAL_ASSERT(is_initialized);
-  TORCH_INTERNAL_ASSERT(pack_hook != nullptr && unpack_hook != nullptr);
  assertSavedTensorHooksNotDisabled();
-  tls.stack.emplace(pack_hook, unpack_hook);
+  tls.stack.emplace(std::move(pack_hook), std::move(unpack_hook));
 }

-std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::pop_hooks() {
-  // Reference counting is handled by the caller of `pop_hooks`
+std::pair<SafePyObject, SafePyObject> SavedTensorDefaultHooks::pop_hooks() {
  TORCH_INTERNAL_ASSERT(is_initialized && !tls.stack.empty());
-  std::pair<PyObject*, PyObject*> hooks = tls.stack.top();
+  std::pair<SafePyObject, SafePyObject> hooks = std::move(tls.stack.top());
  tls.stack.pop();
  return hooks;
 }

-std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::get_hooks() {
+c10::optional<std::pair<SafePyObject, SafePyObject>> SavedTensorDefaultHooks::get_hooks() {
  // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
  if (!is_initialized || tls.stack.empty() || tls.is_tracing) {
-    return std::make_pair(nullptr, nullptr);
+    return c10::nullopt;
  }
  return tls.stack.top();
 }
--- a/aten/src/ATen/SavedTensorHooks.h
+++ b/aten/src/ATen/SavedTensorHooks.h
@ -1,5 +1,6 @@
 #pragma once

+#include <c10/core/SafePyObject.h>
 #include <c10/macros/Export.h>
 #include <c10/util/python_stub.h>
 #include <optional>
@ -14,7 +15,7 @@ namespace impl {

 struct TORCH_API SavedTensorDefaultHooksTLS {
  // PyObject is defined in c10/util/python_stub.h
-  std::stack<std::pair<PyObject*, PyObject*>> stack;
+  std::stack<std::pair<c10::SafePyObject, c10::SafePyObject>> stack;

  // See NOTE: [Disabling SavedTensorDefaultHooks] for context
  // NOTE: [disabled_error_message invariant]
@ -30,9 +31,12 @@ struct TORCH_API SavedTensorDefaultHooksTLS {
 } // namespace impl

 struct TORCH_API SavedTensorDefaultHooks {
-  static void push_hooks(PyObject* pack_hook, PyObject* unpack_hook);
-  static std::pair<PyObject*, PyObject*> pop_hooks();
-  static std::pair<PyObject*, PyObject*> get_hooks();
+  static void push_hooks(
+      c10::SafePyObject pack_hook,
+      c10::SafePyObject unpack_hook);
+  static std::pair<c10::SafePyObject, c10::SafePyObject> pop_hooks();
+  static std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
+  get_hooks();
  static void lazy_initialize();

  static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
--- a/aten/src/ATen/SparseCsrTensorUtils.h
+++ b/aten/src/ATen/SparseCsrTensorUtils.h
@ -139,6 +139,21 @@

 namespace at::sparse_csr {

+// Implements RAII object to manage checking sparse tensor invariants:
+class CheckSparseTensorInvariants {
+  bool old_state;
+
+ public:
+  CheckSparseTensorInvariants(bool state) {
+    old_state = at::globalContext().checkSparseTensorInvariants();
+    at::globalContext().setCheckSparseTensorInvariants(state);
+  }
+
+  ~CheckSparseTensorInvariants() {
+    at::globalContext().setCheckSparseTensorInvariants(old_state);
+  }
+};
+
 using SparseCsrTensor = Tensor;

 inline bool is_sparse_compressed(const Layout& layout) {
--- a/aten/src/ATen/TensorGeometry.cpp
+++ b/aten/src/ATen/TensorGeometry.cpp
@ -1,8 +1,5 @@
 #include <ATen/TensorGeometry.h>

-#include <limits>
-#include <cstddef>
-
 namespace at {

 // See TensorGeometry.h on why this is useful now that we cache is_contiguous.
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@ -19,7 +19,7 @@ namespace at::jit {
 struct TemplateEnv {
  TemplateEnv() = default;
  TemplateEnv(TemplateEnv& parent) : parent(&parent) {}
-  TemplateEnv& operator==(const TemplateEnv& parent) = delete;
+  TemplateEnv& operator=(const TemplateEnv& parent) = delete;

  using string_list = std::vector<std::string>;

--- a/aten/src/ATen/core/IListRef_test.cpp
+++ b/aten/src/ATen/core/IListRef_test.cpp
@ -22,7 +22,7 @@ static std::vector<std::optional<at::Tensor>> get_boxed_opt_tensor_vector() {
  std::vector<std::optional<at::Tensor>> optional_tensors;
  const size_t SIZE = 5;
  for (size_t i = 0; i < SIZE * 2; i++) {
-    auto opt_tensor = (i % 2 == 0) ? std::optional<at::Tensor>(at::empty({0})) : nullopt;
+    auto opt_tensor = (i % 2 == 0) ? std::optional<at::Tensor>(at::empty({0})) : std::nullopt;
    optional_tensors.emplace_back(opt_tensor);
  }
  return optional_tensors;
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -127,7 +127,7 @@ void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names,
  }
 }

-optional<DimnameList> get_opt_names(const TensorImpl* impl) {
+std::optional<DimnameList> get_opt_names(const TensorImpl* impl) {
  const auto* meta = get_named_tensor_meta(impl);
  if (meta == nullptr) {
    return std::nullopt;
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@ -392,7 +392,7 @@ namespace impl {
    }
  };
  template<class T, bool AllowDeprecatedTypes>
-  struct ivalue_to_arg<optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
+  struct ivalue_to_arg<std::optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
    // If an argument is std::optional<ArrayRef<T>>, convert the IValue to an std::optional<std::vector<T>> and pass that
    // to the operator. OptionalArray<T> is basically a std::optional<std::vector<T>> but implicitly convertible
    // to std::optional<ArrayRef<T>>.
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@ -152,7 +152,7 @@ OperatorEntry::AnnotatedKernelContainerIterator OperatorEntry::registerKernel(
    // Suppress the warning for Meta key as we are overriding C++ meta functions with python meta functions
    // for some ops
    if (dispatch_key != DispatchKey::Meta) {
-      TORCH_WARN_ONCE("Warning only once for all operators,  other operators may also be overrided.\n",
+      TORCH_WARN_ONCE("Warning only once for all operators,  other operators may also be overridden.\n",
            "  Overriding a previously registered kernel for the same operator and the same dispatch key\n",
            "  operator: ", (schema_.has_value() ? toString(schema_->schema) : toString(name_)), "\n",
            "    ", (this->schema_.has_value() ? this->schema_->debug : "no debug info"), "\n",
--- a/aten/src/ATen/core/function_schema_inl.h
+++ b/aten/src/ATen/core/function_schema_inl.h
@ -401,7 +401,7 @@ inline void FunctionSchema::checkAndNormalizeInputs(
    }
    auto it = kwargs.find(argument.name());
    if (it != kwargs.end()) {
-      checkArg<T>(it->second, argument, nullopt);
+      checkArg<T>(it->second, argument, std::nullopt);
      inputs.push_back(it->second);
      consumed_kwargs++;
      continue;
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@ -103,7 +103,7 @@ struct OptionalArray {
    if (ref) {
      list = std::vector<T>(ref->begin(), ref->end());
    } else {
-      list = nullopt;
+      list = std::nullopt;
    }
    return *this;
  }
@ -113,7 +113,7 @@ struct OptionalArray {
    if (ref) {
      list = std::vector<T>(ref->begin(), ref->end());
    } else {
-      list = nullopt;
+      list = std::nullopt;
    }
    return *this;
  }
--- a/aten/src/ATen/core/op_registration/adaption.h
+++ b/aten/src/ATen/core/op_registration/adaption.h
@ -45,7 +45,7 @@ namespace impl {

 TORCH_API void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName);

-inline void check_and_update_common_device(optional<Device>& common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+inline void check_and_update_common_device(std::optional<Device>& common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
  // TODO: Remove this once the following issue is addressed:
  // https://github.com/pytorch/pytorch/issues/57380
  if (!tensor.defined()) {
@ -62,19 +62,19 @@ inline void check_and_update_common_device(optional<Device>& common_device, cons
  }
 }

-inline void check_and_update_common_device(optional<Device>& common_device, const std::optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+inline void check_and_update_common_device(std::optional<Device>& common_device, const std::optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
  if (tensor.has_value()) {
    check_and_update_common_device(common_device, tensor.value(), methodName, argName);
  }
 }

-inline void check_and_update_common_device(optional<Device>& common_device, at::ITensorListRef tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
+inline void check_and_update_common_device(std::optional<Device>& common_device, at::ITensorListRef tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
  for (const auto& tensor : tensors) {
    check_and_update_common_device(common_device, tensor, methodName, argName);
  }
 }

-inline void check_and_update_common_device(optional<Device>& common_device, const List<optional<at::Tensor>>& tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
+inline void check_and_update_common_device(std::optional<Device>& common_device, const List<std::optional<at::Tensor>>& tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
  for (const auto& tensor : tensors) {
    check_and_update_common_device(common_device, tensor, methodName, argName);
  }
--- a/aten/src/ATen/core/op_registration/op_registration.h
+++ b/aten/src/ATen/core/op_registration/op_registration.h
@ -70,13 +70,13 @@ public:
    // internal-only for registering stack based kernels
    template<KernelFunction::BoxedKernelFunction* kernel_func>
    Options&& kernel(DispatchKey dispatch_key) && {
-      return std::move(*this).kernel(dispatch_key, KernelFunction::makeFromBoxedFunction<kernel_func>(), nullopt, nullptr);
+      return std::move(*this).kernel(dispatch_key, KernelFunction::makeFromBoxedFunction<kernel_func>(), std::nullopt, nullptr);
    }

    // internal-only for registering stack based catch-all kernels
    template<KernelFunction::BoxedKernelFunction* kernel_func>
    Options&& catchAllKernel() && {
-      return std::move(*this).kernel(std::nullopt, KernelFunction::makeFromBoxedFunction<kernel_func>(), nullopt, nullptr);
+      return std::move(*this).kernel(std::nullopt, KernelFunction::makeFromBoxedFunction<kernel_func>(), std::nullopt, nullptr);
    }

    // internal only for registering caffe2 ops
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@ -133,6 +133,32 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
  }
 };

+
+template <>
+struct VecConvert<int32_t, 1, float, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    return  Vectorized<int32_t>(_mm256_cvttps_epi32(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, int32_t, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    return  Vectorized<float>(_mm256_cvtepi32_ps(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<int16_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int16_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src128 = _mm256_castsi256_si128(src[0]);
+    return Vectorized<int16_t>(_mm256_cvtepu8_epi16(src128));
+  }
+};
+
 template <typename dst_t, typename src_t>
 struct VecConvert<
    dst_t,
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h
@ -554,6 +554,30 @@ Vectorized<ComplexDbl> inline minimum(
  // return _mm256_or_ps(min, isnan);
 }

+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE operator+(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE operator-(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE operator&(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE operator|(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexDbl> C10_ALWAYS_INLINE operator^(const Vectorized<ComplexDbl>& a, const Vectorized<ComplexDbl>& b) {
+  return Vectorized<ComplexDbl>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}

 } // namespace
 } // namespace vec
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h
@ -55,6 +55,13 @@ class Vectorized<ComplexFlt> {
    _vec1 = vfloat32{val3.real(), val3.imag(), val4.real(), val4.imag()};
  }

+  C10_ALWAYS_INLINE const vec_internal_type& vec0() const {
+    return _vec0;
+  }
+  C10_ALWAYS_INLINE const vec_internal_type& vec1() const {
+    return _vec1;
+  }
+
  template <uint64_t mask>
  static std::enable_if_t<blendChoiceComplex(mask) == 0, Vectorized<ComplexFlt>>
      C10_ALWAYS_INLINE
@ -623,6 +630,31 @@ Vectorized<ComplexFlt> inline minimum(
  // return _mm256_or_ps(min, isnan);
 }

+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE operator+(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE operator-(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE operator&(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE operator|(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<ComplexFlt> C10_ALWAYS_INLINE operator^(const Vectorized<ComplexFlt>& a, const Vectorized<ComplexFlt>& b) {
+  return Vectorized<ComplexFlt>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
 } // namespace
 } // namespace vec
 } // namespace at
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h
@ -433,6 +433,42 @@ Vectorized<double> inline minimum(
    const Vectorized<double>& b) {
  return a.minimum(b);
 }
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<double> C10_ALWAYS_INLINE operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return Vectorized<double>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
 } // namespace
 } // namespace vec
 } // namespace at
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h
@ -456,6 +456,41 @@ Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<fl
  return a.minimum(b);
 }

+template <>
+Vectorized<float> C10_ALWAYS_INLINE operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<float> C10_ALWAYS_INLINE operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return Vectorized<float>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
 } // namespace
 } // namespace vec
 } // namespace at
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
@ -362,6 +362,40 @@ Vectorized<int16_t> inline minimum(
  return a.minimum(b);
 }

+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE operator-(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE operator*(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE operator/(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{a.vec0()/b.vec0(), a.vec1()/b.vec1()};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE operator&(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE operator|(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int16_t> C10_ALWAYS_INLINE operator^(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return Vectorized<int16_t>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}

 } // namespace
 } // namespace vec
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
@ -293,6 +293,41 @@ Vectorized<int32_t> inline minimum(
  return a.minimum(b);
 }

+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE operator-(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE operator*(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE operator/(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{a.vec0()/b.vec0(), a.vec1()/b.vec1()};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE operator&(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE operator|(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int32_t> C10_ALWAYS_INLINE operator^(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return Vectorized<int32_t>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
 } // namespace
 } // namespace vec
 } // namespace at
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
@ -246,6 +246,41 @@ Vectorized<int64_t> inline minimum(
  return a.minimum(b);
 }

+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE operator-(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE operator*(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{vec_div(a.vec0(), b.vec0()), vec_div(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE operator&(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE operator|(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<int64_t> C10_ALWAYS_INLINE operator^(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return Vectorized<int64_t>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
 } // namespace
 } // namespace vec
 } // namespace at
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint32_vsx.h
@ -240,6 +240,42 @@ Vectorized<c10::qint32> inline minimum(
    const Vectorized<c10::qint32>& b) {
  return a.minimum(b);
 }
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE operator+(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE operator-(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE operator*(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE operator/(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{a.vec0()/b.vec0(), a.vec1()/b.vec1()};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE operator&(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE operator|(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint32> C10_ALWAYS_INLINE operator^(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return Vectorized<c10::qint32>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
 } // namespace
 } // namespace vec
 } // namespace at
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h
@ -442,6 +442,42 @@ Vectorized<c10::qint8> inline minimum(
    const Vectorized<c10::qint8>& b) {
  return a.minimum(b);
 }
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE operator+(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE operator-(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE operator*(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE operator/(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{a.vec0()/b.vec0(), a.vec1()/b.vec1()};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE operator&(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE operator|(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::qint8> C10_ALWAYS_INLINE operator^(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return Vectorized<c10::qint8>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
 } // namespace
 } // namespace vec
 } // namespace at
--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
+++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h
@ -461,6 +461,41 @@ Vectorized<c10::quint8> inline minimum(
  return a.minimum(b);
 }

+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE operator+(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{vec_add(a.vec0(), b.vec0()), vec_add(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE operator-(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{vec_sub(a.vec0(), b.vec0()), vec_sub(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE operator*(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{vec_mul(a.vec0(), b.vec0()), vec_mul(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE operator/(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{a.vec0()/b.vec0(), a.vec1()/b.vec1()};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE operator&(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{vec_and(a.vec0(), b.vec0()), vec_and(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE operator|(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{vec_or(a.vec0(), b.vec0()), vec_or(a.vec1(), b.vec1())};
+}
+
+template <>
+Vectorized<c10::quint8> C10_ALWAYS_INLINE operator^(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return Vectorized<c10::quint8>{vec_xor(a.vec0(), b.vec0()), vec_xor(a.vec1(), b.vec1())};
+}
+
 } // namespace
 } // namespace vec
 } // namespace at
--- a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
@ -117,6 +117,49 @@ struct VecConvert<int32_t, 1, uint8_t, 1> {
  }
 };

+template <>
+struct VecConvert<int32_t, 1, float, 1> {
+  static inline VectorizedN<int32_t, 1> apply(
+      const VectorizedN<float, 1>& src) {
+    return  Vectorized<int32_t>(_mm512_cvttps_epi32(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<float, 1, int32_t, 1> {
+  static inline VectorizedN<float, 1> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    return  Vectorized<float>(_mm512_cvtepi32_ps(src[0]));
+  }
+};
+
+template <>
+struct VecConvert<int16_t, 1, uint8_t, 1> {
+  static inline VectorizedN<int16_t, 1> apply(
+      const VectorizedN<uint8_t, 1>& src) {
+    auto src256 = _mm512_castsi512_si256(src[0]);
+    return Vectorized<int16_t>(_mm512_cvtepu8_epi16(src256));
+  }
+};
+
+template <>
+struct VecConvert<int8_t, 1, int32_t, 1> {
+  static inline VectorizedN<int8_t, 1> apply(
+      const VectorizedN<int32_t, 1>& src) {
+    auto src128 = _mm512_cvtepi32_epi8(src[0]);
+    return Vectorized<int8_t>(_mm512_castsi128_si512(src128));
+  }
+};
+
+template <>
+struct VecConvert<int8_t, 1, int16_t, 1> {
+  static inline VectorizedN<int8_t, 1> apply(
+      const VectorizedN<int16_t, 1>& src) {
+    auto src256 = _mm512_cvtepi16_epi8(src[0]);
+    return Vectorized<int8_t>(_mm512_castsi256_si512(src256));
+  }
+};
+
 template <typename dst_t, typename src_t>
 struct VecConvert<
    dst_t,
--- a/aten/src/ATen/cpu/vec/vec_n.h
+++ b/aten/src/ATen/cpu/vec/vec_n.h
@ -220,6 +220,7 @@ class VectorizedN {
    return result;
  }

+  VECTORIZEDN_DEFINE_UNARY_OP(isnan)
  VECTORIZEDN_DEFINE_UNARY_OP(abs)
  VECTORIZEDN_DEFINE_UNARY_OP(sgn)
  VECTORIZEDN_DEFINE_UNARY_OP(angle)
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -17,23 +17,10 @@ static bool _cuda_graphs_debug = false;
 constexpr int kSynchronizeBusyWaitMillis = 10;

 MempoolId_t graph_pool_handle() {
-  // uuid count starts at 1. 0 is reserved to mean "wasn't set by graph_pool_handle".
-  static std::atomic<CaptureId_t> uid{1};
  // Sets just the second value, to distinguish it from MempoolId_ts created from
  // cudaStreamGetCaptureInfo id_s in capture_begin.
-  return {0, uid++};
-}
-
-
-// Get the expected id of a capture sequence so that we can call beginAllocateStreamToPool
-// before starting a graph capture
-CaptureId_t capture_sequence_id() {
-  // id starts at 1:
-  // Ensures uuid count starts at 1. 0 is reserved to mean "not set by cudaStreamGetCaptureInfo".
-  // (But how do we know GetCaptureInfo never sets id_ to 0? Because that's the current behavior,
-  // and I asked cuda devs to keep it that way, and they agreed.)
-  static std::atomic<CaptureId_t> uuid{1};
-  return uuid++;
+  auto new_pool = c10::cuda::MemPool();
+  return new_pool.id();
 }

 /**
@ -118,8 +105,6 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
  capture_stream_ = stream;
  capture_dev_ = c10::cuda::current_device();

-  id_ = capture_sequence_id();
-
  if (pool.first != 0 || pool.second != 0) {
    // Either value being nonzero means the user supplied a pool to share.
    // But only one should be nonzero.
@ -128,9 +113,11 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
    TORCH_INTERNAL_ASSERT(!(pool.first && pool.second));
    mempool_id_ = pool;
  } else {
-    // User did not ask us to share a mempool. Use our own id_ as our mempool_id_.
+    // User did not ask us to share a mempool. Create graph pool handle using is_user_created=false.
    // Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle().
-    mempool_id_ = {id_, 0};
+    auto mempool = c10::cuda::MemPool({}, false);
+    mempool_id_ = mempool.id();
+    TORCH_INTERNAL_ASSERT(mempool_id_.first > 0);
  }

  // Addendum: beginAllocateStreamToPool is now called before cudaStreamBeginCapture to prevent an
@ -161,7 +148,6 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
  AT_CUDA_CHECK(cudaStreamGetCaptureInfo(stream, &status, &capture_id_));
  TORCH_INTERNAL_ASSERT(status == cudaStreamCaptureStatus::cudaStreamCaptureStatusActive);

-  TORCH_INTERNAL_ASSERT(id_ > 0);
 }

 void CUDAGraph::capture_end() {
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@ -52,10 +52,6 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
  // Set to true in capture_end if cudaGraphInstantiate succeeded
  bool has_graph_exec_ = false;

-  // uuid of this instance's current capture, used to
-  // specify the pool.
-  CaptureId_t id_;
-
  // the ID assigned by cuda during graph capture,
  // used to identify when a stream is participating in capture
  CaptureId_t capture_id_ = -1;
--- a/aten/src/ATen/cudnn/AutocastRNN.cpp
+++ b/aten/src/ATen/cudnn/AutocastRNN.cpp
@ -9,8 +9,8 @@
 #include <ATen/native/cudnn/RNNUtils.h>
 #endif

-namespace at {
-namespace autocast {
+
+namespace at::autocast {

 /********************************************************************************
 Autocast wrapper for CuDNN RNNs (the weight reflattening needs special attention)
@ -125,5 +125,4 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
 }
 } // anonymous namespace

-} // namespace autocast
-} // namespace at
+} // namespace at::autocast
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -6,7 +6,7 @@
 #include <iostream>
 #include <sstream>

-namespace at { namespace native {
+namespace at::native {

 namespace {

@ -98,7 +98,7 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {

 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
-  int nbDims;
+  int nbDims = 0;
  int dimA[CUDNN_DIM_MAX];
  int strideA[CUDNN_DIM_MAX];
  cudnnDataType_t dtype;
@ -173,7 +173,7 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {

 std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
-  int nbDims;
+  int nbDims = 0;
  int dimA[CUDNN_DIM_MAX];
  cudnnDataType_t dtype;
  cudnnTensorFormat_t tformat;
@ -192,4 +192,4 @@ std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {

 void FilterDescriptor::print() { std::cout << *this; }

-}}
+}
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@ -22,7 +22,7 @@
 #define USE_CUDNN_RNN_V8_API
 #endif

-namespace at { namespace native {
+namespace at::native {

 std::string cudnnTypeToString(cudnnDataType_t dtype);

@ -111,7 +111,7 @@ class TORCH_CUDA_CPP_API Descriptor {
 protected:
  void init() {
    if (desc_ == nullptr) {
-      T* raw_desc;
+      T* raw_desc = nullptr;
      AT_CUDNN_CHECK(ctor(&raw_desc));
      desc_.reset(raw_desc);
    }
@ -235,7 +235,7 @@ struct TORCH_CUDA_CPP_API DropoutDescriptor
  // WARNING: This function is very expensive, avoid calling this function!
  void initialize_rng(cudnnHandle_t handle, float dropout, long long int seed, const TensorOptions& options) {
    TORCH_INTERNAL_ASSERT(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout");
-    size_t state_size;
+    size_t state_size = 0;
    AT_CUDNN_CHECK(cudnnDropoutGetStatesSize(handle, &state_size));
    AT_ASSERT(options.device().type() == kCUDA);
    AT_ASSERT(options.dtype() == kByte);
@ -405,4 +405,4 @@ union Constant
  }
 };

-}}  // namespace
+} // namespace
--- a/aten/src/ATen/cudnn/Handle.cpp
+++ b/aten/src/ATen/cudnn/Handle.cpp
@ -1,35 +1,38 @@
-#include <ATen/cudnn/Handle.h>
 #include <ATen/cuda/detail/DeviceThreadHandles.h>
+#include <ATen/cudnn/Handle.h>
 #include <c10/cuda/CUDAStream.h>

 #include <ATen/cuda/Exceptions.h>

-namespace at { namespace native {
+namespace at::native {
 namespace {

-void createCuDNNHandle(cudnnHandle_t *handle) {
+void createCuDNNHandle(cudnnHandle_t* handle) {
  AT_CUDNN_CHECK(cudnnCreate(handle));
 }

 void destroyCuDNNHandle(cudnnHandle_t /*handle*/) {
-// this is because of something dumb in the ordering of
-// destruction. Sometimes atexit, the cuda context (or something)
-// would already be destroyed by the time this gets destroyed. It
-// happens in fbcode setting. @colesbury and I decided to not destroy
-// the handle as a workaround.
-//   - @soumith
-//
-// Further note: this is now disabled globally, because we are seeing
-// the same issue as mentioned above in CUDA 11 CI.
-//   - @zasdfgbnm
-//
-// #ifdef NO_CUDNN_DESTROY_HANDLE
-// #else
-//   cudnnDestroy(handle);
-// #endif
+  // this is because of something dumb in the ordering of
+  // destruction. Sometimes atexit, the cuda context (or something)
+  // would already be destroyed by the time this gets destroyed. It
+  // happens in fbcode setting. @colesbury and I decided to not destroy
+  // the handle as a workaround.
+  //   - @soumith
+  //
+  // Further note: this is now disabled globally, because we are seeing
+  // the same issue as mentioned above in CUDA 11 CI.
+  //   - @zasdfgbnm
+  //
+  // #ifdef NO_CUDNN_DESTROY_HANDLE
+  // #else
+  //   cudnnDestroy(handle);
+  // #endif
 }

-using CudnnPoolType = at::cuda::DeviceThreadHandlePool<cudnnHandle_t, createCuDNNHandle, destroyCuDNNHandle>;
+using CudnnPoolType = at::cuda::DeviceThreadHandlePool<
+    cudnnHandle_t,
+    createCuDNNHandle,
+    destroyCuDNNHandle>;

 } // namespace

@ -51,4 +54,4 @@ cudnnHandle_t getCudnnHandle() {
  return handle;
 }

-}} // namespace at::native
+} // namespace at::native
--- a/aten/src/ATen/cudnn/Handle.h
+++ b/aten/src/ATen/cudnn/Handle.h
@ -1,9 +1,9 @@
 #pragma once

-#include <ATen/cudnn/cudnn-wrapper.h>
 #include <ATen/cuda/ATenCUDAGeneral.h>
+#include <ATen/cudnn/cudnn-wrapper.h>

-namespace at { namespace native {
+namespace at::native {

 TORCH_CUDA_CPP_API cudnnHandle_t getCudnnHandle();
-}} // namespace at::native
+} // namespace at::native
--- a/aten/src/ATen/cudnn/Types.cpp
+++ b/aten/src/ATen/cudnn/Types.cpp
@ -2,7 +2,7 @@

 #include <ATen/ATen.h>

-namespace at { namespace native {
+namespace at::native {

 cudnnDataType_t getCudnnDataTypeFromScalarType(const at::ScalarType dtype) {
  if (dtype == c10::kQInt8) {
@ -35,4 +35,4 @@ int64_t cudnn_version() {
  return CUDNN_VERSION;
 }

-}}  // namespace at::cudnn
+} // namespace at::native
--- a/aten/src/ATen/cudnn/Types.h
+++ b/aten/src/ATen/cudnn/Types.h
@ -1,9 +1,9 @@
 #pragma once

-#include <ATen/cudnn/cudnn-wrapper.h>
 #include <ATen/Tensor.h>
+#include <ATen/cudnn/cudnn-wrapper.h>

-namespace at { namespace native {
+namespace at::native {

 TORCH_CUDA_CPP_API cudnnDataType_t
 getCudnnDataTypeFromScalarType(const at::ScalarType dtype);
@ -11,4 +11,4 @@ cudnnDataType_t getCudnnDataType(const at::Tensor& tensor);

 int64_t cudnn_version();

-}}  // namespace at::cudnn
+} // namespace at::native
--- a/aten/src/ATen/cudnn/Utils.h
+++ b/aten/src/ATen/cudnn/Utils.h
@ -2,10 +2,10 @@

 #include <ATen/core/Tensor.h>
 #include <ATen/cuda/Exceptions.h>
-#include <ATen/cudnn/cudnn-wrapper.h>
 #include <ATen/cudnn/Handle.h>
+#include <ATen/cudnn/cudnn-wrapper.h>

-namespace at { namespace native {
+namespace at::native {

 // cuDNN has a buggy check for tensor being contiguous (that is, it does
 // not ignore stride for dimension that is equal to 0).  This function
@ -13,9 +13,10 @@ namespace at { namespace native {
 // strides to 1 as cuDNN likes.
 inline Tensor contiguousIfZeroInStrides(const Tensor& t) {
  for (auto s : t.strides()) {
-    if (s == 0) return t.contiguous();
+    if (s == 0)
+      return t.contiguous();
  }
  return t;
 }

-}}
+} // namespace at::native
--- a/aten/src/ATen/cudnn/cudnn-wrapper.h
+++ b/aten/src/ATen/cudnn/cudnn-wrapper.h
@ -5,9 +5,10 @@
 #define STRINGIFY(x) #x
 #define STRING(x) STRINGIFY(x)

-#if CUDNN_MAJOR < 6
-#pragma message ("CuDNN v" STRING(CUDNN_MAJOR) " found, but need at least CuDNN v6. You can get the latest version of CuDNN from https://developer.nvidia.com/cudnn or disable CuDNN with USE_CUDNN=0")
-#pragma message "We strongly encourage you to move to 6.0 and above."
+#if CUDNN_MAJOR < 8 || (CUDNN_MAJOR == 8 && CUDNN_MINOR < 5)
+#pragma message("CuDNN v" STRING( \
+    CUDNN_MAJOR) " found, but need at least CuDNN v8. You can get the latest version of CuDNN from https://developer.nvidia.com/cudnn or disable CuDNN with USE_CUDNN=0")
+#pragma message "We strongly encourage you to move to 8.5 and above."
 #pragma message "This message is intended to annoy you enough to update."
 #endif

--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@ -217,7 +217,7 @@ void GradInterpreterPtr::sendToNextInterpreterImpl(
      op, stack, *base_,
      TransformType::Grad,
      prevGradMode(),
-      nullopt,
+      std::nullopt,
      grad_special_case);
 }

@ -234,7 +234,7 @@ void JvpInterpreterPtr::sendToNextInterpreterImpl(
  autogradBasedTransformSendToNext(
      op, stack, *base_,
      TransformType::Jvp,
-      nullopt,
+      std::nullopt,
      prevFwdGradMode(),
      grad_special_case);
 }
--- a/aten/src/ATen/functorch/BatchRulesActivation.cpp
+++ b/aten/src/ATen/functorch/BatchRulesActivation.cpp
@ -11,7 +11,7 @@
 // NB: most activation functions fit pointwise unary or binary rules.
 // These are only the ones that have special batch rules to help with organization
 namespace at::functorch {
-static std::tuple<Tensor,optional<int64_t>>
+static std::tuple<Tensor, std::optional<int64_t>>
 glu_batch_rule(const Tensor& self, std::optional<int64_t> self_bdim, int64_t dim) {
  // repeated error message from glu because 0D -> 1D when batched
  // this can't pass anyway because a 0-dimensional tensor has "size" 1, which
@ -27,7 +27,7 @@ glu_batch_rule(const Tensor& self, std::optional<int64_t> self_bdim, int64_t dim
  return std::make_tuple(res, 0);
 }

-static std::tuple<Tensor,optional<int64_t>> glu_backward_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> glu_backward_batch_rule(
    const Tensor& grad_output, std::optional<int64_t> grad_output_bdim,
    const Tensor& self, std::optional<int64_t> self_bdim, int64_t dim) {
  if (self_bdim) {
--- a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp
@ -14,7 +14,7 @@
 namespace at::functorch {

 template <typename F, F Func, typename... ExtraArgs>
-std::tuple<Tensor,optional<int64_t>> _binary_pointwise_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> _binary_pointwise_batch_rule(
    const Tensor& tensor, std::optional<int64_t> tensor_batch_dim,
    const Tensor& other, std::optional<int64_t> other_batch_dim,
    ExtraArgs... extra_args) {
@ -33,7 +33,7 @@ struct BinaryPointwiseBatchRuleHelper;

 template <typename F, F Func, typename T1, typename T2, typename... T>
 struct BinaryPointwiseBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
-  static std::tuple<Tensor,optional<int64_t>> apply(
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
      const Tensor& tensor, std::optional<int64_t> tensor_batch_dim,
      const Tensor& other, std::optional<int64_t> other_batch_dim,
      T... extra_args) {
@ -120,7 +120,7 @@ void binary_pointwise_inplace_batch_rule(
 }

 template <typename F, F Func>
-std::tuple<Tensor,optional<int64_t>> comparison_pointwise_batch_rule(
+std::tuple<Tensor, std::optional<int64_t>> comparison_pointwise_batch_rule(
    const Tensor& tensor, std::optional<int64_t> tensor_batch_dim,
    const Tensor& other, std::optional<int64_t> other_batch_dim) {
  // compute max logical rank
@ -142,7 +142,7 @@ std::tuple<Tensor,optional<int64_t>> comparison_pointwise_batch_rule(
  return std::make_tuple( std::move(result), 0 );
 }

-static std::tuple<Tensor,optional<int64_t>> where_self_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> where_self_batch_rule(
    const Tensor& condition, std::optional<int64_t> condition_bdim,
    const Tensor& self, std::optional<int64_t> self_bdim, const Tensor& other, std::optional<int64_t> other_bdim) {
  auto condition_logical_rank = rankWithoutBatchDim(condition, condition_bdim);
@ -177,7 +177,7 @@ static std::tuple<Tensor, std::optional<int64_t>> gelu_backward_batch_rule(
  return std::make_tuple(at::gelu_backward(grad_out_, input_, approximate), 0);
 }

-static std::tuple<Tensor,optional<int64_t>> masked_select_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> masked_select_batch_rule(
    const Tensor& self, std::optional<int64_t> self_bdim,
    const Tensor& mask, std::optional<int64_t> mask_bdim) {
  TORCH_CHECK(!mask_bdim.has_value(),
@ -196,7 +196,7 @@ static std::tuple<Tensor,optional<int64_t>> masked_select_batch_rule(
  return std::make_tuple(result, 0);
 }

-static std::tuple<Tensor,optional<int64_t>> masked_select_backward_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> masked_select_backward_batch_rule(
    const Tensor& grad, std::optional<int64_t> grad_bdim,
    const Tensor& self, std::optional<int64_t> self_bdim,
    const Tensor& mask, std::optional<int64_t> mask_bdim) {
@ -221,7 +221,7 @@ static std::tuple<Tensor,optional<int64_t>> masked_select_backward_batch_rule(
  return std::make_tuple(result, 0);
 }

-static std::tuple<Tensor,optional<int64_t>> cdist_backward_batch_rule(
+static std::tuple<Tensor, std::optional<int64_t>> cdist_backward_batch_rule(
    const Tensor& grad, std::optional<int64_t> grad_bdim,
    const Tensor& x1, std::optional<int64_t> x1_bdim,
    const Tensor& x2, std::optional<int64_t> x2_bdim,
@ -258,7 +258,7 @@ static std::tuple<Tensor,optional<int64_t>> cdist_backward_batch_rule(

  auto out = at::_cdist_backward(grad_, x1_, x2_, p, cdist);

-  std::optional<int64_t> out_bdim = nullopt;
+  std::optional<int64_t> out_bdim = std::nullopt;
  if (x1_bdim || x2_bdim) {
    out_bdim = 0;
  }
@ -277,7 +277,7 @@ static void fill__Tensor_batch_rule(
    self.fill_(other);
    return;
  }
-  if (!self_bdim && other_bdim) {
+  if (!self_bdim) {
    vmapIncompatibleInplaceError("fill_");
  }
  auto self_and_other = _binary_pointwise_helper(
--- a/Show More
+++ b/Show More