rebase

benchmarking
Script for consolidation of sharded safetensor files
2025-10-30 11:44:59 +08:00 · 2025-05-30 14:30:12 -07:00 · 2025-05-30 14:27:37 -07:00 · 2025-05-30 14:18:51 -07:00 · 2025-05-30 10:40:32 -07:00 · 2025-05-30 10:40:30 -07:00
945 changed files with 28838 additions and 10325 deletions
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -31,33 +31,47 @@ def build_ArmComputeLibrary() -> None:
        "build=native",
    ]
    acl_install_dir = "/acl"
-    acl_checkout_dir = "ComputeLibrary"
-    os.makedirs(acl_install_dir)
-    check_call(
-        [
-            "git",
-            "clone",
-            "https://github.com/ARM-software/ComputeLibrary.git",
-            "-b",
-            "v25.02",
-            "--depth",
-            "1",
-            "--shallow-submodules",
-        ]
-    )
+    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
+    if os.path.isdir(acl_install_dir):
+        shutil.rmtree(acl_install_dir)
+    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
+        check_call(
+            [
+                "git",
+                "clone",
+                "https://github.com/ARM-software/ComputeLibrary.git",
+                "-b",
+                "v25.02",
+                "--depth",
+                "1",
+                "--shallow-submodules",
+            ]
+        )

    check_call(
-        ["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
-        + acl_build_flags,
+        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
        cwd=acl_checkout_dir,
    )
-    for d in ["arm_compute", "include", "utils", "support", "src"]:
+    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")


-def update_wheel(wheel_path, desired_cuda) -> None:
+def replace_tag(filename) -> None:
+    with open(filename) as f:
+        lines = f.readlines()
+    for i, line in enumerate(lines):
+        if line.startswith("Tag:"):
+            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
+            print(f"Updated tag from {line} to {lines[i]}")
+            break
+
+    with open(filename, "w") as f:
+        f.writelines(lines)
+
+
+def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    """
-    Update the cuda wheel libraries
+    Package the cuda wheel libraries
    """
    folder = os.path.dirname(wheel_path)
    wheelname = os.path.basename(wheel_path)
@ -88,30 +102,19 @@ def update_wheel(wheel_path, desired_cuda) -> None:
        "/usr/lib64/libgfortran.so.5",
        "/acl/build/libarm_compute.so",
        "/acl/build/libarm_compute_graph.so",
+        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_lapack_core.so.0",
+        "/usr/local/lib/libnvpl_blas_core.so.0",
    ]
-    if enable_cuda:
+
+    if "128" in desired_cuda:
        libs_to_copy += [
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_lapack_core.so.0",
-            "/usr/local/lib/libnvpl_blas_core.so.0",
-        ]
-        if "126" in desired_cuda:
-            libs_to_copy += [
-                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
-                "/usr/local/cuda/lib64/libcufile.so.0",
-                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-            ]
-        elif "128" in desired_cuda:
-            libs_to_copy += [
-                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
-                "/usr/local/cuda/lib64/libcufile.so.0",
-                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-            ]
-    else:
-        libs_to_copy += [
-            "/opt/OpenBLAS/lib/libopenblas.so.0",
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
        ]
+
    # Copy libraries to unzipped_folder/a/lib
    for lib_path in libs_to_copy:
        lib_name = os.path.basename(lib_path)
@ -120,6 +123,13 @@ def update_wheel(wheel_path, desired_cuda) -> None:
            f"cd {folder}/tmp/torch/lib/; "
            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
        )
+
+    # Make sure the wheel is tagged with manylinux_2_28
+    for f in os.scandir(f"{folder}/tmp/"):
+        if f.is_dir() and f.name.endswith(".dist-info"):
+            replace_tag(f"{f.path}/WHEEL")
+            break
+
    os.mkdir(f"{folder}/cuda_wheel")
    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
    shutil.move(
@ -194,8 +204,10 @@ if __name__ == "__main__":
    ).decode()

    print("Building PyTorch wheel")
-    build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
-    os.system("cd /pytorch; python setup.py clean")
+    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
+    if enable_cuda:
+        build_vars = "MAX_JOBS=5 " + build_vars

    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
@ -242,6 +254,6 @@ if __name__ == "__main__":
        print("Updating Cuda Dependency")
        filename = os.listdir("/pytorch/dist/")
        wheel_path = f"/pytorch/dist/{filename[0]}"
-        update_wheel(wheel_path, desired_cuda)
+        package_cuda_wheel(wheel_path, desired_cuda)
    pytorch_wheel_name = complete_wheel("/pytorch/")
    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -1,7 +1,7 @@
 ARG CUDA_VERSION=12.4
 ARG BASE_TARGET=cuda${CUDA_VERSION}
 ARG ROCM_IMAGE=rocm/dev-almalinux-8:6.3-complete
-FROM amd64/almalinux:8 as base
+FROM amd64/almalinux:8.10-20250519 as base

 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
@ -11,6 +11,8 @@ ARG DEVTOOLSET_VERSION=11

 RUN yum -y update
 RUN yum -y install epel-release
+# install glibc-langpack-en make sure en_US.UTF-8 locale is available
+RUN yum -y install glibc-langpack-en
 RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -109,8 +109,8 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.1
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -121,30 +121,6 @@ case "$tag" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.13
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9)
    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
@ -156,8 +132,8 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6.3
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -168,8 +144,8 @@ case "$tag" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6.3
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
@ -180,8 +156,8 @@ case "$tag" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6.3
+  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-0bcc8265e677e5321606a3311bf71470f14456a8
+b0e26b7359c147b8aa0af686c20510fb9b15990a
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-96316ce50fade7e209553aba4898cd9b82aab83b
+c8757738a7418249896224430ce84888e8ecdd79
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -183,9 +183,9 @@ function prune_126 {

 function install_128 {
  CUDNN_VERSION=9.8.0.87
-  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
-  # install CUDA 12.8.0 in the same container
-  install_cuda 12.8.0 cuda_12.8.0_570.86.10_linux
+  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
+  # install CUDA 12.8.1 in the same container
+  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  install_cudnn 12 $CUDNN_VERSION
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -31,8 +31,7 @@ pip_install \
 pip_install coloredlogs packaging

 pip_install onnxruntime==1.18.1
-pip_install onnx==1.17.0
-pip_install onnxscript==0.2.2 --no-deps
+pip_install onnxscript==0.2.6 --no-deps
 # required by onnxscript
 pip_install ml_dtypes

--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -5,7 +5,9 @@ ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 ENV LANGUAGE=C.UTF-8

-ARG DEVTOOLSET_VERSION=13
+# there is a bugfix in gcc >= 14 for precompiled headers and s390x vectorization interaction.
+# with earlier gcc versions test/inductor/test_cpu_cpp_wrapper.py will fail.
+ARG DEVTOOLSET_VERSION=14
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
 RUN yum -y install epel-release
@ -58,7 +60,8 @@ RUN yum install -y \
  libxslt-devel \
  libxml2-devel \
  openssl-devel \
-  valgrind
+  valgrind \
+  ninja-build

 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
@ -103,9 +106,6 @@ CMD ["/bin/bash"]
 # install test dependencies:
 # - grpcio requires system openssl, bundled crypto fails to build
 RUN dnf install -y \
-  protobuf-devel \
-  protobuf-c-devel \
-  protobuf-lite-devel \
  hdf5-devel \
  python3-h5py \
  git
@ -129,6 +129,9 @@ RUN pip3 install flatbuffers && \
  git clone https://github.com/microsoft/onnxruntime && \
  cd onnxruntime && git checkout v1.21.0 && \
  git submodule update --init --recursive && \
-  ./build.sh --config Release --parallel 0 --enable_pybind --build_wheel --enable_training --enable_training_apis --enable_training_ops --skip_tests --allow_running_as_root && \
+  ./build.sh --config Release --parallel 0 --enable_pybind \
+  --build_wheel --enable_training --enable_training_apis \
+  --enable_training_ops --skip_tests --allow_running_as_root \
+  --compile_no_warning_as_error && \
  pip3 install ./build/Linux/Release/dist/onnxruntime_training-*.whl && \
  cd .. && /bin/rm -rf ./onnxruntime
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -93,7 +93,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.14.0
+mypy==1.15.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
 #Pinned versions: 1.14.0
@ -166,10 +166,10 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:

-protobuf==3.20.2
-#Description:  Google’s data interchange format
-#Pinned versions: 3.20.1
-#test that import: test_tensorboard.py
+protobuf==5.29.4
+#Description:  Google's data interchange format
+#Pinned versions: 5.29.4
+#test that import: test_tensorboard.py, test/onnx/*

 psutil
 #Description: information on running processes and system utilization
@ -337,12 +337,12 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:

-onnx==1.17.0
-#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
+onnx==1.18.0
+#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:

-onnxscript==0.2.2
+onnxscript==0.2.6
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -15,6 +15,10 @@ sphinxext-opengraph==0.9.1
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.9.1

+sphinx_sitemap==2.6.0
+#Description: This is used to generate sitemap for PyTorch docs
+#Pinned versions: 2.6.0
+
 matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.3.0
+3.3.1
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -18,12 +18,10 @@ retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }

-PLATFORM="manylinux2014_x86_64"
+PLATFORM=""
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-    retry yum install -q -y zip openssl
-elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
    PLATFORM="manylinux_2_28_x86_64"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
@ -36,6 +34,9 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then

    retry apt-get update
    retry apt-get -y install zip openssl
+else
+    echo "Unknown OS: '$OS_NAME'"
+    exit 1
 fi

 # We use the package name to test the package by passing this to 'pip install'
@ -79,8 +80,6 @@ if [[ -e /opt/openssl ]]; then
    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
 fi

-
-
 mkdir -p /tmp/$WHEELHOUSE_DIR

 export PATCHELF_BIN=/usr/local/bin/patchelf
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -36,10 +36,8 @@ if [[ -n "$DESIRED_CUDA" ]]; then
    if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then
        CUDA_VERSION=${DESIRED_CUDA}
    else
-        # cu90, cu92, cu100, cu101
-        if [[ ${#DESIRED_CUDA} -eq 4 ]]; then
-            CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}"
-        elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then
+        # cu126, cu128 etc...
+        if [[ ${#DESIRED_CUDA} -eq 5 ]]; then
            CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}"
        fi
    fi
@ -61,10 +59,6 @@ case ${CUDA_VERSION} in
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
-    12.4)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
-        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
-        ;;
    11.8)
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
@ -91,14 +85,15 @@ fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true

 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
+else
+    echo "Unknown OS: '$OS_NAME'"
+    exit 1
 fi

 DEPS_LIST=(
@ -108,26 +103,8 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )

-# CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
-# since nvidia-cusparselt-cu11 is not available in PYPI
-if [[ $USE_CUSPARSELT == "1" && $CUDA_VERSION == "11.8" ]]; then
-        DEPS_SONAME+=(
-            "libcusparseLt.so.0"
-        )
-        DEPS_LIST+=(
-            "/usr/local/cuda/lib64/libcusparseLt.so.0"
-        )
-fi

-
-# Turn USE_CUFILE off for CUDA 11.8, 12.4 since nvidia-cufile-cu11 and 1.9.0.20 are
-# not available in PYPI
-if [[ $CUDA_VERSION == "11.8" || $CUDA_VERSION == "12.4" ]]; then
-    export USE_CUFILE=0
-fi
-
-
-# CUDA_VERSION 12.4, 12.6, 12.8
+# CUDA_VERSION 12.6, 12.8
 if [[ $CUDA_VERSION == 12* ]]; then
    export USE_STATIC_CUDNN=0
    # Try parallelizing nvcc as well
@ -151,6 +128,8 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
            "/usr/local/cuda/lib64/libnvrtc.so.12"
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
+            "/usr/local/cuda/lib64/libcufile.so.0"
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
        )
        DEPS_SONAME+=(
            "libcudnn_adv.so.9"
@ -168,17 +147,9 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "libnvToolsExt.so.1"
            "libnvrtc.so.12"
            "libnvrtc-builtins.so"
+            "libcufile.so.0"
+            "libcufile_rdma.so.1"
        )
-        if [[ $USE_CUFILE == 1 ]]; then
-            DEPS_LIST+=(
-                "/usr/local/cuda/lib64/libcufile.so.0"
-                "/usr/local/cuda/lib64/libcufile_rdma.so.1"
-            )
-            DEPS_SONAME+=(
-                "libcufile.so.0"
-                "libcufile_rdma.so.1"
-            )
-        fi
    else
        echo "Using nvidia libs from pypi."
        CUDA_RPATHS=(
@ -194,12 +165,8 @@ if [[ $CUDA_VERSION == 12* ]]; then
            '$ORIGIN/../../cusparselt/lib'
            '$ORIGIN/../../nvidia/nccl/lib'
            '$ORIGIN/../../nvidia/nvtx/lib'
+            '$ORIGIN/../../nvidia/cufile/lib'
        )
-        if [[ $USE_CUFILE == 1 ]]; then
-            CUDA_RPATHS+=(
-                '$ORIGIN/../../nvidia/cufile/lib'
-            )
-        fi
        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
@ -214,11 +181,25 @@ if [[ $CUDA_VERSION == 12* ]]; then
    fi
 elif [[ $CUDA_VERSION == "11.8" ]]; then
    export USE_STATIC_CUDNN=0
+    # Turn USE_CUFILE off for CUDA 11.8 since nvidia-cufile-cu11 and 1.9.0.20 are
+    # not available in PYPI
+    export USE_CUFILE=0
    # Try parallelizing nvcc as well
    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
    # Bundle ptxas into the wheel, see https://github.com/pytorch/pytorch/pull/119750
    export BUILD_BUNDLE_PTXAS=1

+    # CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
+    # since nvidia-cusparselt-cu11 is not available in PYPI
+    if [[ $USE_CUSPARSELT == "1" ]]; then
+        DEPS_SONAME+=(
+            "libcusparseLt.so.0"
+        )
+        DEPS_LIST+=(
+            "/usr/local/cuda/lib64/libcusparseLt.so.0"
+        )
+    fi
+
    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
        echo "Bundling with cudnn and cublas."
        DEPS_LIST+=(
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -22,9 +22,7 @@ retry () {

 # TODO move this into the Docker images
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
-if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-    retry yum install -q -y zip openssl
-elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
@ -35,6 +33,9 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
    retry apt-get update
    retry apt-get -y install zip openssl
+else
+    echo "Unknown OS: '$OS_NAME'"
+    exit 1
 fi

 # Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -40,7 +40,7 @@ if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
 else
  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
  print_sccache_stats
--- a/.ci/pytorch/macos-common.sh
+++ b/.ci/pytorch/macos-common.sh
@ -20,14 +20,4 @@ print_cmake_info() {
  CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC")
  # Print all libraries under cmake rpath for debugging
  ls -la "$CONDA_INSTALLATION_DIR/../lib"
-
-  export CMAKE_EXEC
-  # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
-  # where cmake dependencies couldn't be found. This seems to point to how conda
-  # links $CMAKE_EXEC to its package cache when cloning a new environment
-  install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
-  # Adding the rpath will invalidate cmake signature, so signing it again here
-  # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
-  # with an exit code 137 otherwise
-  codesign -f -s - "${CMAKE_EXEC}" || true
 }
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -165,6 +165,7 @@ test_jit_hooks() {
 torchbench_setup_macos() {
  git clone --recursive https://github.com/pytorch/vision torchvision
  git clone --recursive https://github.com/pytorch/audio torchaudio
+  brew install jpeg-turbo libpng

  pushd torchvision
  git fetch
@ -179,7 +180,8 @@ torchbench_setup_macos() {
  git checkout "$(cat ../.github/ci_commit_pins/audio.txt)"
  git submodule update --init --recursive
  python setup.py clean
-  python setup.py develop
+  #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
+  USE_OPENMP=0 python setup.py develop
  popd

  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
@ -187,9 +189,8 @@ torchbench_setup_macos() {
  checkout_install_torchbench
 }

-conda_benchmark_deps() {
-  conda install -y astunparse numpy scipy ninja pyyaml setuptools cmake typing-extensions requests protobuf numba cython scikit-learn
-  conda install -y -c conda-forge librosa
+pip_benchmark_deps() {
+  python -mpip install --no-input astunparse requests cython scikit-learn
 }


@ -197,7 +198,7 @@ test_torchbench_perf() {
  print_cmake_info

  echo "Launching torchbench setup"
-  conda_benchmark_deps
+  pip_benchmark_deps
  torchbench_setup_macos

  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -224,7 +225,7 @@ test_torchbench_smoketest() {
  print_cmake_info

  echo "Launching torchbench setup"
-  conda_benchmark_deps
+  pip_benchmark_deps
  # shellcheck disable=SC2119,SC2120
  torchbench_setup_macos

@ -232,8 +233,8 @@ test_torchbench_smoketest() {
  mkdir -p "$TEST_REPORTS_DIR"

  local device=mps
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo)
-  local hf_models=(GoogleFnet YituTechConvBert)
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor)
+  local hf_models=(GoogleFnet YituTechConvBert Speech2Text2ForCausalLM)

  for backend in eager inductor; do

@ -258,10 +259,10 @@ test_torchbench_smoketest() {
        if [ "$backend" == "inductor" ]; then
          PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
            --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-            --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
+            --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_performance.csv" || true
          PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
            --accuracy --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-            --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
+            --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
        fi
      done
    done
@ -289,7 +290,7 @@ test_hf_perf() {
  print_cmake_info
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  conda_benchmark_deps
+  pip_benchmark_deps
  torchbench_setup_macos

  echo "Launching HuggingFace training perf run"
@ -305,7 +306,7 @@ test_timm_perf() {
  print_cmake_info
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  conda_benchmark_deps
+  pip_benchmark_deps
  torchbench_setup_macos

  echo "Launching timm training perf run"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -820,16 +820,7 @@ test_inductor_torchbench_smoketest_perf() {
  done
 }

-test_inductor_get_core_number() {
-  if [[ "${TEST_CONFIG}" == *aarch64* ]]; then
-    echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
-  else
-    echo "$(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))"
-  fi
-}
-
 test_inductor_set_cpu_affinity(){
-  #set jemalloc
  JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
  export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
@ -841,14 +832,23 @@ test_inductor_set_cpu_affinity(){
    export KMP_AFFINITY=granularity=fine,compact,1,0
    export KMP_BLOCKTIME=1
  fi
-  cores=$(test_inductor_get_core_number)
-  # Set number of cores to 16 on Aarch64 for performance runs.
+
+  # Use nproc here instead of lscpu because it takes into account cgroups slice
+  cpus=$(nproc)
+  thread_per_core=$(lscpu | grep 'Thread(s) per core:' | awk '{print $4}')
+  cores=$((cpus / thread_per_core))
+
+  # Set number of cores to 16 on aarch64 for performance runs
  if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
    cores=16
  fi
  export OMP_NUM_THREADS=$cores
-  end_core=$((cores-1))
-  export TASKSET="taskset -c 0-$end_core"
+
+  # Handle cgroups slice start and end CPU
+  start_cpu=$(python -c 'import os; print(min(os.sched_getaffinity(0)))')
+  # Leaving one physical CPU for other tasks
+  end_cpu=$(($(python -c 'import os; print(max(os.sched_getaffinity(0)))') - thread_per_core))
+  export TASKSET="taskset -c $start_cpu-$end_cpu"
 }

 test_inductor_torchbench_cpu_smoketest_perf(){
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 pytest-subtests==0.13.1
+python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1

 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.12.2.0
--- a/.ci/pytorch/windows/arm64/bootstrap_libuv.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_libuv.bat
@ -7,7 +7,7 @@ if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR%
 if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%

 :: activate visual studio
-call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
 where cl.exe

 cd %DEPENDENCIES_DIR%
--- a/.ci/pytorch/windows/arm64/bootstrap_openblas.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_openblas.bat
@ -7,7 +7,7 @@ if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR%
 if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%

 :: activate visual studio
-call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
 where cl.exe

 :: Clone OpenBLAS
--- a/.ci/pytorch/windows/arm64/bootstrap_tests.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_tests.bat
@ -2,7 +2,7 @@
 cd %PYTORCH_ROOT%

 :: activate visual studio
-call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
 where cl.exe

 :: create virtual environment
--- a/.ci/pytorch/windows/arm64/build_libtorch.bat
+++ b/.ci/pytorch/windows/arm64/build_libtorch.bat
@ -21,7 +21,7 @@ if %ENABLE_APL% == 1 (
 )

 :: activate visual studio
-call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
 where cl.exe

 :: change to source directory
--- a/.ci/pytorch/windows/arm64/build_pytorch.bat
+++ b/.ci/pytorch/windows/arm64/build_pytorch.bat
@ -21,7 +21,7 @@ if %ENABLE_APL% == 1 (
 )

 :: activate visual studio
-call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64
 where cl.exe

 :: change to source directory
--- a/.ci/pytorch/windows/arm64/smoke_test.bat
+++ b/.ci/pytorch/windows/arm64/smoke_test.bat
@ -33,7 +33,7 @@ pushd tmp
 set VC_VERSION_LOWER=14
 set VC_VERSION_UPPER=36

-call "%DEPENDENCIES_DIR%\VSBuildTools\VC\Auxiliary\Build\vcvarsall.bat" arm64
+call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" arm64

 set install_root=%CD%
 set INCLUDE=%INCLUDE%;%install_root%\include;%install_root%\include\torch\csrc\api\include
--- a/.ci/pytorch/windows/internal/vc_install_helper.bat
+++ b/.ci/pytorch/windows/internal/vc_install_helper.bat
@ -3,6 +3,8 @@ if "%VC_YEAR%" == "2022" powershell windows/internal/vs2022_install.ps1

 set VC_VERSION_LOWER=17
 set VC_VERSION_UPPER=18
+:: Please don't delete VS2019 as an alternative, in case some Windows compiler issue.
+:: Reference: https://github.com/pytorch/pytorch/issues/145702#issuecomment-2858693930
 if "%VC_YEAR%" == "2019" (
    set VC_VERSION_LOWER=16
    set VC_VERSION_UPPER=17
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -9,7 +9,7 @@ if [[ "$OS" != "windows-arm64" ]]; then
    export USE_SCCACHE=1
    export SCCACHE_BUCKET=ossci-compiler-cache
    export SCCACHE_IGNORE_SERVER_IO_ERROR=1
-    export VC_YEAR=2019
+    export VC_YEAR=2022
 fi

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -4,7 +4,7 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"

 export CUDA_VERSION="${DESIRED_CUDA/cu/}"
-export VC_YEAR=2019
+export VC_YEAR=2022

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
--- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
+++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
@ -5,7 +5,7 @@ title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
 labels: "module: ci"
 ---

-> For example, DISABLED pull / win-vs2019-cpu-py3 / test (default). Once
+> For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
 > created, the job will be disabled within 15 minutes. You can check the
 > list of disabled jobs at https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json

--- a/.github/ISSUE_TEMPLATE/release-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/release-feature-request.yml
@ -0,0 +1,111 @@
+name: 🚀 Release highlight for proposed Feature
+description: Submit a Release highlight for proposed Feature
+labels: ["release-feature-request"]
+
+body:
+- type: textarea
+  attributes:
+    label: Release highlight for proposed Feature
+    description: >
+      Example: “A torch.special module, analogous to SciPy's special module.”
+- type: input
+  id: contact
+  attributes:
+    label: Point(s) of contact
+    description: How can we get in touch with you if we need more info?
+    placeholder: ex. github username
+  validations:
+    required: false
+- type: dropdown
+  attributes:
+    label: Release Mode (pytorch/pytorch features only)
+    description: |
+      If "out-of-tree", please include the GH repo name
+    options:
+      - In-tree
+      - Out-of-tree
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Out-Of-Tree Repo
+    description: >
+      please include the GH repo name
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Description and value to the user
+    description: >
+      Please provide a brief description of the feature and how it will benefit the user.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Link to design doc, GitHub issues, past submissions, etc
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: What feedback adopters have provided
+    description: >
+      Please list users/teams that have tried the feature and provided feedback. If that feedback motivated material changes (API, doc, etc..), a quick overview of the changes and the status (planned, in progress, implemented) would be helpful as well.
+  validations:
+    required: false
+- type: dropdown
+  attributes:
+    label: Plan for documentations / tutorials
+    description: |
+      Select One of the following options
+    options:
+      - Tutorial exists
+      - Will submit a PR to pytorch/tutorials
+      - Will submit a PR to a repo
+      - Tutorial is not needed
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Additional context for tutorials
+    description: >
+      Please provide a link for existing tutorial or link to a repo or context for why tutorial is not needed.
+  validations:
+    required: false
+- type: dropdown
+  attributes:
+    label: Marketing/Blog Coverage
+    description: |
+      Are you requesting feature Inclusion in the release blogs?
+    options:
+      - "Yes"
+      - "No"
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Are you requesting other marketing assistance with this feature?
+    description: >
+      E.g. supplementary blogs, social media amplification, etc.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Release Version
+    description: >
+      Please include release version for marketing coverage.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: OS / Platform / Compute Coverage
+    description: >
+      Please list the platforms supported by the proposed feature. If the feature supports all the platforms, write "all". Goal of this section is to clearly share if this feature works in all PyTorch configurations or is it limited to only certain platforms/configurations (e.g. CPU only, GPU only, Linux only, etc...)
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Testing Support (CI, test cases, etc..)
+    description: >
+      Please provide an overview of test coverage. This includes unit testing and integration testing, but if E2E validation testing has been done to show that the feature works for a certain set of use cases or models please mention that as well.
+  validations:
+    required: false
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -45,6 +45,7 @@ self-hosted-runner:
    - windows.g5.4xlarge.nvidia.gpu
    # Windows ARM64 runners
    - windows-11-arm64
+    - windows-11-arm64-preview
    # Organization-wide AMD-hosted runners
    # MI2xx runners
    - linux.rocm.gpu
--- a/.github/actions/reuse-old-whl/action.yml
+++ b/.github/actions/reuse-old-whl/action.yml
@ -0,0 +1,38 @@
+name: Reuse old wheel if possible
+
+description:
+  Reuse old wheel if possible
+
+inputs:
+  build-environment:
+    description: Build environment
+    required: true
+  run-id:
+    description: Workflow run ID
+    required: true
+  github-token:
+    description: GitHub token
+    required: true
+
+outputs:
+  reuse:
+    description: Whether the wheel is reused or not
+    value: ${{ steps.check-file-changes.outputs.reuse }}
+
+runs:
+  using: composite
+
+  steps:
+    # Check out pytorch with fetch depth 0
+    - name: Check file changes
+      id: check-file-changes
+      shell: bash
+      continue-on-error: true
+      env:
+        GITHUB_TOKEN: ${{ inputs.github-token }}
+      run: |
+        set -x
+        python3 ${GITHUB_ACTION_PATH}/reuse_old_whl.py \
+          --build-environment "${{ inputs.build-environment }}" \
+          --run-id "${{ inputs.run-id }}" \
+          --github-ref "${{ github.ref }}"
--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@ -0,0 +1,289 @@
+import argparse
+import os
+import subprocess
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, cast, Optional
+
+import requests
+
+
+FORCE_REBUILD_LABEL = "ci-force-rebuild"
+
+
+@lru_cache
+def get_merge_base() -> str:
+    merge_base = subprocess.check_output(
+        ["git", "merge-base", "HEAD", "origin/main"],
+        text=True,
+        stderr=subprocess.DEVNULL,
+    ).strip()
+    # Remove this when we turn this off for the main branch
+    if merge_base == get_head_sha():
+        print("Merge base is the same as HEAD, using HEAD^")
+        merge_base = subprocess.check_output(
+            ["git", "rev-parse", "HEAD^"],
+            text=True,
+            stderr=subprocess.DEVNULL,
+        ).strip()
+    print(f"Merge base: {merge_base}")
+    return merge_base
+
+
+@lru_cache
+def get_head_sha() -> str:
+    sha = subprocess.check_output(
+        ["git", "rev-parse", "HEAD"],
+        text=True,
+        stderr=subprocess.DEVNULL,
+    ).strip()
+    return sha
+
+
+def is_main_branch() -> bool:
+    return False
+    # Testing on main branch for now
+    # print(
+    #     f"Checking if we are on main branch: merge base {get_merge_base()}, head {get_head_sha()}"
+    # )
+    # return get_merge_base() == get_head_sha()
+
+
+def query_github_api(url: str) -> Any:
+    headers = {
+        "Accept": "application/vnd.github.v3+json",
+        "Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}",
+    }
+    response = requests.get(url, headers=headers)
+    return response.json()
+
+
+@lru_cache
+def check_labels_for_pr() -> bool:
+    # Check if the current commit is part of a PR and if it has the
+    # FORCE_REBUILD_LABEL
+    head_sha = get_head_sha()
+    url = f"https://api.github.com/repos/pytorch/pytorch/commits/{head_sha}/pulls"
+    response = query_github_api(url)
+
+    print(
+        f"Found {len(response)} PRs for commit {head_sha}: {[pr['number'] for pr in response]}"
+    )
+    for pr in response:
+        labels = pr.get("labels", [])
+        for label in labels:
+            if label["name"] == FORCE_REBUILD_LABEL:
+                print(f"Found label {FORCE_REBUILD_LABEL} in PR {pr['number']}.")
+                return True
+    return False
+
+
+def check_issue_open() -> bool:
+    # Check if issue #153759 is open.  This is the config issue for quickly
+    # forcing everyone to build
+    url = "https://api.github.com/repos/pytorch/pytorch/issues/153759"
+    response = query_github_api(url)
+    if response.get("state") == "open":
+        print("Issue #153759 is open.")
+        return True
+    else:
+        print("Issue #153759 is not open.")
+        return False
+
+
+def get_workflow_id(run_id: str) -> Optional[str]:
+    # Get the workflow ID that corresponds to the file for the run ID
+    url = f"https://api.github.com/repos/pytorch/pytorch/actions/runs/{run_id}"
+    response = query_github_api(url)
+    if "workflow_id" in response:
+        print(f"Found workflow ID for run ID {run_id}: {response['workflow_id']}")
+        return cast(str, response["workflow_id"])
+    else:
+        print("No workflow ID found.")
+        return None
+
+
+def ok_changed_file(file: str) -> bool:
+    # Return true if the file is in the list of allowed files to be changed to
+    # reuse the old whl
+    if (
+        file.startswith("torch/")
+        and file.endswith(".py")
+        and not file.startswith("torch/csrc/")
+    ):
+        return True
+    if file.startswith("test/") and file.endswith(".py"):
+        return True
+    return False
+
+
+def check_changed_files(sha: str) -> bool:
+    # Return true if all the changed files are in the list of allowed files to
+    # be changed to reuse the old whl
+    changed_files = (
+        subprocess.check_output(
+            ["git", "diff", "--name-only", sha, "HEAD"],
+            text=True,
+            stderr=subprocess.DEVNULL,
+        )
+        .strip()
+        .split()
+    )
+    print(f"Checking changed files between {sha} and HEAD:")
+    for file in changed_files:
+        if not ok_changed_file(file):
+            print(f"  File {file} is not allowed to be changed.")
+            return False
+        else:
+            print(f"  File {file} is allowed to be changed.")
+    return True
+
+
+def find_old_whl(workflow_id: str, build_environment: str, sha: str) -> bool:
+    # Find the old whl on s3 and download it to artifacts.zip
+    if build_environment is None:
+        print("BUILD_ENVIRONMENT is not set.")
+        return False
+    print(f"SHA: {sha}, workflow_id: {workflow_id}")
+
+    workflow_runs = query_github_api(
+        f"https://api.github.com/repos/pytorch/pytorch/actions/workflows/{workflow_id}/runs?head_sha={sha}&branch=main&per_page=100"
+    )
+    if workflow_runs.get("total_count", 0) == 0:
+        print("No workflow runs found.")
+        return False
+    for run in workflow_runs.get("workflow_runs", []):
+        # Look in s3 for the old whl
+        run_id = run["id"]
+        try:
+            url = f"https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/{run_id}/{build_environment}/artifacts.zip"
+            print(f"Checking for old whl at {url}")
+            response = requests.get(
+                url,
+            )
+            if response.status_code == 200:
+                with open("artifacts.zip", "wb") as f:
+                    f.write(response.content)
+                    print(f"Found old whl file from s3: {url}")
+                    return True
+        except requests.RequestException as e:
+            print(f"Error checking for old whl: {e}")
+            continue
+    return False
+
+
+def unzip_artifact_and_replace_files() -> None:
+    # Unzip the artifact and replace files
+    subprocess.check_output(
+        ["unzip", "-o", "artifacts.zip", "-d", "artifacts"],
+    )
+    os.remove("artifacts.zip")
+
+    # Rename wheel into zip
+    wheel_path = Path("artifacts/dist").glob("*.whl")
+    for path in wheel_path:
+        new_path = path.with_suffix(".zip")
+        os.rename(path, new_path)
+        print(f"Renamed {path} to {new_path}")
+        print(new_path.stem)
+        # Unzip the wheel
+        subprocess.check_output(
+            ["unzip", "-o", new_path, "-d", f"artifacts/dist/{new_path.stem}"],
+        )
+        # Copy python files into the artifact
+        subprocess.check_output(
+            ["rsync", "-avz", "torch", f"artifacts/dist/{new_path.stem}"],
+        )
+
+        # Zip the wheel back
+        subprocess.check_output(
+            ["zip", "-r", f"{new_path.stem}.zip", "."],
+            cwd=f"artifacts/dist/{new_path.stem}",
+        )
+        subprocess.check_output(
+            [
+                "mv",
+                f"artifacts/dist/{new_path.stem}/{new_path.stem}.zip",
+                f"artifacts/dist/{new_path.stem}.whl",
+            ],
+        )
+
+        # Remove the extracted folder
+        subprocess.check_output(
+            ["rm", "-rf", f"artifacts/dist/{new_path.stem}"],
+        )
+
+    # Rezip the artifact
+    subprocess.check_output(["zip", "-r", "artifacts.zip", "."], cwd="artifacts")
+    subprocess.check_output(
+        ["mv", "artifacts/artifacts.zip", "."],
+    )
+    return None
+
+
+def set_output() -> None:
+    # Disable for now so we can monitor first
+    # pass
+    if os.getenv("GITHUB_OUTPUT"):
+        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
+            print("reuse=true", file=env)
+    else:
+        print("::set-output name=reuse::true")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Check for old whl files.")
+    parser.add_argument("--run-id", type=str, required=True, help="Workflow ID")
+    parser.add_argument(
+        "--build-environment", type=str, required=True, help="Build environment"
+    )
+    parser.add_argument(
+        "--github-ref",
+        type=str,
+    )
+    return parser.parse_args()
+
+
+def can_reuse_whl(args: argparse.Namespace) -> bool:
+    # if is_main_branch() or (
+    #     args.github_ref
+    #     and any(
+    #         args.github_ref.startswith(x)
+    #         for x in ["refs/heads/release", "refs/tags/v", "refs/heads/main"]
+    #     )
+    # ):
+    #     print("On main branch or release branch, rebuild whl")
+    #     return False
+
+    if check_labels_for_pr():
+        print(f"Found {FORCE_REBUILD_LABEL} label on PR, rebuild whl")
+        return False
+
+    if check_issue_open():
+        print("Issue #153759 is open, rebuild whl")
+        return False
+
+    if not check_changed_files(get_merge_base()):
+        print("Cannot use old whl due to the changed files, rebuild whl")
+        return False
+
+    workflow_id = get_workflow_id(args.run_id)
+    if workflow_id is None:
+        print("No workflow ID found, rebuild whl")
+        return False
+
+    if not find_old_whl(workflow_id, args.build_environment, get_merge_base()):
+        print("No old whl found, rebuild whl")
+        # TODO: go backwards from merge base to find more runs
+        return False
+
+    return True
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    if can_reuse_whl(args):
+        print("Reusing old whl")
+        unzip_artifact_and_replace_files()
+        set_output()
--- a/.github/actions/upload-utilization-stats/action.yml
+++ b/.github/actions/upload-utilization-stats/action.yml
@ -1,6 +1,6 @@
 name: upload-utilization-stats

-description: Upload utilization stats to artifacts
+description: Upload utilization stats to artifacts.

 inputs:
    workflow_run_id:
@ -23,6 +23,17 @@ inputs:
      type: string
      description: 'the job name of the test'
      required: True
+    local_path:
+      type: string
+      description: 'the local path to the utilization stats file'
+      required: False
+      default: ''
+    artifact_prefix:
+      type: string
+      description: |
+          'the prefix of the raw utilization data, for data stored in zip file, this is the prefix of the parent zip file'
+      default: ""
+      required: False

 runs:
  using: composite
@ -35,6 +46,8 @@ runs:
        echo "workflow_Name: ${{inputs.workflow_name}}"
        echo "job_id: ${{inputs.job_id}}"
        echo "job_name:  ${{inputs.job_name}}"
+        echo "artifact_prefix: ${{inputs.artifact_prefix}}"
+        python3 --version
    - uses: nick-fields/retry@v3.0.0
      name: Setup dependencies
      with:
@ -53,4 +66,6 @@ runs:
          --workflow-name "${{inputs.workflow_name}}" \
          --workflow-run-attempt "${{inputs.workflow_attempt}}" \
          --job-id "${{inputs.job_id}}" \
-          --job-name "${{inputs.job_name}}"
+          --job-name "${{inputs.job_name}}" \
+          --local-path "${{inputs.local_path}}" \
+          --artifact-prefix "${{inputs.artifact_prefix}}"
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ea5de17755d657508c84c4dce8970b614008adcf
+1a8f6213b0b61efc6a4862bc45b853551a93dbb6
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-373ffb19dc470f4423a3176a4133f8f4b3cdb5bd
+e03a63be43e33596f7f0a43b0f530353785e4a59
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-d23a6e1664d20707c11781299611436e1f0c104f
+966da7e46f65d6d49df3e31214470a4fe5cc8e66
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-8d9e34b352af09c81ff8df448fd27f9c4aae1382
+edc1a882d872dd7f1362e4312fd045a1d81b3355
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -398,7 +398,10 @@
  - torch/_inductor/codegen/cpp_micro_gemm.py
  - torch/_inductor/codegen/cpp_template_kernel.py
  - torch/_inductor/codegen/cpp_template.py
+  - torch/_inductor/codegen/cpp_bmm_template.py
  - torch/_inductor/codegen/cpp_gemm_template.py
+  - torch/_inductor/codegen/cpp_grouped_gemm_template.py
+  - torch/_inductor/codegen/cpp_flex_attention_template.py
  - torch/csrc/inductor/cpp_prefix.h
  - test/inductor/test_mkldnn_pattern_matcher.py
  - test/inductor/test_cpu_repro.py
@ -406,6 +409,7 @@
  - test/inductor/test_cpu_select_algorithm.py
  - aten/src/ATen/cpu/**
  - aten/src/ATen/native/quantized/cpu/**
+  - aten/src/ATen/test/vec_test_all_types.*
  - test/quantization/core/test_quantized_op.py
  - torch/ao/quantization/quantizer/x86_inductor_quantizer.py
  - test/quantization/pt2e/test_x86inductor_quantizer.py
@ -413,6 +417,7 @@
  - leslie-fang-intel
  - jgong5
  - EikanWang
+  - CaoE
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/requirements/README.md
+++ b/.github/requirements/README.md
@ -11,16 +11,6 @@ jobs, but it also allows them to be cached properly to improve CI
 reliability.

 The list of support files are as follows:
-
-* Conda:
-  * conda-env-iOS. This is used by iOS build and test jobs to setup the
-    conda environment
-  * conda-env-macOS-ARM64. This is used by MacOS (m1, arm64) build and
-    test jobs to setup the conda environment
-  * conda-env-Linux-X64. This is used by Linux buck build and test jobs
-    to setup the conda environment
 * Pip:
-  * pip-requirements-iOS.txt. This is used by iOS build and test jobs to
-    setup the pip environment
  * pip-requirements-macOS.txt. This is used by MacOS build and test jobs to
    setup the pip environment
--- a/.github/requirements/conda-env-Linux-X64.txt
+++ b/.github/requirements/conda-env-Linux-X64.txt
@ -1,8 +0,0 @@
-cmake=3.22.*
-mkl=2022.1.0
-mkl-include=2022.1.0
-ninja=1.10.2
-numpy=1.23.3
-pyyaml=6.0
-setuptools=72.1.0
-typing-extensions=4.11.0
--- a/.github/requirements/conda-env-iOS.txt
+++ b/.github/requirements/conda-env-iOS.txt
@ -1,7 +0,0 @@
-blas=1.0
-cmake=3.22.1
-ninja=1.10.2
-numpy=1.23.3
-pyyaml=6.0
-setuptools=72.1.0
-typing-extensions=4.11.0
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@ -1,22 +1,6 @@
-numpy=1.22.3
-pyyaml=6.0
-setuptools=72.1.0
-cmake=3.22.*
-typing-extensions=4.11.0
-dataclasses=0.8
-pip=22.2.2
-pillow=10.0.1
-pkg-config=0.29.2
-wheel=0.37.1
-# NB: This is intentionally held back because anaconda main doesn't
-# have updated expecttest, but you don't /need/ the updated version
-# to run the tests.  In the meantime I need to figure out how to
-# cajole anaconda into updating, or get the package from pypi instead...
-expecttest=0.1.3
-
 # Not pinning certifi so that we can always get the latest certificates
 certifi
-
-# Cross-compiling arm64 from x86-64 picks up 1.40.0 while testing on arm64
-# itself only has up to 1.39.0 from upstream conda. Both work though
-libuv>=1.39.0,<=1.40.0
+pip=23.2.1
+pkg-config=0.29.2
+setuptools=72.1.0
+wheel=0.37.1
--- a/.github/requirements/pip-requirements-iOS.txt
+++ b/.github/requirements/pip-requirements-iOS.txt
@ -1,4 +0,0 @@
-# iOS simulator requirements
-coremltools==5.0b5
-protobuf==3.20.2
-optree==0.13.0
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,33 +1,34 @@
 boto3==1.35.42
-hypothesis==6.56.4
+cmake==3.25.*
 expecttest==0.3.0
 fbscribelogger==0.1.7
+filelock==3.6.0
+hypothesis==6.56.4
 librosa>=0.6.2
 mpmath==1.3.0
 networkx==2.8.7
-# Use numba-0.49.1 or older on Intel Macs, but 0.56.0 on M1 machines, as older numba is not available
-numba==0.56.0; platform_machine == "arm64"
-numba<=0.49.1; platform_machine != "arm64"
+ninja==1.10.2.4
+numba==0.59.0
+numpy==1.26.4
 opt-einsum>=3.3
-psutil==5.9.1
-nvidia-ml-py==11.525.84
+optree==0.13.0
 packaging==23.1
+parameterized==0.8.1
+pillow==10.0.1
+protobuf==5.29.4
+psutil==5.9.1
 pygments==2.15.0
-pytest==7.3.2
-pytest-xdist==3.3.1
-pytest-rerunfailures==10.3
+pytest-cpp==2.3.0
 pytest-flakefinder==1.1.0
+pytest-rerunfailures==10.3
 pytest-subtests==0.13.1
-scipy==1.10.1
+pytest-xdist==3.3.1
+pytest==7.3.2
+pyyaml==6.0.2
+scipy==1.12.0
 sympy==1.13.3
+tensorboard==2.13.0
+typing-extensions==4.12.2
 unittest-xml-reporting<=3.2.0,>=2.0.0
 xdoctest==1.1.0
-filelock==3.6.0
-pytest-cpp==2.3.0
 z3-solver==4.12.2.0
-tensorboard==2.13.0
-optree==0.13.0
-# NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
-# which the stringify metadata is wrong when escaping double quote
-protobuf==3.20.2
-parameterized==0.8.1
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@ -28,12 +28,12 @@ def main() -> None:
    issue = repo.get_issue(issue_number)
    issue_labels = issue.labels
    docathon_label_present = any(
-        label.name == "docathon-h1-2024" for label in issue_labels
+        label.name == "docathon-h1-2025" for label in issue_labels
    )

    # if the issue has a docathon label, add all labels from the issue to the PR.
    if not docathon_label_present:
-        print("The 'docathon-h1-2024' label is not present in the issue.")
+        print("The 'docathon-h1-2025' label is not present in the issue.")
        return
    pull_request_labels = pull_request.get_labels()
    pull_request_label_names = [label.name for label in pull_request_labels]
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -21,7 +21,7 @@ CUDA_STABLE = "12.6"
 CUDA_ARCHES_FULL_VERSION = {
    "11.8": "11.8.0",
    "12.6": "12.6.3",
-    "12.8": "12.8.0",
+    "12.8": "12.8.1",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
    "11.8": "9",
@ -72,20 +72,20 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.8": (
-        "nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "xpu": (
        "intel-cmplr-lib-rt==2025.1.1 | "
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -31,6 +31,9 @@ python3 -m tools.pyi.gen_pyi \
    --deprecated-functions-path "tools/autograd/deprecated.yaml"
 python3 torch/utils/data/datapipes/gen_pyi.py

+# Also check generated pyi files
+find torch -name '*.pyi' -exec git add --force -- "{}" +
+
 RC=0
 # Run lintrunner on all files
 if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
@ -41,6 +44,9 @@ if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS}
    RC=1
 fi

+# Unstage temporally added pyi files
+find torch -name '*.pyi' -exec git restore --staged -- "{}" +
+
 # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
 jq --raw-output \
    '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
--- a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
+++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
@ -27,6 +27,9 @@ unset ACCESS_TOKEN
 # it does one job, stops and unregisters
 registration_token=$(jq --raw-output .token "$token_file")

+# workaround for https://gitlab.com/qemu-project/qemu/-/issues/2600
+export DOTNET_EnableWriteXorExecute=0
+
 ./config.sh \
        --unattended \
        --ephemeral \
@ -44,8 +47,5 @@ rm -f "$token_file"
 # and it doesn't work for non-root user
 source venv/bin/activate

-# workaround for https://gitlab.com/qemu-project/qemu/-/issues/2600
-export DOTNET_EnableWriteXorExecute=0
-
 # Run one job.
 ./run.sh
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -939,6 +939,12 @@ class GitHubPR:
                    summary=None,
                )

+        # Making an exception for Apply lint auggestions/autoformat because the
+        # bot adds a merged label -> triggers workflow -> sometimes needs
+        # approval -> is read as failure, which results in a blocked merge, but
+        # this workflow doesn't provide mergability info
+        self.conclusions.pop("Apply lint suggestions", None)
+
        return self.conclusions

    def get_authors(self) -> dict[str, str]:
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -76,7 +76,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    {%- if os == "windows-arm64" %}
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    {%- else %}
    {%- if branches == "nightly" %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
@ -102,23 +102,12 @@ jobs:
        run: |
          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch - recursive
        uses: actions/checkout@v4
        with:
          path: "pytorch"
@ -172,7 +161,7 @@ jobs:
      - !{{ config["build_name"] }}-build
      - get-label-type
 {%- if os == "windows-arm64" %}
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
 {%- else %}
 {%- if config["gpu_arch_type"] == "cuda" %}
 {%- if branches == "nightly" %}
@ -198,18 +187,11 @@ jobs:
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Populate binary env
+      - name: Enable long paths
        shell: cmd
        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -223,10 +205,6 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
      - name: Bootstrap Rust
        shell: cmd
        run: |
--- a/.github/workflows/_link_check.yml
+++ b/.github/workflows/_link_check.yml
@ -7,29 +7,25 @@ on:
      ref:
        type: string
        required: true
-      run-url-lint:
-        type: boolean
-        required: false
-        default: false

 jobs:
  lint-urls:
-    if: ${{ inputs.run-url-lint }}
+    if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
      timeout: 120
      runner: ${{ inputs.runner }}linux.2xlarge
-      docker-image: pytorch-linux-focal-linter
+      docker-image: ci-image:pytorch-linux-focal-linter
      fetch-depth: 0
      submodules: false
      ref: ${{ inputs.ref }}
      script: |
        ./scripts/lint_urls.sh $(
-          { [ "${{ github.event_name }}" = "pull_request" ] \
-              && git diff --name-only "${{ github.event.pull_request.base.sha }}...${{ github.event.pull_request.head.sha }}"; } \
-          || \
-          { [ "${{ github.event_name }}" = "push" ] \
-              && git diff --name-only "${{ github.event.before }}...${{ github.sha }}"; }
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}"
+          else
+            echo "${{ github.event.before }}" "${{ github.sha }}"
+          fi
        ) || {
          echo
          echo "URL lint failed."
@ -44,17 +40,17 @@ jobs:
    with:
      timeout: 60
      runner: ${{ inputs.runner }}linux.2xlarge
-      docker-image: pytorch-linux-focal-linter
+      docker-image: ci-image:pytorch-linux-focal-linter
      fetch-depth: 0
      submodules: false
      ref: ${{ inputs.ref }}
      script: |
        ./scripts/lint_xrefs.sh $(
-          { [ "${{ github.event_name }}" = "pull_request" ] \
-              && git diff --name-only "${{ github.event.pull_request.base.sha }}...${{ github.event.pull_request.head.sha }}"; } \
-          || \
-          { [ "${{ github.event_name }}" = "push" ] \
-              && git diff --name-only "${{ github.event.before }}...${{ github.sha }}"; }
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}"
+          else
+            echo "${{ github.event.before }}" "${{ github.sha }}"
+          fi
        ) || {
          echo
          echo "Xref lint failed."
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -74,6 +74,32 @@ on:
          Overwrite the number of jobs to use for the build
        required: false
        type: string
+      disable-monitor:
+        description: |
+          Disable utilization monitoring for build job
+        required: false
+        type: boolean
+        default: false
+      monitor-log-interval:
+        description: |
+          Set the interval for the monitor script to log utilization.
+        required: false
+        type: number
+        default: 5
+      monitor-data-collect-interval:
+        description: |
+          Set the interval for the monitor script to collect data.
+        required: false
+        type: number
+        default: 1
+
+      allow-reuse-old-whl:
+        description: |
+          If set, the build try to pull an old wheel from s3 that was built on a
+          commit with no cpp changes from this commit
+        required: false
+        type: boolean
+        default: false

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -132,6 +158,15 @@ jobs:
          role-session-name: gha-linux-build
          aws-region: us-east-1

+      - name: Check if can use old whl build
+        id: use-old-whl
+        uses: ./.github/actions/reuse-old-whl
+        if: ${{ inputs.allow-reuse-old-whl && github.event_name == 'push' }}
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          run-id: ${{ github.run_id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@ -141,7 +176,7 @@ jobs:

      - name: Use following to pull public copy of the image
        id: print-ghcr-mirror
-        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
        env:
          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
        shell: bash
@ -151,7 +186,7 @@ jobs:

      - name: Pull docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -176,17 +211,38 @@ jobs:
          selected-test-configs: ${{ inputs.selected-test-configs }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

+      - name: Start monitoring script
+        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
+        shell: bash
+        continue-on-error: true
+        env:
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
+          WORKFLOW_NAME: ${{ github.workflow }}
+          WORKFLOW_RUN_ID: ${{github.run_id}}
+          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
+          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
+        run: |
+          mkdir -p ../../usage_logs
+          python3 -m pip install psutil==5.9.1 dataclasses_json==0.6.7
+          python3 -m tools.stats.monitor \
+          --log-interval "$MONITOR_LOG_INTERVAL" \
+          --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" \
+          > "../../usage_logs/usage_log_build_${JOB_ID}.txt" 2>&1 &
+          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
+
      - name: Download pytest cache
        uses: ./.github/actions/pytest-cache-download
        continue-on-error: true
-        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
        with:
          cache_dir: .pytest_cache
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
          s3_bucket: ${{ inputs.s3-bucket }}

      - name: Build
-        if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
+        if: (steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '') && steps.use-old-whl.outputs.reuse != 'true'
        id: build
        env:
          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
@ -280,14 +336,23 @@ jobs:
          END_TIME=$(date +%s)
          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"

+      - name: Stop monitoring script
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
+        shell: bash
+        continue-on-error: true
+        env:
+          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        run: |
+          kill "$MONITOR_SCRIPT_PID"
+
      - name: Archive artifacts into zip
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && steps.use-old-whl.outputs.reuse != 'true'
        run: |
          zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
+        if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
@ -297,13 +362,32 @@ jobs:

      - name: Store PyTorch Build Artifacts for s390x
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        if: inputs.build-generates-artifacts && (steps.build.outcome != 'skipped' || steps.use-old-whl.outputs.reuse == 'true') && inputs.build-environment == 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
          if-no-files-found: error
          path: artifacts.zip

+      - name: copy logs
+        shell: bash
+        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
+        continue-on-error: true
+        run: |
+          rm -f ./usage_logs
+          mkdir -p ./usage_logs
+          cp ../../usage_logs/usage_log_build_*.txt ./usage_logs/
+
+      - name: Upload raw usage log to s3
+        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel'}}
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
+          retention-days: 14
+          if-no-files-found: warn
+          path: usage_logs/usage_log_build_*.txt
+
      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        uses: ./.github/actions/upload-sccache-stats
@ -311,6 +395,18 @@ jobs:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          build-time: ${{ steps.build.outputs.build_time }}

+      - name: Upload utilization stats
+        if: ${{ always() && steps.build.outcome != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+        continue-on-error: true
+        uses: ./.github/actions/upload-utilization-stats
+        with:
+          job_id: ${{ steps.get-job-id.outputs.job-id }}
+          job_name: ${{ steps.get-job-id.outputs.job-name }}
+          workflow_name: ${{ github.workflow }}
+          workflow_run_id: ${{github.run_id}}
+          workflow_attempt: ${{github.run_attempt}}
+          artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}
+
      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -376,7 +376,7 @@ jobs:
      - name: Upload pytest cache if tests failed
        uses: ./.github/actions/pytest-cache-upload
        continue-on-error: true
-        if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
+        if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          cache_dir: .pytest_cache
          shard: ${{ matrix.shard }}
@ -431,7 +431,7 @@ jobs:
          path: ./**/core.[1-9]*

      - name: Upload utilization stats
-        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor }}
+        if: ${{ always() && steps.test.conclusion && steps.test.conclusion != 'skipped' && !inputs.disable-monitor && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        continue-on-error: true
        uses: ./.github/actions/upload-utilization-stats
        with:
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -30,7 +30,7 @@ on:
      python-version:
        required: false
        type: string
-        default: "3.9"
+        default: "3.12"
        description: |
          The python version to be used. Will be 3.9 by default
      test-matrix:
@ -85,8 +85,9 @@ jobs:
        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
        with:
          python-version: ${{ inputs.python-version }}
-          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
-          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
+          environment-file: .github/requirements/conda-env-macOS-ARM64
+          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
+          default-packages: ""

      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -21,7 +21,7 @@ on:
      python-version:
        required: false
        type: string
-        default: "3.9"
+        default: "3.12"
        description: |
          The python version to be used. Will be 3.9 by default
      timeout-minutes:
@ -144,8 +144,9 @@ jobs:
        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
        with:
          python-version: ${{ inputs.python-version }}
-          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
-          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
+          environment-file: .github/requirements/conda-env-macOS-ARM64
+          pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
+          default-packages: ""

      - name: Parse ref
        id: parse-ref
@ -278,6 +279,7 @@ jobs:
          workflow_name: ${{ github.workflow }}
          workflow_run_id: ${{github.run_id}}
          workflow_attempt: ${{github.run_attempt}}
+          local_path: usage_log.txt

      - name: Clean up disk space
        if: always()
--- a/.github/workflows/assigntome-docathon.yml
+++ b/.github/workflows/assigntome-docathon.yml
@ -28,14 +28,14 @@ jobs:
                  repo: context.repo.repo,
                  issue_number: issueNumber
                });
-              const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2024');
+              const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2025');
              if (hasLabel) {
                if (issue.assignee !== null) {
                  await github.rest.issues.createComment({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    issue_number: issueNumber,
-                    body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)."
+                    body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2025 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2025)."
                  });
                } else {
                  await github.rest.issues.addAssignees({
@ -46,7 +46,7 @@ jobs:
                  });
                }
              } else {
-                const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)."
+                const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2025 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2025)."
                await github.rest.issues.createComment({
                  owner: context.repo.owner,
                  repo: context.repo.repo,
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -49,13 +49,11 @@ jobs:
      matrix:
        runner: [linux.12xlarge]
        docker-image-name: [
-          pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks,
-          pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11,
-          pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
-          pytorch-linux-focal-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
          pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9,
          pytorch-linux-focal-py3.9-clang10,
          pytorch-linux-focal-py3.11-clang10,
@ -110,18 +108,6 @@ jobs:
          always-rebuild: true
          push: true

-      - name: Push docker image to old name
-        shell: bash
-        env:
-          ECR_DOCKER_IMAGE: ${{ steps.build-docker-image.outputs.docker-image }}
-        run: |
-          # This can be deleted after people have rebased their PRs/the new name has been in main for a while
-          set -euox pipefail
-          docker_image_name="308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}"
-          foldersha=${ECR_DOCKER_IMAGE##*-}
-          docker tag "${ECR_DOCKER_IMAGE}" "${docker_image_name}:${foldersha}"
-          docker push "${docker_image_name}:${foldersha}"
-
      - name: Pull docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -136,7 +136,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -252,7 +252,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -368,7 +368,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -484,7 +484,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -600,7 +600,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -716,7 +716,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -155,7 +155,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -269,7 +269,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -882,7 +882,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -1563,7 +1563,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -2176,7 +2176,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -2789,7 +2789,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -3402,7 +3402,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@ -50,7 +50,7 @@ jobs:
  libtorch-cpu-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -77,23 +77,12 @@ jobs:
        run: |
          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch - recursive
        uses: actions/checkout@v4
        with:
          path: "pytorch"
@ -138,7 +127,7 @@ jobs:
    needs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -160,18 +149,11 @@ jobs:
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Populate binary env
+      - name: Enable long paths
        shell: cmd
        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -185,10 +167,6 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
      - name: Bootstrap Rust
        shell: cmd
        run: |
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@ -50,7 +50,7 @@ jobs:
  libtorch-cpu-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -77,23 +77,12 @@ jobs:
        run: |
          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch - recursive
        uses: actions/checkout@v4
        with:
          path: "pytorch"
@ -138,7 +127,7 @@ jobs:
    needs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -160,18 +149,11 @@ jobs:
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Populate binary env
+      - name: Enable long paths
        shell: cmd
        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -185,10 +167,6 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
      - name: Bootstrap Rust
        shell: cmd
        run: |
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@ -50,7 +50,7 @@ jobs:
  wheel-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -73,23 +73,12 @@ jobs:
        run: |
          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch - recursive
        uses: actions/checkout@v4
        with:
          path: "pytorch"
@ -134,7 +123,7 @@ jobs:
    needs:
      - wheel-py3_11-cpu-build
      - get-label-type
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -152,18 +141,11 @@ jobs:
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Populate binary env
+      - name: Enable long paths
        shell: cmd
        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -177,10 +159,6 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
      - name: Bootstrap Rust
        shell: cmd
        run: |
@ -219,7 +197,7 @@ jobs:
  wheel-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -242,23 +220,12 @@ jobs:
        run: |
          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch - recursive
        uses: actions/checkout@v4
        with:
          path: "pytorch"
@ -303,7 +270,7 @@ jobs:
    needs:
      - wheel-py3_12-cpu-build
      - get-label-type
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -321,18 +288,11 @@ jobs:
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Populate binary env
+      - name: Enable long paths
        shell: cmd
        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -346,10 +306,6 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
      - name: Bootstrap Rust
        shell: cmd
        run: |
@ -388,7 +344,7 @@ jobs:
  wheel-py3_13-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -411,23 +367,12 @@ jobs:
        run: |
          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Enable long paths
+        shell: cmd
+        run: |
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch - recursive
        uses: actions/checkout@v4
        with:
          path: "pytorch"
@ -472,7 +417,7 @@ jobs:
    needs:
      - wheel-py3_13-cpu-build
      - get-label-type
-    runs-on: "windows-11-arm64"
+    runs-on: "windows-11-arm64-preview"
    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -490,18 +435,11 @@ jobs:
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Populate binary env
+      - name: Enable long paths
        shell: cmd
        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
+          git config --system --get core.longpaths || echo "core.longpaths is not set, setting it now"
+          git config --system core.longpaths true
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -515,10 +453,6 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
      - name: Bootstrap Rust
        shell: cmd
        run: |
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -27,15 +27,15 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-build:
-    name: cuda12.6-py3.10-gcc9-sm80
+  build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -43,13 +43,13 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-test:
-    name: cuda12.6-py3.10-gcc9-sm80
+  test:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-build
+    needs: build
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-micro-benchmark-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 720
    secrets: inherit
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -24,15 +24,15 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
-    name: cuda12.6-py3.10-gcc9-sm80
+  build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -43,14 +43,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
-    name: cuda12.6-py3.10-gcc9-sm80
+  test:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+    needs: build
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
      # disable monitor in perf tests for more investigation
      disable-monitor: false
      monitor-log-interval: 15
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -79,13 +79,13 @@ jobs:

  # NB: Keep this in sync with trunk.yml
  build:
-    name: cuda12.6-py3.10-gcc9-sm90
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '9.0'
      test-matrix: |
        { include: [
@ -111,12 +111,12 @@ jobs:
    secrets: inherit

  test-nightly:
-    name: cuda12.6-py3.10-gcc9-sm90
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '0 7 * * 1-6'
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -128,12 +128,12 @@ jobs:
    secrets: inherit

  test-weekly:
-    name: cuda12.6-py3.10-gcc9-sm90
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '0 7 * * 0'
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -145,12 +145,12 @@ jobs:
    secrets: inherit

  test:
-    name: cuda12.6-py3.10-gcc9-sm90
+    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event_name == 'workflow_dispatch'
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm90
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -42,7 +42,7 @@ jobs:
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
-      python-version: 3.9.12
+      python-version: 3.12.7
      test-matrix: |
        { include: [
          { config: "perf_smoketest", shard: 1, num_shards: 1, runner: "macos-m2-15" },
@ -56,8 +56,10 @@ jobs:
    with:
      build-environment: macos-py3-arm64-distributed
      # Same as the build job
-      python-version: 3.9.12
+      python-version: 3.12.7
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
-      # disable monitor in perf tests for more investigation
-      disable-monitor: true
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -78,14 +78,14 @@ jobs:
      opt_out_experiments: lf

  # NB: Keep this in sync with trunk.yml
-  linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
-    name: cuda12.6-py3.10-gcc9-sm80
+  build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -112,32 +112,32 @@ jobs:
      selected-test-configs: ${{ inputs.benchmark_configs }}
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-test-nightly:
-    name: cuda12.6-py3.10-gcc9-sm80
+  test-nightly:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+    needs: build
    if: github.event.schedule == '0 7 * * 1-6'
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 720
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-test-weekly:
-    name: cuda12.6-py3.10-gcc9-sm80
+  test-weekly:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+    needs: build
    if: github.event.schedule == '0 7 * * 0'
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 1440
      # disable monitor in perf tests, next step is to enable it
      disable-monitor: false
@ -145,16 +145,16 @@ jobs:
      monitor-data-collect-interval: 4
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
-    name: cuda12.6-py3.10-gcc9-sm80
+  test:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+    needs: build
    if: github.event_name == 'workflow_dispatch'
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 720
      disable-monitor: false
      monitor-log-interval: 15
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -29,14 +29,14 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-focal-cuda12_6-py3_10-gcc9-periodic-dynamo-benchmarks-build:
-    name: cuda12.6-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
@ -58,14 +58,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-periodic-dynamo-benchmarks-test:
-    name: cuda12.6-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
+    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-periodic-dynamo-benchmarks-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
@ -109,15 +109,15 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp:
-    name: cuda12.6-py3.10-gcc9-sm80
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -125,14 +125,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-test-gcp:
-    name: cuda12.6-py3.10-gcc9-sm80
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
@ -170,16 +170,16 @@ jobs:
    secrets: inherit


-  linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
-    name: cuda12.6-py3.10-gcc9-sm86
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      sync-tag: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
      test-matrix: |
        { include: [
          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -195,14 +195,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
-    name: cuda12.6-py3.10-gcc9-sm86
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-build:
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@ -38,12 +38,12 @@ jobs:
      opt_out_experiments: lf

  linux-jammy-rocm-py3_10-inductor-build:
-    name: rocm-py3.10-inductor
+    name: rocm-py3.10-inductor-mi300
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
@ -56,11 +56,11 @@ jobs:
    permissions:
      id-token: write
      contents: read
-    name: rocm-py3.10-inductor
+    name: rocm-py3.10-inductor-mi300
    uses: ./.github/workflows/_rocm-test.yml
    needs: linux-jammy-rocm-py3_10-inductor-build
    with:
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.docker-image }}
      test-matrix:  ${{ needs.linux-jammy-rocm-py3_10-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@ -43,6 +43,7 @@ jobs:
          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

  linux-jammy-rocm-py3_10-inductor-test:
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -26,13 +26,13 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
+  linux-jammy-cuda12_6-py3_10-gcc9-inductor-build:
    name: cuda12.6-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.6-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
@ -43,25 +43,26 @@ jobs:
          { config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
+  linux-jammy-cuda12_6-py3_10-gcc9-inductor-test:
    name: cuda12.6-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+    needs: linux-jammy-cuda12_6-py3_10-gcc9-inductor-build
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.6-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-focal-cuda12_6-py3_12-gcc9-inductor-build:
+  linux-jammy-cuda12_6-py3_12-gcc9-inductor-build:
    name: cuda12.6-py3.12-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-focal-cuda12.6-py3.12-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.6-py3.12-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
@ -69,16 +70,17 @@ jobs:
          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

-  linux-focal-cuda12_6-py3_12-gcc9-inductor-test:
+  linux-jammy-cuda12_6-py3_12-gcc9-inductor-test:
    name: cuda12.6-py3.12-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_12-gcc9-inductor-build
+    needs: linux-jammy-cuda12_6-py3_12-gcc9-inductor-build
    with:
-      build-environment: linux-focal-cuda12.6-py3.12-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_12-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.6-py3.12-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_6-py3_12-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_6-py3_12-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-halide-build:
@ -93,6 +95,7 @@ jobs:
        { include: [
          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-halide-test:
@ -117,6 +120,7 @@ jobs:
        { include: [
          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
@ -144,6 +148,7 @@ jobs:
          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-test:
@ -156,27 +161,28 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-focal-cuda12_6-py3_13-gcc9-inductor-build:
+  linux-jammy-cuda12_6-py3_13-gcc9-inductor-build:
    name: cuda12.6-py3.13-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-focal-cuda12.6-py3.13-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.6-py3.13-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

-  linux-focal-cuda12_6-py3_13-gcc9-inductor-test:
+  linux-jammy-cuda12_6-py3_13-gcc9-inductor-test:
    name: cuda12.6-py3.13-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_13-gcc9-inductor-build
+    needs: linux-jammy-cuda12_6-py3_13-gcc9-inductor-build
    with:
-      build-environment: linux-focal-cuda12.6-py3.13-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_13-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_13-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.6-py3.13-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_6-py3_13-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_6-py3_13-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -42,16 +42,16 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-build:
-    name: cuda12.6-py3.10-gcc9-sm86
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      sync-tag: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
      test-matrix: |
        { include: [
          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
@ -60,16 +60,17 @@ jobs:
          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

-  linux-focal-cuda12_6-py3_10-gcc9-inductor-test:
-    name: cuda12.6-py3.10-gcc9-sm86
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
    with:
-      build-environment: linux-focal-cuda12.6-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-build:
@ -92,6 +93,7 @@ jobs:
          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-test:
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -283,6 +283,15 @@ jobs:
          # All we need to see is that it passes
          python3 torch/utils/collect_env.py

+  link-check:
+    name: Link checks
+    needs: get-label-type
+    uses: ./.github/workflows/_link_check.yml
+    with:
+      runner: ${{ needs.get-label-type.outputs.label-type }}
+      ref:    ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+    secrets: inherit
+
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@ -23,7 +23,7 @@ jobs:
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
-      python-version: 3.9.12
+      python-version: 3.12.7
      # The runner macos-m2-14 is not a typo, it's a custom runner that is different
      # than our AWS macos-m1-14 runners
      test-matrix: |
@ -42,6 +42,7 @@ jobs:
      sync-tag: macos-py3-arm64-mps-test
      build-environment: macos-py3-arm64
      # Same as the build job
-      python-version: 3.9.12
+      python-version: 3.12.7
      test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
+      disable-monitor: false
    secrets: inherit
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -34,7 +34,6 @@ jobs:
    with:
      runner: ${{ needs.get-label-type.outputs.label-type }}
      ref:    ${{ github.sha }}
-      run-url-lint: true
    secrets: inherit

  docs-build:
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@ -50,12 +50,12 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}

  linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
+    name: linux-jammy-rocm-py3.10-mi300
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
@ -69,13 +69,13 @@ jobs:
    permissions:
      id-token: write
      contents: read
-    name: linux-jammy-rocm-py3.10
+    name: linux-jammy-rocm-py3.10-mi300
    uses: ./.github/workflows/_rocm-test.yml
    needs:
      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -138,6 +138,7 @@ jobs:
          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
        ]}
      sync-tag: asan-build
+      allow-reuse-old-whl: true
    secrets: inherit


@ -202,6 +203,7 @@ jobs:
          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

  linux-focal-py3_9-clang10-test:
@ -237,6 +239,7 @@ jobs:
          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

  linux-focal-py3_13-clang10-test:
@ -250,14 +253,14 @@ jobs:
      timeout-minutes: 600
    secrets: inherit

-  linux-focal-cuda11_8-py3_10-gcc9-build:
-    name: linux-focal-cuda11.8-py3.10-gcc9
+  linux-focal-cuda12_6-py3_10-gcc11-build-distributed:
+    name: linux-focal-cuda12.6-py3.10-gcc11-build-distributed
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda11.8-py3.10-gcc9
-      docker-image-name: ci-image:pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+      build-environment: linux-focal-cuda12.6-py3.10-gcc11-distributed
+      docker-image-name: ci-image:pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11
      cuda-arch-list: '7.5'
      test-matrix: |
        { include: [
@ -267,17 +270,17 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-cuda11_8-py3_10-gcc9-test:
-    name: linux-focal-cuda11.8-py3.10-gcc9
+  linux-focal-cuda12_6-py3_10-gcc11-test-distributed:
+    name: linux-focal-cuda12.6-py3.10-gcc11-test
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-focal-cuda11_8-py3_10-gcc9-build
+      - linux-focal-cuda12_6-py3_10-gcc11-build-distributed
      - target-determination
    with:
      timeout-minutes: 360
-      build-environment: linux-focal-cuda11.8-py3.10-gcc9
-      docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.6-py3.10-gcc11-distributed
+      docker-image: ${{ needs.linux-focal-cuda12_6-py3_10-gcc11-build-distributed.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_6-py3_10-gcc11-build-distributed.outputs.test-matrix }}
    secrets: inherit

  linux-focal-cuda12_6-py3_10-gcc11-build:
@ -296,6 +299,7 @@ jobs:
          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

  linux-focal-cuda12_6-py3_10-gcc11-test:
@ -364,25 +368,6 @@ jobs:
      test-matrix: ${{ needs.linux-focal-py3_9-clang9-xla-build.outputs.test-matrix }}
    secrets: inherit

-  win-vs2022-cpu-py3-build:
-    # don't run build twice on main
-    if: github.event_name == 'pull_request'
-    name: win-vs2022-cpu-py3
-    uses: ./.github/workflows/_win-build.yml
-    needs: get-label-type
-    with:
-      build-environment: win-vs2022-cpu-py3
-      cuda-version: cpu
-      sync-tag: win-cpu-build
-      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-        ]}
-    secrets: inherit
-
  linux-focal-cpu-py3_10-gcc11-bazel-test:
    name: linux-focal-cpu-py3.10-gcc11-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
@ -449,6 +434,7 @@ jobs:
          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

  unstable-linux-focal-cuda12_6-py3_10-gcc11-sm89-build-xfail:
@ -469,6 +455,7 @@ jobs:
        { include: [
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

  linux-focal-cuda12_6-py3_10-gcc11-sm89-test:
@ -507,29 +494,30 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
    secrets: inherit

-  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
-    name: cuda12.4-py3.10-gcc9-sm75
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm75
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm75
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '7.5'
      test-matrix: |
        { include: [
          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
        ]}
+      allow-reuse-old-whl: true
    secrets: inherit

-  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
-    name: cuda12.4-py3.10-gcc9-sm75
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm75
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm75
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-xpu-2025_1-py3_9-build:
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -38,12 +38,12 @@ jobs:

  linux-jammy-rocm-py3_10-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: linux-jammy-rocm-py3.10
+    name: linux-jammy-rocm-py3.10-mi300
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -61,13 +61,13 @@ jobs:
    permissions:
      id-token: write
      contents: read
-    name: linux-jammy-rocm-py3.10
+    name: linux-jammy-rocm-py3.10-mi300
    uses: ./.github/workflows/_rocm-test.yml
    needs:
      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
-      build-environment: linux-jammy-rocm-py3.10
+      build-environment: linux-jammy-rocm-py3.10-mi300
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -148,6 +148,7 @@ jobs:
          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
        ]}
      sync-tag: asan-build
+      allow-reuse-old-whl: true
    secrets: inherit

  linux-jammy-py3_10-clang15-asan-test:
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@ -5,6 +5,8 @@ on:
    paths:
      - .github/workflows/test-h100.yml
  workflow_dispatch:
+  schedule:
+    - cron: 0 4,10,16,22 * * *  # every 6 hours
  push:
    tags:
      - ciflow/h100/*
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@ -21,15 +21,15 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-focal-cuda12_4-py3_10-gcc9-torchbench-build-gcp:
-    name: cuda12.4-py3.10-gcc9-sm80
+  build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -37,12 +37,12 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-cuda12_4-py3_10-gcc9-torchbench-test-gcp:
-    name: cuda12.4-py3.10-gcc9-sm80
+  test:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_4-py3_10-gcc9-torchbench-build-gcp
+    needs: build
    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-torchbench-build-gcp.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-torchbench-build-gcp.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -86,7 +86,7 @@ jobs:
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
-      python-version: 3.9.12
+      python-version: 3.12.7
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
@ -107,8 +107,9 @@ jobs:
    with:
      build-environment: macos-py3-arm64
      # Same as the build job
-      python-version: 3.9.12
+      python-version: 3.12.7
      test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
+      disable-monitor: false
    secrets: inherit

  win-vs2022-cpu-py3-build:
@ -118,7 +119,6 @@ jobs:
    with:
      build-environment: win-vs2022-cpu-py3
      cuda-version: cpu
-      sync-tag: win-cpu-build
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
      test-matrix: |
        { include: [
@ -187,13 +187,13 @@ jobs:
    secrets: inherit

  # NB: Keep this in sync with inductor-perf-test-nightly.yml
-  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
-    name: cuda12.4-py3.10-gcc9-sm80
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
    secrets: inherit

--- a/.gitignore
+++ b/.gitignore
@ -47,6 +47,7 @@ docs/source/generated/
 docs/source/compile/generated/
 log
 usage_log.txt
+usage_log*
 test-reports/
 test/*.bak
 test/**/*.bak
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -152,12 +152,12 @@ init_command = [
    'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
    'numpy==2.1.0 ; python_version >= "3.12"',
    'expecttest==0.3.0',
-    'mypy==1.14.0',
+    'mypy==1.15.0',
    'sympy==1.13.3',
    'types-requests==2.27.25',
    'types-PyYAML==6.0.7',
    'types-tabulate==0.8.8',
-    'types-protobuf==3.19.18',
+    'types-protobuf==5.29.1.20250403',
    'types-pkg-resources==0.1.3',
    'types-Jinja2==2.11.9',
    'types-colorama==0.4.6',
@ -1160,12 +1160,6 @@ exclude_patterns = [
    'torch/_inductor/autoheuristic/artifacts/**',
    # These files are all grandfathered in, feel free to remove from this list
    # as necessary
-    'test/_nvfuser/__init__.py',
-    'test/_nvfuser/test_dynamo.py',
-    'test/_nvfuser/test_python_frontend.py',
-    'test/_nvfuser/test_torchscript.py',
-    'test/delete.py',
-    'test/expect/__init__.py',
    'test/quantization/__init__.py',
    'test/quantization/core/__init__.py',
    'test/quantization/core/experimental/apot_fx_graph_mode_ptq.py',
@ -1193,8 +1187,6 @@ exclude_patterns = [
    'test/quantization/fx/test_numeric_suite_fx.py',
    'test/quantization/fx/test_quantize_fx.py',
    'test/quantization/fx/test_subgraph_rewriter.py',
-    'test/test_fake_tensor.py',
-    'test/test_flop_counter.py',
    'test/test_function_schema.py',
    'test/test_functional_autograd_benchmark.py',
    'test/test_functional_optim.py',
@ -1324,13 +1316,6 @@ exclude_patterns = [
    'torch/_export/passes/const_prop_pass.py',
    'torch/_export/passes/functionalize_side_effectful_ops_pass.py',
    'torch/_export/passes/replace_sym_size_ops_pass.py',
-    'torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py',
-    'torch/_export/serde/__init__.py',
-    'torch/_export/serde/schema.py',
-    'torch/_export/serde/serialize.py',
-    'torch/_export/serde/upgrade.py',
-    'torch/_export/trace.py',
-    'torch/_export/verifier.py',
    'torch/testing/_internal/__init__.py',
    'torch/testing/_internal/autocast_test_lists.py',
    'torch/testing/_internal/autograd_function_db.py',
@ -1447,7 +1432,6 @@ exclude_patterns = [
    'torch/utils/throughput_benchmark.py',
    'torch/utils/viz/__init__.py',
    'torch/utils/viz/_cycles.py',
-    'torch/utils/weak.py',
 ]
 init_command = [
    'python3',
@ -1521,7 +1505,7 @@ code = 'RUFF'
 include_patterns = [
    '**/*.py',
    '**/*.pyi',
-    'torch/utils/data/*.ipynb',
+    '**/*.ipynb',
    'pyproject.toml',
 ]
 exclude_patterns = [
@ -1552,7 +1536,7 @@ init_command = [
 ]
 is_formatter = true

-# This linter prevents merge conlicts in csv files in pytorch by enforcing
+# This linter prevents merge conflicts in csv files in pytorch by enforcing
 # three lines of whitespace between entries such that unless people are modifying
 # the same line, merge conflicts should not arise in git or hg
 [[linter]]
@ -1736,3 +1720,15 @@ command = [
 include_patterns = [
    'test/**/test_*.py',
 ]
+
+# 'header_only_linter' reports on properly testing header-only APIs.
+[[linter]]
+code = 'HEADER_ONLY_LINTER'
+command = [
+    'python3',
+    'tools/linter/adapters/header_only_linter.py',
+]
+include_patterns = [
+    'torch/header_only_apis.txt',
+]
+is_formatter = false
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -985,12 +985,11 @@ endif()
 include(cmake/public/utils.cmake)
 if(NOT MSVC)
  string(APPEND CMAKE_CXX_FLAGS " -O2 -fPIC")
-  if(NOT USE_XPU)
-    # This prevents use of `c10::optional`, `c10::nullopt` etc within the codebase
-    string(APPEND CMAKE_CXX_FLAGS " -DC10_NODEPRECATED")
-    string(APPEND CMAKE_CUDA_FLAGS " -DC10_NODEPRECATED")
-    string(APPEND CMAKE_OBJCXX_FLAGS " -DC10_NODEPRECATED")
-  endif()
+
+  # This prevents use of `c10::optional`, `c10::nullopt` etc within the codebase
+  string(APPEND CMAKE_CXX_FLAGS " -DC10_NODEPRECATED")
+  string(APPEND CMAKE_CUDA_FLAGS " -DC10_NODEPRECATED")
+  string(APPEND CMAKE_OBJCXX_FLAGS " -DC10_NODEPRECATED")

  # Eigen fails to build with some versions, so convert this to a warning
  # Details at http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1459
--- a/14
+++ b/14
@ -14,6 +14,7 @@
 /torch/csrc/autograd/ @albanD @soulitzer
 /torch/autograd/ @albanD @soulitzer
 /tools/autograd/ @albanD @soulitzer
+/torch/header_only_apis.txt @janeyx99
 /torch/nn/ @albanD @jbschlosser @mikaylagawarecki
 /torch/optim/ @albanD @janeyx99
 /test/test_public_bindings.py @albanD
@ -21,6 +22,7 @@
 /test/forward_backward_compatibility/check_forward_backward_compatibility.py @larryliu0820
 /docs/source/conf.py @albanD
 /aten/src/ATen/native/tags.yaml @ezyang
+/.github/merge_rules.yaml @albanD @malfet

 # Architecture Optimization (quantization, sparsity, etc.)
 /aten/src/ATen/native/ao_sparse @salilsdesai @kimishpatel @digantdesai @jianyuh
@ -49,12 +51,12 @@ nn/qat/ @jerryzh168
 /torch/csrc/distributed/c10d/Ops.* @kwen2501

 # ONNX Export
-/torch/_dynamo/backends/onnxrt.py @wschin @xadupre
-/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1 @xadupre
-/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1 @xadupre
-/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1 @xadupre
-/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin @xadupre
-/test/onnx/  @titaiwangms @shubhambhokare1 @justinchuby @wschin @xadupre
+/torch/_dynamo/backends/onnxrt.py @wschin
+/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1
+/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1
+/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1
+/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin
+/test/onnx/  @titaiwangms @shubhambhokare1 @justinchuby @wschin

 # CI
 /.ci  @pytorch/pytorch-dev-infra
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -112,8 +112,7 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
  lazy.)

  ```bash
-  conda uninstall pytorch -y
-  yes | pip uninstall torch
+  pip uninstall torch
  ```

  Next run `python setup.py clean`. After that, you can install in `develop` mode again.
@ -180,14 +179,6 @@ You can use this script to check out a new nightly branch with the following:
 source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
 ```

-Or if you would like to re-use an existing conda environment, you can pass in
-the prefix argument (`--prefix`):
-
-```bash
-./tools/nightly.py checkout -b my-nightly-branch -p my-env
-source my-env/bin/activate  # or `& .\my-env\Scripts\Activate.ps1` on Windows
-```
-
 To install the nightly binaries built with CUDA, you can pass in the flag `--cuda`:

 ```bash
@ -289,7 +280,7 @@ dependencies as well as the nightly binaries into the repo directory.
 ### Python Unit Testing

 **Prerequisites**:
-The following packages should be installed with either `conda` or `pip`:
+The following packages should be installed with `pip`:
 - `expecttest` and `hypothesis` - required to run tests
 - `mypy` - recommended for linting
 - `pytest` - recommended to run tests more selectively
@ -497,8 +488,7 @@ pip install -r requirements.txt
 # Or if you prefer an uncontaminated global executable environment or do not want to go through the node configuration:
 # npm install katex && export PATH="$PATH:$(pwd)/node_modules/.bin"
 ```
-> Note: if you installed `nodejs` with a different package manager (e.g.,
-`conda`) then `npm` will probably install a version of `katex` that is not
+> Note: if you installed `nodejs` with a different package manager then `npm` will probably install a version of `katex` that is not
 compatible with your version of `nodejs` and doc builds will fail.
 A combination of versions that is known to work is `node@6.13.1` and
 `katex@0.13.18`. To install the latter with `npm` you can run
@ -670,13 +660,13 @@ you run `import torch` anywhere else, the development version will be
 used).

 If you want to manage multiple builds of PyTorch, you can make use of
-[conda environments](https://conda.io/docs/using/envs.html) to maintain
+[venv environments](https://docs.python.org/3/library/venv.html) to maintain
 separate Python package environments, each of which can be tied to a
 specific build of PyTorch. To set one up:

 ```bash
-conda create -n pytorch-myfeature
-source activate pytorch-myfeature
+python -m venv pytorch-myfeature
+source pytorch-myfeature/bin/activate  # or `& .\pytorch-myfeature\Scripts\Activate.ps1` on Windows
 # if you run python now, torch will NOT be installed
 python setup.py develop
 ```
@ -754,7 +744,6 @@ same. Using ccache in a situation like this is a real time-saver.
 Before building pytorch, install ccache from your package manager of choice:

 ```bash
-conda install ccache -c conda-forge
 sudo apt install ccache
 sudo yum install ccache
 brew install ccache
@ -1046,8 +1035,7 @@ than Linux, which are worth keeping in mind when fixing these problems.

 3. If you have a Windows box (we have a few on EC2 which you can request access to) and
   you want to run the build, the easiest way is to just run `.ci/pytorch/win-build.sh`.
-   If you need to rebuild, run `REBUILD=1 .ci/pytorch/win-build.sh` (this will avoid
-   blowing away your Conda environment.)
+   If you need to rebuild, run `REBUILD=1 .ci/pytorch/win-build.sh`.

 Even if you don't know anything about MSVC, you can use cmake to build simple programs on
 Windows; this can be helpful if you want to learn more about some peculiar linking behavior
@ -1264,7 +1252,7 @@ in the meantime there will be some separation.
 There are a few "unusual" directories which, for historical reasons,
 are Caffe2/PyTorch specific. Here they are:

- `CMakeLists.txt`, `Makefile`, `binaries`, `cmake`, `conda`, `modules`,
+- `CMakeLists.txt`, `Makefile`, `binaries`, `cmake`, `modules`,
  `scripts` are Caffe2-specific. Don't put PyTorch code in them without
  extra coordination.

--- a/RELEASE.md
+++ b/RELEASE.md
@ -373,8 +373,9 @@ The patch release process takes around 4-5 weeks to complete.
  * Should the new patch release be created?
  * Timeline execution for the patch release
 3. Cherry picking phase starts after the decision is made to create a patch release. At this point, a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks.
-4. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger a new build and produce a new release candidate. An announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks.
-5. General Availability
+4. Updating `version.txt` in the release branch to match expected patch release version, see https://github.com/pytorch/pytorch/commit/f77213d3dae5d103a39cdaf93f21863843571e8d as an example
+5. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger a new build and produce a new release candidate. An announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks.
+6. General Availability

 ### Triage

--- a/10
+++ b/10
@ -144,8 +144,8 @@ new_local_repository(

 new_local_repository(
    name = "asmjit",
-    build_file = "//third_party:fbgemm/third_party/asmjit.BUILD",
-    path = "third_party/fbgemm/third_party/asmjit",
+    build_file = "//third_party:fbgemm/external/asmjit.BUILD",
+    path = "third_party/fbgemm/external/asmjit",
 )

 new_local_repository(
@ -184,6 +184,12 @@ new_local_repository(
    path = "third_party/nlohmann",
 )

+new_local_repository(
+    name = "moodycamel",
+    build_file = "//third_party:moodycamel.BUILD",
+    path = "third_party/concurrentqueue",
+)
+
 new_local_repository(
    name = "tensorpipe",
    build_file = "//third_party:tensorpipe.BUILD",
--- a/android/README.md
+++ b/android/README.md
@ -2,7 +2,9 @@

 ## Demo applications and tutorials

-Demo applications with code walk-through can be find in [this github repo](https://github.com/pytorch/android-demo-app).
+Please refer to [pytorch-labs/executorch-examples](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo) for the Android demo app based on [ExecuTorch](https://github.com/pytorch/executorch).
+
+Please join our [Discord](https://discord.com/channels/1334270993966825602/1349854760299270284) for any questions.

 ## Publishing

@ -119,8 +121,6 @@ We also have to add all transitive dependencies of our aars.
 As `pytorch_android` [depends](https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/build.gradle#L76-L77) on `'com.facebook.soloader:nativeloader:0.10.5'` and `'com.facebook.fbjni:fbjni-java-only:0.2.2'`, we need to add them.
 (In case of using maven dependencies they are added automatically from `pom.xml`).

-You can check out [test app example](https://github.com/pytorch/pytorch/blob/master/android/test_app/app/build.gradle) that uses aars directly.
-
 ## Linking to prebuilt libtorch library from gradle dependency

 In some cases, you may want to use libtorch from your android native build.
@ -202,7 +202,7 @@ find_library(FBJNI_LIBRARY fbjni
  NO_CMAKE_FIND_ROOT_PATH)

 target_link_libraries(${PROJECT_NAME}
-  ${PYTORCH_LIBRARY})
+  ${PYTORCH_LIBRARY}
  ${FBJNI_LIBRARY})

 ```
@ -233,8 +233,6 @@ void loadAndForwardModel(const std::string& modelPath) {

 To load torchscript model for mobile we need some special setup which is placed in `struct JITCallGuard` in this example. It may change in future, you can track the latest changes keeping an eye in our [pytorch android jni code]([https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp#L28)

-[Example of linking to libtorch from aar](https://github.com/pytorch/pytorch/tree/master/android/test_app)
-
 ## PyTorch Android API Javadoc

 You can find more details about the PyTorch Android API in the [Javadoc](https://pytorch.org/javadoc/).
--- a/android/build_test_app.sh
+++ b/android/build_test_app.sh
@ -1,30 +0,0 @@
-#!/bin/bash
-set -eux
-
-PYTORCH_DIR="$(cd $(dirname $0)/..; pwd -P)"
-PYTORCH_ANDROID_DIR=$PYTORCH_DIR/android
-
-echo "PYTORCH_DIR:$PYTORCH_DIR"
-
-source "$PYTORCH_ANDROID_DIR/common.sh"
-
-check_android_sdk
-check_gradle
-parse_abis_list "$@"
-build_android
-
-# To set proxy for gradle add following lines to ./gradle/gradle.properties:
-# systemProp.http.proxyHost=...
-# systemProp.http.proxyPort=8080
-# systemProp.https.proxyHost=...
-# systemProp.https.proxyPort=8080
-
-if [ "$CUSTOM_ABIS_LIST" = true ]; then
-  NDK_DEBUG=1 $GRADLE_PATH -PnativeLibsDoNotStrip=true -PABI_FILTERS=$ABIS_LIST -p $PYTORCH_ANDROID_DIR clean test_app:assembleDebug
-else
-  NDK_DEBUG=1 $GRADLE_PATH -PnativeLibsDoNotStrip=true -p $PYTORCH_ANDROID_DIR clean test_app:assembleDebug
-fi
-
-find $PYTORCH_ANDROID_DIR -type f -name *apk
-
-find $PYTORCH_ANDROID_DIR -type f -name *apk | xargs echo "To install apk run: $ANDROID_HOME/platform-tools/adb install -r "
--- a/android/build_test_app_custom.sh
+++ b/android/build_test_app_custom.sh
@ -1,32 +0,0 @@
-#!/bin/bash
-###############################################################################
-# This script tests the custom selective build flow for PyTorch Android, which
-# optimizes library size by only including ops used by a specific model.
-###############################################################################
-
-set -eux
-
-PYTORCH_DIR="$(cd $(dirname $0)/..; pwd -P)"
-PYTORCH_ANDROID_DIR="${PYTORCH_DIR}/android"
-BUILD_ROOT="${PYTORCH_DIR}/build_pytorch_android_custom"
-
-source "${PYTORCH_ANDROID_DIR}/common.sh"
-
-prepare_model_and_dump_root_ops() {
-  cd "${BUILD_ROOT}"
-  MODEL="${BUILD_ROOT}/MobileNetV2.pt"
-  ROOT_OPS="${BUILD_ROOT}/MobileNetV2.yaml"
-  python "${PYTORCH_ANDROID_DIR}/test_app/make_assets_custom.py"
-  cp "${MODEL}" "${PYTORCH_ANDROID_DIR}/test_app/app/src/main/assets/mobilenet2.pt"
-}
-
-# Start building
-mkdir -p "${BUILD_ROOT}"
-check_android_sdk
-check_gradle
-parse_abis_list "$@"
-prepare_model_and_dump_root_ops
-SELECTED_OP_LIST="${ROOT_OPS}" build_android
-
-# TODO: change this to build test_app instead
-$GRADLE_PATH -PABI_FILTERS=$ABIS_LIST -p $PYTORCH_ANDROID_DIR clean assembleRelease
--- a/android/settings.gradle
+++ b/android/settings.gradle
@ -3,4 +3,3 @@ include ':app', ':pytorch_android', ':pytorch_android_torchvision', ':pytorch_ho
 project(':pytorch_android_torchvision').projectDir = file('pytorch_android_torchvision')

 project(':pytorch_host').projectDir = file('pytorch_android/host')
-project(':test_app').projectDir = file('test_app/app')
--- a/android/test_app/.gitignore
+++ b/android/test_app/.gitignore
@ -1,9 +0,0 @@
-local.properties
-**/*.iml
-.gradle
-gradlew*
-gradle/wrapper
-.idea/*
-.DS_Store
-build
-.externalNativeBuild
--- a/android/test_app/app/CMakeLists.txt
+++ b/android/test_app/app/CMakeLists.txt
@ -1,38 +0,0 @@
-cmake_minimum_required(VERSION 3.5)
-set(PROJECT_NAME pytorch_testapp_jni)
-project(${PROJECT_NAME} CXX)
-set(CMAKE_CXX_STANDARD 17 CACHE STRING "The C++ standard whose features are requested to build this target.")
-set(CMAKE_VERBOSE_MAKEFILE ON)
-
-set(build_DIR ${CMAKE_SOURCE_DIR}/build)
-
-set(pytorch_testapp_cpp_DIR ${CMAKE_CURRENT_LIST_DIR}/src/main/cpp)
-message(STATUS "ANDROID_STL:${ANDROID_STL}")
-file(GLOB pytorch_testapp_SOURCES
-  ${pytorch_testapp_cpp_DIR}/pytorch_testapp_jni.cpp
-)
-
-add_library(${PROJECT_NAME} SHARED
-    ${pytorch_testapp_SOURCES}
-)
-
-file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers")
-file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}")
-
-target_compile_options(${PROJECT_NAME} PRIVATE
-  -fexceptions
-)
-
-set(BUILD_SUBDIR ${ANDROID_ABI})
-
-target_include_directories(${PROJECT_NAME} PRIVATE
-  ${PYTORCH_INCLUDE_DIRS}
-)
-
-find_library(PYTORCH_LIBRARY pytorch_jni
-  PATHS ${PYTORCH_LINK_DIRS}
-  NO_CMAKE_FIND_ROOT_PATH)
-
-target_link_libraries(${PROJECT_NAME}
-  ${PYTORCH_LIBRARY}
-  log)
--- a/android/test_app/app/build.gradle
+++ b/android/test_app/app/build.gradle
@ -1,190 +0,0 @@
-apply plugin: 'com.android.application'
-
-repositories {
-    jcenter()
-    maven {
-        url "https://oss.sonatype.org/content/repositories/snapshots"
-    }
-    flatDir {
-        dirs 'aars'
-    }
-}
-
-android {
-    configurations {
-        extractForNativeBuild
-    }
-    compileOptions {
-        sourceCompatibility 1.8
-        targetCompatibility 1.8
-    }
-    compileSdkVersion rootProject.compileSdkVersion
-    buildToolsVersion rootProject.buildToolsVersion
-    defaultConfig {
-        applicationId "org.pytorch.testapp"
-        minSdkVersion rootProject.minSdkVersion
-        targetSdkVersion rootProject.targetSdkVersion
-        versionCode 1
-        versionName "1.0"
-        ndk {
-            abiFilters ABI_FILTERS.split(",")
-        }
-        // Commented due to dependency on local copy of pytorch_android aar to aars folder
-        //externalNativeBuild {
-        //    cmake {
-        //        abiFilters ABI_FILTERS.split(",")
-        //        arguments "-DANDROID_STL=c++_shared"
-        //    }
-        //}
-        buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2q.pt\"")
-        buildConfigField("String", "LOGCAT_TAG", "@string/app_name")
-        buildConfigField("long[]", "INPUT_TENSOR_SHAPE", "new long[]{1, 3, 224, 224}")
-        buildConfigField("boolean", "NATIVE_BUILD", 'false')
-        buildConfigField("boolean", "USE_VULKAN_DEVICE", 'false')
-        buildConfigField(
-                "int",
-                "BUILD_LITE_INTERPRETER",
-                System.env.BUILD_LITE_INTERPRETER != null ? System.env.BUILD_LITE_INTERPRETER : "1"
-        )
-        addManifestPlaceholders([APP_NAME: "@string/app_name", MAIN_ACTIVITY: "org.pytorch.testapp.MainActivity"])
-    }
-    buildTypes {
-        debug {
-            minifyEnabled false
-            debuggable true
-        }
-        release {
-            minifyEnabled false
-        }
-    }
-    // Commented due to dependency on local copy of pytorch_android aar to aars folder
-    //externalNativeBuild {
-    //    cmake {
-    //        path "CMakeLists.txt"
-    //    }
-    //}
-    flavorDimensions "model", "build", "activity"
-    productFlavors {
-        mnet {
-            dimension "model"
-            applicationIdSuffix ".mnet"
-            buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet_v2.ptl\"")
-            addManifestPlaceholders([APP_NAME: "MNET"])
-            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet\"")
-        }
-        // NB: This is not working atm https://github.com/pytorch/pytorch/issues/102966
-        mnetVulkan {
-            dimension "model"
-            applicationIdSuffix ".mnet_vulkan"
-            buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet_v2_vulkan.ptl\"")
-            buildConfigField("boolean", "USE_VULKAN_DEVICE", 'true')
-            addManifestPlaceholders([APP_NAME: "MNET_VULKAN"])
-            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet-vulkan\"")
-        }
-        resnet18 {
-            dimension "model"
-            applicationIdSuffix ".resnet18"
-            buildConfigField("String", "MODULE_ASSET_NAME", "\"resnet18.ptl\"")
-            addManifestPlaceholders([APP_NAME: "RN18"])
-            buildConfigField("String", "LOGCAT_TAG", "\"pytorch-resnet18\"")
-        }
-        local {
-            dimension "build"
-        }
-        nightly {
-            dimension "build"
-        }
-        aar {
-            dimension "build"
-        }
-        // Commented due to dependency on local copy of pytorch_android aar to aars folder
-        //nativeBuild {
-        //    dimension "build"
-        //    buildConfigField("boolean", "NATIVE_BUILD", "true")
-        //}
-        camera {
-            dimension "activity"
-            addManifestPlaceholders([MAIN_ACTIVITY: "org.pytorch.testapp.CameraActivity"])
-        }
-        base {
-            dimension "activity"
-            sourceSets {
-                main {
-                    java {
-                        exclude 'org/pytorch/testapp/CameraActivity.java'
-                    }
-                }
-            }
-        }
-    }
-    packagingOptions {
-        doNotStrip '**.so'
-    }
-
-    // Filtering for CI
-    if (!testAppAllVariantsEnabled.toBoolean()) {
-        variantFilter { variant ->
-            def names = variant.flavors*.name
-            if (names.contains("nightly")
-                || names.contains("camera")
-                || names.contains("aar")
-                || names.contains("nativeBuild")) {
-                setIgnore(true)
-            }
-        }
-    }
-}
-
-tasks.all { task ->
-    // Disable externalNativeBuild for all but nativeBuild variant
-    if (task.name.startsWith('externalNativeBuild')
-          && !task.name.contains('NativeBuild')) {
-        task.enabled = false
-    }
-}
-
-dependencies {
-    implementation 'com.android.support:appcompat-v7:28.0.0'
-    implementation 'com.facebook.soloader:nativeloader:0.10.5'
-
-    localImplementation project(':pytorch_android')
-    localImplementation project(':pytorch_android_torchvision')
-
-    // Commented due to dependency on local copy of pytorch_android aar to aars folder
-    //nativeBuildImplementation(name: 'pytorch_android-release', ext: 'aar')
-    //nativeBuildImplementation(name: 'pytorch_android_torchvision-release', ext: 'aar')
-    //extractForNativeBuild(name: 'pytorch_android-release', ext: 'aar')
-
-    nightlyImplementation 'org.pytorch:pytorch_android:2.2.0-SNAPSHOT'
-    nightlyImplementation 'org.pytorch:pytorch_android_torchvision:2.2.0-SNAPSHOT'
-
-    aarImplementation(name:'pytorch_android', ext:'aar')
-    aarImplementation(name:'pytorch_android_torchvision', ext:'aar')
-    aarImplementation 'com.facebook.soloader:nativeloader:0.10.5'
-    aarImplementation 'com.facebook.fbjni:fbjni-java-only:0.2.2'
-
-    def camerax_version = "1.0.0-alpha05"
-    cameraImplementation "androidx.camera:camera-core:$camerax_version"
-    cameraImplementation "androidx.camera:camera-camera2:$camerax_version"
-    cameraImplementation 'com.google.android.material:material:1.0.0-beta01'
-}
-
-task extractAARForNativeBuild {
-    doLast {
-        configurations.extractForNativeBuild.files.each {
-            def file = it.absoluteFile
-            copy {
-                from zipTree(file)
-                into "$buildDir/$file.name"
-                include "headers/**"
-                include "jni/**"
-            }
-        }
-    }
-}
-
-tasks.whenTaskAdded { task ->
-  if (task.name.contains('externalNativeBuild')) {
-    task.dependsOn(extractAARForNativeBuild)
-  }
-}
--- a/android/test_app/app/src/main/AndroidManifest.xml
+++ b/android/test_app/app/src/main/AndroidManifest.xml
@ -1,27 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    package="org.pytorch.testapp">
-
-    <application
-        android:allowBackup="true"
-        android:label="${APP_NAME}"
-        android:supportsRtl="true"
-        android:theme="@style/AppTheme">
-
-        <activity android:name="${MAIN_ACTIVITY}">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-
-    <uses-permission android:name="android.permission.CAMERA" />
-
-    <!--
-     Permissions required by the Snapdragon Profiler to collect GPU metrics.
-    -->
-    <uses-permission android:name="android.permission.INTERNET" />
-    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
-</manifest>
--- a/android/test_app/app/src/main/assets/.gitignore
+++ b/android/test_app/app/src/main/assets/.gitignore
@ -1,3 +0,0 @@
-*
-*/
-!.gitignore
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .3.0
 .3.1