Remove unused test code

ghstack-source-id: 8d6fad8d8f59a12a1711649cdd4558f23025a45c Pull Request resolved: https://github.com/pytorch/pytorch/pull/160823
[inductor] TLParse tensor metadata logging + test (#160132 )
2025-10-24 23:54:56 +08:00 · 2025-08-16 11:23:52 -07:00 · 2025-08-16 16:37:18 +00:00 · 2025-08-16 16:15:22 +00:00 · 2025-08-16 14:58:03 +00:00 · 2025-08-16 09:15:58 +00:00
1409 changed files with 40358 additions and 55786 deletions
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -92,6 +92,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libnccl.so.2",
        "/usr/local/cuda/lib64/libnvJitLink.so.12",
        "/usr/local/cuda/lib64/libnvrtc.so.12",
+        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
@ -208,7 +209,7 @@ if __name__ == "__main__":
    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
-        build_vars = "MAX_JOBS=5 " + build_vars
+        build_vars += "MAX_JOBS=5 "

    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -438,9 +438,7 @@ def build_torchvision(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

@ -495,9 +493,7 @@ def build_torchdata(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

@ -553,9 +549,7 @@ def build_torchtext(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

@ -613,9 +607,7 @@ def build_torchaudio(
        )
        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
    elif build_version is not None:
-        build_vars += (
-            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"
-        )
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -76,7 +76,6 @@ ADD ./common/install_mnist.sh install_mnist.sh
 RUN bash ./install_mnist.sh

 FROM base as all_cuda
-COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
 COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -76,6 +76,9 @@ elif [[ "$image" == *cuda*linter* ]]; then
 elif [[ "$image" == *linter* ]]; then
  # Use a separate Dockerfile for linter to keep a small image size
  DOCKERFILE="linter/Dockerfile"
+elif [[ "$image" == *riscv* ]]; then
+  # Use RISC-V specific Dockerfile
+  DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
 fi

 _UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
@ -144,16 +147,6 @@ case "$tag" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.6.3
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.12
@ -164,39 +157,6 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    ANACONDA_PYTHON_VERSION=3.13
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.10
@ -219,19 +179,7 @@ case "$tag" in
    VISION=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3.11-clang12)
-    ANACONDA_PYTHON_VERSION=3.11
-    CLANG_VERSION=12
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3.9-gcc9)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=9
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)
+  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
    if [[ $tag =~ "jammy" ]]; then
      ANACONDA_PYTHON_VERSION=3.10
    else
@ -245,7 +193,9 @@ case "$tag" in
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
-    INDUCTOR_BENCHMARKS=yes
+    if [[ $tag =~ "benchmarks" ]]; then
+      INDUCTOR_BENCHMARKS=yes
+    fi
    ;;
  pytorch-linux-noble-rocm-alpha-py3)
    ANACONDA_PYTHON_VERSION=3.12
@ -257,7 +207,6 @@ case "$tag" in
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
-    INDUCTOR_BENCHMARKS=yes
    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
    ;;
  pytorch-linux-jammy-xpu-2025.0-py3)
@ -357,6 +306,9 @@ case "$tag" in
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
+  pytorch-linux-noble-riscv64-py3.12-gcc14)
+    GCC_VERSION=14
+    ;;
  *)
    # Catch-all for builds that are not hardcoded.
    VISION=yes
@ -477,7 +429,14 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
 fi

 if [ -n "$GCC_VERSION" ]; then
-  if !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
+  if [[ "$image" == *riscv* ]]; then
+    # Check RISC-V cross-compilation toolchain version
+    if !(drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
+      echo "RISC-V GCC_VERSION=$GCC_VERSION, but:"
+      drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version
+      exit 1
+    fi
+  elif !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
    echo "GCC_VERSION=$GCC_VERSION, but:"
    drun gcc --version
    exit 1
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -0,0 +1,2 @@
+transformers==4.54.0
+soxr==0.5.0
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -1 +0,0 @@
-243e186efbf7fb93328dd6b34927a4e8c8f24395
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-ae324eeac8e102a2b40370e341460f3791353398
+0958dc9b2bb815e428f721f9da599dab0dc1c5d7
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-11ec6354315768a85da41032535e3b7b99c5f706
+f7888497a1eb9e98d4c07537f0d0bcfe180d1363
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -66,8 +66,9 @@ function do_cpython_build {
        ln -s pip3 ${prefix}/bin/pip
    fi
    # install setuptools since python 3.12 is required to use distutils
-    ${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0
-    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
+    # packaging is needed to create symlink since wheel no longer provides needed information
+    ${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0
+    local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))")
    ln -sf ${prefix} /opt/python/${abi_tag}
 }

--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi

-NVSHMEM_VERSION=3.3.9
+NVSHMEM_VERSION=3.3.20

 function install_cuda {
  version=$1
@ -62,14 +62,16 @@ function install_nvshmem {
  mkdir -p "${tmpdir}" && cd "${tmpdir}"

  # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
-  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
-  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
+  # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
+  filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
+  suffix=".tar.xz"
+  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"

  # download, unpack, install
  wget -q "${url}"
-  tar xf "${filename}.tar.gz"
-  cp -a "libnvshmem/include/"* /usr/local/include/
-  cp -a "libnvshmem/lib/"*     /usr/local/lib/
+  tar xf "${filename}${suffix}"
+  cp -a "${filename}/include/"* /usr/local/cuda/include/
+  cp -a "${filename}/lib/"*     /usr/local/cuda/lib64/

  # cleanup
  cd ..
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -5,9 +5,7 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

 function install_huggingface() {
-  local version
-  commit=$(get_pinned_commit huggingface)
-  pip_install "git+https://github.com/huggingface/transformers@${commit}"
+  pip_install -r huggingface-requirements.txt
 }

 function install_timm() {
@ -15,11 +13,34 @@ function install_timm() {
  commit=$(get_pinned_commit timm)

  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
-  # Clean up
-  conda_run pip uninstall -y torch torchvision triton
+}
+
+function install_torchbench() {
+  local commit
+  commit=$(get_pinned_commit torchbench)
+  git clone https://github.com/pytorch/benchmark torchbench
+  pushd torchbench
+  git checkout "$commit"
+
+  python install.py --continue_on_fail
+
+  echo "Print all dependencies after TorchBench is installed"
+  python -mpip freeze
+  popd
+
+  chown -R jenkins torchbench
+  chown -R jenkins /opt/conda
 }

 # Pango is needed for weasyprint which is needed for doctr
 conda_install pango
+
+# Stable packages are ok here, just to satisfy TorchBench check
+pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+
+install_torchbench
 install_huggingface
 install_timm
+
+# Clean up
+conda_run pip uninstall -y torch torchvision torchaudio triton torchao
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -103,5 +103,5 @@ fi
 # It depends on torch and triton. We don't want to install
 # triton and torch from production on Docker CI images
 if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
-  pip_install helion==0.0.10 --no-deps
+  pip_install helion --no-deps
 fi
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -34,18 +34,27 @@ function install_ubuntu() {

    # The xpu-smi packages
    apt-get install -y flex bison xpu-smi
-    # Compute and Media Runtimes
-    apt-get install -y \
-        intel-opencl-icd intel-level-zero-gpu level-zero \
-        intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
-        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-    if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
-        apt-get install -y intel-ocloc
+
+    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
+        # Compute and Media Runtimes
+        apt-get install -y \
+            intel-opencl-icd intel-level-zero-gpu level-zero \
+            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
+        # Development Packages
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+    else # rolling driver
+        apt-get install -y \
+            intel-opencl-icd libze-intel-gpu1 libze1 \
+            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
    fi
-    # Development Packages
-    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+
    # Install Intel Support Packages
    apt-get install -y ${XPU_PACKAGES}

@ -130,11 +139,11 @@ function install_sles() {

 }

-# Default use GPU driver LTS releases
-XPU_DRIVER_VERSION="/lts/2350"
-if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
-    # Use GPU driver rolling releases
-    XPU_DRIVER_VERSION=""
+# Default use GPU driver rolling releases
+XPU_DRIVER_VERSION=""
+if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
+    # Use GPU driver LTS releases
+    XPU_DRIVER_VERSION="/lts/2350"
 fi

 # Default use Intel® oneAPI Deep Learning Essentials 2025.0
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -63,11 +63,12 @@ lark==0.12.0
 #Pinned versions: 0.12.0
 #test that import:

-librosa>=0.6.2 ; python_version < "3.11"
-librosa==0.10.2 ; python_version == "3.12"
+librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
+librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
 #test that import: test_spectral_ops.py
+#librosa depends on numba; disable it for s390x while numba is disabled too

 #mkl #this breaks linux-bionic-rocm4.5-py3.7
 #Description: Intel oneAPI Math Kernel Library
@ -110,14 +111,15 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

-numba==0.49.0 ; python_version < "3.9"
-numba==0.55.2 ; python_version == "3.9"
-numba==0.55.2 ; python_version == "3.10"
-numba==0.60.0 ; python_version == "3.12"
+numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
+numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
+numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
+numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
 #For numba issue see https://github.com/pytorch/pytorch/issues/51511
+#Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073

 #numpy
 #Description: Provides N-dimensional arrays and linear algebra
@ -307,7 +309,7 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:

-z3-solver==4.15.1.0
+z3-solver==4.15.1.0 ; platform_machine != "s390x"
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
@ -361,7 +363,6 @@ pwlf==2.2.1
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py

-
 # To build PyTorch itself
 pyyaml
 pyzstd
--- a/.ci/docker/ubuntu-cross-riscv/Dockerfile
+++ b/.ci/docker/ubuntu-cross-riscv/Dockerfile
@ -0,0 +1,155 @@
+# Cross-compilation Docker container for RISC-V architecture
+ARG UBUNTU_VERSION
+FROM --platform=linux/amd64 ubuntu:${UBUNTU_VERSION} as base
+
+ARG UBUNTU_VERSION
+
+ENV GCC_VERSION=14
+ENV PYTHON_VERSION=3.12.3
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CC=riscv64-linux-gnu-gcc-${GCC_VERSION}
+ENV CXX=riscv64-linux-gnu-g++-${GCC_VERSION}
+ENV QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/
+ENV SYSROOT=/opt/sysroot
+
+# Install basic dependencies
+RUN apt-get update && apt-get install -y \
+    ninja-build \
+    autoconf \
+    automake \
+    libtool \
+    patchelf \
+    ccache \
+    git \
+    wget \
+    python3-pip \
+    python3-venv \
+    python-is-python3 \
+    cmake \
+    sudo \
+    lsb-release \
+    gcc-${GCC_VERSION}-riscv64-linux-gnu \
+    g++-${GCC_VERSION}-riscv64-linux-gnu \
+    pkg-config \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install user
+COPY ./common/install_user.sh install_user.sh
+RUN bash ./install_user.sh && rm install_user.sh
+
+FROM base as python
+ARG ZLIB_VERSION=1.3.1
+ARG FFI_VERSION=3.4.6
+ARG BZ2_VERSION=1.0.8
+ARG XZ_VERSION=5.4.6
+ARG OPENSSL_VERSION=3.2.1
+
+# Set up sysroot directory for dependencies
+ENV PKG_CONFIG_PATH=${SYSROOT}/lib/pkgconfig
+ENV PKG_CONFIG_SYSROOT_DIR=${SYSROOT}
+
+WORKDIR /opt
+
+# Build zlib (for compression)
+RUN echo "--- Building zlib ---" \
+    && wget -c https://www.zlib.net/zlib-${ZLIB_VERSION}.tar.gz \
+    && tar -xf zlib-${ZLIB_VERSION}.tar.gz --no-same-permissions --no-same-owner \
+    && cd zlib-${ZLIB_VERSION}/ \
+    && mkdir build && cd build \
+    && ../configure --prefix=${SYSROOT} \
+    && make -j$(nproc) && make install \
+    && cd ../..
+
+# Build libffi (for ctypes module)
+RUN echo "--- Building libffi ---" \
+    && wget -c https://github.com/libffi/libffi/releases/download/v${FFI_VERSION}/libffi-${FFI_VERSION}.tar.gz \
+    && tar -xf libffi-${FFI_VERSION}.tar.gz --no-same-permissions --no-same-owner \
+    && cd libffi-${FFI_VERSION}/ \
+    && mkdir build && cd build \
+    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
+    && make -j$(nproc) && make install \
+    && cd ../..
+
+# Build bzip2 (for bz2 module)
+RUN echo "--- Building bzip2 ---" \
+    && wget -c https://sourceware.org/pub/bzip2/bzip2-${BZ2_VERSION}.tar.gz \
+    && tar -xf bzip2-${BZ2_VERSION}.tar.gz --no-same-permissions --no-same-owner \
+    && cd bzip2-${BZ2_VERSION}/ \
+    && make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} bzip2 bzip2recover libbz2.a \
+    && make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} -f Makefile-libbz2_so \
+    && make install PREFIX=${SYSROOT} \
+    && cp libbz2.so.${BZ2_VERSION} ${SYSROOT}/lib/ \
+    && cd ${SYSROOT}/lib/ \
+    && ln -sf libbz2.so.${BZ2_VERSION} libbz2.so.1.0 \
+    && ln -sf libbz2.so.1.0 libbz2.so \
+    && cd /opt/
+
+# Build xz (for lzma module)
+RUN echo "--- Building xz ---" \
+    && wget -c https://github.com/tukaani-project/xz/releases/download/v${XZ_VERSION}/xz-${XZ_VERSION}.tar.gz \
+    && tar -xf xz-${XZ_VERSION}.tar.gz --no-same-permissions --no-same-owner \
+    && cd xz-${XZ_VERSION} \
+    && mkdir build && cd build \
+    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
+    && make -j$(nproc) && make install \
+    && cd ../..
+
+# Build OpenSSL (for ssl module)
+RUN echo "--- Building OpenSSL ---" \
+    && wget -c https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz \
+    && tar -xf openssl-${OPENSSL_VERSION}.tar.gz --no-same-permissions --no-same-owner \
+    && cd openssl-${OPENSSL_VERSION}/ \
+    && mkdir build && cd build \
+    && ../Configure linux64-riscv64 --prefix=${SYSROOT} \
+    && make -j$(nproc) && make install_sw \
+    && cd ../..
+
+# Build SQLite3 (for sqlite3 module)
+RUN echo "--- Building SQLite3 ---" \
+    && wget -c https://www.sqlite.org/2024/sqlite-autoconf-3450200.tar.gz \
+    && tar -xf sqlite-autoconf-3450200.tar.gz --no-same-permissions --no-same-owner \
+    && cd sqlite-autoconf-3450200 \
+    && mkdir build && cd build \
+    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
+    && make -j$(nproc) && make install \
+    && cd ../..
+
+# Build and install RISC-V Python with all modules
+RUN wget -c https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
+    && tar -xf Python-${PYTHON_VERSION}.tgz --no-same-permissions --no-same-owner \
+    && cd Python-${PYTHON_VERSION} \
+    && mkdir build && cd build \
+    && ../configure \
+        --host=riscv64-linux-gnu \
+        --build=x86_64-linux-gnu \
+        --prefix=${SYSROOT} \
+        --enable-shared \
+        --disable-ipv6 \
+        --with-build-python=/usr/bin/python3 \
+        --with-ensurepip=no \
+        ac_cv_file__dev_ptmx=yes \
+        ac_cv_file__dev_ptc=no \
+    && make -j$(nproc) \
+    && make install
+
+FROM base as final
+COPY --from=python             /opt/sysroot                       /opt/sysroot
+
+# Install crossenv and cmake
+RUN pip install crossenv cmake==4.0.0 --break-system-packages \
+    && /usr/bin/python3 -m crossenv ${SYSROOT}/bin/python3 /opt/riscv-cross-env
+
+# Add pip-installed cmake binaries to PATH
+ENV PATH="/usr/local/bin:${PATH}"
+
+# Set up cross Python environment
+SHELL ["/bin/bash", "-c"]
+RUN source /opt/riscv-cross-env/bin/activate \
+    && pip install setuptools pyyaml typing_extensions wheel
+
+# Set default environment variables for PyTorch build
+ENV Python_ROOT_DIR=${SYSROOT}
+ENV OPENSSL_ROOT_DIR=${SYSROOT}
+
+USER jenkins
+CMD ["bash"]
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -96,10 +96,11 @@ ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
+COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -56,10 +56,10 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt

 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -96,10 +96,11 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
+COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

 ARG TRITON
 ARG TRITON_CPU
--- a/.ci/lumen_cli/README.md
+++ b/.ci/lumen_cli/README.md
@ -0,0 +1,31 @@
+# 🔧 Lumen_cli
+A Python CLI tool for building and testing PyTorch-based components, using a YAML configuration file for structured, repeatable workflows.
+
+
+## Features
+- **Build**
+    - external projects (e.g. vLLM)
+
+## 📦 Installation
+at the root of the pytorch repo
+```bash
+pip install -e .ci/lumen_cli
+```
+
+## Run the cli tool
+The cli tool must be used at root of pytorch repo, as example to run build external vllm:
+```bash
+python -m cli.run build external vllm
+```
+this will run the build steps with default behaviour for vllm project.
+
+to see help messages, run
+```bash
+python3 -m cli.run --help
+```
+
+## Add customized external build logics
+To add a new external build, for instance, add a new external build logics:
+1. create the build function in cli/lib folder
+2. register your target and the main build function at  EXTERNAL_BUILD_TARGET_DISPATCH in `cli/build_cli/register_build.py`
+3. [optional] create your ci config file in .github/ci_configs/${EXTERNAL_PACKAGE_NAME}.yaml
--- a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_blocked
+++ b/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_blocked
--- a/.ci/lumen_cli/cli/build_cli/register_build.py
+++ b/.ci/lumen_cli/cli/build_cli/register_build.py
@ -0,0 +1,37 @@
+import argparse
+import logging
+
+from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
+from cli.lib.core.vllm import VllmBuildRunner
+
+
+logger = logging.getLogger(__name__)
+
+# Maps targets to their argparse configuration and runner
+# it adds new target to path python -m cli.run build external {target} with buildrunner
+_TARGETS: dict[str, TargetSpec] = {
+    "vllm": {
+        "runner": VllmBuildRunner,
+        "help": "Build vLLM using docker buildx.",
+    }
+    # add yours ...
+}
+
+
+def register_build_commands(subparsers: argparse._SubParsersAction) -> None:
+    build_parser = subparsers.add_parser(
+        "build",
+        help="Build related commands",
+        formatter_class=RichHelp,
+    )
+    build_subparsers = build_parser.add_subparsers(dest="build_command", required=True)
+    overview = "\n".join(
+        f"  {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
+    )
+    external_parser = build_subparsers.add_parser(
+        "external",
+        help="Build external targets",
+        description="Build third-party targets.\n\nAvailable targets:\n" + overview,
+        formatter_class=RichHelp,
+    )
+    register_targets(external_parser, _TARGETS)
--- a/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_bool_called_at_least_once
+++ b/test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_bool_called_at_least_once
--- a/.ci/lumen_cli/cli/lib/common/cli_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@ -0,0 +1,71 @@
+"""
+Cli Argparser Utility helpers for CLI tasks.
+
+"""
+
+import argparse
+from abc import ABC, abstractmethod
+
+
+try:
+    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
+except ImportError:
+    from typing import Any, Callable, TypedDict
+
+    from typing_extensions import Required  # Fallback for Python <3.11
+
+
+class BaseRunner(ABC):
+    def __init__(self, args: Any) -> None:
+        self.args = args
+
+    @abstractmethod
+    def run(self) -> None:
+        """runs main logics, required"""
+
+
+# Pretty help: keep newlines + show defaults
+class RichHelp(
+    argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
+):
+    pass
+
+
+class TargetSpec(TypedDict, total=False):
+    """CLI subcommand specification with bA."""
+
+    runner: Required[type[BaseRunner]]
+    help: str
+    description: str
+    add_arguments: Callable[[argparse.ArgumentParser], None]
+
+
+def register_targets(
+    parser: argparse.ArgumentParser,
+    target_specs: dict[str, TargetSpec],
+    common_args: Callable[[argparse.ArgumentParser], None] = lambda _: None,
+) -> None:
+    """Register target subcommands."""
+    targets = parser.add_subparsers(
+        dest="target",
+        required=True,
+        metavar="{" + ",".join(target_specs.keys()) + "}",
+    )
+
+    for name, spec in target_specs.items():
+        desc = spec.get("description") or spec["runner"].__doc__ or ""
+
+        p = targets.add_parser(
+            name,
+            help=spec.get("help", ""),
+            description=desc.strip(),
+            formatter_class=RichHelp,
+        )
+        p.set_defaults(
+            func=lambda args, cls=spec["runner"]: cls(args).run(),
+            _runner_class=spec["runner"],
+        )
+        if "add_arguments" in spec and callable(spec["add_arguments"]):
+            spec["add_arguments"](p)
+        if common_args:
+            common_args(p)
--- a/.ci/lumen_cli/cli/lib/common/docker_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/docker_helper.py
@ -0,0 +1,42 @@
+"""
+Docker Utility helpers for CLI tasks.
+"""
+
+import logging
+from typing import Optional
+
+import docker
+from docker.errors import APIError, NotFound
+
+
+logger = logging.getLogger(__name__)
+
+# lazy singleton so we don't reconnect every call
+_docker_client: Optional[docker.DockerClient] = None
+
+
+def _get_client() -> docker.DockerClient:
+    global _docker_client
+    if _docker_client is None:
+        _docker_client = docker.from_env()
+    return _docker_client
+
+
+def local_image_exists(
+    image_name: str, client: Optional[docker.DockerClient] = None
+) -> bool:
+    """Return True if a local Docker image exists."""
+    if not image_name:
+        return False
+
+    client = client or _get_client()
+    try:
+        client.images.get(image_name)
+        return True
+    except (NotFound, APIError) as e:
+        logger.error(
+            "Error when checking Docker image '%s': %s",
+            image_name,
+            e.explanation if hasattr(e, "explanation") else str(e),
+        )
+        return False
--- a/.ci/lumen_cli/cli/lib/common/envs_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/envs_helper.py
@ -0,0 +1,110 @@
+"""
+Environment Variables and Dataclasses Utility helpers for CLI tasks.
+"""
+
+import os
+from dataclasses import field, fields, is_dataclass, MISSING
+from pathlib import Path
+from textwrap import indent
+from typing import Optional, Union
+
+from cli.lib.common.utils import str2bool
+
+
+def get_env(name: str, default: str = "") -> str:
+    """Get environment variable with default fallback."""
+    return os.environ.get(name) or default
+
+
+def env_path_optional(
+    name: str,
+    default: Optional[Union[str, Path]] = None,
+    resolve: bool = True,
+) -> Optional[Path]:
+    """Get environment variable as optional Path."""
+    val = get_env(name) or default
+    if not val:
+        return None
+
+    path = Path(val)
+    return path.resolve() if resolve else path
+
+
+def env_path(
+    name: str,
+    default: Optional[Union[str, Path]] = None,
+    resolve: bool = True,
+) -> Path:
+    """Get environment variable as Path, raise if missing."""
+    path = env_path_optional(name, default, resolve)
+    if not path:
+        raise ValueError(f"Missing path value for {name}")
+    return path
+
+
+def env_bool(
+    name: str,
+    default: bool = False,
+) -> bool:
+    val = get_env(name)
+    if not val:
+        return default
+    return str2bool(val)
+
+
+def env_bool_field(
+    name: str,
+    default: bool = False,
+):
+    return field(default_factory=lambda: env_bool(name, default))
+
+
+def env_path_field(
+    name: str,
+    default: Union[str, Path] = "",
+    *,
+    resolve: bool = True,
+) -> Path:
+    return field(default_factory=lambda: env_path(name, default, resolve=resolve))
+
+
+def env_str_field(
+    name: str,
+    default: str = "",
+) -> str:
+    return field(default_factory=lambda: get_env(name, default))
+
+
+def generate_dataclass_help(cls) -> str:
+    """Auto-generate help text for dataclass fields."""
+    if not is_dataclass(cls):
+        raise TypeError(f"{cls} is not a dataclass")
+
+    def get_value(f):
+        if f.default is not MISSING:
+            return f.default
+        if f.default_factory is not MISSING:
+            try:
+                return f.default_factory()
+            except Exception as e:
+                return f"<error: {e}>"
+        return "<required>"
+
+    lines = [f"{f.name:<22} = {repr(get_value(f))}" for f in fields(cls)]
+    return indent("\n".join(lines), "    ")
+
+
+def with_params_help(params_cls: type, title: str = "Parameter defaults"):
+    """
+    Class decorator that appends a help table generated from another dataclass
+    (e.g., VllmParameters) to the decorated class's docstring.
+    """
+    if not is_dataclass(params_cls):
+        raise TypeError(f"{params_cls} must be a dataclass")
+
+    def _decorator(cls: type) -> type:
+        block = generate_dataclass_help(params_cls)
+        cls.__doc__ = (cls.__doc__ or "") + f"\n\n{title}:\n{block}"
+        return cls
+
+    return _decorator
--- a/.ci/lumen_cli/cli/lib/common/git_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/git_helper.py
@ -0,0 +1,69 @@
+"""
+Git Utility helpers for CLI tasks.
+"""
+
+import logging
+from pathlib import Path
+
+from cli.lib.common.path_helper import remove_dir
+from git import GitCommandError, RemoteProgress, Repo
+
+
+logger = logging.getLogger(__name__)
+
+
+class PrintProgress(RemoteProgress):
+    """Simple progress logger for git operations."""
+
+    def __init__(self, interval: int = 5):
+        super().__init__()
+        self._last_percent = -1
+        self._interval = interval
+
+    def update(self, op_code, cur, max=None, message=""):
+        msg = self._cur_line or message
+        if max and cur:
+            percent = int(cur / max * 100)
+            if percent != self._last_percent and percent % self._interval == 0:
+                self._last_percent = percent
+                logger.info("Progress: %d%% - %s", percent, msg)
+        elif msg:
+            logger.info(msg)
+
+
+def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules=False):
+    """Clone repository with pinned commit and optional submodules."""
+    dst = dst or target
+
+    try:
+        logger.info("Cloning %s to %s", target, dst)
+
+        # Clone and fetch
+        remove_dir(dst)
+        r = Repo.clone_from(repo, dst, progress=PrintProgress())
+        r.git.fetch("--all", "--tags")
+
+        # Checkout pinned commit
+        commit = get_post_build_pinned_commit(target)
+        logger.info("Checking out pinned commit %s", commit)
+        r.git.checkout(commit)
+
+        # Update submodules if requested
+        if update_submodules and r.submodules:
+            logger.info("Updating %d submodule(s)", len(r.submodules))
+            for sm in r.submodules:
+                sm.update(init=True, recursive=True, progress=PrintProgress())
+
+        logger.info("Successfully cloned %s", target)
+        return r
+
+    except GitCommandError as e:
+        logger.error("Git operation failed: %s", e)
+        raise
+
+
+def get_post_build_pinned_commit(name: str, prefix=".github/ci_commit_pins") -> str:
+    path = Path(prefix) / f"{name}.txt"
+    if not path.exists():
+        raise FileNotFoundError(f"Pin file not found: {path}")
+    return path.read_text(encoding="utf-8").strip()
--- a/.ci/lumen_cli/cli/lib/common/logger.py
+++ b/.ci/lumen_cli/cli/lib/common/logger.py
@ -0,0 +1,14 @@
+"""
+Logger Utility helpers for CLI tasks.
+"""
+
+import logging
+import sys
+
+
+def setup_logging(level: int = logging.INFO):
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        stream=sys.stdout,
+    )
--- a/.ci/lumen_cli/cli/lib/common/path_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/path_helper.py
@ -0,0 +1,62 @@
+"""Path utility helpers for CLI tasks."""
+
+import logging
+import shutil
+from pathlib import Path
+from typing import Union
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_path(path: Union[str, Path], resolve: bool = False) -> Path:
+    """Convert to Path object, optionally resolving to absolute path."""
+    if not path:
+        raise ValueError("Path cannot be None or empty")
+    result = Path(path)
+    return result.resolve() if resolve else result
+
+
+def ensure_dir_exists(path: Union[str, Path]) -> Path:
+    """Create directory if it doesn't exist."""
+    path_obj = get_path(path)
+    path_obj.mkdir(parents=True, exist_ok=True)
+    return path_obj
+
+
+def remove_dir(path: Union[str, Path, None]) -> None:
+    """Remove directory if it exists."""
+    if not path:
+        return
+    path_obj = get_path(path)
+    if path_obj.exists():
+        shutil.rmtree(path_obj)
+
+
+def force_create_dir(path: Union[str, Path]) -> Path:
+    """Remove directory if exists, then create fresh empty directory."""
+    remove_dir(path)
+    return ensure_dir_exists(path)
+
+
+def copy(src: Union[str, Path], dst: Union[str, Path]) -> None:
+    """Copy file or directory from src to dst."""
+    src_path = get_path(src, resolve=True)
+    dst_path = get_path(dst, resolve=True)
+
+    if not src_path.exists():
+        raise FileNotFoundError(f"Source does not exist: {src_path}")
+
+    dst_path.parent.mkdir(parents=True, exist_ok=True)
+
+    if src_path.is_file():
+        shutil.copy2(src_path, dst_path)
+    elif src_path.is_dir():
+        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
+    else:
+        raise ValueError(f"Unsupported path type: {src_path}")
+
+
+def is_path_exist(path: Union[str, Path, None]) -> bool:
+    """Check if path exists."""
+    return bool(path and get_path(path).exists())
--- a/.ci/lumen_cli/cli/lib/common/utils.py
+++ b/.ci/lumen_cli/cli/lib/common/utils.py
@ -0,0 +1,79 @@
+"""
+General Utility helpers for CLI tasks.
+"""
+
+import logging
+import os
+import shlex
+import subprocess
+import sys
+from typing import Optional
+
+
+logger = logging.getLogger(__name__)
+
+
+def run_command(
+    cmd: str,
+    use_shell: bool = False,
+    log_cmd: bool = True,
+    cwd: Optional[str] = None,
+    env: Optional[dict] = None,
+    check: bool = True,
+) -> int:
+    """Run a command with optional shell execution."""
+    if use_shell:
+        args = cmd
+        log_prefix = "[shell]"
+        executable = "/bin/bash"
+    else:
+        args = shlex.split(cmd)
+        log_prefix = "[cmd]"
+        executable = None
+
+    if log_cmd:
+        display_cmd = cmd if use_shell else " ".join(args)
+        logger.info("%s %s", log_prefix, display_cmd)
+
+    run_env = {**os.environ, **(env or {})}
+
+    proc = subprocess.run(
+        args,
+        shell=use_shell,
+        executable=executable,
+        stdout=sys.stdout,
+        stderr=sys.stderr,
+        cwd=cwd,
+        env=run_env,
+        check=False,
+    )
+
+    if check and proc.returncode != 0:
+        logger.error(
+            "%s Command failed (exit %s): %s", log_prefix, proc.returncode, cmd
+        )
+        raise subprocess.CalledProcessError(
+            proc.returncode, args if not use_shell else cmd
+        )
+
+    return proc.returncode
+
+
+def str2bool(value: Optional[str]) -> bool:
+    """Convert environment variables to boolean values."""
+    if not value:
+        return False
+    if not isinstance(value, str):
+        raise ValueError(
+            f"Expected a string value for boolean conversion, got {type(value)}"
+        )
+    value = value.strip().lower()
+
+    true_value_set = {"1", "true", "t", "yes", "y", "on", "enable", "enabled", "found"}
+    false_value_set = {"0", "false", "f", "no", "n", "off", "disable"}
+
+    if value in true_value_set:
+        return True
+    if value in false_value_set:
+        return False
+    raise ValueError(f"Invalid string value for boolean conversion: {value}")
--- a/.ci/lumen_cli/cli/lib/core/vllm.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm.py
@ -0,0 +1,263 @@
+import logging
+import os
+import textwrap
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+from cli.lib.common.cli_helper import BaseRunner
+from cli.lib.common.docker_helper import local_image_exists
+from cli.lib.common.envs_helper import (
+    env_bool_field,
+    env_path_field,
+    env_str_field,
+    with_params_help,
+)
+from cli.lib.common.git_helper import clone_external_repo
+from cli.lib.common.path_helper import (
+    copy,
+    ensure_dir_exists,
+    force_create_dir,
+    get_path,
+    is_path_exist,
+)
+from cli.lib.common.utils import run_command
+
+
+logger = logging.getLogger(__name__)
+
+
+# Default path for docker build artifacts
+_DEFAULT_RESULT_PATH = "./shared"
+
+# Temp folder in vllm work place to cp torch whls in vllm work directory for docker build
+_VLLM_TEMP_FOLDER = "tmp"
+
+
+@dataclass
+class VllmBuildParameters:
+    """
+    Parameters defining the vllm external input configurations.
+    Combine with VllmDockerBuildArgs to define the vllm build environment
+    """
+
+    # USE_TORCH_WHEEL: when true, use local Torch wheels; requires TORCH_WHEELS_PATH.
+    #  Otherwise docker build pull torch nightly during build
+    # TORCH_WHEELS_PATH: directory containing local torch wheels when use_torch_whl is True
+    use_torch_whl: bool = env_bool_field("USE_TORCH_WHEEL", True)
+    torch_whls_path: Path = env_path_field("TORCH_WHEELS_PATH", "./dist")
+
+    # USE_LOCAL_BASE_IMAGE: when true, use an existing local Docker base image; requires BASE_IMAGE
+    # Otherwise, pull dockerfile's default image remotely
+    # BASE_IMAGE: name:tag (only needed when use_local_base_image is True)
+    use_local_base_image: bool = env_bool_field("USE_LOCAL_BASE_IMAGE", True)
+    base_image: str = env_str_field("BASE_IMAGE")
+
+    # USE_LOCAL_DOCKERFILE: when true("1"), use a local Dockerfile; requires DOCKERFILE_PATH.
+    # otherwise, use vllm's default dockerfile.torch_nightly for build
+    # DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True"
+    use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True)
+    dockerfile_path: Path = env_path_field(
+        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
+    )
+
+    # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
+    output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
+
+    # --- Build args ----------------------------------------------------------
+    target_stage: str = env_str_field("TARGET_STAGE", "export-wheels")
+
+    tag_name: str = env_str_field("TAG", "vllm-wheels")
+
+    cuda_version: str = env_str_field("CUDA_VERSION", "12.8.1")
+
+    python_version: str = env_str_field("PYTHON_VERSION", "3.12")
+
+    max_jobs: str = env_str_field("MAX_JOBS", "64")
+
+    sccache_bucket: str = env_str_field("SCCACHE_BUCKET")
+
+    sccache_region: str = env_str_field("SCCACHE_REGION")
+
+    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
+
+    def __post_init__(self):
+        checks = [
+            (
+                self.use_torch_whl,  # flag
+                True,  # trigger_value
+                "torch_whls_path",  # resource
+                is_path_exist,  # check_func
+                "TORCH_WHEELS_PATH is not provided, but USE_TORCH_WHEEL is set to 1",
+            ),
+            (
+                self.use_local_base_image,
+                True,
+                "base_image",
+                local_image_exists,
+                f"BASE_IMAGE {self.base_image} does not found, but USE_LOCAL_BASE_IMAGE is set to 1",
+            ),
+            (
+                self.use_local_dockerfile,
+                True,
+                "dockerfile_path",
+                is_path_exist,
+                " DOCKERFILE_PATH path does not found, but USE_LOCAL_DOCKERFILE is set to 1",
+            ),
+        ]
+        for flag, trigger_value, attr_name, check_func, error_msg in checks:
+            value = getattr(self, attr_name)
+            if flag == trigger_value:
+                if not value or not check_func(value):
+                    raise ValueError(error_msg)
+            else:
+                logger.info("flag  %s is not set", flag)
+        if not self.output_dir:
+            raise ValueError("missing required output_dir")
+
+
+@with_params_help(VllmBuildParameters)
+class VllmBuildRunner(BaseRunner):
+    """
+    Build vLLM using docker buildx.
+
+    Environment variable options:
+        "USE_TORCH_WHEEL":      "1: use local wheels; 0: pull nightly from pypi",
+        "TORCH_WHEELS_PATH":    "Path to local wheels (when USE_TORCH_WHEEL=1)",
+
+        "USE_LOCAL_BASE_IMAGE": "1: use local base image; 0: default image",
+         "BASE_IMAGE":           "name:tag to indicate base image the dockerfile depends on (when USE_LOCAL_BASE_IMAGE=1)",
+
+        "USE_LOCAL_DOCKERFILE": "1: use local Dockerfile; 0: vllm repo default dockerfile.torch_nightly",
+        "DOCKERFILE_PATH":      "Path to Dockerfile (when USE_LOCAL_DOCKERFILE=1)",
+
+        "OUTPUT_DIR":           "e.g. './shared'",
+
+        "TORCH_CUDA_ARCH_LIST": "e.g. '8.0' or '8.0;9.0'",
+        "CUDA_VERSION":         "e.g. '12.8.1'",
+        "PYTHON_VERSION":       "e.g. '3.12'",
+        "MAX_JOBS":             "e.g. '64'",
+        "SCCACHE_BUCKET":       "e.g. 'my-bucket'",
+        "SCCACHE_REGION":       "e.g. 'us-west-2'",
+    """
+
+    def __init__(self, args=None):
+        self.work_directory = "vllm"
+
+    def run(self):
+        """
+        main function to run vllm build
+        1. prepare vllm build environment
+        2. prepare the docker build command args
+        3. run docker build
+        """
+        inputs = VllmBuildParameters()
+        clone_vllm()
+
+        self.cp_dockerfile_if_exist(inputs)
+
+        # cp torch wheels from root direct to vllm workspace if exist
+        self.cp_torch_whls_if_exist(inputs)
+
+        ensure_dir_exists(inputs.output_dir)
+
+        cmd = self._generate_docker_build_cmd(inputs)
+        logger.info("Running docker build: \n %s", cmd)
+        run_command(cmd, cwd="vllm", env=os.environ.copy())
+
+    def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
+        if not inputs.use_torch_whl:
+            return ""
+        tmp_dir = f"./{self.work_directory}/{_VLLM_TEMP_FOLDER}"
+        tmp_path = Path(tmp_dir)
+        force_create_dir(tmp_path)
+        copy(inputs.torch_whls_path, tmp_dir)
+        return tmp_dir
+
+    def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
+        if not inputs.use_local_dockerfile:
+            logger.info("using vllm default dockerfile.torch_nightly for build")
+            return
+        dockerfile_path = get_path(inputs.dockerfile_path, resolve=True)
+        vllm_torch_dockerfile = Path(
+            f"./{self.work_directory}/docker/Dockerfile.nightly_torch"
+        )
+        copy(dockerfile_path, vllm_torch_dockerfile)
+
+    def get_result_path(self, path):
+        """
+        Get the absolute path of the result path
+        """
+        if not path:
+            path = _DEFAULT_RESULT_PATH
+        abs_path = get_path(path, resolve=True)
+        return abs_path
+
+    def _get_torch_wheel_path_arg(self, torch_whl_dir: Optional[Path]) -> str:
+        if not torch_whl_dir:
+            return ""
+        return f"--build-arg TORCH_WHEELS_PATH={_VLLM_TEMP_FOLDER}"
+
+    def _get_base_image_args(self, inputs: VllmBuildParameters) -> tuple[str, str, str]:
+        """
+        Returns:
+            - base_image_arg: docker buildx arg string for base image
+            - final_base_image_arg:  docker buildx arg string for vllm-base stage
+            - pull_flag: --pull=true or --pull=false depending on whether the image exists locally
+        """
+        if not inputs.use_local_base_image:
+            return "", "", ""
+
+        base_image = inputs.base_image
+
+        # set both base image and final base image to the same local image
+        base_image_arg = f"--build-arg BUILD_BASE_IMAGE={base_image}"
+        final_base_image_arg = f"--build-arg FINAL_BASE_IMAGE={base_image}"
+
+        if local_image_exists(base_image):
+            pull_flag = "--pull=false"
+            return base_image_arg, final_base_image_arg, pull_flag
+        logger.info(
+            "[INFO] Local image not found:%s will try to pull from remote", {base_image}
+        )
+        return base_image_arg, final_base_image_arg, ""
+
+    def _generate_docker_build_cmd(
+        self,
+        inputs: VllmBuildParameters,
+    ) -> str:
+        base_image_arg, final_base_image_arg, pull_flag = self._get_base_image_args(
+            inputs
+        )
+        torch_arg = self._get_torch_wheel_path_arg(inputs.torch_whls_path)
+
+        return textwrap.dedent(
+            f"""
+            docker buildx build \
+                --output type=local,dest={inputs.output_dir} \
+                -f docker/Dockerfile.nightly_torch \
+                {pull_flag} \
+                {torch_arg} \
+                {base_image_arg} \
+                {final_base_image_arg} \
+                --build-arg max_jobs={inputs.max_jobs} \
+                --build-arg CUDA_VERSION={inputs.cuda_version} \
+                --build-arg PYTHON_VERSION={inputs.python_version} \
+                --build-arg USE_SCCACHE={int(bool(inputs.sccache_bucket and inputs.sccache_region))} \
+                --build-arg SCCACHE_BUCKET_NAME={inputs.sccache_bucket} \
+                --build-arg SCCACHE_REGION_NAME={inputs.sccache_region} \
+                --build-arg torch_cuda_arch_list='{inputs.torch_cuda_arch_list}' \
+                --target {inputs.target_stage} \
+                -t {inputs.tag_name} \
+                --progress=plain .
+        """
+        ).strip()
+
+
+def clone_vllm():
+    clone_external_repo(
+        target="vllm",
+        repo="https://github.com/vllm-project/vllm.git",
+        dst="vllm",
+        update_submodules=True,
+    )
--- a/.ci/lumen_cli/cli/run.py
+++ b/.ci/lumen_cli/cli/run.py
@ -0,0 +1,38 @@
+# main.py
+
+import argparse
+import logging
+
+from cli.build_cli.register_build import register_build_commands
+from cli.lib.common.logger import setup_logging
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    # Define top-level parser
+    parser = argparse.ArgumentParser(description="Lumos CLI")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+    parser.add_argument(
+        "--log-level", default="INFO", help="Log level (DEBUG, INFO, WARNING, ERROR)"
+    )
+
+    # registers second-level subcommands
+    register_build_commands(subparsers)
+
+    # parse args after all options are registered
+    args = parser.parse_args()
+
+    # setup global logging
+    setup_logging(getattr(logging, args.log_level.upper(), logging.INFO))
+    logger.debug("Parsed args: %s", args)
+
+    if hasattr(args, "func"):
+        args.func(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
--- a/.ci/lumen_cli/pyproject.toml
+++ b/.ci/lumen_cli/pyproject.toml
@ -0,0 +1,22 @@
+[project]
+name = "lumen-ci"
+version = "0.1.0"
+dependencies = [
+    "pyyaml==6.0.2",
+    "GitPython==3.1.45",
+    "docker==7.1.0",
+    "pytest==7.3.2",
+]
+
+[tool.setuptools]
+packages = ["cli"]
+
+[tool.setuptools.package-dir]
+cli = "cli"
+
+[tool.ruff.lint]
+# Enable preview mode for linting
+preview = true
+
+# Now you can select your preview rules, like RUF048
+extend-select = ["RUF048"]
--- a/.ci/lumen_cli/tests/test_app.py
+++ b/.ci/lumen_cli/tests/test_app.py
@ -0,0 +1,47 @@
+# tests/test_cli.py
+import io
+import sys
+import unittest
+from contextlib import redirect_stderr, redirect_stdout
+from unittest.mock import patch
+
+from cli.run import main
+
+
+class TestArgparseCLI(unittest.TestCase):
+    @patch("cli.build_cli.register_build.VllmBuildRunner.run", return_value=None)
+    @patch("cli.build_cli.register_build.VllmBuildRunner.__init__", return_value=None)
+    def test_cli_run_build_external(self, mock_init, mock_run):
+        from cli.run import main  # import after patches if needed
+
+        test_args = ["cli.run", "build", "external", "vllm"]
+        with patch.object(sys, "argv", test_args):
+            # argparse may call sys.exit on error; capture to avoid test aborts
+            try:
+                main()
+            except SystemExit:
+                pass
+        mock_init.assert_called_once()  # got constructed
+        mock_run.assert_called_once_with()  # run() called
+
+    def test_build_help(self):
+        test_args = ["cli.run", "build", "--help"]
+
+        with patch.object(sys, "argv", test_args):
+            stdout = io.StringIO()
+            stderr = io.StringIO()
+
+            # --help always raises SystemExit(0)
+            with self.assertRaises(SystemExit) as cm:
+                with redirect_stdout(stdout), redirect_stderr(stderr):
+                    main()
+
+            self.assertEqual(cm.exception.code, 0)
+
+            output = stdout.getvalue()
+            self.assertIn("usage", output)
+            self.assertIn("external", output)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/.ci/lumen_cli/tests/test_cli_helper.py
+++ b/.ci/lumen_cli/tests/test_cli_helper.py
@ -0,0 +1,115 @@
+import argparse
+import io
+import unittest
+from contextlib import redirect_stderr
+from unittest.mock import patch
+
+from cli.lib.common.cli_helper import BaseRunner, register_targets, RichHelp, TargetSpec
+
+
+# ---- Dummy runners for unittests----
+class FooRunner(BaseRunner):
+    """Foo description from docstring."""
+
+    def run(self) -> None:  # replaced by mock
+        pass
+
+
+class BarRunner(BaseRunner):
+    def run(self) -> None:  # replaced by mock
+        pass
+
+
+def add_foo_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--x", type=int, required=True, help="x value")
+
+
+def common_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--verbose", action="store_true", help="verbose flag")
+
+
+def build_parser(specs: dict[str, TargetSpec]) -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="app", formatter_class=RichHelp)
+    register_targets(
+        parser=parser,
+        target_specs=specs,
+        common_args=common_args,
+    )
+    return parser
+
+
+def get_subparser(
+    parser: argparse.ArgumentParser, name: str
+) -> argparse.ArgumentParser:
+    subparsers_action = next(
+        a
+        for a in parser._subparsers._group_actions  # type: ignore[attr-defined]
+        if isinstance(a, argparse._SubParsersAction)
+    )
+    return subparsers_action.choices[name]
+
+
+class TestRegisterTargets(unittest.TestCase):
+    def test_metavar_lists_targets(self):
+        specs: dict[str, TargetSpec] = {
+            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
+            "bar": {"runner": BarRunner},
+        }
+        parser = build_parser(specs)
+        subparsers_action = next(
+            a
+            for a in parser._subparsers._group_actions  # type: ignore[attr-defined]
+            if isinstance(a, argparse._SubParsersAction)
+        )
+        self.assertEqual(subparsers_action.metavar, "{foo,bar}")
+
+    def test_add_arguments_and_common_args_present(self):
+        specs: dict[str, TargetSpec] = {
+            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
+        }
+        parser = build_parser(specs)
+        foo = get_subparser(parser, "foo")
+        help_text = foo.format_help()
+        self.assertIn("--x", help_text)
+        self.assertIn("--verbose", help_text)
+
+    def test_runner_constructed_with_ns_and_run_called(self):
+        specs: dict[str, TargetSpec] = {
+            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
+        }
+        parser = build_parser(specs)
+
+        with (
+            patch.object(FooRunner, "__init__", return_value=None) as mock_init,
+            patch.object(FooRunner, "run", return_value=None) as mock_run,
+        ):
+            ns = parser.parse_args(["foo", "--x", "3", "--verbose"])
+            ns.func(ns)  # set by register_targets
+            # __init__ received the Namespace
+            self.assertEqual(mock_init.call_count, 1)
+            (called_ns,), _ = mock_init.call_args
+            self.assertIsInstance(called_ns, argparse.Namespace)
+            # run() called with no args
+            mock_run.assert_called_once_with()
+
+    def test_runner_docstring_used_as_description_when_missing(self):
+        specs: dict[str, TargetSpec] = {
+            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
+        }
+        parser = build_parser(specs)
+        foo = get_subparser(parser, "foo")
+        help_text = foo.format_help()
+        self.assertIn("Foo description from docstring.", help_text)
+
+    def test_missing_target_raises_systemexit_with_usage(self):
+        specs: dict[str, TargetSpec] = {"foo": {"runner": FooRunner}}
+        parser = build_parser(specs)
+        buf = io.StringIO()
+        with self.assertRaises(SystemExit), redirect_stderr(buf):
+            parser.parse_args([])
+        err = buf.getvalue()
+        self.assertIn("usage:", err)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/.ci/lumen_cli/tests/test_docker_helper.py
+++ b/.ci/lumen_cli/tests/test_docker_helper.py
@ -0,0 +1,75 @@
+import unittest
+from unittest import mock
+from unittest.mock import MagicMock
+
+import docker.errors as derr
+from cli.lib.common.docker_helper import _get_client, local_image_exists
+
+
+class TestDockerImageHelpers(unittest.TestCase):
+    def setUp(self):
+        # Reset the singleton in the target module
+        patcher = mock.patch("cli.lib.common.docker_helper._docker_client", None)
+        self.addCleanup(patcher.stop)
+        patcher.start()
+
+    def test_local_image_exists_true(self):
+        # Mock a docker client whose images.get returns an object (no exception)
+        mock_client = MagicMock()
+        mock_client.images.get.return_value = object()
+        ok = local_image_exists("repo:tag", client=mock_client)
+        self.assertTrue(ok)
+
+    def test_local_image_exists_not_found_false(self):
+        mock_client = MagicMock()
+        # Raise docker.errors.NotFound
+        mock_client.images.get.side_effect = derr.NotFound("nope")
+        ok = local_image_exists("missing:latest", client=mock_client)
+        self.assertFalse(ok)
+
+    def test_local_image_exists_api_error_false(self):
+        mock_client = MagicMock()
+        mock_client.images.get.side_effect = derr.APIError("boom", None)
+
+        ok = local_image_exists("broken:tag", client=mock_client)
+        self.assertFalse(ok)
+
+    def test_local_image_exists_uses_lazy_singleton(self):
+        # Patch docker.from_env used by _get_client()
+        with mock.patch(
+            "cli.lib.common.docker_helper.docker.from_env"
+        ) as mock_from_env:
+            mock_docker_client = MagicMock()
+            mock_from_env.return_value = mock_docker_client
+
+            # First call should create and cache the client
+            c1 = _get_client()
+            self.assertIs(c1, mock_docker_client)
+            mock_from_env.assert_called_once()
+
+            # Second call should reuse cached client (no extra from_env calls)
+            c2 = _get_client()
+            self.assertIs(c2, mock_docker_client)
+            mock_from_env.assert_called_once()  # still once
+
+    def test_local_image_exists_without_client_param_calls_get_client_once(self):
+        # Ensure _get_client is called and cached; local_image_exists should reuse it
+        with mock.patch("cli.lib.common.docker_helper._get_client") as mock_get_client:
+            mock_client = MagicMock()
+            mock_get_client.return_value = mock_client
+
+            # 1st call
+            local_image_exists("repo:tag")
+            # 2nd call
+            local_image_exists("repo:tag2")
+
+            # local_image_exists should call _get_client each time,
+            # but your _get_client itself caches docker.from_env.
+            self.assertEqual(mock_get_client.call_count, 2)
+            self.assertEqual(mock_client.images.get.call_count, 2)
+            mock_client.images.get.assert_any_call("repo:tag")
+            mock_client.images.get.assert_any_call("repo:tag2")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/.ci/lumen_cli/tests/test_envs_helper.py
+++ b/.ci/lumen_cli/tests/test_envs_helper.py
@ -0,0 +1,149 @@
+import os
+import unittest
+from dataclasses import dataclass
+from pathlib import Path
+from unittest.mock import patch
+
+import cli.lib.common.envs_helper as m
+
+
+class TestEnvHelpers(unittest.TestCase):
+    def setUp(self):
+        # Keep a copy of the original environment to restore later
+        self._env_backup = dict(os.environ)
+
+    def tearDown(self):
+        # Restore environment to original state
+        os.environ.clear()
+        os.environ.update(self._env_backup)
+
+    # -------- get_env --------
+    def test_get_env_unset_returns_default(self):
+        with patch.dict(os.environ, {}, clear=True):
+            self.assertEqual(m.get_env("FOO", "default"), "default")
+
+    def test_get_env_empty_returns_default(self):
+        with patch.dict(os.environ, {"FOO": ""}, clear=True):
+            self.assertEqual(m.get_env("FOO", "default"), "default")
+
+    def test_get_env_set_returns_value(self):
+        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
+            self.assertEqual(m.get_env("FOO", "default"), "bar")
+
+    def test_get_env_not_exist_returns_default(self):
+        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
+            self.assertEqual(m.get_env("TEST_NOT_EXIST", "default"), "default")
+
+    def test_get_env_not_exist_without_default(self):
+        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
+            self.assertEqual(m.get_env("TEST_NOT_EXIST"), "")
+
+    # -------- env_bool --------
+    def test_env_bool_uses_default_when_unset(self):
+        with patch.dict(os.environ, {}, clear=True):
+            self.assertTrue(m.env_bool("FLAG", default=True))
+            self.assertFalse(m.env_bool("FLAG", default=False))
+
+    def test_env_bool_uses_str2bool_when_set(self):
+        # Patch str2bool used by env_bool so we don't depend on its exact behavior
+        def fake_str2bool(s: str) -> bool:
+            return s.lower() in {"1", "true", "yes", "on", "y"}
+
+        with (
+            patch.dict(os.environ, {"FLAG": "yEs"}, clear=True),
+            patch.object(m, "str2bool", fake_str2bool),
+        ):
+            self.assertTrue(m.env_bool("FLAG", default=False))
+
+    # -------- env_path_optional / env_path --------
+    def test_env_path_optional_unset_returns_none_by_default(self):
+        with patch.dict(os.environ, {}, clear=True):
+            self.assertIsNone(m.env_path_optional("P"))
+
+    def test_env_path_optional_unset_returns_none_when_env_var_is_empty(self):
+        with patch.dict(os.environ, {"P": ""}, clear=True):
+            self.assertIsNone(m.env_path_optional("P"))
+
+    def test_env_path_optional_unset_returns_default_str(self):
+        # default as string; resolve=True by default -> absolute path
+        default_str = "x/y"
+        with patch.dict(os.environ, {}, clear=True):
+            p = m.env_path_optional("P", default=default_str)
+            self.assertIsInstance(p, Path)
+            self.assertIsNotNone(p)
+            if p:
+                self.assertTrue(p.is_absolute())
+                self.assertEqual(p.parts[-2:], ("x", "y"))
+
+    def test_env_path_optional_unset_returns_default_path_no_resolve(self):
+        d = Path("z")
+        with patch.dict(os.environ, {}, clear=True):
+            p = m.env_path_optional("P", default=d, resolve=False)
+            self.assertEqual(p, d)
+
+    def test_env_path_optional_respects_resolve_true(self):
+        with patch.dict(os.environ, {"P": "a/b"}, clear=True):
+            p = m.env_path_optional("P", resolve=True)
+            self.assertIsInstance(p, Path)
+            if p:
+                self.assertTrue(p.is_absolute())
+
+    def test_env_path_optional_respects_resolve_false(self):
+        with patch.dict(os.environ, {"P": "rel/dir"}, clear=True):
+            p = m.env_path_optional("P", resolve=False)
+            self.assertEqual(p, Path("rel/dir"))
+            if p:
+                self.assertFalse(p.is_absolute())
+
+    def test_env_path_raises_when_missing_and_default_none(self):
+        with patch.dict(os.environ, {}, clear=True):
+            with self.assertRaises(ValueError):
+                m.env_path("P", None, resolve=True)
+
+    def test_env_path_returns_path_when_present(self):
+        tmp = Path("./b").resolve()
+        with patch.dict(os.environ, {"P": str(tmp)}, clear=True):
+            p = m.env_path("P", None, resolve=True)
+            self.assertEqual(p, tmp)
+
+    # -------- dataclass field helpers --------
+    def test_dataclass_fields_read_env_at_instantiation(self):
+        @dataclass
+        class Cfg:
+            flag: bool = m.env_bool_field("FLAG", default=False)
+            out: Path = m.env_path_field("OUT", default="ab", resolve=True)
+            name: str = m.env_str_field("NAME", default="anon")
+
+        # First instantiation
+        with patch.dict(
+            os.environ, {"FLAG": "true", "OUT": "outdir", "NAME": "alice"}, clear=True
+        ):
+            cfg1 = Cfg()
+            self.assertTrue(cfg1.flag)
+            self.assertIsInstance(cfg1.out, Path)
+            self.assertTrue(cfg1.out.is_absolute())
+            self.assertEqual(cfg1.name, "alice")
+            cfg1.name = "bob"  # change instance value
+            self.assertEqual(cfg1.name, "bob")  # change is reflected
+
+        # Change env; new instance should reflect new values
+        with patch.dict(os.environ, {"FLAG": "false", "NAME": ""}, clear=True):
+            cfg2 = Cfg()
+            self.assertFalse(cfg2.flag)  # str2bool("false") -> False
+            self.assertTrue("ab" in str(cfg2.out))
+            self.assertIsInstance(cfg2.out, Path)
+            self.assertTrue(cfg2.out.is_absolute())
+            self.assertEqual(cfg2.name, "anon")  # empty -> fallback to default
+
+    def test_dataclass_path_field_with_default_value(self):
+        @dataclass
+        class C2:
+            out: Path = m.env_path_field("OUT", default="some/dir", resolve=False)
+
+        with patch.dict(os.environ, {}, clear=True):
+            c = C2()
+            self.assertEqual(c.out, Path("some/dir"))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/.ci/lumen_cli/tests/test_path_helper.py
+++ b/.ci/lumen_cli/tests/test_path_helper.py
@ -0,0 +1,122 @@
+# test_path_utils.py
+# Run: pytest -q
+
+import os
+import unittest
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from cli.lib.common.path_helper import (
+    copy,
+    ensure_dir_exists,
+    force_create_dir,
+    get_path,
+    is_path_exist,
+    remove_dir,
+)
+
+
+class TestPathHelper(unittest.TestCase):
+    def setUp(self):
+        self.tmpdir = TemporaryDirectory()
+        self.tmp_path = Path(self.tmpdir.name)
+
+    def tearDown(self):
+        self.tmpdir.cleanup()
+
+    # -------- get_path --------
+    def test_get_path_returns_path_for_str(self):
+        # Use relative path to avoid absolute-ness
+        rel_str = "sub/f.txt"
+        os.chdir(self.tmp_path)
+        p = get_path(rel_str, resolve=False)
+        self.assertIsInstance(p, Path)
+        self.assertFalse(p.is_absolute())
+        self.assertEqual(str(p), rel_str)
+
+    def test_get_path_resolves(self):
+        rel_str = "sub/f.txt"
+        p = get_path(str(self.tmp_path / rel_str), resolve=True)
+        self.assertTrue(p.is_absolute())
+        self.assertTrue(str(p).endswith(rel_str))
+
+    def test_get_path_with_path_input(self):
+        p_in = self.tmp_path / "sub/f.txt"
+        p_out = get_path(p_in, resolve=False)
+        self.assertTrue(str(p_out) == str(p_in))
+
+    def test_get_path_with_none_raises(self):
+        with self.assertRaises(ValueError):
+            get_path(None)  # type: ignore[arg-type]
+
+    def test_get_path_invalid_type_raises(self):
+        with self.assertRaises(TypeError):
+            get_path(123)  # type: ignore[arg-type]
+
+    # -------- ensure_dir_exists / force_create_dir / remove_dir --------
+    def test_ensure_dir_exists_creates_and_is_idempotent(self):
+        d = self.tmp_path / "made"
+        ensure_dir_exists(d)
+        self.assertTrue(d.exists() and d.is_dir())
+        ensure_dir_exists(d)
+
+    def test_force_create_dir_clears_existing(self):
+        d = self.tmp_path / "fresh"
+        (d / "inner").mkdir(parents=True)
+        (d / "inner" / "f.txt").write_text("x")
+        force_create_dir(d)
+        self.assertTrue(d.exists())
+        self.assertEqual(list(d.iterdir()), [])
+
+    def test_remove_dir_none_is_noop(self):
+        remove_dir(None)  # type: ignore[arg-type]
+
+    def test_remove_dir_nonexistent_is_noop(self):
+        ghost = self.tmp_path / "ghost"
+        remove_dir(ghost)
+
+    def test_remove_dir_accepts_str(self):
+        d = self.tmp_path / "to_rm"
+        d.mkdir()
+        remove_dir(str(d))
+        self.assertFalse(d.exists())
+
+    # -------- copy --------
+    def test_copy_file_to_file(self):
+        src = self.tmp_path / "src.txt"
+        dst = self.tmp_path / "out" / "dst.txt"
+        src.write_text("hello")
+        copy(src, dst)
+        self.assertEqual(dst.read_text(), "hello")
+
+    def test_copy_dir_to_new_dir(self):
+        src = self.tmp_path / "srcdir"
+        (src / "a").mkdir(parents=True)
+        (src / "a" / "f.txt").write_text("content")
+        dst = self.tmp_path / "destdir"
+        copy(src, dst)
+        self.assertEqual((dst / "a" / "f.txt").read_text(), "content")
+
+    def test_copy_dir_into_existing_dir_overwrite_true_merges(self):
+        src = self.tmp_path / "srcdir"
+        dst = self.tmp_path / "destdir"
+        (src / "x").mkdir(parents=True)
+        (src / "x" / "new.txt").write_text("new")
+        dst.mkdir()
+        (dst / "existing.txt").write_text("old")
+        copy(src, dst)
+        self.assertEqual((dst / "existing.txt").read_text(), "old")
+        self.assertEqual((dst / "x" / "new.txt").read_text(), "new")
+
+    def test_is_str_path_exist(self):
+        p = self.tmp_path / "x.txt"
+        p.write_text("1")
+        self.assertTrue(is_path_exist(str(p)))
+        self.assertTrue(is_path_exist(p))
+        self.assertFalse(is_path_exist(str(self.tmp_path / "missing")))
+        self.assertFalse(is_path_exist(self.tmp_path / "missing"))
+        self.assertFalse(is_path_exist(""))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/.ci/lumen_cli/tests/test_vllm.py
+++ b/.ci/lumen_cli/tests/test_vllm.py
@ -0,0 +1,181 @@
+import os
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import cli.lib.core.vllm as vllm
+
+
+class TestVllmBuildParameters(unittest.TestCase):
+    @patch("cli.lib.core.vllm.local_image_exists", return_value=True)
+    @patch("cli.lib.core.vllm.is_path_exist", return_value=True)
+    @patch(
+        "cli.lib.common.envs_helper.env_path_optional",
+        side_effect=lambda name, default=None, resolve=True: {
+            "DOCKERFILE_PATH": Path("/abs/vllm/Dockerfile"),
+            "TORCH_WHEELS_PATH": Path("/abs/dist"),
+            "OUTPUT_DIR": Path("/abs/shared"),
+        }.get(name, Path(default) if default is not None else None),
+    )
+    @patch.dict(
+        os.environ,
+        {
+            "USE_TORCH_WHEEL": "1",
+            "USE_LOCAL_BASE_IMAGE": "1",
+            "USE_LOCAL_DOCKERFILE": "1",
+            "BASE_IMAGE": "my/image:tag",
+            "DOCKERFILE_PATH": "vllm/Dockerfile",
+            "TORCH_WHEELS_PATH": "dist",
+            "OUTPUT_DIR": "shared",
+        },
+        clear=True,
+    )
+    def test_params_success_normalizes_and_validates(
+        self, mock_env_path, mock_is_path, mock_local_img
+    ):
+        params = vllm.VllmBuildParameters()
+        self.assertEqual(params.torch_whls_path, Path("/abs/dist"))
+        self.assertEqual(params.dockerfile_path, Path("/abs/vllm/Dockerfile"))
+        self.assertEqual(params.output_dir, Path("/abs/shared"))
+        self.assertEqual(params.base_image, "my/image:tag")
+
+    @patch("cli.lib.core.vllm.is_path_exist", return_value=False)
+    @patch.dict(
+        os.environ, {"USE_TORCH_WHEEL": "1", "TORCH_WHEELS_PATH": "dist"}, clear=True
+    )
+    def test_params_missing_torch_whls_raises(self, _is_path):
+        with tempfile.TemporaryDirectory() as td:
+            os.chdir(td)
+            with self.assertRaises(ValueError) as cm:
+                vllm.VllmBuildParameters(
+                    use_local_base_image=False,
+                    use_local_dockerfile=False,
+                )
+        err = cm.exception
+        self.assertIn("TORCH_WHEELS_PATH", str(err))
+
+    @patch("cli.lib.core.vllm.local_image_exists", return_value=False)
+    @patch.dict(
+        os.environ, {"USE_LOCAL_BASE_IMAGE": "1", "BASE_IMAGE": "img:tag"}, clear=True
+    )
+    def test_params_missing_local_base_image_raises(self, _local_img):
+        with tempfile.TemporaryDirectory() as td:
+            os.chdir(td)
+            with self.assertRaises(ValueError) as cm:
+                vllm.VllmBuildParameters(
+                    use_torch_whl=False,
+                    use_local_dockerfile=False,
+                )
+        err = cm.exception
+        self.assertIn("BASE_IMAGE", str(err))
+
+    @patch("cli.lib.core.vllm.is_path_exist", return_value=False)
+    @patch.dict(
+        os.environ,
+        {"USE_LOCAL_DOCKERFILE": "1", "DOCKERFILE_PATH": "Dockerfile"},
+        clear=True,
+    )
+    def test_params_missing_dockerfile_raises(self, _is_path):
+        with tempfile.TemporaryDirectory() as td:
+            os.chdir(td)
+            with self.assertRaises(ValueError) as cm:
+                vllm.VllmBuildParameters(
+                    use_torch_whl=False,
+                    use_local_base_image=False,
+                )
+        err = cm.exception
+        self.assertIn("DOCKERFILE_PATH", str(err))
+
+    @patch("cli.lib.core.vllm.is_path_exist", return_value=False)
+    @patch.dict(
+        os.environ,
+        {"OUTPUT_DIR": ""},
+        clear=True,
+    )
+    def test_params_missing_output_dir(self, _is_path):
+        with self.assertRaises(FileNotFoundError):
+            vllm.VllmBuildParameters()
+
+
+class TestBuildCmdAndRun(unittest.TestCase):
+    @patch("cli.lib.core.vllm.local_image_exists", return_value=True)
+    def test_generate_docker_build_cmd_includes_bits(self, _exists):
+        runner = vllm.VllmBuildRunner()
+        # Craft inputs that simulate a prepared build
+        inputs = MagicMock()
+        inputs.output_dir = Path("/abs/out")
+        inputs.use_local_base_image = True
+        inputs.base_image = "img:tag"
+        inputs.torch_whls_path = Path("./vllm/tmp")
+        inputs.max_jobs = 64
+        inputs.cuda_version = "12.8.1"
+        inputs.python_version = "3.12"
+        inputs.sccache_bucket = "my-bucket"
+        inputs.sccache_region = "us-west-2"
+        inputs.torch_cuda_arch_list = "8.0;9.0"
+        inputs.target_stage = "export-wheels"
+        inputs.tag_name = "vllm-wheels"
+
+        cmd = runner._generate_docker_build_cmd(inputs)
+        squashed = " ".join(cmd.split())  # normalize whitespace for matching
+
+        self.assertIn("--output type=local,dest=/abs/out", squashed)
+        self.assertIn("-f docker/Dockerfile.nightly_torch", squashed)
+        self.assertIn("--pull=false", squashed)
+        self.assertIn("--build-arg TORCH_WHEELS_PATH=tmp", squashed)
+        self.assertIn("--build-arg BUILD_BASE_IMAGE=img:tag", squashed)
+        self.assertIn("--build-arg FINAL_BASE_IMAGE=img:tag", squashed)
+        self.assertIn("--build-arg max_jobs=64", squashed)
+        self.assertIn("--build-arg CUDA_VERSION=12.8.1", squashed)
+        self.assertIn("--build-arg PYTHON_VERSION=3.12", squashed)
+        self.assertIn("--build-arg USE_SCCACHE=1", squashed)
+        self.assertIn("--build-arg SCCACHE_BUCKET_NAME=my-bucket", squashed)
+        self.assertIn("--build-arg SCCACHE_REGION_NAME=us-west-2", squashed)
+        self.assertIn("--build-arg torch_cuda_arch_list='8.0;9.0'", squashed)
+        self.assertIn("--target export-wheels", squashed)
+        self.assertIn("-t vllm-wheels", squashed)
+
+    @patch("cli.lib.core.vllm.run_command")
+    @patch("cli.lib.core.vllm.ensure_dir_exists")
+    @patch("cli.lib.core.vllm.clone_vllm")
+    @patch.object(
+        vllm.VllmBuildRunner,
+        "_generate_docker_build_cmd",
+        return_value="docker buildx ...",
+    )
+    @patch.dict(
+        os.environ,
+        {
+            # Make __post_init__ validations pass cheaply
+            "USE_TORCH_WHEEL": "0",
+            "USE_LOCAL_BASE_IMAGE": "0",
+            "USE_LOCAL_DOCKERFILE": "0",
+            "OUTPUT_DIR": "shared",
+        },
+        clear=True,
+    )
+    def test_run_calls_clone_prepare_and_build(
+        self, mock_gen, mock_clone, mock_ensure, mock_run
+    ):
+        # Stub parameters instance so we avoid FS/Docker accesses in run()
+        params = MagicMock()
+        params.output_dir = Path("shared")
+        params.use_local_dockerfile = False
+        params.use_torch_whl = False
+
+        with patch("cli.lib.core.vllm.VllmBuildParameters", return_value=params):
+            runner = vllm.VllmBuildRunner()
+            runner.run()
+
+        mock_clone.assert_called_once()
+        mock_ensure.assert_called_once_with(Path("shared"))
+        mock_gen.assert_called_once_with(params)
+        mock_run.assert_called_once()
+        # ensure we run in vllm workdir
+        _, kwargs = mock_run.call_args
+        assert kwargs.get("cwd") == "vllm"
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -5,10 +5,6 @@ set -ex
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

 case "${GPU_ARCH_TYPE:-BLANK}" in
-    BLANK)
-        # Legacy behavior for CircleCI
-        bash "${SCRIPTPATH}/build_cuda.sh"
-        ;;
    cuda)
        bash "${SCRIPTPATH}/build_cuda.sh"
        ;;
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -138,28 +138,11 @@ fi

 echo "Calling setup.py bdist at $(date)"

-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \
+time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+    EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
-    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
-    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-    CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-else
-    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
-        EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-        BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
-        USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-        python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-fi
 echo "Finished setup.py bdist at $(date)"

 # Build libtorch packages
@ -272,10 +255,6 @@ ls /tmp/$WHEELHOUSE_DIR
 mkdir -p "/$WHEELHOUSE_DIR"
 mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/

-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true
-fi
-
 if [[ -n "$BUILD_PYTHONLESS" ]]; then
    mkdir -p /$LIBTORCH_HOUSE_DIR
    mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
@ -452,16 +431,8 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then
  pushd $PYTORCH_ROOT/test

  # Install the wheel for this Python version
-  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true
-  fi
-
  pip uninstall -y "$TORCH_PACKAGE_NAME"

-  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
-  fi
-
  pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v

  # Print info on the libraries installed in this wheel
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -134,6 +134,7 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
            "/usr/local/cuda/lib64/libcufile.so.0"
            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
+            "/usr/local/cuda/lib64/libnvshmem_host.so.3"
            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
        )
@ -152,6 +153,7 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "libcudart.so.12"
            "libnvrtc.so.12"
            "libnvrtc-builtins.so"
+            "libnvshmem_host.so.3"
            "libcufile.so.0"
            "libcufile_rdma.so.1"
            "libcupti.so.12"
--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@ -194,7 +194,7 @@ ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
 ROCBLAS_LIB_DST=lib/rocblas/library
 ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
 ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
-ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
+ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $ROCBLAS_OTHER_FILES)

 # hipblaslt library files
 HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -50,9 +50,6 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi

-# Enable LLVM dependency for TensorExpr testing
-export USE_LLVM=/opt/llvm
-export LLVM_DIR=/opt/llvm/lib/cmake/llvm

 if ! which conda; then
  # In ROCm CIs, we are doing cross compilation on build machines with
@ -95,6 +92,27 @@ if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  export ACL_ROOT_DIR=/ComputeLibrary
 fi

+if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
+  if [[ -f /opt/riscv-cross-env/bin/activate ]]; then
+    # shellcheck disable=SC1091
+    source /opt/riscv-cross-env/bin/activate
+  else
+    echo "Activation file not found"
+    exit 1
+  fi
+
+  export CMAKE_CROSSCOMPILING=TRUE
+  export CMAKE_SYSTEM_NAME=Linux
+  export CMAKE_SYSTEM_PROCESSOR=riscv64
+
+  export USE_CUDA=0
+  export USE_MKLDNN=0
+
+  export SLEEF_TARGET_EXEC_USE_QEMU=ON
+  sudo chown -R jenkins /var/lib/jenkins/workspace /opt
+
+fi
+
 if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
  POSSIBLE_JAVA_HOMES=()
  POSSIBLE_JAVA_HOMES+=(/usr/local)
@ -176,7 +194,7 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
 fi

@ -192,7 +210,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
  export USE_ASAN=1
  export REL_WITH_DEB_INFO=1
  export UBSAN_FLAGS="-fno-sanitize-recover=all"
-  unset USE_LLVM
 fi

 if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
@ -213,7 +230,7 @@ fi

 # Do not change workspace permissions for ROCm and s390x CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && -d /var/lib/jenkins/workspace ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -258,29 +275,19 @@ else
    # XLA test build fails when WERROR=1
    # set only when building other architectures
    # or building non-XLA tests.
-    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
-          "$BUILD_ENVIRONMENT" != *xla* ]]; then
+    if [[ "$BUILD_ENVIRONMENT" != *rocm*  && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then
      # Install numpy-2.0.2 for builds which are backward compatible with 1.X
      python -mpip install numpy==2.0.2

      WERROR=1 python setup.py clean

-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        python3 tools/packaging/split_wheel.py bdist_wheel
-      else
-        WERROR=1 python setup.py bdist_wheel
-      fi
+      WERROR=1 python setup.py bdist_wheel
    else
      python setup.py clean
      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
        source .ci/pytorch/install_cache_xla.sh
      fi
-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
-        exit 1
-      else
-        python setup.py bdist_wheel
-      fi
+      python setup.py bdist_wheel
    fi
    pip_install_whl "$(echo dist/*.whl)"

@ -405,7 +412,7 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
  python tools/stats/export_test_times.py
 fi
-# don't do this for bazel or s390x as they don't use sccache
-if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
+# don't do this for bazel or s390x or riscv64 as they don't use sccache
+if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -229,7 +229,6 @@ function install_torchrec_and_fbgemm() {

    pip_install tabulate  # needed for newer fbgemm
    pip_install patchelf  # needed for rocm fbgemm
-    pushd /tmp

    local wheel_dir=dist/fbgemm_gpu
    local found_whl=0
@ -245,7 +244,7 @@ function install_torchrec_and_fbgemm() {
    if [ "${found_whl}" == "0" ]; then
      git clone --recursive https://github.com/pytorch/fbgemm
      pushd fbgemm/fbgemm_gpu
-      git checkout "${fbgemm_commit}"
+      git checkout "${fbgemm_commit}" --recurse-submodules
      python setup.py bdist_wheel \
        --build-variant=rocm \
        -DHIP_ROOT_DIR="${ROCM_PATH}" \
@ -264,7 +263,6 @@ function install_torchrec_and_fbgemm() {
    done

    rm -rf fbgemm
-    popd
  else
    pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
    pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
@ -283,30 +281,6 @@ function clone_pytorch_xla() {
  fi
 }

-function checkout_install_torchbench() {
-  local commit
-  commit=$(get_pinned_commit torchbench)
-  git clone https://github.com/pytorch/benchmark torchbench
-  pushd torchbench
-  git checkout "$commit"
-
-  if [ "$1" ]; then
-    python install.py --continue_on_fail models "$@"
-  else
-    # Occasionally the installation may fail on one model but it is ok to continue
-    # to install and test other models
-    python install.py --continue_on_fail
-  fi
-
-  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
-  # is regressing speedup metric. This needs to be investigated further
-  pip install transformers==4.38.1
-
-  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
-  popd
-}
-
 function install_torchao() {
  local commit
  commit=$(get_pinned_commit torchao)
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -157,6 +157,34 @@ test_jit_hooks() {
  assert_git_not_dirty
 }

+# Shellcheck doesn't like it when you pass no arguments to a function
+# that can take args. See https://www.shellcheck.net/wiki/SC2120
+# shellcheck disable=SC2120
+checkout_install_torchbench() {
+  local commit
+  commit=$(cat .ci/docker/ci_commit_pins/torchbench.txt)
+  git clone https://github.com/pytorch/benchmark torchbench
+  pushd torchbench
+  git checkout "$commit"
+
+  if [ "$1" ]; then
+    python install.py --continue_on_fail models "$@"
+  else
+    # Occasionally the installation may fail on one model but it is ok to continue
+    # to install and test other models
+    python install.py --continue_on_fail
+  fi
+  popd
+
+  pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
+  # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
+  # its current version 0.12.0 doesn't work with transformers 4.54.0
+  pip uninstall -y torchao
+
+  echo "Print all dependencies after TorchBench is installed"
+  python -mpip freeze
+}
+
 torchbench_setup_macos() {
  git clone --recursive https://github.com/pytorch/vision torchvision
  git clone --recursive https://github.com/pytorch/audio torchaudio
@ -179,8 +207,6 @@ torchbench_setup_macos() {
  USE_OPENMP=0 python setup.py develop
  popd

-  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
-  # shellcheck disable=SC2119,SC2120
  checkout_install_torchbench
 }

--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -627,6 +627,8 @@ test_perf_for_dashboard() {
    device=cuda_a10g
  elif [[ "${TEST_CONFIG}" == *h100* ]]; then
    device=cuda_h100
+  elif [[ "${TEST_CONFIG}" == *b200* ]]; then
+    device=cuda_b200
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
  fi
@ -801,6 +803,16 @@ test_dynamo_benchmark() {
  if [[ "${TEST_CONFIG}" == *perf_compare* ]]; then
    test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
+    # TODO (huydhn): Just smoke test some sample models
+    if [[ "${TEST_CONFIG}" == *b200* ]]; then
+      if [[ "${suite}" == "huggingface" ]]; then
+        export TORCHBENCH_ONLY_MODELS="DistillGPT2"
+      elif [[ "${suite}" == "timm_models" ]]; then
+        export TORCHBENCH_ONLY_MODELS="inception_v3"
+      elif [[ "${suite}" == "torchbench" ]]; then
+        export TORCHBENCH_ONLY_MODELS="hf_Bert"
+      fi
+    fi
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
    if [[ "${TEST_CONFIG}" == *cpu* ]]; then
@ -1039,20 +1051,10 @@ test_libtorch_api() {
    mkdir -p $TEST_REPORTS_DIR

    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
-    "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
  else
    # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"

-    # On s390x, pytorch is built without llvm.
-    # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
-    # test fails with errors like:
-    # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
-    # unknown file: Failure
-    # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
-    if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
-      python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
-    fi
  fi

  # quantization is not fully supported on s390x yet
@ -1672,43 +1674,34 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
 elif [[ "${TEST_CONFIG}" == cachebench ]]; then
  install_torchaudio
  install_torchvision
-  checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
-  PYTHONPATH=$(pwd)/torchbench test_cachebench
+  PYTHONPATH=/torchbench test_cachebench
 elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
  install_torchaudio
  install_torchvision
-  checkout_install_torchbench nanogpt
-  PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
+  PYTHONPATH=/torchbench test_verify_cachebench
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  install_torchaudio
  install_torchvision
-  install_torchao
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
-    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
-    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
+    PYTHONPATH=/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
-    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
-      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
-      functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
-    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
+    PYTHONPATH=/torchbench test_inductor_torchbench_cpu_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
-    checkout_install_torchbench
-    TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
+    TORCHBENCHPATH=/torchbench test_torchbench_gcp_smoketest
  else
-    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
      install_torchrec_and_fbgemm
    fi
-    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
+    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
  fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
-  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
+  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
    test_inductor_aoti
  fi
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -61,9 +61,10 @@ if "%USE_XPU%"=="1" (
  call "C:\Program Files (x86)\Intel\oneAPI\compiler\latest\env\vars.bat"
  call "C:\Program Files (x86)\Intel\oneAPI\ocloc\latest\env\vars.bat"
  if errorlevel 1 exit /b 1
-  :: Reduce build time. Only have MTL self-hosted runner now
-  SET TORCH_XPU_ARCH_LIST=xe-lpg
-  SET USE_KINETO=0
+  :: Reduce build time
+  SET TORCH_XPU_ARCH_LIST=bmg
+  :: Re-setup python env for build
+  call pip install -r requirements.txt
 )

@echo on
--- a/.ci/pytorch/windows/cuda126.bat
+++ b/.ci/pytorch/windows/cuda126.bat
@ -37,7 +37,7 @@ IF "%CUDA_PATH_V126%"=="" (
 )

 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
+    set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -133,6 +133,25 @@ EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
 RENAME_WHEEL=true
 case $desired_python in
+    3.14t)
+        echo "Using 3.14 deps"
+        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
+        PYYAML_PINNED_VERSION=">=6.0.1"
+        NUMPY_PINNED_VERSION="=2.1.0"
+        CONDA_ENV_CREATE_FLAGS="python-freethreading"
+        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+        desired_python="3.14.0rc1"
+        RENAME_WHEEL=false
+        ;;
+    3.14)
+        echo "Using 3.14t deps"
+        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
+        PYYAML_PINNED_VERSION=">=6.0.1"
+        NUMPY_PINNED_VERSION="=2.1.0"
+        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+        desired_python="3.14.0rc1"
+        RENAME_WHEEL=false
+        ;;
    3.13t)
        echo "Using 3.13 deps"
        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
@ -192,9 +211,6 @@ retry brew install libomp
 # For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

-if [[ -n "$CROSS_COMPILE_ARM64" ]]; then
-    export CMAKE_OSX_ARCHITECTURES=arm64
-fi
 export USE_MKLDNN=OFF
 export USE_QNNPACK=OFF
 export BUILD_TEST=OFF
@ -202,16 +218,7 @@ export BUILD_TEST=OFF
 pushd "$pytorch_rootdir"
 echo "Calling setup.py bdist_wheel at $(date)"

-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel -d "$whl_tmp_dir"
-    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 CMAKE_FRESH=1 python setup.py bdist_wheel -d "$whl_tmp_dir"
-    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-else
-    python setup.py bdist_wheel -d "$whl_tmp_dir"
-fi
+python setup.py bdist_wheel -d "$whl_tmp_dir"

 echo "Finished setup.py bdist_wheel at $(date)"

--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -65,16 +65,8 @@ fi

 if [[ "$PACKAGE_TYPE" != libtorch ]]; then
  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
-    if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-      pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
-      pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
-      # todo: after folder is populated use the pypi_pkg channel instead
-      pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
-      retry pip install -q numpy protobuf typing-extensions
-    else
-      pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-      retry pip install -q numpy protobuf typing-extensions
-    fi
+    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+    retry pip install -q numpy protobuf typing-extensions
  else
    pip install "\$pkg"
    retry pip install -q numpy protobuf typing-extensions
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -134,7 +134,6 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
-export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
 if [[ "${OSTYPE}" == "msys" ]]; then
  export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
  if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -23,10 +23,6 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
  AWS_S3_CP="aws s3 cp"
 fi

-if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
-  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
-fi
-
 # this is special build with all dependencies packaged
 if [[ ${BUILD_NAME} == *-full* ]]; then
  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -54,6 +54,7 @@ self-hosted-runner:
    - linux.rocm.gpu.2
    - linux.rocm.gpu.4
    # gfx942 runners
+    - linux.rocm.gpu.gfx942.1
    - linux.rocm.gpu.gfx942.2
    - linux.rocm.gpu.gfx942.4
    - rocm-docker
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@ -0,0 +1,80 @@
+# .github/workflows/build-external.yml
+name: Build External packages
+
+description: build external packages for PyTorch
+
+inputs:
+  cuda-arch-list:
+    description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
+    type: string
+    required: true
+    default: ""
+  docker-image:
+    description: Base image to use
+    type: string
+    required: true
+  build-targets:
+    description: Build targets
+    type: string
+    required: true
+  torch-wheel-dir:
+    description: Directory to built torch wheel
+    type: string
+    required: false
+    default: dist
+  output-dir:
+    description: Directory to store build artifact
+    default: external
+    type: string
+    required: false
+
+outputs:
+  build_time:
+    description: "Total build time in seconds"
+    value: ${{ steps.build-external.outputs.build_time }}
+  output_dir:
+    description: "Directory where build artifact is stored"
+    value: ${{ steps.build-external.outputs.output_dir }}
+
+runs:
+  using: composite
+  steps:
+    - name: Build external packages in sequence
+      id: build-external
+      env:
+        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_REGION: us-east-1
+        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
+        BASE_IMAGE: ${{ inputs.docker-image }}
+        BUILD_TARGETS: ${{ inputs.build-targets }}
+        PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
+      shell: bash
+      run: |
+        set -euo pipefail
+        python3 --version
+        docker images
+        START_TIME=$(date +%s)
+        (
+          cd .ci/lumen_cli
+          python3 -m pip install -e .
+        )
+        MAX_JOBS="$(nproc --ignore=6)"
+        export MAX_JOBS
+
+        # Split the comma-separated list and build each target
+        IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
+        for target in "${TARGETS[@]}"; do
+          OUTPUT_DIR="$PARENT_OUTPUT_DIR/$target"
+          export OUTPUT_DIR
+          echo "Building external package: $target in directory $OUTPUT_DIR"
+          python3 -m cli.run build external "$target"
+
+        done
+
+        END_TIME=$(date +%s)
+        {
+          echo "build_time=$((END_TIME - START_TIME))"
+          if [ -d "$PARENT_OUTPUT_DIR" ]; then
+            echo "output_dir=$PARENT_OUTPUT_DIR"
+          fi
+        } >> "$GITHUB_OUTPUT"
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@ -59,11 +59,6 @@ runs:
            echo "$msg"
            exit 1
        fi
-        if [[ $ngpu -eq 1 ]]; then
-            echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs"
-            echo "$msg"
-            exit 1
-        fi

    - name: Runner diskspace health check
      uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -24,7 +24,6 @@ runs:
          -e PYTORCH_FINAL_PACKAGE_DIR \
          -e PYTORCH_ROOT \
          -e SKIP_ALL_TESTS \
-          -e USE_SPLIT_BUILD \
          --tty \
          --detach \
          -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-bf305f538005f2e900f8850ed57146024a8bc559
+02351a683668dd65bc82343e55245e308eb97b4e
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-ca9e2be3ed6320b51f52f536595cd24e254f8bb2
+070da660c1bf9e7a7be8b9efeff4b06f91c7342f
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-29ae4c76c026185f417a25e841d2cd5e65f087a3
+095faec1e7b6cc47220181e74ae9cde2605f9b00
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -0,0 +1,414 @@
+# TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo
+# The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing
+
+ARG CUDA_VERSION=12.8.1
+ARG PYTHON_VERSION=3.12
+
+# BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine,
+# by default, it uses the torch-nightly-base stage from this docker image
+ARG BUILD_BASE_IMAGE=torch-nightly-base
+
+# FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer,
+# by default, it uses devel-ubuntu22.04 official image.
+ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+
+#################### TORCH NIGHTLY  BASE IMAGE ####################
+# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
+From nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
+ARG CUDA_VERSION=12.8.1
+ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+
+# Install Python and other dependencies if it does not existed
+RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
+      echo "Installing Python ${PYTHON_VERSION}..." && \
+      echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
+      echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
+      apt-get update -y && \
+      apt-get install -y ccache software-properties-common git curl sudo && \
+      for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+      done && \
+      apt-get update -y && \
+      apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
+      update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
+      update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
+      ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
+      curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
+   else \
+      echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
+   fi \
+   && python3 --version && python3 -m pip --version
+
+# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+# as it was causing spam when compiling the CUTLASS kernels
+# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
+RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
+    if [ "$current_gcc_version" -lt 10 ]; then \
+      echo "GCC version is $current_gcc_version, installing gcc-10..."; \
+      apt-get update && \
+      apt-get install -y gcc-10 g++-10 && \
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \
+      update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
+    else \
+      echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
+    fi && \
+    gcc --version && g++ --version
+
+# install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv==0.8.4
+
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+#################### TORCH NIGHTLY  BASE IMAGE ####################
+
+
+#################### BASE BUILD IMAGE ####################
+# A base image for building vLLM with torch nightly or torch wheels
+# prepare basic build environment
+FROM ${BUILD_BASE_IMAGE} AS base
+USER root
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+# Install uv for faster pip installs if not existed
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if ! python3 -m uv --version >/dev/null 2>&1; then \
+        python3 -m pip install uv==0.8.4; \
+    fi
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+WORKDIR /workspace
+
+# install build and runtime dependencies
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+
+# install build and runtime dependencies without stable torch version
+RUN python3 use_existing_torch.py
+
+# default mount file as placeholder, this just avoid the mount error
+# change to a different vllm folder if this does not exist anymore
+ARG TORCH_WHEELS_PATH="./requirements"
+ARG PINNED_TORCH_VERSION
+
+# Install torch, torchaudio and torchvision based on the input
+# if TORCH_WHEELS_PATH is default "./requirements", it will pull thethe nightly versions using pip
+# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
+RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
+        torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
+        vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
+        audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
+        uv pip install --system "${torch_whl}[opt-einsum]"; \
+        uv pip install --system "${vision_whl}"; \
+        uv pip install --system "${audio_whl}"; \
+    elif [ -n "$PINNED_TORCH_VERSION" ]; then \
+        echo "[INFO] Installing pinned torch nightly version: $PINNED_TORCH_VERSION"; \
+        uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \
+    else \
+        echo "[INFO] Installing torch nightly with latest one"; \
+        uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \
+    fi
+
+# Install numba 0.61.2 for cuda environment
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system numba==0.61.2
+
+# Install common dependencies from vllm common.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+uv pip install --system -r requirements/common.txt
+
+
+# Must put before installing xformers, so it can install the correct version of xfomrers.
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+
+# Build xformers with cuda and torch nightly/wheel
+# following official xformers guidance: https://github.com/facebookresearch/xformers#build
+ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    echo 'git clone xformers...' \
+    && git clone https://github.com/facebookresearch/xformers.git --recursive \
+    && cd xformers \
+    && git checkout ${XFORMERS_COMMIT} \
+    && git submodule update --init --recursive \
+    && echo 'finish git clone xformers...' \
+    && rm -rf build \
+    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
+    && cd .. \
+    && rm -rf xformers
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system xformers-dist/*.whl --verbose
+
+# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
+# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
+RUN cat  torch_build_versions.txt
+
+RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
+
+#################### BASE BUILD IMAGE ####################
+
+
+#################### WHEEL BUILD IMAGE ####################
+# Image used to build vllm wheel
+FROM base AS build
+ARG TARGETPLATFORM
+
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+COPY . .
+
+RUN python3 use_existing_torch.py
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/build.txt
+
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
+
+# Max jobs used by Ninja to build extensions
+ARG max_jobs=16
+ENV MAX_JOBS=${max_jobs}
+ARG nvcc_threads=2
+ENV NVCC_THREADS=$nvcc_threads
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+
+ARG USE_SCCACHE
+ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
+ARG SCCACHE_REGION_NAME=us-west-2
+ARG SCCACHE_S3_NO_CREDENTIALS=0
+
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
+        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
+        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
+        && export SCCACHE_IDLE_TIMEOUT=0 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=.git,target=.git  \
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
+        python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
+    fi
+
+RUN echo "[DEBUG] Listing  current directory:" && \
+    ls -al && \
+    echo "[DEBUG] Showing torch_build_versions.txt content:" && \
+    cat torch_build_versions.txt
+
+#################### WHEEL BUILD IMAGE ####################
+
+
+################### VLLM INSTALLED IMAGE ####################
+# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
+FROM ${FINAL_BASE_IMAGE} AS vllm-base
+USER root
+# prepare for environment starts
+WORKDIR /workspace
+
+RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
+    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
+
+# Install Python and other dependencies if it does not existed
+RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
+      echo "Installing Python ${PYTHON_VERSION}..." && \
+      echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
+      echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
+      apt-get update -y && \
+      apt-get install -y ccache software-properties-common git curl sudo && \
+      for i in 1 2 3; do \
+        add-apt-repository -y ppa:deadsnakes/ppa && break || \
+        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
+      done && \
+      apt-get update -y && \
+      apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
+      update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
+      update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
+      ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
+      curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
+   else \
+      echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
+   fi \
+   && python3 --version && python3 -m pip --version
+
+
+# Get the torch versions, and whls used in previous stagtes for consistency
+COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
+COPY --from=base /workspace/xformers-dist /wheels/xformers
+COPY --from=build /workspace/vllm-dist /wheels/vllm
+RUN echo "[DEBUG] Listing current directory before torch install step:" && \
+    ls -al && \
+    echo "[DEBUG] Showing torch_build_versions.txt content:" && \
+    cat torch_build_versions.txt
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+
+
+# Install uv for faster pip installs if not existed
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if ! python3 -m uv --version > /dev/null 2>&1; then \
+        python3 -m pip install uv==0.8.4; \
+    fi
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+# Default mount file as placeholder, this just avoid the mount error
+ARG TORCH_WHEELS_PATH="./requirements"
+# Install torch, torchaudio and torchvision
+# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt
+# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
+RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
+        torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
+        vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
+        audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
+        echo "Found: '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
+        uv pip install --system "${torch_whl}[opt-einsum]"; \
+        uv pip install --system "${vision_whl}"; \
+        uv pip install --system "${audio_whl}"; \
+    else \
+        echo "[INFO] Installing torch versions from torch_build_versions.txt"; \
+        uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128; \
+    fi
+
+# Install the vllm wheel from previous stage
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system /wheels/vllm/*.whl --verbose
+
+# Install xformers wheel from previous stage
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system /wheels/xformers/*.whl --verbose
+
+
+# Build flashinfer from source.
+ARG torch_cuda_arch_list='8.0;8.9;9.0a'
+# install package for build flashinfer
+# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
+
+RUN pip install build==1.3.0
+RUN pip freeze | grep -E 'setuptools|packaging|build'
+
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+# Build flashinfer for torch nightly from source around 10 mins
+ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
+# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
+ARG FLASHINFER_GIT_REF="v0.2.9rc2"
+RUN --mount=type=cache,target=/root/.cache/uv \
+    git clone --depth 1 --recursive --shallow-submodules \
+        --branch ${FLASHINFER_GIT_REF} \
+        ${FLASHINFER_GIT_REPO} flashinfer \
+    && echo "Building FlashInfer with AOT for arches: ${torch_cuda_arch_list}" \
+    && cd flashinfer \
+    && python3 -m flashinfer.aot \
+    && python3 -m build --no-isolation --wheel --outdir ../wheels/flashinfer \
+    && cd .. \
+    && rm -rf flashinfer
+
+# install flashinfer python
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system wheels/flashinfer/*.whl --verbose
+
+# Logging to confirm the torch versions
+RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+################### VLLM INSTALLED IMAGE ####################
+
+
+#################### UNITTEST IMAGE #############################
+FROM vllm-base as test
+
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+
+COPY tests/ tests/
+COPY examples examples
+COPY benchmarks benchmarks
+COPY ./vllm/collect_env.py .
+COPY requirements/common.txt requirements/common.txt
+COPY use_existing_torch.py use_existing_torch.py
+COPY pyproject.toml pyproject.toml
+# Install build and runtime dependencies without stable torch version
+COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
+
+RUN python3 use_existing_torch.py
+
+# install packages
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/common.txt
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -e tests/vllm_test_utils
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r requirements/nightly_torch_test.txt
+
+# Workaround for #17068
+# pinned commit for v2.2.4
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@95d8aba8a8c75aedcaa6143713b11e745e7cd0d9#egg=mamba-ssm"
+
+# Logging to confirm the torch versions
+RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+
+# Logging to confirm all the packages are installed
+RUN pip freeze
+
+#################### UNITTEST IMAGE #############################
+
+#################### EXPORT STAGE ####################
+FROM scratch as export-wheels
+
+# Just copy the wheels we prepared in previous stages
+COPY --from=base /workspace/xformers-dist /wheels/xformers
+COPY --from=build /workspace/vllm-dist /wheels/vllm
+COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,20 @@
+version: 2
+updates:
+  # Update to the latest transformers version with dependabot
+  - package-ecosystem: "pip"
+    directory: "/.ci/docker/ci_commit_pins"
+    schedule:
+      interval: "daily"
+    target-branch: "main"
+    allow:
+      - dependency-name: "transformers"
+    commit-message:
+      prefix: "[Dependabot] Update"
+      include: "scope"
+    labels:
+      - "dependencies"
+      - "open source"
+      - "python"
+      - "topic: not user facing"
+      - "module: ci"
+      - "module: inductor"
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -22,10 +22,12 @@ ciflow_push_tags:
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
+- ciflow/riscv64
 - ciflow/slow
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
+- ciflow/vllm
 - ciflow/torchbench
 - ciflow/op-benchmark
 - ciflow/pull
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@ -1,5 +0,0 @@
-# Not pinning certifi so that we can always get the latest certificates
-certifi
-pip=23.2.1
-pkg-config=0.29.2
-wheel=0.37.1
--- a/.github/scripts/amd/package_triton_wheel.sh
+++ b/.github/scripts/amd/package_triton_wheel.sh
@ -1,3 +1,4 @@
+#!/bin/bash
 set -ex

 # Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
@ -50,29 +51,15 @@ do
    cp $lib $TRITON_ROCM_DIR/lib/
 done

-# Required ROCm libraries
-if [[ "${MAJOR_VERSION}" == "6" ]]; then
-    libamdhip="libamdhip64.so.6"
-else
-    libamdhip="libamdhip64.so.5"
-fi
-
 # Required ROCm libraries - ROCm 6.0
 ROCM_SO=(
-    "${libamdhip}"
-    "libhsa-runtime64.so.1"
-    "libdrm.so.2"
-    "libdrm_amdgpu.so.1"
+    "libamdhip64.so"
+    "libhsa-runtime64.so"
+    "libdrm.so"
+    "libdrm_amdgpu.so"
+    "libamd_comgr.so"
+    "librocprofiler-register.so"
 )
-if [[ $ROCM_INT -ge 60400 ]]; then
-    ROCM_SO+=("libamd_comgr.so.3")
-else
-    ROCM_SO+=("libamd_comgr.so.2")
-fi
-
-if [[ $ROCM_INT -ge 60100 ]]; then
-    ROCM_SO+=("librocprofiler-register.so.0")
-fi

 for lib in "${ROCM_SO[@]}"
 do
@ -94,10 +81,6 @@ do
    fi

    cp $file_path $TRITON_ROCM_DIR/lib
-    # When running locally, and not building a wheel, we need to satisfy shared objects requests that don't look for versions
-    LINKNAME=$(echo $lib | sed -e 's/\.so.*/.so/g')
-    ln -sf $lib $TRITON_ROCM_DIR/lib/$LINKNAME
-
 done

 # Copy Include Files
--- a/.github/scripts/amd/patch_triton_wheel.sh
+++ b/.github/scripts/amd/patch_triton_wheel.sh
@ -19,15 +19,13 @@ replace_needed_sofiles() {
    find $1 -name '*.so*' -o -name 'ld.lld' | while read sofile; do
        origname=$2
        patchedname=$3
-        if [[ "$origname" != "$patchedname" ]]; then
-            set +e
-            origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
-            ERRCODE=$?
-            set -e
-            if [ "$ERRCODE" -eq "0" ]; then
-                echo "patching $sofile entry $origname to $patchedname"
-                $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
-            fi
+        set +e
+        origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
+        ERRCODE=$?
+        set -e
+        if [ "$ERRCODE" -eq "0" ]; then
+            echo "patching $sofile entry $origname to $patchedname"
+            $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
        fi
    done
 }
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -54,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -71,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -88,7 +88,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -193,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
    "cpu": "libtorch-cxx11-builder:cpu",
 }

-FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
+FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]


 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -273,7 +273,6 @@ def generate_wheels_matrix(
    os: str,
    arches: Optional[list[str]] = None,
    python_versions: Optional[list[str]] = None,
-    use_split_build: bool = False,
 ) -> list[dict[str, str]]:
    package_type = "wheel"
    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
@ -315,15 +314,11 @@ def generate_wheels_matrix(
            # TODO: Enable python 3.13t on cpu-s390x
            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
                continue
-
-            if use_split_build and (
-                arch_version not in ["12.6", "12.8", "12.9", "cpu"] or os != "linux"
+            # TODO: Enable python 3.14 on non linux OSes
+            if os not in ["linux", "linux-aarch64", "macos-arm64"] and (
+                python_version == "3.14" or python_version == "3.14t"
            ):
-                raise RuntimeError(
-                    "Split build is only supported on linux with cuda 12* and cpu.\n"
-                    f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
-                    "Please modify the matrix generation to exclude this combination."
-                )
+                continue

            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

@ -339,7 +334,6 @@ def generate_wheels_matrix(
                        "gpu_arch_type": gpu_arch_type,
                        "gpu_arch_version": gpu_arch_version,
                        "desired_cuda": desired_cuda,
-                        "use_split_build": "True" if use_split_build else "False",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                            ":"
                        )[0],
@ -372,7 +366,6 @@ def generate_wheels_matrix(
                            "desired_cuda": translate_desired_cuda(
                                gpu_arch_type, gpu_arch_version
                            ),
-                            "use_split_build": "True" if use_split_build else "False",
                            "container_image": WHEEL_CONTAINER_IMAGES[
                                arch_version
                            ].split(":")[0],
@ -395,7 +388,6 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "use_split_build": "True" if use_split_build else "False",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version].split(
                            ":"
                        )[0],
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -59,9 +59,7 @@ class BinaryBuildWorkflow:
    is_scheduled: str = ""
    branches: str = "nightly"
    # Mainly for macos
-    cross_compile_arm64: bool = False
    macos_runner: str = "macos-14-xlarge"
-    use_split_build: bool = False
    # Mainly used for libtorch builds
    build_variant: str = ""

@ -72,9 +70,6 @@ class BinaryBuildWorkflow:
                for item in [self.os, "binary", self.package_type, self.build_variant]
                if item != ""
            )
-        if self.use_split_build:
-            # added to distinguish concurrency groups
-            self.build_environment += "-split"

    def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
        output_file_path = (
@ -117,21 +112,6 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
            isolated_workflow=True,
        ),
    ),
-    # See https://github.com/pytorch/pytorch/issues/138750
-    #   BinaryBuildWorkflow(
-    #     os=OperatingSystem.LINUX,
-    #     package_type="manywheel",
-    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-    #         OperatingSystem.LINUX,
-    #         use_split_build=True,
-    #         arches=["11.8", "12.1", "12.4", "cpu"],
-    #     ),
-    #     ciflow_config=CIFlowConfig(
-    #         labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
-    #         isolated_workflow=True,
-    #     ),
-    #     use_split_build=True,
-    # ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
@ -175,27 +155,11 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["12.6", "12.8", "12.9"],
-            python_versions=["3.9"],
+            arches=["12.8"],
+            python_versions=["3.12"],
        ),
        branches="main",
    ),
-    # See https://github.com/pytorch/pytorch/issues/138750
-    # BinaryBuildWorkflow(
-    #     os=OperatingSystem.LINUX,
-    #     package_type="manywheel",
-    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-    #         OperatingSystem.LINUX,
-    #         arches=["11.8", "12.1", "12.4"],
-    #         python_versions=["3.9"],
-    #         use_split_build=True,
-    #     ),
-    #     ciflow_config=CIFlowConfig(
-    #         labels={LABEL_CIFLOW_PERIODIC},
-    #     ),
-    #     branches="main",
-    #     use_split_build=True,
-    # ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
@ -338,7 +302,6 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
            generate_binary_build_matrix.RELEASE,
            libtorch_variants=["shared-with-deps"],
        ),
-        cross_compile_arm64=False,
        macos_runner="macos-14-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -351,7 +314,6 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.MACOS_ARM64
        ),
-        cross_compile_arm64=False,
        macos_runner="macos-14-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -262,7 +262,12 @@ def is_exception_branch(branch: str) -> bool:
    """
    Branches that get opted out of experiments by default, until they're explicitly enabled.
    """
-    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
+    return branch.split("/", maxsplit=1)[0] in {
+        "main",
+        "nightly",
+        "release",
+        "landchecks",
+    }


 def load_yaml(yaml_text: str) -> Any:
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -70,6 +70,9 @@ def mock_query(
    if key in mocked_queries:
        return mocked_queries[key]

+    # TODO: Remove me once https://github.com/pytorch/pytorch/issues/160489 is resolved
+    raise ValueError(f"Key {key} could not be found in gql_mocks")
+
    try:
        rc = fallback_function(*args)
    except HTTPError as err:
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -108,10 +108,6 @@ GH_CHECKSUITES_FRAGMENT = """
 fragment PRCheckSuites on CheckSuiteConnection {
  edges {
    node {
-      app {
-        name
-        databaseId
-      }
      workflowRun {
        workflow {
          name
--- a/.github/scripts/windows/build_triton.bat
+++ b/.github/scripts/windows/build_triton.bat
@ -10,7 +10,7 @@ if "%PY_VERS%" == "3.13t" (
    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
 )
 :: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480
-call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja
+call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja==1.11.1.4

 dir "%VC_INSTALL_PATH%"

--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -47,9 +47,6 @@ env:
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SKIP_ALL_TESTS: 0
-{%- if cross_compile_arm64 %}
-  CROSS_COMPILE_ARM64: 1
-{% endif %}
 !{{ common.concurrency(build_environment) }}

 jobs:
@ -113,12 +110,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -25,11 +25,6 @@
      DOCKER_IMAGE: !{{ config["container_image"] }}
      DOCKER_IMAGE_TAG_PREFIX: !{{ config["container_image_tag_prefix"] }}
 {%- endif %}
-{%- if config["package_type"] == "manywheel" %}
-  {%- if config.use_split_build is defined %}
-      use_split_build: !{{ config["use_split_build"] }}
-  {%- endif %}
-{%- endif %}
 {%- if config["package_type"] == "libtorch" %}
  {%- if config["libtorch_config"] %}
      LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -26,13 +26,6 @@ on:
        default: 240
        type: number
        description: timeout for the job
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
      ALPINE_IMAGE:
        required: false
        type: string
@ -117,7 +110,6 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Make the env permanent during this workflow (but not the secrets)
        shell: bash
@ -142,7 +134,6 @@ jobs:
            echo "PR_NUMBER=${{ env.PR_NUMBER }}"
            echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
            echo "SHA1=${{ env.SHA1 }}"
-            echo "USE_SPLIT_BUILD=${{ env.use_split_build }}"
          } >> "${GITHUB_ENV} }}"

      - name: List the env
@ -261,7 +252,6 @@ jobs:
            -e PYTORCH_ROOT \
            -e SKIP_ALL_TESTS \
            -e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \
-            -e USE_SPLIT_BUILD \
            --tty \
            --detach \
            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -64,13 +64,6 @@ on:
        required: true
        type: string
        description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
    secrets:
      github-token:
        required: true
@ -104,7 +97,6 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Make the env permanent during this workflow (but not the secrets)
        shell: bash
@ -129,7 +121,6 @@ jobs:
            echo "PR_NUMBER=${{ env.PR_NUMBER }}"
            echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
            echo "SHA1=${{ env.SHA1 }}"
-            echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}"
          } >> "${GITHUB_ENV} }}"

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -51,13 +51,6 @@ on:
        required: false
        type: string
        description: Desired python version
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
    secrets:
      github-token:
        required: true
@ -86,7 +79,6 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -96,6 +96,13 @@ on:
        required: false
        type: string
        default: ""
+      build-external-packages:
+        description: |
+          If set, the build external packages and saves their wheels as artifacts
+          use command separated list of packages to build ex: 'vllm,transformers'.
+        required: false
+        type: string
+        default: ""

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -287,10 +294,36 @@ jobs:
          # comes from https://github.com/pytorch/test-infra/pull/6058
          TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))

+          if [[ ${BUILD_ENVIRONMENT} == *"riscv64"* ]]; then
+            # EC2 specific setup for RISC-V emulation
+            # Ensure binfmt_misc is available
+            echo "Mounting binfmt_misc filesystem"
+            sudo mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc 2>/dev/null || true
+
+            echo "QEMU registration: multiarch/qemu-user-static"
+            docker run --rm --privileged multiarch/qemu-user-static --reset -p yes || true
+
+            # Final verification
+            echo "Checking binfmt_misc status:"
+            ls -la /proc/sys/fs/binfmt_misc/ 2>/dev/null || echo "Cannot access binfmt_misc directory"
+
+            if [ -f /proc/sys/fs/binfmt_misc/qemu-riscv64 ]; then
+              echo "qemu-riscv64 registration successful"
+            else
+              echo "qemu-riscv64 registration failed - proceeding without emulation"
+              echo "This may cause RISC-V builds to fail"
+            fi
+
+            RISCV_DOCKER_ARGS="--privileged"
+          else
+            RISCV_DOCKER_ARGS=
+          fi
+
          # detached container should get cleaned up by teardown_ec2_linux
          # Used for JENKINS_USER and DOCKER_SHELL_CMD, which can be empty
          # shellcheck disable=SC2086
          container_name=$(docker run \
+            ${RISCV_DOCKER_ARGS} \
            -e BUILD_ENVIRONMENT \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e PR_NUMBER \
@ -306,7 +339,6 @@ jobs:
            -e OUR_GITHUB_JOB_ID \
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
-            -e USE_SPLIT_BUILD \
            -e BUILD_ADDITIONAL_PACKAGES \
            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
@ -331,6 +363,26 @@ jobs:
          END_TIME=$(date +%s)
          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"

+      - name: Build external packages
+        id: build-external-packages
+        if: inputs.build-external-packages != '' &&  steps.build.outcome != 'skipped'
+        uses: ./.github/actions/build-external-packages
+        with:
+          build-targets: ${{ inputs.build-external-packages }}
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          cuda-arch-list: ${{ inputs.cuda-arch-list }}
+          output-dir: external
+
+      - name: Move external packages to dist
+        if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped'
+        shell: bash
+        run: |
+          src="${{ steps.build-external-packages.outputs.output_dir }}"
+          if [ -d "$src" ]; then
+            mkdir -p "dist/$(dirname "$src")"
+            mv "$src" "dist/$(dirname "$src")/"
+          fi
+
      - name: Stop monitoring script
        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -96,7 +96,7 @@ jobs:
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+        if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -109,7 +109,7 @@ jobs:
          no-sudo: true

      - name: Setup Python
-        if: matrix.runner == 'B200'
+        if: contains(matrix.runner, 'b200')
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
          python-version: '3.12'
@ -117,7 +117,7 @@ jobs:

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
-        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && !contains(matrix.runner, 'b200')

      - name: configure aws credentials
        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
@ -128,7 +128,7 @@ jobs:
          aws-region: us-east-1

      - name: Login to Amazon ECR
-        if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
+        if: ${{ inputs.aws-role-to-assume != '' && contains(matrix.runner, 'b200') }}
        id: login-ecr
        continue-on-error: true
        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
@ -166,17 +166,17 @@ jobs:
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
        with:
          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}

      - name: Setup GPU_FLAG for docker run
        id: setup-gpu-flag
        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || contains(matrix.runner, 'b200')) }}

      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
        id: setup-sscache-port-flag
        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
-        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && !contains(matrix.runner, 'b200') }}

      - name: Lock NVIDIA A100 40GB Frequency
        run: |
@ -277,8 +277,8 @@ jobs:
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
          # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
-          SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
-          SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
+          SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
+          SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@ -403,7 +403,7 @@ jobs:
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

      - name: Authenticate with AWS
-        if: ${{ matrix.runner == 'B200' }}
+        if: ${{ contains(matrix.runner, 'b200') }}
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -136,7 +136,7 @@ jobs:
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7
+          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7
          "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -88,6 +88,16 @@ jobs:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm

+      - name: Runner check GPU count (distributed jobs)
+        if: ${{ contains(matrix.config, 'distributed') }}
+        shell: bash
+        run: |
+          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
+          if [[ $ngpu -lt 4 ]]; then
+            echo "Error: only $ngpu GPU(s) detected, at least 4 GPUs are needed for distributed jobs"
+            exit 1
+          fi
+
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -51,21 +51,17 @@ jobs:
        docker-image-name: [
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
-          pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.9-clang12,
-          pytorch-linux-jammy-py3.11-clang12,
-          pytorch-linux-jammy-py3.12-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
          pytorch-linux-noble-rocm-alpha-py3,
+          pytorch-linux-jammy-rocm-n-py3-benchmarks,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
          pytorch-linux-jammy-py3.9-gcc11,
          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
@ -78,7 +74,8 @@ jobs:
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
          # Executorch pin needs update
          # pytorch-linux-jammy-py3-clang12-executorch,
-          pytorch-linux-jammy-py3.12-triton-cpu
+          pytorch-linux-jammy-py3.12-triton-cpu,
+          pytorch-linux-noble-riscv64-py3.12-gcc14
        ]
        include:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -60,7 +60,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -84,7 +83,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -108,7 +106,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
    secrets:
@ -129,14 +126,13 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -156,7 +152,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
    secrets:
@ -176,7 +171,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -200,7 +194,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -224,7 +217,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
    secrets:
@ -245,14 +237,13 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -272,7 +263,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
    secrets:
@ -292,7 +282,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -316,7 +305,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -340,7 +328,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
    secrets:
@ -361,14 +348,13 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -388,7 +374,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
    secrets:
@ -408,7 +393,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -432,7 +416,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -456,7 +439,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
    secrets:
@ -477,14 +459,13 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -504,7 +485,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
    secrets:
@ -524,7 +504,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -548,7 +527,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -572,7 +550,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-aarch64
    secrets:
@ -593,14 +570,13 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -620,7 +596,6 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
    secrets:
@ -640,7 +615,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
@ -664,7 +638,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      build_name: manywheel-py3_13t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -688,7 +661,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: manylinux2_28_aarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      build_name: manywheel-py3_13t-cpu-aarch64
    secrets:
@ -709,14 +681,13 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -736,9 +707,230 @@ jobs:
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
      DESIRED_PYTHON: "3.13t"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cpu-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-aarch64-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14-cpu-aarch64-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.2xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cpu-aarch64-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cpu-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-aarch64-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14t-cpu-aarch64-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.2xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cpu-aarch64-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -42,54 +42,7 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-cuda12_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_6
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_6-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_6-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_6
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_8-build:
+  manywheel-py3_12-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -103,18 +56,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_8
+      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_8-test:  # Testing
+  manywheel-py3_12-cuda12_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_9-cuda12_8-build
+      - manywheel-py3_12-cuda12_8-build
      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
@ -127,56 +79,8 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_8
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: 12.9
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_9
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -58,7 +58,6 @@ jobs:
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-rocm6_4
@ -83,7 +82,6 @@ jobs:
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
    steps:
      - name: Setup ROCm
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -60,7 +60,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -84,7 +83,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -107,7 +105,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
    secrets:
@ -127,7 +124,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -151,7 +147,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -174,7 +169,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
    secrets:
@ -194,7 +188,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -218,7 +211,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -241,7 +233,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
    secrets:
@ -261,7 +252,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -285,7 +275,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -308,7 +297,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
    secrets:
@ -328,7 +316,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
@ -352,7 +339,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
@ -375,7 +361,6 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
-      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
    secrets:
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -115,12 +115,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -239,12 +260,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -363,12 +405,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -487,12 +550,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -611,12 +695,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -735,12 +840,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -774,3 +900,293 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_14-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-14-xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.14"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
+
+          # Build
+          USE_PYTORCH_METAL_EXPORT=1
+          USE_COREML_DELEGATE=1
+          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
+          export USE_PYTORCH_METAL_EXPORT
+          export USE_COREML_DELEGATE
+          export TORCH_PACKAGE_NAME
+          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
+      - name: Test PyTorch wheel
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+
+          # Create new "clean" conda environment for testing
+
+          SMOKE_TEST_PARAMS=""
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          conda activate test_conda_env
+          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
+
+          # shellcheck disable=SC2086
+          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_14-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_14-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_14-cpu-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      DESIRED_PYTHON: "3.14"
+      build_name: wheel-py3_14-cpu
+      use_s3: False
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_14t-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-14-xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.14t"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
+
+          # Build
+          USE_PYTORCH_METAL_EXPORT=1
+          USE_COREML_DELEGATE=1
+          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
+          export USE_PYTORCH_METAL_EXPORT
+          export USE_COREML_DELEGATE
+          export TORCH_PACKAGE_NAME
+          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
+      - name: Test PyTorch wheel
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+
+          # Create new "clean" conda environment for testing
+
+          SMOKE_TEST_PARAMS=""
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          conda activate test_conda_env
+          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
+
+          # shellcheck disable=SC2086
+          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_14t-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_14t-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_14t-cpu-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      DESIRED_PYTHON: "3.14t"
+      build_name: wheel-py3_14t-cpu
+      use_s3: False
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/h100-cutlass-backend.yml
+++ b/.github/workflows/h100-cutlass-backend.yml
@ -4,9 +4,12 @@ on:
  pull_request:
    paths:
      - .github/workflows/h100-cutlass-backend.yml
+      - torch/_inductor/codegen/cuda/**
+      - test/inductor/test_cutlass_backend.py
+      - test/inductor/test_cutlass_evt.py
  workflow_dispatch:
  schedule:
-    - cron: 22 9 * * *  # every 24 hours about 2:22am PDT
+    - cron: 22 9,21 * * *  # every 12 hours
  push:
    tags:
      - ciflow/h100-cutlass-backend/*
--- a/.github/workflows/inductor-perf-test-b200.yml
+++ b/.github/workflows/inductor-perf-test-b200.yml
@ -0,0 +1,154 @@
+name: inductor-perf-b200
+
+on:
+  schedule:
+    - cron: 0 7 * * 1-6
+    - cron: 0 7 * * 0
+  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
+  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
+  workflow_dispatch:
+    inputs:
+      training:
+        description: Run training (on by default)?
+        required: false
+        type: boolean
+        default: true
+      inference:
+        description: Run inference (on by default)?
+        required: false
+        type: boolean
+        default: true
+      default:
+        description: Run inductor_default?
+        required: false
+        type: boolean
+        default: false
+      dynamic:
+        description: Run inductor_dynamic_shapes?
+        required: false
+        type: boolean
+        default: false
+      cppwrapper:
+        description: Run inductor_cpp_wrapper?
+        required: false
+        type: boolean
+        default: false
+      cudagraphs:
+        description: Run inductor_cudagraphs?
+        required: false
+        type: boolean
+        default: true
+      freezing_cudagraphs:
+        description: Run inductor_cudagraphs with freezing for inference?
+        required: false
+        type: boolean
+        default: false
+      aotinductor:
+        description: Run aot_inductor for inference?
+        required: false
+        type: boolean
+        default: false
+      maxautotune:
+        description: Run inductor_max_autotune?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+        default: inductor_huggingface_perf_cuda_b200,inductor_timm_perf_cuda_b200,inductor_torchbench_perf_cuda_b200
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  build:
+    name: cuda12.8-py3.10-gcc9-sm100
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      # Use a bigger runner here because CUDA_ARCH 9.0 is only built for H100
+      # or newer GPUs, so it doesn't benefit much from existing compiler cache
+      # from trunk. Also use a memory-intensive runner here because memory is
+      # usually the bottleneck
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_huggingface_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+          { config: "inductor_timm_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+          { config: "inductor_torchbench_perf_cuda_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+        ]}
+      selected-test-configs: ${{ inputs.benchmark_configs }}
+      build-additional-packages: "vision audio fbgemm torchao"
+    secrets: inherit
+
+  test-periodically:
+    name: cuda12.8-py3.10-gcc9-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event.schedule == '0 7 * * 1-6'
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test-weekly:
+    name: cuda12.8-py3.10-gcc9-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    if: github.event.schedule == '0 7 * * 0'
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      timeout-minutes: 1440
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
+
+  test:
+    name: cuda12.8-py3.10-gcc9-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs: build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      docker-image: ${{ needs.build.outputs.docker-image }}
+      test-matrix: ${{ needs.build.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@ -85,26 +85,26 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-rocm-py3_10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -77,25 +77,25 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-rocm-py3_10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamo_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamo_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamo_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamo_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_aot_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_aot_eager_torchbench", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@ -47,8 +47,8 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
        ]}
    secrets: inherit

--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -93,7 +93,7 @@ jobs:
      script: |
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running mypy"
-        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY --all-files" .github/scripts/lintrunner.sh
+        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh

  lintrunner-noclang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -111,9 +111,9 @@ jobs:
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running all other linters"
        if [ "$CHANGED_FILES" = '*' ]; then
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY --all-files" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
        else
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
        fi

  quick-checks:
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -51,37 +51,6 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-jammy-cuda12_4-py3_10-gcc11-sm89-build:
-    name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
-      cuda-arch-list: 8.9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_4-py3_10-gcc11-sm89-test:
-    name: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_4-py3_10-gcc11-sm89-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-cuda12.4-py3.10-gcc11-sm89
-      docker-image: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_4-py3_10-gcc11-sm89-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-cuda12_4-py3_10-gcc11-build:
    name: linux-jammy-cuda12.4-py3.10-gcc11
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -251,68 +251,6 @@ jobs:
      build-environment: linux-jammy-py3.13-clang12
      docker-image: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
-      timeout-minutes: 600
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '7.5'
-      test-matrix: |
-        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc11
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc11
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-build
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
@ -329,30 +267,6 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-cpu-py3_10-gcc11-bazel-test:
    name: linux-jammy-cpu-py3.10-gcc11-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
@ -402,37 +316,6 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc11-sm89-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: 8.9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-sm89-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-sm89-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm89
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm89-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-py3-clang12-executorch-build:
    if: false  # Docker build needs pin update
    name: linux-jammy-py3-clang12-executorch
@ -459,31 +342,6 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm75
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '7.5'
-      test-matrix: |
-        { include: [
-          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm75
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-xpu-2025_1-py3_9-build:
    name: linux-jammy-xpu-2025.1-py3.9
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/riscv64.yml
+++ b/.github/workflows/riscv64.yml
@ -0,0 +1,24 @@
+name: riscv64
+
+on:
+  push:
+    tags:
+      - ciflow/riscv64/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  pytorch-linux-noble-riscv64-py3_12-gcc14-cross-build:
+    if: github.repository_owner == 'pytorch'
+    name: pytorch-linux-noble-riscv64-py3_12-gcc14-cross-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-noble-riscv64-py3.12-gcc14
+      docker-image-name: pytorch-linux-noble-riscv64-py3.12-gcc14
+      runner: linux.2xlarge
+    secrets: inherit
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -48,12 +48,12 @@ jobs:
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.2" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
        ]}
    secrets: inherit

--- a/.github/workflows/tools-unit-tests.yml
+++ b/.github/workflows/tools-unit-tests.yml
@ -0,0 +1,70 @@
+name: test-scripts-and-ci-tools
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - scripts/lumen_cli/**
+      - .github/workflows/tools-unit-tests.yml
+  pull_request:
+    paths:
+      - scripts/lumen_cli/**
+      - .github/workflows/tools-unit-tests.yml
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  lumen-cli-unit-tests-python312:
+    permissions:
+      contents: read
+      pull-requests: write
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout pytorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: true
+          fetch-depth: 0
+      - name: Setup Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: '3.12'
+          cache: pip
+
+      - name: Run tests
+        continue-on-error: true
+        run: |
+          set -ex
+          python3 -m venv /tmp/venv
+          source /tmp/venv/bin/activate
+          pip install -e .ci/lumen_cli/
+          pytest -v -s .ci/lumen_cli/tests/*
+
+  lumen-cli-compatible-python39:
+    permissions:
+      contents: read
+      pull-requests: write
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout pytorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: true
+          fetch-depth: 0
+      - name: Setup Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: '3.9'
+          cache: 'pip'
+      - name: Run tests
+        continue-on-error: true
+        run: |
+          set -ex
+          python3 -m venv /tmp/venv
+          source /tmp/venv/bin/activate
+          pip install -e .ci/lumen_cli/
--- a/Show More
+++ b/Show More