Revert "[RELAND] Always build USE_DISTRIBUTED (#160449 ) and Make distributed modules importable even when backend not built (#159889 ) (#162594 )"

This reverts commit 09cb34c1dce8fe1b880bbf3115d8ddad3401d871.
[CI] Install libuv for Win testing (#163797 )
2025-10-25 16:14:55 +08:00 · 2025-09-24 18:18:00 -07:00 · 2025-09-25 01:10:14 +00:00 · 2025-09-25 00:58:17 +00:00 · 2025-09-25 00:14:31 +00:00 · 2025-09-24 23:59:19 +00:00
480 changed files with 17394 additions and 11734 deletions
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
        try:
            with socket.create_connection((addr, port), timeout=timeout):
                return
-        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
+        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
            if i == attempt_cnt - 1:
                raise
            time.sleep(timeout)
@ -1004,7 +1004,7 @@ if __name__ == "__main__":
        install_condaforge_python(host, args.python_version)
        sys.exit(0)

-    python_version = args.python_version if args.python_version is not None else "3.9"
+    python_version = args.python_version if args.python_version is not None else "3.10"

    if args.use_torch_from_pypi:
        configure_system(host, compiler=args.compiler, python_version=python_version)
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -262,13 +262,10 @@ case "$tag" in
    TRITON_CPU=yes
    ;;
  pytorch-linux-jammy-linter)
-    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
-    # We will need to update mypy version eventually, but that's for another day. The task
-    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    PYTHON_VERSION=3.10
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
+    PYTHON_VERSION=3.10
    CUDA_VERSION=12.8.1
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-5ae38bdb0dc066c5823e34dc9797afb9de42c866
+bbb06c0334a6772b92d24bde54956e675c8c6604
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -93,8 +93,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:

-mypy==1.16.0
+mypy==1.16.0 ; platform_system != "Windows"
 # Pin MyPy version because new errors are likely to appear with each release
+# Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
@ -111,8 +112,6 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

-numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
-numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
@ -133,7 +132,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
+numpy==1.22.4; python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13"

@ -325,8 +324,6 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 lxml==5.3.0
 #Description: This is a requirement of unittest-xml-reporting

-# Python-3.9 binaries
-
 PyGithub==2.3.0

 sympy==1.13.3
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,24 +1,23 @@
-sphinx==6.2.1
+sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 7.2.6
-
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
-
-sphinx-remove-toctrees==1.0.0.post1
-#Description: This is used to generate PyTorch docs
-#Pinned versions: 1.0.0.post1
+#Pinned versions: 5.3.0
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.

-sphinxcontrib.katex==0.9.10
+sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 0.9.10
+#Pinned versions: 0.8.6

-sphinx_sitemap==2.7.1
+sphinxext-opengraph==0.9.1
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 0.9.1
+
+sphinx_sitemap==2.6.0
 #Description: This is used to generate sitemap for PyTorch docs
-#Pinned versions: 2.7.1
+#Pinned versions: 2.6.0

 matplotlib==3.5.3 ; python_version < "3.13"
 matplotlib==3.6.3 ; python_version >= "3.13"
@ -30,17 +29,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0

-breathe==4.35.0
+breathe==4.34.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.35.0
+#Pinned versions: 4.34.0

-exhale==0.3.7
+exhale==0.2.3
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.3.7
+#Pinned versions: 0.2.3

-docutils==0.18.1
+docutils==0.16
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.18.1
+#Pinned versions: 0.16

 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@ -50,24 +49,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0

-<<<<<<< HEAD
 myst-nb==0.17.2
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
 #Pinned versions: 0.17.2
-=======
-myst-nb==1.2.0
-#Description: This is used to generate PyTorch functorch docs
-#Pinned versions: 1.2.0
->>>>>>> 195382ce28e (Update)

 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
-sphinx-copybutton==0.5.2
-sphinx-design==0.6.1
+sphinx-copybutton==0.5.0
+sphinx-design==0.4.0
 sphinxcontrib-mermaid==1.0.0
-<<<<<<< HEAD
 myst-parser==0.18.1
-=======
-myst-parser==3.0.1
-myst-nb
->>>>>>> 195382ce28e (Update)
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -41,7 +41,6 @@ def sample_vllm_test_library():
                "pytest -v -s basic_correctness/test_cumem.py",
                "pytest -v -s basic_correctness/test_basic_correctness.py",
                "pytest -v -s basic_correctness/test_cpu_offload.py",
-                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
            ],
        },
        "vllm_basic_models_test": {
@ -68,15 +67,12 @@ def sample_vllm_test_library():
                        "-v",
                        "-s",
                        "entrypoints/llm",
-                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
                        "--ignore=entrypoints/llm/test_generate.py",
-                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
                        "--ignore=entrypoints/llm/test_collective_rpc.py",
                    ]
                ),
-                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
-                "pytest -v -s entrypoints/llm/test_generate.py ",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/llm/test_generate.py",
+                "pytest -v -s entrypoints/offline_mode",
            ],
        },
        "vllm_regression_test": {
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,10 +35,11 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
-  # backends (specifically the gloo backend), so test that this case works too
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

-python -mpip install -r requirements.txt
-
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

-python -mpip install --no-input -r requirements.txt
-
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
@ -59,7 +55,7 @@ test_python_shard() {

  setup_test_python

-  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"
+  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"

  assert_git_not_dirty
 }
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -83,10 +83,6 @@ rm -rf pytorch || true
 pushd "$pt_checkout"
 pushd docs

-# Profile the docs build to see what is taking the longest
-python -m cProfile -o docs_build.prof -m sphinx.cmd.build -b html -d build/doctrees source build/html
-python -c "import pstats; p = pstats.Stats('docs_build.prof'); p.sort_stats('cumtime').print_stats(50)"
-
 # Build the docs
 if [ "$is_main_doc" = true ]; then
  build_docs html || exit $?
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -322,23 +322,29 @@ test_python_shard() {

  # modify LD_LIBRARY_PATH to ensure it has the conda env.
  # This set of tests has been shown to be buggy without it for the split-build
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running

  assert_git_not_dirty
 }

 test_python() {
  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
  assert_git_not_dirty
 }

 test_python_smoke() {
-  # Smoke tests for H100
+  # Smoke tests for H100/B200
  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

+test_python_smoke_b200() {
+  # Targeted smoke tests for B200 - staged approach to avoid too many failures
+  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
 test_h100_distributed() {
  # Distributed tests at H100
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -384,6 +390,7 @@ test_dynamo_wrapped_shard() {
    --exclude-distributed-tests \
    --exclude-torch-export-tests \
    --exclude-aot-dispatch-tests \
+    --exclude-quantization-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose \
    --upload-artifacts-while-running
@ -1156,6 +1163,12 @@ test_distributed() {
  fi
 }

+test_quantization() {
+  echo "Testing quantization"
+
+  python test/test_quantization.py
+}
+
 test_rpc() {
  echo "Testing RPC C++ tests"
  # NB: the ending test_rpc must match the current function name for the current
@ -1573,7 +1586,7 @@ test_executorch() {
 test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

@ -1649,6 +1662,8 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
  test_python_legacy_jit
+elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
+  test_quantization
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
  # TODO: run some C++ tests
  echo "no-op at the moment"
@ -1773,6 +1788,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
+elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
+  test_python_smoke_b200
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
 elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -137,7 +137,7 @@ sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
  if "%BUILD_ENVIRONMENT%"=="" (
-    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
+    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
  ) else (
    copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"

--- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
 ) else (
  set CONDA_PARENT_DIR=C:\Jenkins
 )
-
+set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3

 :: Be conservative here when rolling out the new AMI with conda. This will try
 :: to install conda as before if it couldn't find the conda installation. This
 :: can be removed eventually after we gain enough confidence in the AMI
-if not exist %CONDA_PARENT_DIR%\Miniconda3 (
+if not exist %CONDA_ROOT_DIR% (
  set INSTALL_FRESH_CONDA=1
 )

@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b

-  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
+  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b
 )

 :: Activate conda so that we can use its commands, i.e. conda, python, pip
-call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
+call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
+:: Activate conda so that we can use its commands, i.e. conda, python, pip
+call conda activate py_tmp
+
+call pip install -r .ci/docker/requirements-ci.txt
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
 :: build\torch. Rather than changing all these references, making a copy of torch folder
 :: from conda to the current workspace is easier. The workspace will be cleaned up after
 :: the job anyway
-xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\

 pushd .
 if "%VC_VERSION%" == "" (
--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@ -25,7 +25,7 @@ echo Copying over test times file
 robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"

 echo Run nn tests
-python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
 if ERRORLEVEL 1 goto fail

 popd
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,14 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+
+# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
+# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
+# scipy from 1.6.3 to 1.10
+# expecttest from 0.1.3 to 0.3.0
+# xdoctest from 1.0.2 to 1.3.0
+python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"

 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.15.1.0
@ -52,9 +59,6 @@ python -m pip install parameterized==0.8.1
 # Install pulp for testing ilps under torch\distributed\_tools
 python -m pip install pulp==2.9.0

-# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
-python -m pip install expecttest==0.3.0
-
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -177,8 +177,7 @@ source ~/${desired_python}-build/bin/activate
 retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
-# is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@ -6,6 +6,12 @@ inputs:
  cuda-version:
    description: which cuda version to install, 'cpu' for none
    required: true
+  python-version:
+    required: false
+    type: string
+    default: "3.10"
+    description: |
+      The python version to be used. Will be 3.10 by default

 runs:
  using: composite
@ -38,18 +44,24 @@ runs:
        CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"

        {
+          echo "CONDA=${CONDA}";
          echo "CONDA_RUN=${CONDA} run --no-capture-output";
          echo "CONDA_BUILD=${CONDA} run conda-build";
          echo "CONDA_INSTALL=${CONDA} install";
        } >> "${GITHUB_ENV}"

    - name: Setup Python3
+      env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
      shell: bash
      run: |
        set +e
        set -x

-        PYTHON3=$(${CONDA_RUN} which python3)
+        # Create new py_tmp env with python-version
+        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
+
+        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
        EXIT_CODE=$?

        if [[ "${EXIT_CODE}" == "0" ]]; then
@ -62,7 +74,7 @@ runs:
          # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
          # is also the Miniconda installation that is Python 2 based, and both can be installed if
          # needed. In both cases, Python binary is just called python
-          PYTHON=$(${CONDA_RUN} which python)
+          PYTHON=$(${CONDA_RUN} -n py_tmp which python)
          EXIT_CODE=$?

          if [[ "${EXIT_CODE}" == "0" ]]; then
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26
+1983609239caaab24ab1ed2bfa2aa92e8c76c1b1
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -525,6 +525,21 @@
  - Lint
  - pull

+- name: typechecking
+  patterns:
+  - 'pyrefly.toml'
+  - 'mypy.ini'
+  - 'mypy-strict.ini'
+  approved_by:
+  - lolpack
+  - maggiemoss
+  - ndmitchell
+  - kinto0
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+
 - name: superuser
  patterns:
  - '*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -19,6 +19,7 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/periodic
 - ciflow/periodic-rocm-mi300
+- ciflow/quantization-periodic
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
@ -36,6 +37,7 @@ ciflow_push_tags:
 - ciflow/win-arm64
 - ciflow/h100-symm-mem
 - ciflow/h100-cutlass-backend
+- ciflow/b200
 retryable_workflows:
 - pull
 - trunk
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
            arches=["6.4"],
-            python_versions=["3.9"],
+            python_versions=["3.10"],
        ),
        ciflow_config=CIFlowConfig(
            labels={
@ -155,7 +155,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["12.8"],
+            arches=["13.0"],
            python_versions=["3.12"],
        ),
        branches="main",
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -71,12 +71,15 @@ jobs:
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      timeout-minutes: 420
+      {%- elif config["gpu_arch_type"] == "rocm" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.24xlarge.ephemeral
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -187,8 +187,6 @@ jobs:

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        with:
-          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: configure aws credentials
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -74,11 +74,7 @@ jobs:
          - docs_type: python
            runner: ${{ inputs.runner_prefix }}linux.2xlarge
            # It takes less than 30m to finish python docs unless there are issues
-            timeout-minutes: 60
-          - docs_type: functorch
-            runner: ${{ inputs.runner_prefix }}linux.2xlarge
-            # It takes less than 15m to finish functorch docs unless there are issues
-            timeout-minutes: 15
+            timeout-minutes: 30
    # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
    # The current name requires updating the database last docs push query from test-infra every time the matrix is updated
    name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@ -2,6 +2,12 @@ name: Get Changed Files

 on:
  workflow_call:
+    inputs:
+      all_files:
+        description: "Whether to return all files instead of just changed files"
+        required: false
+        type: boolean
+        default: false
    outputs:
      changed-files:
        description: "List of changed files (space-separated) or '*' if not in a PR"
@ -26,17 +32,23 @@ jobs:
            # Get the PR number from the github context
            PR_NUMBER="${{ github.event.number }}"

-            # Use gh CLI to get changed files in the PR with explicit repo
-            CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
+            # Check if all_files is requested
+            if [ "${{ inputs.all_files }}" = "true" ]; then
+              echo "all_files input is true, returning all files"
+              echo "changed-files=*" >> "$GITHUB_OUTPUT"
+            else
+              # Use gh CLI to get changed files in the PR with explicit repo
+              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')

-            if [ -z "$CHANGED_FILES" ]; then
-              echo "No changed files found, setting to '*'"
-              CHANGED_FILES="*"
+              if [ -z "$CHANGED_FILES" ]; then
+                echo "No changed files found, setting to '*'"
+                CHANGED_FILES="*"
+              fi
+
+              echo "Changed files: $CHANGED_FILES"
+              echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
            fi

-            echo "Changed files: $CHANGED_FILES"
-            echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
-
          else
            echo "Not in PR context, setting changed files to '*'"
            echo "changed-files=*" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -151,7 +151,7 @@ jobs:
          BUILD_WHEEL: 1
          MAX_JOBS: 8
          CUDA_VERSION: ${{ inputs.cuda-version }}
-          PYTHON_VERSION: "3.9"
+          PYTHON_VERSION: "3.10"
          SCCACHE_BUCKET: "ossci-compiler-cache"
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SCCACHE_REGION: us-east-1
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -184,7 +184,7 @@ jobs:
        env:
          USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
          INSTALL_WINDOWS_SDK: 1
-          PYTHON_VERSION: 3.9
+          PYTHON_VERSION: "3.10"
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -50,7 +50,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
        device: ["cuda", "rocm", "xpu", "aarch64"]
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
@ -108,9 +108,6 @@ jobs:

          # Determine python executable for given version
          case $PY_VERS in
-          3.9)
-            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
-            ;;
          3.10)
            PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
            ;;
@ -194,7 +191,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
        device: ["xpu"]
    timeout-minutes: 40
    env:
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -70,7 +70,7 @@ jobs:
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
          pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu,
          pytorch-linux-noble-riscv64-py3.12-gcc14
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -62,7 +62,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -128,7 +128,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -174,7 +174,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -220,7 +220,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -265,7 +265,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -331,7 +331,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -377,7 +377,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -423,7 +423,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -468,7 +468,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -534,7 +534,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -580,7 +580,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -626,7 +626,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -671,7 +671,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -737,7 +737,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -783,7 +783,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -829,7 +829,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -874,7 +874,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -940,7 +940,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -986,7 +986,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -1032,7 +1032,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -1077,7 +1077,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -1143,7 +1143,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -1189,7 +1189,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -1235,7 +1235,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -1280,7 +1280,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -1346,7 +1346,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -1392,7 +1392,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -1438,7 +1438,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.r7g.12xlarge.memory
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -333,6 +333,7 @@ jobs:
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: libtorch-rocm6_3-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
@ -447,6 +448,7 @@ jobs:
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: libtorch-rocm6_4-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -42,7 +42,7 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_12-cuda12_8-build:
+  manywheel-py3_12-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -51,22 +51,22 @@ jobs:
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda12_8
+      build_name: manywheel-py3_12-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_8-test:  # Testing
+  manywheel-py3_12-cuda13_0-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_12-cuda12_8-build
+      - manywheel-py3_12-cuda13_0-build
      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
@ -74,13 +74,13 @@ jobs:
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_8
+      build_name: manywheel-py3_12-cuda13_0
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -323,6 +323,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_10-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -434,6 +435,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_10-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -915,6 +917,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_11-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -1026,6 +1029,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_11-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -1507,6 +1511,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_12-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -1618,6 +1623,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_12-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -2099,6 +2105,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_13-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -2210,6 +2217,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_13-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -2691,6 +2699,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_13t-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -2802,6 +2811,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_13t-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -3283,6 +3293,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_14-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -3394,6 +3405,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_14-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -3875,6 +3887,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_14t-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -3986,6 +3999,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_14t-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -44,7 +44,7 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-rocm6_4-build:
+  manywheel-py3_10-rocm6_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -58,16 +58,17 @@ jobs:
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-rocm6_4
+      timeout-minutes: 300
+      build_name: manywheel-py3_10-rocm6_4
      build_environment: linux-binary-manywheel-rocm
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm6_4-test:  # Testing
+  manywheel-py3_10-rocm6_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_9-rocm6_4-build
+      - manywheel-py3_10-rocm6_4-build
      - get-label-type
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
@ -82,14 +83,14 @@ jobs:
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
-          name: manywheel-py3_9-rocm6_4
+          name: manywheel-py3_10-rocm6_4
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -31,6 +31,8 @@ jobs:
    if: github.repository_owner == 'pytorch'
    name: Get changed files
    uses: ./.github/workflows/_get-changed-files.yml
+    with:
+      all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}

  lintrunner-clang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -53,7 +55,7 @@ jobs:
    with:
      timeout: 120
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
@ -264,10 +266,10 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.9
+      - name: Setup Python 3.10
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
-          python-version: '3.9'
+          python-version: '3.10'
          architecture: x64
          cache: pip
      - name: Install dependencies
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -127,8 +127,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/quantization-periodic.yml
+++ b/.github/workflows/quantization-periodic.yml
@ -0,0 +1,54 @@
+name: quantization-periodic
+
+on:
+  push:
+    tags:
+      - ciflow/quantization-periodic/*
+  workflow_dispatch:
+  schedule:
+    # run weekly
+    - cron: "45 0 * * 0"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  get-default-label-prefix:
+    name: get-default-label-prefix
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  periodic-quantization-build:
+    name: periodic-quantization-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-default-label-prefix
+    with:
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '8.9'
+      test-matrix: |
+        { include: [
+          { config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
+  periodic-test-quantization:
+    name: periodic-test-quantization
+    uses: ./.github/workflows/_linux-test.yml
+    needs: periodic-quantization-build
+    with:
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,8 +140,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -0,0 +1,76 @@
+# B200 Smoke Tests CI Workflow
+#
+# This workflow runs smoke tests on B200 hardware
+#
+# Flow:
+# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
+# 2. Runs smoke tests on linux.dgx.b200 runner
+# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
+#
+# Triggered by:
+# - Pull requests modifying this workflow file
+# - Manual dispatch
+# - Schedule (every 6 hours)
+# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
+
+name: B200 Smoke Tests
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/test-b200.yml
+  workflow_dispatch:
+  schedule:
+    - cron: 0 4,10,16,22 * * *  # every 6 hours
+  push:
+    tags:
+      - ciflow/b200/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+        ]}
+      # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -53,27 +53,3 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -49,7 +49,7 @@ init_command = [
    'mccabe==0.7.0',
    'pycodestyle==2.14.0',
    'pyflakes==3.4.0',
-    'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
+    'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"',
 ]


@ -123,6 +123,7 @@ is_formatter = true
 code = 'MYPY'
 include_patterns = [
    'setup.py',
+    'functorch/dim/**/*.py',
    'torch/**/*.py',
    'torch/**/*.pyi',
    'caffe2/**/*.py',
@ -152,7 +153,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
+    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
    'numpy==2.1.0 ; python_version >= "3.12"',
    'expecttest==0.3.0',
    'mypy==1.16.0',
@ -195,6 +196,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
+    'tools/experimental/dynamic_shapes/torchfuzz/**',
 ]
 command = [
    'python3',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -22,6 +22,7 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
+    "-DUSE_DISTRIBUTED",
    "-DAT_PER_OPERATOR_HEADERS",
    "-DATEN_THREADING=NATIVE",
    "-DNO_CUDNN_DESTROY_HANDLE",
@ -810,7 +811,7 @@ cc_library(
    name = "torch_python",
    srcs = libtorch_python_core_sources
        + if_cuda(libtorch_python_cuda_sources)
-        + libtorch_python_distributed_sources
+        + if_cuda(libtorch_python_distributed_sources)
        + GENERATED_AUTOGRAD_PYTHON,
    hdrs = glob([
        "torch/csrc/generic/*.cpp",
@ -832,36 +833,6 @@ pybind_extension(
    ],
 )

-cc_library(
-    name = "functorch",
-    hdrs = glob([
-        "functorch/csrc/dim/*.h",
-    ]),
-    srcs = glob([
-        "functorch/csrc/dim/*.cpp",
-    ]),
-    deps = [
-        ":aten_nvrtc",
-        ":torch_python",
-        "@pybind11",
-    ],
-)
-
-pybind_extension(
-    name = "functorch/_C",
-    copts=[
-        "-DTORCH_EXTENSION_NAME=_C"
-    ],
-    srcs = [
-        "functorch/csrc/init_dim_only.cpp",
-    ],
-    deps = [
-        ":functorch",
-        ":torch_python",
-        ":aten_nvrtc",
-    ],
-)
-
 cc_binary(
    name = "torch/bin/torch_shm_manager",
    srcs = [
@ -902,7 +873,6 @@ py_library(
    ],
    data = [
        ":torch/_C.so",
-        ":functorch/_C.so",
        ":torch/bin/torch_shm_manager",
    ],
 )
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,4 @@
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
-# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)

 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
 # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@ -181,9 +180,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
  set(CPU_POWER ON)
 endif()

-# For non-supported platforms, turn USE_DISTRIBUTED off by default.
-# NB: USE_DISTRIBUTED simply disables the backend; distributed code
-# still gets built
+# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
+# tested and likely won't work without additional changes.
 if(NOT LINUX AND NOT WIN32)
  set(USE_DISTRIBUTED
      OFF
@ -263,11 +261,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Enable default distributed backends" ON)
+option(USE_DISTRIBUTED "Use distributed" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -439,10 +437,11 @@ if(WIN32)
      PATH_SUFFIXES lib
      NO_DEFAULT_PATH)
    if(NOT libuv_tmp_LIBRARY)
+      set(USE_DISTRIBUTED OFF)
      set(USE_GLOO OFF)
      message(
        WARNING
-          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
+          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
      )
    else()
@ -1391,10 +1390,6 @@ endif()
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()

-if(BUILD_FUNCTORCH)
-  add_subdirectory(functorch)
-endif()
-
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
  string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@ -1486,4 +1481,4 @@ else()
    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
    ]])
  endif()
-endif()
+endif()
--- a/README.md
+++ b/README.md
@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)

 #### Prerequisites
 If you are installing from source, you will need:
- Python 3.9 or later
+- Python 3.10 or later
 - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
 - Visual Studio or Visual Studio Build Tool (Windows only)

--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -317,10 +317,20 @@ IF(USE_FBGEMM_GENAI)
        -greedy-reverse-local-assignment=1
        -fhip-new-launch-api)

+      # Only compile for gfx942 for now.
+      # This is rather hacky, I could not figure out a clean solution :(
+      set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
+      string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
+      if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+        list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
+      endif()
+      set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
+
      hip_add_library(
        fbgemm_genai STATIC
        ${fbgemm_genai_native_rocm_hip}
        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
+      set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)

--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -401,30 +401,13 @@ T* toDLPackImpl(const Tensor& src) {
  // The following code detects whether the src follows
  // a continuous pattern. If the src follows such pattern (common-case)
  // then we do not need to normalize the strides.
-  bool need_normalize_strides = false;
-  int64_t expected_stride = 1;
-  for (int i = src.dim() - 1; i >= 0; i--) {
-    // detect if we do not meet continuous pattern
-    // and the size is 1, so there is opportunity to normalize
-    if (src.stride(i) != expected_stride && src.size(i) == 1) {
-      need_normalize_strides = true;
-      break;
-    }
-    expected_stride *= src.size(i);
-  }
-
+  bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
  // less common case, try normalizing the strides
  if (need_normalize_strides) {
    // create a new tensor with possibly normalized strides
    // gh-83069
    auto shape = src.sizes();
-    auto strides = src.strides().vec();
-    for (int i = 0; i < src.dim(); i++) {
-      if (shape[i] < 2) {
-        strides[i] = 1;
-      }
-    }
-    view = src.as_strided(shape, strides, src.storage_offset());
+    view = src.as_strided(shape, {1}, src.storage_offset());
  }

  ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1637,9 +1637,7 @@ bool gemm_and_bias(
  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
    epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
-#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
-#endif
  }

  if (bias != nullptr) {
@ -1931,7 +1929,6 @@ void scaled_gemm(
    bool use_fast_accum) {
  // Note: see `cublasCommonArgs` for various non-intuitive manupulations
  // of input arguments to this function.
-#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
  const auto computeType = CUBLAS_COMPUTE_32F;
  const auto scaleType = CUDA_R_32F;
  const float alpha_val = 1.0;
@ -2133,8 +2130,6 @@ void scaled_gemm(
      " scaleType ",
      scaleType);
  return;
-#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
-  TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
 }

 void int8_gemm(
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@ -97,43 +97,38 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
  int64_t nDims = self.dim();
  TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");

-  int64_t height = self.size(0);
-  int64_t width = self.size(1);
+  auto height = self.sym_size(0);
+  auto width = self.sym_size(1);

  if (nDims > 2) {
-    int64_t dim1 = height;
    for (const auto i : c10::irange(1, nDims)) {
-      if (self.size(i) != dim1) {
+      if (self.sym_size(i) != height) {
        TORCH_CHECK(false, "all dimensions of input must be of equal length");
      }
    }
  }

-  int64_t storage_offset = self.storage_offset();
-  std::vector<int64_t> sizes;
-  std::vector<int64_t> strides;
-  int64_t size = std::min(height, width);
+  auto storage_offset = self.sym_storage_offset();
+  auto size = std::min(height, width);

  int64_t stride = 0;
  for (const auto i : c10::irange(nDims)) {
    stride += self.stride(i);
  }
-  strides.push_back(stride);
-  sizes.push_back(size);
+  std::vector<SymInt> strides{stride};
+  std::vector<SymInt> sizes{size};

-  auto main_diag = self.as_strided(sizes, strides, storage_offset);
+  auto main_diag = self.as_strided_symint(sizes, strides, storage_offset);
  main_diag.fill_(fill_value);

  if (wrap && nDims == 2 && height > width + 1) {
-    std::vector<int64_t> wrap_sizes;
+    auto step = width + 1;
+    auto wrap_size = ((self.numel() + step - 1) / step) - size;
+    std::vector<SymInt> wrap_sizes{wrap_size};

-    int64_t step = width + 1;
-    int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
-    wrap_sizes.push_back(wrap_size);
+    auto offset = self.stride(0) * (width + 1);

-    int64_t offset = self.stride(0) * (width + 1);
-
-    auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
+    auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset);
    wrap_diag.fill_(fill_value);
  }

--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@ -23,8 +23,6 @@ Tensor& max_unpooling2d_forward_out_cpu(
  // Nondeterministic with duplicate indices
  at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");

-  auto oheight = output_size[0];
-  auto owidth = output_size[1];
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices_.scalar_type());
@ -45,6 +43,9 @@ Tensor& max_unpooling2d_forward_out_cpu(
                self_.sizes(), " with dimension ", i , " being empty.");
  }

+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
  auto memory_format = self_.suggest_memory_format();
  auto self = self_.contiguous(memory_format);
  auto indices = indices_.contiguous(memory_format);
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@ -73,7 +73,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
    for (const auto i : c10::irange((size_t)l_pad)) {
        auto pad_idx = pad.size() - ((i + 1) * 2);
        auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
-        TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
+        TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
                 pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
                 "which is invalid. Check dimension ", l_diff + i, " of your input.");
        new_shape.emplace_back(new_dim);
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -1,3 +1,5 @@
+#include <ATen/core/ATen_fwd.h>
+#include <c10/core/ScalarType.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
@ -1878,19 +1880,18 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {

  Tensor xtensor = self.expand(padded_size);

-  Tensor result;
+  Tensor urtensor;
  if (self.is_quantized()) {
-    result = at::empty_quantized(target_size, self);
+    urtensor = at::empty_quantized(target_size, self);
  } else {
-    result = at::empty(target_size, self.options());
+    urtensor = at::empty(target_size, self.options());
  }

  // return an empty tensor if one of the repeat dimensions is zero
  if (zero_tensor) {
-    return result;
+    return urtensor;
  }

-  Tensor urtensor = at::alias(result);
  for (const auto i : c10::irange(xtensor.dim())) {
    // can't unfold with step 0, so make sure step is at least 1
    // (it doesn't matter what it is in that case, because the size is 0).
@ -1900,7 +1901,22 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {

  urtensor.copy_(xtensor.expand_as(urtensor));

-  return result;
+  // Combine the dimensions to produce the target_size.
+  // xtensor dims: [a0, ..., ad-1]
+  // urtensor dims: [a0, ..., ad-1, b0, ..., bd-1]
+  // b dims are produced by unfold.
+  // Transform urtensor to [a0 * b0, ..., ad-1 * bd-1]
+  const int64_t n_dims = xtensor.dim();
+  auto range_a = at::arange(xtensor.dim(), at::TensorOptions(at::kLong));
+  auto range_b = range_a + n_dims;
+  auto stacked = stack({std::move(range_a), std::move(range_b)}, 1).flatten();
+  auto permutation = IntArrayRef(stacked.data_ptr<int64_t>(), n_dims * 2);
+  // Permute from [a0, ..., ad-1, b0, ..., bd-1] to [a0, b0, ..., ad-1, bd-1]
+  urtensor = urtensor.permute(permutation);
+  // Reshape from [a0, b0, ..., ad-1, bd-1] to [a0 * b0, ..., ad-1 * bd-1]
+  urtensor = urtensor.reshape(target_size);
+
+  return urtensor;
 }

 Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -999,12 +999,41 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
      dtypes[i] = iter.dtype(i);
    }
    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
+#ifdef USE_ROCM
+    constexpr int grp_sz = 128;
+    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+      if (unrl) {
+        auto offsets0 = offset_calc.get(idx);
+        auto offsets1 = offset_calc.get(idx + grp_sz);
+        auto offsets2 = offset_calc.get(idx + grp_sz * 2);
+        auto offsets3 = offset_calc.get(idx + grp_sz * 3);
+        void* out0 = data[0] + offsets0[0];
+        void* out1 = data[0] + offsets1[0];
+        void* out2 = data[0] + offsets2[0];
+        void* out3 = data[0] + offsets3[0];
+        arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1);
+        arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1);
+        arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1);
+        arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
+        c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
+        c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
+      } else {
+        auto offsets = offset_calc.get(idx);
+        void* out = data[0] + offsets[0];
+        arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+      }
+    });
+#else
    launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
      auto offsets = offset_calc.get(idx);
      void* out = data[0] + offsets[0];
      arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
    });
+#endif
  }
 }

--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@ -51,7 +51,7 @@ std::vector<Tensor> foreach_tensor_list_op(
      Op<opmath_t>(),
      alpha.to<opmath_t>());

-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@ -45,7 +45,7 @@ std::vector<Tensor> foreach_binary_op(
          /* res_arg_index */ 1>(),
      Op<opmath_t>(),
      scalar.to<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@ -33,7 +33,7 @@ std::vector<Tensor> foreach_binary_op(
  }

  tensor_lists.emplace_back(tensors.vec());
-  tensor_lists.emplace_back(vec_res);
+  tensor_lists.emplace_back(std::move(vec_res));

  using opmath_t = at::opmath_type<T>;
  multi_tensor_apply<2, opmath_t>(
@ -46,7 +46,7 @@ std::vector<Tensor> foreach_binary_op(
          /* res_arg_index */ 1>(),

      Op<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
@ -56,7 +56,7 @@ std::vector<Tensor> foreach_binary_op(
      Op<opmath_t>(),
      scalar.data_ptr<T>(),
      alpha.to<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@ -57,7 +57,7 @@ std::vector<Tensor> foreach_pointwise_op(
            scalar.to<opmath_t>());
      });

-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }

 template <template <class> class Op>
@ -160,7 +160,7 @@ std::vector<Tensor> foreach_pointwise_op(
            Op<opmath_t>());
      });

-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }

 #define FOREACH_POINTWISE_OP_SCALAR(NAME, OP)                           \
--- a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
@ -37,7 +37,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
    vec_res.emplace_back(at::native::empty_like(t));
  }
  std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)};

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
      at::ScalarType::Half,
@ -56,7 +56,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
            LerpFunctor<opmath_t>());
      });

-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }

 void foreach_tensor_lerp_ternary_cuda_(
@ -104,7 +104,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
    vec_res.emplace_back(at::native::empty_like(t));
  }
  std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), std::move(vec_res)};

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
      at::ScalarType::Half,
@ -124,7 +124,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
            weight.to<opmath_t>());
      });

-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }

 void foreach_tensor_lerp_list_cuda_(
@ -173,7 +173,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
    vec_res.emplace_back(at::native::empty_like(t));
  }
  std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), std::move(vec_res)};

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
      at::ScalarType::Half,
@ -193,7 +193,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
            LerpFunctor<opmath_t>());
      });

-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }

 void foreach_tensor_lerp_scalarlist_cuda_(
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@ -67,7 +67,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
          /* res_arg_index */ 1>(),
      Op<opmath_t>());

-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename scalar_t, template <class> class Op>
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@ -125,8 +125,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices_.scalar_type());
-  auto oheight = output_size[0];
-  auto owidth = output_size[1];

  TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
      indices_arg{indices_, "indices_", 3};
@ -149,6 +147,9 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
      output_size.size() == 2,
      "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");

+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
  int64_t dimw = 2;
  int64_t dimh = 1;
  int64_t numBatch = 1;
@ -217,9 +218,6 @@ static void max_unpooling3d_shape_check(
    IntArrayRef stride,
    IntArrayRef padding,
    const char *fn_name) {
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];
  TORCH_CHECK(
      indices.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices.scalar_type());
@ -250,6 +248,10 @@ static void max_unpooling3d_shape_check(
      "strides should be greater than zero, but got stride: ",
      stride);

+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
  int dimw = 3;
  int dimh = 2;
  int dimt = 1;
@ -402,8 +404,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
    const Tensor& indices_,
    IntArrayRef output_size,
    Tensor& grad_input) {
-  int64_t oheight = output_size[0];
-  int64_t owidth = output_size[1];
  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
@ -426,6 +426,9 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,

  TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());

+  int64_t oheight = output_size[0];
+  int64_t owidth = output_size[1];
+
  int64_t nInputCols, nInputRows, nInputPlane;

  int dimw = 2;
@ -505,13 +508,14 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
    IntArrayRef padding,
    Tensor& grad_input) {
  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];

  max_unpooling3d_shape_check(
    self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");

+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
  int batchSize = 0;
  int inputSlices = 0;
  int inputTime = 0;
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@ -300,8 +300,6 @@ void nonzero_static_cuda_out_impl(
    int64_t size,
    int64_t fill_value,
    Tensor& out) {
-#if defined(CUDA_VERSION) || defined(USE_ROCM)
-
  Tensor self_contiguous_ = self.contiguous();
  // see comment in nonzero_cuda_out_impl on reqs for out
  bool out_correct_size =
@ -377,9 +375,6 @@ void nonzero_static_cuda_out_impl(
  if (need_to_copy) {
    out.copy_(out_temp);
  }
-#else
-  TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
-#endif
 }

 Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -221,22 +221,9 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
  std::optional<CuFFTConfig> uncached_plan;
  const CuFFTConfig * config = nullptr;

-  // Workaround for gh-63152, gh-58724
-  // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
  // Bluestein's algorithm is only used when a size has large prime factors,
  // sizes with only small prime factors can still be cached
-  bool use_caching = true;
-#ifdef CUFFT_VERSION
-  if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
-    // Only cache plans for transforms with small prime factors
-    use_caching = std::none_of(
-        signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
-      return has_large_prime_factor(dim_size);
-    });
-  }
-#endif
-
-  if (use_caching && plan_cache.max_size() > 0) {
+  if (plan_cache.max_size() > 0) {
    guard.lock();
    if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
      config = &plan_cache.lookup(Params);
--- a/aten/src/ATen/native/im2col_shape_check.h
+++ b/aten/src/ATen/native/im2col_shape_check.h
@ -2,6 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/div_rtn.h>
+#include <c10/util/safe_numerics.h>

 namespace at::native {

@ -54,6 +55,14 @@ inline void col2im_shape_check(

  int64_t batch_dim = (ndim == 3) ? 0 : -1;
  int64_t n_input_plane = input.size(batch_dim + 1);
+  uint64_t prod_kernel_size = 1;
+
+  TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size),
+            "Given kernel_width = ",
+            kernel_width,
+            " and kernel_height = ",
+            kernel_height,
+            " the product of kernel_width and kernel_height overflowed.");

  if (n_input_plane % (kernel_width * kernel_height) != 0) {
    TORCH_CHECK(false,
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
@ -14,7 +14,7 @@ struct EmbeddingBagParams {
  ::c10::metal::array<idx_type_t, 2> output_strides;
  ::c10::metal::array<idx_type_t, 2> max_indices_strides;

-  idx_type_t per_sample_weights_strides;
+  idx_type_t per_sample_weights_stride;

  idx_type_t num_indices;
  idx_type_t num_bags;
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@ -23,54 +23,72 @@ struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
 template <EmbeddingBagMode M, typename T>
 struct ReductionOp {
  inline opmath_t<T> operator()(
-      T weight_val,
+      opmath_t<T> weight_val,
      opmath_t<T> out_val,
-      uint32_t per_sample_weights_index,
-      constant T* per_sample_weights,
-      uint32_t per_sample_weights_strides);
-};
-
-template <typename T>
-struct ReductionOp<EmbeddingBagMode::SUM, T> {
-  inline opmath_t<T> operator()(
-      T weight_val,
-      opmath_t<T> out_val,
-      uint32_t per_sample_weights_index,
-      constant T* per_sample_weights,
-      uint32_t per_sample_weights_strides) {
-    if (per_sample_weights_strides) {
-      T per_sample_weight = per_sample_weights
-          [per_sample_weights_strides * per_sample_weights_index];
-      return static_cast<opmath_t<T>>(per_sample_weight) *
-          static_cast<opmath_t<T>>(weight_val) +
-          out_val;
-    } else {
-      return static_cast<opmath_t<T>>(weight_val) + out_val;
-    }
-  }
-};
-
-template <typename T>
-struct ReductionOp<EmbeddingBagMode::MEAN, T> {
-  inline opmath_t<T> operator()(
-      T weight_val,
-      opmath_t<T> out_val,
-      uint32_t,
-      constant T*,
-      uint32_t) {
-    return static_cast<opmath_t<T>>(weight_val) + out_val;
+      bool is_first) {
+    return weight_val + out_val;
  }
 };

 template <typename T>
 struct ReductionOp<EmbeddingBagMode::MAX, T> {
  inline opmath_t<T> operator()(
-      T weight_val,
+      opmath_t<T> weight_val,
      opmath_t<T> out_val,
-      uint32_t,
-      constant T*,
-      uint32_t) {
-    return max(static_cast<opmath_t<T>>(weight_val), out_val);
+      bool is_first) {
+    return (is_first || weight_val > out_val) ? weight_val : out_val;
+  }
+};
+
+template <EmbeddingBagMode M, typename T>
+struct MaybeApplyPerSampleWeight {
+  inline opmath_t<T> operator()(
+      opmath_t<T> weight_val,
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_stride) {
+    return weight_val;
+  }
+};
+
+template <typename T>
+struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
+  inline opmath_t<T> operator()(
+      opmath_t<T> weight_val,
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_stride) {
+    if (per_sample_weights_stride) {
+      T per_sample_weight = per_sample_weights
+          [per_sample_weights_stride * per_sample_weights_index];
+      return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
+    } else {
+      return weight_val;
+    }
+  }
+};
+
+template <EmbeddingBagMode M, typename T, typename I>
+struct MaybeCalcMaxIndex {
+  inline void operator()(
+      opmath_t<T> weight_val,
+      opmath_t<T> out_val,
+      bool is_first,
+      thread I& max_idx,
+      I weight_idx,
+      bool pad) {}
+};
+
+template <typename T, typename I>
+struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> {
+  inline void operator()(
+      opmath_t<T> weight_val,
+      opmath_t<T> out_val,
+      bool is_first,
+      thread I& max_idx,
+      I weight_idx,
+      bool pad) {
+    max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx;
  }
 };

@ -96,6 +114,30 @@ struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
  }
 };

+template <EmbeddingBagMode M, typename I>
+struct MaybeWriteMaxIndex {
+  inline void operator()(
+      device I*,
+      const constant ::c10::metal::array<uint32_t, 2>&,
+      uint32_t,
+      uint32_t,
+      I) {}
+};
+
+template <typename I>
+struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> {
+  inline void operator()(
+      device I* max_indices,
+      const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides,
+      uint32_t bag_idx,
+      uint32_t feature_idx,
+      I max_idx) {
+    max_indices
+        [bag_idx * max_indices_strides[0] +
+         feature_idx * max_indices_strides[1]] = max_idx;
+  }
+};
+
 template <EmbeddingBagMode M, typename T, typename I>
 void embedding_bag_impl(
    constant T* weight,
@ -112,7 +154,7 @@ void embedding_bag_impl(
  auto num_bags = params.num_bags;
  auto feature_size = params.feature_size;
  auto padding_idx = params.padding_idx;
-  auto per_sample_weights_strides = params.per_sample_weights_strides;
+  auto per_sample_weights_stride = params.per_sample_weights_stride;
  constant auto& output_strides = params.output_strides;
  constant auto& weight_strides = params.weight_strides;
  constant auto& max_indices_strides = params.max_indices_strides;
@ -120,8 +162,6 @@ void embedding_bag_impl(
  auto bag_idx = tid / feature_size;
  auto feature_idx = tid % feature_size;

-  output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
-
  uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
  bool is_last_bag = bag_idx + 1 == num_bags;
  uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
@ -131,28 +171,37 @@ void embedding_bag_impl(
  auto out_val = ReductionOpInit<M, T>()();

  uint32_t bag_size_ = 0;
+  I max_idx = 0;

  for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
       indices_idx++) {
    I weight_idx = indices[indices_idx];
    bool pad = (weight_idx == padding_idx);
-    T weight_val = weight
-        [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
-         feature_idx * weight_strides[1]];
+    auto weight_val = static_cast<opmath_t<T>>(
+        weight
+            [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
+             feature_idx * weight_strides[1]]);

+    weight_val = MaybeApplyPerSampleWeight<M, T>()(
+        weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
+
+    auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
+
+    MaybeCalcMaxIndex<M, T, I>()(
+        weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad);
+
+    out_val = pad ? out_val : new_out_val;
+    offset2bag[indices_idx] = bag_idx;
    bag_size_ += static_cast<uint32_t>(!pad);
-
-    auto tmp_val = ReductionOp<M, T>()(
-        weight_val,
-        out_val,
-        indices_idx,
-        per_sample_weights,
-        per_sample_weights_strides);
-
-    out_val = pad ? out_val : tmp_val;
  }

-  *output = ReductionOpFinal<M, T>()(out_val, bag_size_);
+  output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] =
+      ReductionOpFinal<M, T>()(out_val, bag_size_);
+
+  bag_size[bag_idx] = bag_size_;
+
+  MaybeWriteMaxIndex<M, I>()(
+      max_indices, max_indices_strides, bag_idx, feature_idx, max_idx);
 }

 #define DISPATCH_IMPL(MODE)        \
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -198,7 +198,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,

    if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
      inputNDArray = getMPSNDArray(input_t, inputShape);
-      outputNDArray = getMPSNDArray(*output, outputShape);
+      outputNDArray = getMPSNDArray(output_t, outputShape);
    }

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@ -302,7 +302,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
      }
    }
    auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
-                                           : Placeholder(cachedGraph->outputTensor_, *output);
+                                           : Placeholder(cachedGraph->outputTensor_, output_t);

    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
        [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
@ -315,7 +315,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
  }

-  return *output;
+  return output_t;
 }

 Tensor _mps_convolution(const Tensor& input_t,
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@ -66,11 +66,12 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
  int64_t num_indices = indices.size(0);
  int64_t num_bags = offsets.size(0);
  if (include_last_offset) {
+    TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1");
    num_bags -= 1;
  }
  int64_t feature_size = weight.size(1);

-  auto bag_size = at::empty(offsets.sizes(), indices.options());
+  auto bag_size = at::empty({num_bags}, indices.options());
  auto offset2bag = at::empty({indices.size(0)}, indices.options());
  auto output = at::empty({num_bags, feature_size}, weight.options());

@ -94,7 +95,7 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
  }

  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
-  params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
+  params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;

  params.num_indices = num_indices;
  params.num_bags = num_bags;
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -20,6 +20,7 @@
 #include <ATen/ops/baddbmm_native.h>
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
+#include <ATen/ops/eye_native.h>
 #include <ATen/ops/linalg_cholesky_ex_native.h>
 #include <ATen/ops/linalg_inv_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
@ -496,26 +497,24 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
  using namespace mps;
  TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
  TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
-  using CachedGraph = MPSUnaryCachedGraph;

-  MPSStream* stream = getCurrentMPSStream();
  info.zero_();
-
  if (A.numel() == 0) {
    return;
  }

-  if (!result.is_contiguous()) {
-    result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
-  }
  auto A_sizes = A.sizes();
  int ndim = A.dim();

-  Tensor LU = empty_like(A);
-  Tensor identity = zeros_like(A);
+  Tensor LU = empty_like(A, MemoryFormat::Contiguous);
+  Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A);
  Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
-  (ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
-  linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
+  // need to do this to keep the strides of the result tensor
+  // mps's solve expects row major layout, while inductor
+  // expects result to be column major
+  Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
+  linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
+  result.copy_(tmp);
 }

 static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -519,6 +519,13 @@ static void max_unpool_out_mps_template(const Tensor& input,
                                        Tensor& output,
                                        const int32_t pooling_dims,
                                        const std::string& op_name) {
+  TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims),
+              op_name,
+              "There should be exactly ",
+              pooling_dims,
+              " elements but got ",
+              output_size_.size());
+
  auto dims = input.dim();
  auto leading_dims = input.dim() - pooling_dims;

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -3858,7 +3858,7 @@
  device_check: NoCheck   # TensorIterator
  structured: True
  dispatch:
-    CPU, CUDA: aminmax_out
+    CPU, CUDA, MTIA: aminmax_out
    MPS: aminmax_out_mps

 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
@ -3909,7 +3909,7 @@
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
-    CPU, CUDA: amax_out
+    CPU, CUDA, MTIA: amax_out
    MPS: amax_out_mps

 # Return: (Tensor output, Tensor indices)
@ -4090,7 +4090,7 @@
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
-    CPU, CUDA: amin_out
+    CPU, CUDA, MTIA: amin_out
    MPS: amin_out_mps

 # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@ -64,7 +64,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
  // create sparse descriptor, dtype
  cusparseLtMatDescriptor_t sparse_input_descriptor;
  cudaDataType type;
-  auto compression_factor = 9;

  #ifdef USE_ROCM
  TORCH_CHECK(isHipSparseLtSupported());
@ -73,7 +72,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
  switch (sparse_input.scalar_type()) {
    case at::ScalarType::Char:
      type = CUDA_R_8I;
-      compression_factor = 10;
      break;
    case at::ScalarType::Half:
      type = CUDA_R_16F;
@ -89,7 +87,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
    case at::ScalarType::Float8_e4m3fn:
      type = CUDA_R_8F_E4M3;
-      compression_factor = 10;
      break;
 #endif
    default:
@ -97,10 +94,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
      break;
  }

-  // create a new compressed tensor with the same dtype as
-  auto compressed_tensor =
-      sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);
-
  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
      &handle,
      &sparse_input_descriptor,
@ -121,6 +114,15 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
      &compressed_size,
      &compressed_buffer_size));

+  // create a new compressed tensor with the same dtype as the input,
+  // and with packed data/metadata stored in an array with original
+  // number of rows, and sufficient columns to provide compressed_size
+  // buffer (in bytes)
+  size_t orig_m = sparse_input.size(0);
+  size_t div = orig_m * sparse_input.itemsize();
+  size_t new_n = (compressed_size + div - 1) / div; // floor
+  auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
+
  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
  auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -165,7 +167,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
  cudaDataType output_type;
  cudaDataType C_type;
  cusparseComputeType compute_type;
-  auto compression_factor = 9;

  #ifdef USE_ROCM
  TORCH_CHECK(isHipSparseLtSupported());
@ -177,7 +178,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
      output_type = CUDA_R_8I;
      C_type = CUDA_R_8I;
      compute_type = CUSPARSE_COMPUTE_32I;
-      compression_factor = 10;
      break;

 // cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F
@ -210,7 +210,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
      output_type = CUDA_R_8F_E4M3;
      C_type = CUDA_R_16F;
      compute_type = CUSPARSE_COMPUTE_32F;
-      compression_factor = 10;
      break;
 #endif
 // cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
@ -300,9 +299,10 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
    }
  }

+  TORCH_INTERNAL_ASSERT(compressed_A.dim() == 2); // encoded M x S
  int64_t k = dense_B.size(0);
  int64_t n = dense_B.size(1);
-  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+  int64_t m = compressed_A.size(0);

  // initialize sparse descriptor
  cusparseLtMatDescriptor_t sparse_input_descriptor;
--- a/aten/src/ATen/test/cuda_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@ -5,51 +5,6 @@

 #include <ATen/test/allocator_clone_test.h>

-#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
-
 TEST(AllocatorTestCUDA, test_clone) {
  test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
-
-static int called_dummy_free_0 = 0;
-static int called_dummy_free_1 = 0;
-
-void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
-void dummy_free_0(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_0++;
-}
-void dummy_free_1(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_1++;
-}
-
-// Tests that data_ptrs have their respective deleters
-// when mixing allocators
-TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
-  // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
-  auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
-  at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
-  auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
-  at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Manually use a's deleter
-  auto* ctx = a.storage().data_ptr().get_context();
-  a.storage().data_ptr().get_deleter()(ctx);
-  a.storage().mutable_data_ptr().release_context();
-
-  // a's deleter is dummy_free_0
-  // dummy_free_0 should be called above, so called_dummy_free_0 should be 1
-  ASSERT_TRUE(called_dummy_free_0 == 1);
-
-  // Manually use b's deleter
-  ctx = b.storage().data_ptr().get_context();
-  b.storage().data_ptr().get_deleter()(ctx);
-  b.storage().mutable_data_ptr().release_context();
-
-  // b's deleter is dummy_free_1
-  // dummy_free_1 should be called above, so called_dummy_free_1 should be 1
-  ASSERT_TRUE(called_dummy_free_1 == 1);
-}
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -78,6 +78,8 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "google/gemma-3-4b-it",
                "openai/whisper-tiny",
                "Qwen/Qwen3-0.6B",
+                "mistralai/Mistral-7B-Instruct-v0.3",
+                "openai/gpt-oss-20b",
            }
        )

--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -61,6 +61,8 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "google/gemma-3-4b-it",
                "openai/whisper-tiny",
                "Qwen/Qwen3-0.6B",
+                "mistralai/Mistral-7B-Instruct-v0.3",
+                "openai/gpt-oss-20b",
            }
        )

--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -187,3 +187,11 @@ openai/whisper-tiny,fail_to_run,0


 Qwen/Qwen3-0.6B,fail_to_run,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,fail_to_run,0
+
+
+
+openai/gpt-oss-20b,fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0


 Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0


 Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass_due_to_skip,0


 Qwen/Qwen3-0.6B,pass_due_to_skip,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
+
+
+
+openai/gpt-oss-20b,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -191,3 +191,11 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
+
+
+
+mistralai/Mistral-7B-Instruct-v0.3,pass,0
+
+
+
+openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -3580,18 +3580,10 @@ def process_caching_precompile():
    )
    from torch._dynamo.precompile_context import PrecompileContext

-    # Serialize all callables, clear PrecompileContext
-    # TODO: put this under torch.compiler API once ready
-    serialized = PrecompileContext.serialize()
-    PrecompileContext.clear()
-    if serialized is not None:
-        artifacts, info = serialized
-        print(
-            f"Saving {len(info.precompile_dynamo_artifacts)} Precompile Artifact(s)..."
-        )
-        results = PrecompileContext.deserialize(artifacts)
-        assert results is not None
-        PrecompileContext.populate_caches(results)
+    debug_info = PrecompileContext.save_to_dynamo_cache()
+    print(
+        f"Saved {len(debug_info['dynamo'])} precompile artifacts with {len(debug_info['backends'])} backends"
+    )


 def process_entry(rank, runner, original_dir, args):
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@ -11,6 +11,8 @@ skip:
    - GPTJForQuestionAnswering
    # Model too big
    - google/gemma-3-4b-it
+    - openai/gpt-oss-20b
+    - mistralai/Mistral-7B-Instruct-v0.3

  device:
    cpu:
@ -19,6 +21,8 @@ skip:
      - google/gemma-3-4b-it
      - openai/whisper-tiny
      - Qwen/Qwen3-0.6B
+      - mistralai/Mistral-7B-Instruct-v0.3
+      - openai/gpt-oss-20b

  control_flow:
    - AllenaiLongformerBase
@ -79,6 +83,8 @@ batch_size:
    google/gemma-3-4b-it: 8
    openai/whisper-tiny: 8
    Qwen/Qwen3-0.6B: 8
+    mistralai/Mistral-7B-Instruct-v0.3: 8
+    openai/gpt-oss-20b: 8


 tolerance:
--- a/benchmarks/dynamo/huggingface_llm_models.py
+++ b/benchmarks/dynamo/huggingface_llm_models.py
@ -99,4 +99,6 @@ HF_LLM_MODELS: dict[str, Benchmark] = {
    "google/gemma-3-4b-it": TextGenerationBenchmark,
    "openai/whisper-tiny": WhisperBenchmark,
    "Qwen/Qwen3-0.6B": TextGenerationBenchmark,
+    "mistralai/Mistral-7B-Instruct-v0.3": TextGenerationBenchmark,
+    "openai/gpt-oss-20b": TextGenerationBenchmark,
 }
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@ -51,3 +51,5 @@ google/gemma-2-2b,8
 google/gemma-3-4b-it,8
 openai/whisper-tiny,8
 Qwen/Qwen3-0.6B,8
+mistralai/Mistral-7B-Instruct-v0.3, 8
+openai/gpt-oss-20b, 8
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
 # for targets in subfolders
 ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"

-C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")
+C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"

 # a dictionary maps third party library name to fbsource and oss target
 THIRD_PARTY_LIBS = {
@ -948,7 +948,6 @@ def define_buck_targets(
            [
                ("torch/csrc/api/include", "torch/**/*.h"),
                ("", "torch/csrc/**/*.h"),
-                ("", "torch/csrc/**/*.hpp"),
                ("", "torch/nativert/**/*.h"),
                ("", "torch/headeronly/**/*.h"),
                ("", "torch/script.h"),
@ -2048,7 +2047,6 @@ def define_buck_targets(
                ("", "caffe2/utils/*.h"),
                ("", "caffe2/core/*.h"),
                ("", "torch/csrc/*.h"),
-                ("", "torch/csrc/*.hpp"),
                ("", "torch/csrc/api/include/torch/*.h"),
                ("", "torch/csrc/autograd/*.h"),
                ("", "torch/csrc/autograd/*/*.h"),
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
    is_le<sizeof(autograd_meta_),         16,  FieldNameEnum::autograd_meta_>();
    is_le<sizeof(extra_meta_),            16,  FieldNameEnum::extra_meta_>();
    are_equal<sizeof(version_counter_),    8,  FieldNameEnum::version_counter_>();
-    are_equal<sizeof(pyobj_slot_),         8,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(pyobj_slot_),   16,  FieldNameEnum::pyobj_slot_>();
    are_equal<sizeof(sizes_and_strides_), 88,  FieldNameEnum::sizes_and_strides_>();
    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
--- a/c10/core/impl/PyInterpreterHooks.h
+++ b/c10/core/impl/PyInterpreterHooks.h
@ -13,10 +13,11 @@ struct C10_API PyInterpreterHooksInterface {

  // Get the PyInterpreter instance
  // Stub implementation throws error when Python is not available
-  // We return nullptr rather than throwing an error since there are bits of c10
-  // that expect an empty PyObjectSlot when python is not available.
  virtual PyInterpreter* getPyInterpreter() const {
-    return nullptr;
+    TORCH_CHECK(
+        false,
+        "PyTorch was compiled without Python support. "
+        "Cannot access Python interpreter from C++.");
  }
 };

--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@ -2,7 +2,7 @@

 namespace c10::impl {

-PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
+PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}

 PyObjectSlot::~PyObjectSlot() {
  maybe_destroy_pyobj();
@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {

 void PyObjectSlot::maybe_destroy_pyobj() {
  if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
+    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*getGlobalPyInterpreter())
+    (*pyobj_interpreter_.load(std::memory_order_acquire))
        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
    // NB: this destructor can only be entered when there are no
    // references to this C++ object (obviously), NOR any references
@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
 }

 PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return getGlobalPyInterpreter();
+  return pyobj_interpreter_.load(std::memory_order_acquire);
 }

 PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
 }

 PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = getGlobalPyInterpreter();
+  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
  if (interpreter) {
    return *interpreter;
  }
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -6,17 +6,10 @@
 #include <c10/util/python_stub.h>
 #include <optional>

+#include <atomic>
+
 namespace c10::impl {

-// Function pointer type for getting the global interpreter
-using GetPyInterpreterFn = PyInterpreter* (*)();
-
-// Global function pointer (set by csrc initialization)
-C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
-
-// Helper function to get the global interpreter
-C10_API PyInterpreter* getGlobalPyInterpreter();
-
 struct C10_API PyObjectSlot {
 public:
  PyObjectSlot();
@ -33,6 +26,8 @@ struct C10_API PyObjectSlot {
  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
  // PyObject if necessary!
  void init_pyobj(PyObject* pyobj) {
+    pyobj_interpreter_.store(
+        getGlobalPyInterpreter(), std::memory_order_relaxed);
    pyobj_ = pyobj;
  }

@ -60,15 +55,18 @@ struct C10_API PyObjectSlot {

  // @todo alban: I'm not too sure what's going on here, we can probably delete
  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj() const {
-    impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
-    if (interpreter == nullptr || pyobj_ == nullptr) {
+  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
+    impl::PyInterpreter* interpreter =
+        pyobj_interpreter_.load(std::memory_order_acquire);
+    if (interpreter == nullptr) {
      return std::nullopt;
    }
-    if (c10::impl::HermeticPyObjectTLS::get_state()) {
+
+    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
      return std::nullopt;
+    } else {
+      return _unchecked_untagged_pyobj();
    }
-    return _unchecked_untagged_pyobj();
  }

  PyInterpreter& load_pyobj_interpreter() const;
@ -78,6 +76,30 @@ struct C10_API PyObjectSlot {
  void set_owns_pyobj(bool b);

 private:
+  // This field contains the interpreter tag for this object.  See
+  // Note [Python interpreter tag] for general context
+  //
+  // Note [Memory ordering on Python interpreter tag]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // What memory_order do we need when accessing this atomic?  We don't
+  // need a single total modification order (as provided by
+  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
+  // transition from -1 to some positive integer and never changes afterwards.
+  // Because there is only one modification, it trivially already has a total
+  // modification order (e.g., we don't need fences or locked instructions on
+  // x86)
+  //
+  // In fact, one could make a reasonable argument that relaxed reads are OK,
+  // due to the presence of external locking (GIL) to ensure that interactions
+  // with other data structures are still correctly synchronized, so that
+  // we fall in the "Single-Location Data Structures" case as described in
+  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
+  // as I get the same assembly in both cases.  So I just use the more
+  // conservative acquire (which will impede compiler optimizations but I don't
+  // care)
+  std::atomic<PyInterpreter*> pyobj_interpreter_;
+
  // This field contains a reference to a PyObject representing this Tensor.
  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
  // PyObject for it and set this field.  This field does not have to be
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@ -14,7 +14,6 @@ namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync {
 using namespace c10::CachingAllocator;
 using namespace c10::CachingDeviceAllocator;

-#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
 // CUDA device allocator that uses cudaMallocAsync to implement
 // the same interface as CUDACachingAllocator.cpp.

@ -926,13 +925,4 @@ CUDAAllocator* allocator() {
  return &device_allocator;
 }

-#else
-// NOLINTNEXTLINE(misc-use-internal-linkage)
-CUDAAllocator* allocator() {
-  TORCH_CHECK(false, "Cannot use CudaMallocAsyncAllocator with cuda < 11.4.");
-  return nullptr;
-}
-
-#endif
-
 } // namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync
--- a/c10/ovrsource_defs.bzl
+++ b/c10/ovrsource_defs.bzl
@ -18,9 +18,9 @@ cuda_supported_platforms = [

 def define_c10_ovrsource(name, is_mobile):
    if is_mobile:
-        pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"]
+        pp_flags = ["-DC10_MOBILE=1"]
    else:
-        pp_flags = ["-DC10_USE_GLOG"]
+        pp_flags = []

    oxx_static_library(
        name = name,
--- a/c10/util/ExclusivelyOwnedTensorTraits.h
+++ b/c10/util/ExclusivelyOwnedTensorTraits.h
@ -35,26 +35,26 @@ struct ExclusivelyOwnedTensorTraits {
    // incremented.
    const bool isUndefined = toDestroy == UndefinedTensorImpl::singleton();
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        toDestroy->refcount_ == 1 || (toDestroy->refcount_ == 0 && isUndefined),
+        toDestroy->refcount() == 1 ||
+            (toDestroy->refcount() == 0 && isUndefined),
        "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
        isUndefined,
        " and refcount ",
-        toDestroy->refcount_,
+        toDestroy->refcount(),
        ", expected 1 or, if isUndefined, 0!");
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        toDestroy->weakcount_ == 1 ||
-            (toDestroy->weakcount_ == 0 &&
+        toDestroy->weakcount() == 1 ||
+            (toDestroy->weakcount() == 0 &&
             toDestroy == UndefinedTensorImpl::singleton()),
        "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
        isUndefined,
        " and weakcount ",
-        toDestroy->weakcount_,
+        toDestroy->weakcount(),
        ", expected 1 or, if isUndefined, 0!");
    if (!isUndefined) {
 #ifndef NDEBUG
      // Needed to pass the debug assertions in ~intrusive_ptr_target.
-      toDestroy->refcount_ = 0;
-      toDestroy->weakcount_ = 0;
+      toDestroy->combined_refcount_.store(0, std::memory_order_relaxed);
 #endif
      delete toDestroy;
    }
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@ -27,7 +27,78 @@ struct DontIncreaseRefcount {};
 } // namespace raw

 namespace detail {
-constexpr uint32_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+constexpr uint64_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
+    (kImpracticallyHugeReferenceCount << 32);
+constexpr uint64_t kReferenceCountOne = 1;
+constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
+constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
+
+template <class TTarget>
+struct intrusive_target_default_null_type final {
+  static constexpr TTarget* singleton() noexcept {
+    return nullptr;
+  }
+};
+
+template <class TTarget, class ToNullType, class FromNullType>
+TTarget* assign_ptr_(TTarget* rhs) {
+  if (FromNullType::singleton() == rhs) {
+    return ToNullType::singleton();
+  } else {
+    return rhs;
+  }
+}
+
+inline uint32_t refcount(uint64_t combined_refcount) {
+  return static_cast<uint32_t>(combined_refcount);
+}
+
+inline uint32_t weakcount(uint64_t combined_refcount) {
+  return static_cast<uint32_t>(combined_refcount >> 32);
+}
+
+// The only requirement for refcount increment is that it happens-before
+// decrement, so no additional memory ordering is needed.
+inline uint64_t atomic_combined_refcount_increment(
+    std::atomic<uint64_t>& combined_refcount,
+    uint64_t inc) {
+  return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
+}
+
+inline uint32_t atomic_refcount_increment(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::refcount(atomic_combined_refcount_increment(
+      combined_refcount, kReferenceCountOne));
+}
+
+inline uint32_t atomic_weakcount_increment(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::weakcount(atomic_combined_refcount_increment(
+      combined_refcount, kWeakReferenceCountOne));
+}
+
+// The requirement is that all modifications to the managed object happen-before
+// invocation of the managed object destructor, and that allocation of the
+// managed object storage happens-before deallocation of the storage.
+//
+// To get this ordering, all non-final decrements must synchronize-with the
+// final decrement. So all non-final decrements have to store-release while the
+// final decrement has to load-acquire, either directly or with the help of
+// fences. But it's easiest just to have all decrements be acq-rel. And it turns
+// out, on modern architectures and chips, it's also fastest.
+inline uint64_t atomic_combined_refcount_decrement(
+    std::atomic<uint64_t>& combined_refcount,
+    uint64_t dec) {
+  return combined_refcount.fetch_sub(dec, std::memory_order_acq_rel) - dec;
+}
+
+inline uint32_t atomic_weakcount_decrement(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::weakcount(atomic_combined_refcount_decrement(
+      combined_refcount, kWeakReferenceCountOne));
+}
+
 } // namespace detail

 /**
@ -80,8 +151,14 @@ class C10_API intrusive_ptr_target {
  //    atomically increment the use count, if it is greater than 0.
  //    If it is not, you must report that the storage is dead.
  //
-  mutable std::atomic<uint32_t> refcount_;
-  mutable std::atomic<uint32_t> weakcount_;
+  //.We use a single combined count for refcount and weakcount so that
+  // we can atomically operate on both at the same time for performance
+  // and defined behaviors.
+  //
+  mutable std::atomic<uint64_t> combined_refcount_;
+  static_assert(sizeof(std::atomic<uint64_t>) == 8);
+  static_assert(alignof(std::atomic<uint64_t>) == 8);
+  static_assert(std::atomic<uint64_t>::is_always_lock_free);

  template <typename T, typename NullType>
  friend class intrusive_ptr;
@ -126,16 +203,16 @@ class C10_API intrusive_ptr_target {
        // caller of unsafe_adapt_non_heap_allocated wanted to
        // use). We choose our reference count such that the count
        // will not dip below kImpracticallyHugeReferenceCount regardless.
-        refcount_.load() == 0 ||
-            refcount_.load() >= detail::kImpracticallyHugeReferenceCount,
+        refcount() == 0 ||
+            refcount() >= detail::kImpracticallyHugeReferenceCount,
        "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it; refcount was ",
-        refcount_.load());
+        refcount());
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
        // See ~intrusive_ptr for optimization that will frequently result in 1
        // at destruction time.
-        weakcount_.load() == 1 || weakcount_.load() == 0 ||
-            weakcount_.load() == detail::kImpracticallyHugeReferenceCount - 1 ||
-            weakcount_.load() == detail::kImpracticallyHugeReferenceCount,
+        weakcount() == 1 || weakcount() == 0 ||
+            weakcount() == detail::kImpracticallyHugeReferenceCount - 1 ||
+            weakcount() == detail::kImpracticallyHugeReferenceCount,
        "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
@ -144,7 +221,7 @@ class C10_API intrusive_ptr_target {
 #endif
  }

-  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
+  constexpr intrusive_ptr_target() noexcept : combined_refcount_(0) {}

  // intrusive_ptr_target supports copy and move: but refcount and weakcount
  // don't participate (since they are intrinsic properties of the memory
@ -177,54 +254,17 @@ class C10_API intrusive_ptr_target {
   * destructed), this function WILL NOT be called.
   */
  virtual void release_resources() {}
-};

-namespace detail {
-template <class TTarget>
-struct intrusive_target_default_null_type final {
-  static constexpr TTarget* singleton() noexcept {
-    return nullptr;
+  uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
+    return detail::refcount(combined_refcount_.load(order));
+  }
+
+  uint32_t weakcount(
+      std::memory_order order = std::memory_order_relaxed) const {
+    return detail::weakcount(combined_refcount_.load(order));
  }
 };

-template <class TTarget, class ToNullType, class FromNullType>
-TTarget* assign_ptr_(TTarget* rhs) {
-  if (FromNullType::singleton() == rhs) {
-    return ToNullType::singleton();
-  } else {
-    return rhs;
-  }
-}
-
-// The only requirement for refcount increment is that it happens-before
-// decrement, so no additional memory ordering is needed.
-inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
-  return refcount.fetch_add(1, std::memory_order_relaxed) + 1;
-}
-
-inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
-  return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
-}
-
-// The requirement is that all modifications to the managed object happen-before
-// invocation of the managed object destructor, and that allocation of the
-// managed object storage happens-before deallocation of the storage.
-//
-// To get this ordering, all non-final decrements must synchronize-with the
-// final decrement. So all non-final decrements have to store-release while the
-// final decrement has to load-acquire, either directly or with the help of
-// fences. But it's easiest just to have all decrements be acq-rel. And it turns
-// out, on modern architectures and chips, it's also fastest.
-inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
-  return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
-}
-
-inline uint32_t atomic_weakcount_decrement(std::atomic<uint32_t>& weakcount) {
-  return weakcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
-}
-
-} // namespace detail
-
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;

@ -275,7 +315,7 @@ class intrusive_ptr final {
  void retain_() {
    if (target_ != NullType::singleton()) {
      uint32_t new_refcount =
-          detail::atomic_refcount_increment(target_->refcount_);
+          detail::atomic_refcount_increment(target_->combined_refcount_);
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
          new_refcount != 1,
          "intrusive_ptr: Cannot increase refcount after it reached zero.");
@ -284,41 +324,25 @@ class intrusive_ptr final {

  void reset_() noexcept {
    if (target_ != NullType::singleton()) {
-#if defined(__linux__) && (defined(__aarch64__) || defined(__x86_64__))
-      if constexpr (
-          std::atomic<uint64_t>::is_always_lock_free &&
-          std::atomic<uint32_t>::is_always_lock_free &&
-          sizeof(std::atomic<uint64_t>) == 8 &&
-          sizeof(std::atomic<uint32_t>) == 4) {
-        auto both_counts_ =
-            reinterpret_cast<std::atomic<uint64_t>*>(&target_->refcount_);
-        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-            (reinterpret_cast<std::uintptr_t>(both_counts_) %
-             sizeof(std::atomic<uint64_t>)) == 0 &&
-            (reinterpret_cast<std::uintptr_t>(&target_->weakcount_) -
-             reinterpret_cast<std::uintptr_t>(both_counts_)) ==
-                sizeof(std::atomic<uint32_t>));
-        // 0x100000001ULL is a 64-bit number combination of both the refcount_
-        // and weakcount_ being 1.
-        constexpr uint64_t unique_ref_ = 0x100000001ULL;
-        if (both_counts_->load(std::memory_order_acquire) == unique_ref_) {
-          // Both counts are 1, so there are no weak references and
-          // we are releasing the last strong reference. No other
-          // threads can observe the effects of this target_ deletion
-          // call (e.g. calling use_count()) without a data race.
-          target_->refcount_.store(0, std::memory_order_relaxed);
-          delete target_;
-          return;
-        }
+      if (target_->combined_refcount_.load(std::memory_order_acquire) ==
+          detail::kUniqueRef) {
+        // Both counts are 1, so there are no weak references and
+        // we are releasing the last strong reference. No other
+        // threads can observe the effects of this target_ deletion
+        // call (e.g. calling use_count()) without a data race.
+        target_->combined_refcount_.store(0, std::memory_order_relaxed);
+        delete target_;
+        return;
      }
-#endif

-      if (detail::atomic_refcount_decrement(target_->refcount_) == 0) {
+      auto combined_refcount = detail::atomic_combined_refcount_decrement(
+          target_->combined_refcount_, detail::kReferenceCountOne);
+      if (detail::refcount(combined_refcount) == 0) {
+        bool should_delete =
+            (combined_refcount == detail::kWeakReferenceCountOne);
        // See comment above about weakcount. As long as refcount>0,
        // weakcount is one larger than the actual number of weak references.
        // So we need to decrement it here.
-        bool should_delete =
-            target_->weakcount_.load(std::memory_order_acquire) == 1;
        if (!should_delete) {
          // justification for const_cast: release_resources is basically a
          // destructor and a destructor always mutates the object, even for
@ -326,8 +350,8 @@ class intrusive_ptr final {
          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
          const_cast<std::remove_const_t<TTarget>*>(target_)
              ->release_resources();
-          should_delete =
-              detail::atomic_weakcount_decrement(target_->weakcount_) == 0;
+          should_delete = detail::atomic_weakcount_decrement(
+                              target_->combined_refcount_) == 0;
        }
        if (should_delete) {
          delete target_;
@ -354,12 +378,12 @@ class intrusive_ptr final {
      // `mov`, whereas an atomic increment does a lock-prefixed `add`, which is
      // much more expensive: https://godbolt.org/z/eKPzj8.)
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-          target_->refcount_ == 0 && target_->weakcount_ == 0,
+          target_->combined_refcount_.load(std::memory_order_relaxed) == 0,
          "intrusive_ptr: Newly-created target had non-zero refcounts. Does its "
          "constructor do something strange like incref or create an "
          "intrusive_ptr from `this`?");
-      target_->refcount_.store(1, std::memory_order_relaxed);
-      target_->weakcount_.store(1, std::memory_order_relaxed);
+      target_->combined_refcount_.store(
+          detail::kUniqueRef, std::memory_order_relaxed);
    }
  }

@ -482,14 +506,14 @@ class intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->refcount_.load(std::memory_order_relaxed);
+    return target_->refcount(std::memory_order_relaxed);
  }

  uint32_t weak_use_count() const noexcept {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->weakcount_.load(std::memory_order_relaxed);
+    return target_->weakcount(std::memory_order_relaxed);
  }

  bool unique() const noexcept {
@ -518,8 +542,8 @@ class intrusive_ptr final {
   */
  static intrusive_ptr reclaim(TTarget* owning_ptr) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        owning_ptr == NullType::singleton() ||
-            owning_ptr->refcount_.load() == 0 || owning_ptr->weakcount_.load(),
+        owning_ptr == NullType::singleton() || owning_ptr->refcount() == 0 ||
+            owning_ptr->weakcount(),
        "TTarget violates the invariant that refcount > 0  =>  weakcount > 0");
    return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{});
  }
@ -590,11 +614,11 @@ class intrusive_ptr final {
 #ifdef NDEBUG
    expected_decrefs = 0;
 #endif
-    result.target_->refcount_.store(
-        detail::kImpracticallyHugeReferenceCount + expected_decrefs,
+    result.target_->combined_refcount_.store(
+        detail::refcount(
+            detail::kImpracticallyHugeReferenceCount + expected_decrefs) |
+            detail::kImpracticallyHugeWeakReferenceCount,
        std::memory_order_relaxed);
-    result.target_->weakcount_.store(
-        detail::kImpracticallyHugeReferenceCount, std::memory_order_relaxed);
    return result;
  }

@ -611,7 +635,7 @@ class intrusive_ptr final {
  static intrusive_ptr unsafe_reclaim_from_nonowning(TTarget* raw_ptr) {
    // See Note [Stack allocated intrusive_ptr_target safety]
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        raw_ptr == NullType::singleton() || raw_ptr->refcount_.load() > 0,
+        raw_ptr == NullType::singleton() || raw_ptr->refcount() > 0,
        "intrusive_ptr: Can only reclaim pointers that are owned by someone");
    auto ptr = reclaim(raw_ptr); // doesn't increase refcount
    ptr.retain_();
@ -745,7 +769,7 @@ class weak_intrusive_ptr final {
  void retain_() {
    if (target_ != NullType::singleton()) {
      uint32_t new_weakcount =
-          detail::atomic_weakcount_increment(target_->weakcount_);
+          detail::atomic_weakcount_increment(target_->combined_refcount_);
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
          new_weakcount != 1,
          "weak_intrusive_ptr: Cannot increase weakcount after it reached zero.");
@ -754,7 +778,7 @@ class weak_intrusive_ptr final {

  void reset_() noexcept {
    if (target_ != NullType::singleton() &&
-        detail::atomic_weakcount_decrement(target_->weakcount_) == 0) {
+        detail::atomic_weakcount_decrement(target_->combined_refcount_) == 0) {
      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDelete)
      delete target_;
    }
@ -887,7 +911,7 @@ class weak_intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->refcount_.load(
+    return target_->refcount(
        std::memory_order_relaxed); // refcount, not weakcount!
  }

@ -895,7 +919,7 @@ class weak_intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->weakcount_.load(std::memory_order_relaxed);
+    return target_->weakcount(std::memory_order_relaxed);
  }

  bool expired() const noexcept {
@ -906,16 +930,17 @@ class weak_intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return intrusive_ptr<TTarget, NullType>();
    } else {
-      auto refcount = target_->refcount_.load(std::memory_order_relaxed);
+      auto combined_refcount =
+          target_->combined_refcount_.load(std::memory_order_relaxed);
      do {
-        if (refcount == 0) {
+        if (detail::refcount(combined_refcount) == 0) {
          // Object already destructed, no strong references left anymore.
          // Return nullptr.
          return intrusive_ptr<TTarget, NullType>();
        }
-      } while (!target_->refcount_.compare_exchange_weak(
-          refcount,
-          refcount + 1,
+      } while (!target_->combined_refcount_.compare_exchange_weak(
+          combined_refcount,
+          combined_refcount + detail::kReferenceCountOne,
          std::memory_order_acquire,
          std::memory_order_relaxed));

@ -952,9 +977,9 @@ class weak_intrusive_ptr final {
    // if refcount == 0, weakcount only must be >0.
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
        owning_weak_ptr == NullType::singleton() ||
-            owning_weak_ptr->weakcount_.load() > 1 ||
-            (owning_weak_ptr->refcount_.load() == 0 &&
-             owning_weak_ptr->weakcount_.load() > 0),
+            owning_weak_ptr->weakcount() > 1 ||
+            (owning_weak_ptr->refcount() == 0 &&
+             owning_weak_ptr->weakcount() > 0),
        "weak_intrusive_ptr: Can only weak_intrusive_ptr::reclaim() owning pointers that were created using weak_intrusive_ptr::release().");
    return weak_intrusive_ptr(owning_weak_ptr);
  }
@ -1033,7 +1058,7 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
  if (self) {
-    detail::atomic_refcount_increment(self->refcount_);
+    detail::atomic_refcount_increment(self->combined_refcount_);
  }
 }

@ -1067,7 +1092,7 @@ inline uint32_t use_count(intrusive_ptr_target* self) {
 namespace weak_intrusive_ptr {

 inline void incref(weak_intrusive_ptr_target* self) {
-  detail::atomic_weakcount_increment(self->weakcount_);
+  detail::atomic_weakcount_increment(self->combined_refcount_);
 }

 inline void decref(weak_intrusive_ptr_target* self) {
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -540,9 +540,11 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
    ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
  )

-  append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
-  if(NOT WIN32)
-    append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
+    endif()
  endif()
 endif()

@ -573,30 +575,32 @@ if(USE_CUDA)
    list(APPEND Caffe2_GPU_SRCS
      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
  endif()
-  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
-  if(NOT WIN32)
-    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
-    set_source_files_properties(
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
-      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
-      PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
-    )
-  endif()
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
+      set_source_files_properties(
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
+        PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
+      )
+    endif()

-  set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
-  # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
-  if(CMAKE_COMPILER_IS_GNUCXX)
-    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
-  endif()
-  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
-    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
+    set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
+    # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
+    if(CMAKE_COMPILER_IS_GNUCXX)
+      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
+    endif()
+    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
+      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
+    endif()
  endif()
  set_source_files_properties(
    ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -629,9 +633,11 @@ if(USE_ROCM)
    list(APPEND Caffe2_HIP_SRCS
      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
  endif()
-  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
-  if(NOT WIN32)
-    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
+  if(USE_DISTRIBUTED)
+    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
+    if(NOT WIN32)
+      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
+    endif()
  endif()
  # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
  # See NOTE [ ATen NVRTC Stub and HIP ]
@ -1352,10 +1358,12 @@ if(BUILD_TEST)
    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
    add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
    add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
-    add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
-    if(NOT WIN32)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
+    if(USE_DISTRIBUTED)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
+      if(NOT WIN32)
+        add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
+        add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
+      endif()
    endif()
    if(NOT NO_API)
      add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
@ -1460,40 +1468,46 @@ if(BUILD_LITE_INTERPRETER)
  endif()
 endif()

-if(USE_GLOO AND USE_C10D_GLOO)
-  target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
-endif()
-if(USE_UCC AND USE_C10D_UCC)
-  target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
-  if(USE_CUDA)
-    target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
+
+# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
+# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
+if(USE_DISTRIBUTED)
+  target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
+  if(USE_GLOO AND USE_C10D_GLOO)
+    target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
  endif()
-endif()
-if(USE_NCCL AND USE_C10D_NCCL)
-  if(USE_ROCM)
-    target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
-  else()
-    target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
+  if(USE_UCC AND USE_C10D_UCC)
+    target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
+    if(USE_CUDA)
+      target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
+    endif()
  endif()
-endif()
-if(USE_MPI AND USE_C10D_MPI)
-  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    set_source_files_properties(
-      "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
-      PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+  if(USE_NCCL AND USE_C10D_NCCL)
+    if(USE_ROCM)
+      target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
+    else()
+      target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
+    endif()
+  endif()
+  if(USE_MPI AND USE_C10D_MPI)
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      set_source_files_properties(
+        "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
+        PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
+    endif()
+    target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
+  endif()
+  # Pass USE_RPC in order to reduce use of
+  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+  # need to be removed when RPC is supported
+  if(NOT WIN32)
+    target_compile_definitions(torch_cpu PUBLIC USE_RPC)
+  endif()
+  # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
+  # can only be compiled with USE_TENSORPIPE is set.
+  if(USE_TENSORPIPE)
+    target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
  endif()
-  target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
-endif()
-# Pass USE_RPC in order to reduce use of
-# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
-# need to be removed when RPC is supported
-if(NOT WIN32)
-  target_compile_definitions(torch_cpu PUBLIC USE_RPC)
-endif()
-# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
-# can only be compiled with USE_TENSORPIPE is set.
-if(USE_TENSORPIPE)
-  target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
 endif()

 if(NOT INTERN_BUILD_MOBILE)
--- a/caffe2/perfkernels/batch_box_cox_sve128.cc
+++ b/caffe2/perfkernels/batch_box_cox_sve128.cc
@ -114,14 +114,20 @@ inline float32x4_t vexpq_f32(float32x4_t x) {

  auto poly = svset_neonq(svundef_f32(), vfmaq_f32(scale, p12345, scale));

+  auto pHigh = svcmpgt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), max_input);
+  auto pLow = svcmplt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), min_input);
+
+  auto bound = svsel_f32(
+      pHigh,
+      inf,
+      zero);
+
+  auto pCombined = svorr_b_z(svptrue_b8(), pLow, pHigh);
+
  // Handle underflow and overflow.
  poly = svsel_f32(
-      svcmplt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), min_input),
-      zero,
-      poly);
-  poly = svsel_f32(
-      svcmpgt_f32(svptrue_b8(), svset_neonq(svundef_f32(), x), max_input),
-      inf,
+      pCombined,
+      bound,
      poly);

  return svget_neonq(poly);
--- a/caffe2/perfkernels/batch_box_cox_vec.h
+++ b/caffe2/perfkernels/batch_box_cox_vec.h
@ -73,19 +73,6 @@ void box_cox_zero_lambda(
  }
 }

-template <typename T>
-at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
-    at::vec::Vectorized<T> data,
-    at::vec::Vectorized<T> lambda1,
-    at::vec::Vectorized<T> lambda2,
-    at::vec::Vectorized<T> k_eps) {
-  auto sum = data + lambda2;
-  auto max = at::vec::max(sum, k_eps);
-  auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
-  auto pow = max.pow(lambda1);
-  return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
-}
-
 template <typename T>
 void box_cox_nonzero_lambda(
    int64_t D,
@ -101,18 +88,21 @@ void box_cox_nonzero_lambda(
  auto k_eps_vec = Vec(k_eps);
  for(; j + VLEN < D; j += VLEN) {
    auto data = Vec::loadu(data_ptr + j);
-    auto lambda1 = Vec::loadu(lambda1_ptr + j);
    auto lambda2 = Vec::loadu(lambda2_ptr + j);
-    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
+    auto sum = data + lambda2;
+    auto max = at::vec::max(sum, k_eps_vec);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j);
+    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
+    auto pow = max.pow(lambda1);
+    auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
    res.store(out + j);
  }
-  if (j < D) {
-    auto remaining = D - j;
-    auto data = Vec::loadu(data_ptr + j, remaining);
-    auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
-    auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
-    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
-    res.store(out + j, remaining);
+  for ( ;j < D; ++j) {
+    auto sum = data_ptr[j] + lambda2_ptr[j];
+    auto max = std::max(sum, k_eps);
+    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
+    auto pow = std::pow(max, lambda1_ptr[j]);
+    out[j] = pow * lambda_over_1 - lambda_over_1;
  }
 }
 #else
--- a/Show More
+++ b/Show More