update

uipdate
update
2025-10-23 23:04:52 +08:00 · 2025-09-23 07:09:52 -07:00 · 2025-09-23 06:59:49 -07:00 · 2025-09-22 23:17:03 -07:00 · 2025-09-22 23:16:21 -07:00 · 2025-09-22 23:14:10 -07:00
464 changed files with 17121 additions and 5960 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -31,8 +31,7 @@ pip install -r /pytorch/requirements.txt
 pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
    export USE_SYSTEM_NCCL=1
@ -46,6 +45,5 @@ else
        export USE_NVIDIA_PYPI_LIBS=1
    fi

-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -317,7 +317,7 @@ if __name__ == "__main__":
    ).decode()

    print("Building PyTorch wheel")
-    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    build_vars = ""
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -262,13 +262,10 @@ case "$tag" in
    TRITON_CPU=yes
    ;;
  pytorch-linux-jammy-linter)
-    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
-    # We will need to update mypy version eventually, but that's for another day. The task
-    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    PYTHON_VERSION=3.10
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
+    PYTHON_VERSION=3.10
    CUDA_VERSION=12.8.1
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-56392aa978594cc155fa8af48cd949f5b5f1823a
+e0dda9059d082537cee36be6c5e4fe3b18c880c0
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -1,2 +1,2 @@
-transformers==4.54.0
+transformers==4.56.0
 soxr==0.5.0
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-5ae38bdb0dc066c5823e34dc9797afb9de42c866
+bbb06c0334a6772b92d24bde54956e675c8c6604
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -42,22 +42,27 @@ install_pip_dependencies() {
  # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
  # numba and scipy version used in PyTorch CI
  conda_run pip uninstall -y numba scipy
+  # Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
+  pip_install yaspin==3.1.0

  popd
 }

 setup_executorch() {
-  pushd executorch
-
  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"

  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
-  popd
 }

-clone_executorch
-install_buck2
-install_conda_dependencies
-install_pip_dependencies
-setup_executorch
+if [ $# -eq 0 ]; then
+  clone_executorch
+  install_buck2
+  install_conda_dependencies
+  install_pip_dependencies
+  pushd executorch
+  setup_executorch
+  popd
+else
+  "$@"
+fi
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -93,8 +93,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:

-mypy==1.16.0
+mypy==1.16.0 ; platform_system != "Windows"
 # Pin MyPy version because new errors are likely to appear with each release
+# Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -41,7 +41,6 @@ def sample_vllm_test_library():
                "pytest -v -s basic_correctness/test_cumem.py",
                "pytest -v -s basic_correctness/test_basic_correctness.py",
                "pytest -v -s basic_correctness/test_cpu_offload.py",
-                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
            ],
        },
        "vllm_basic_models_test": {
@ -68,15 +67,12 @@ def sample_vllm_test_library():
                        "-v",
                        "-s",
                        "entrypoints/llm",
-                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
                        "--ignore=entrypoints/llm/test_generate.py",
-                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
                        "--ignore=entrypoints/llm/test_collective_rpc.py",
                    ]
                ),
-                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
-                "pytest -v -s entrypoints/llm/test_generate.py ",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/llm/test_generate.py",
+                "pytest -v -s entrypoints/offline_mode",
            ],
        },
        "vllm_regression_test": {
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1550,14 +1550,10 @@ test_executorch() {
  install_torchvision
  install_torchaudio

+  INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
+
  pushd /executorch
-
-  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
-
-  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
-  # from the PR
-  bash .ci/scripts/setup-linux.sh --build-tool cmake
+  "${INSTALL_SCRIPT}" setup_executorch

  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
@ -1571,10 +1567,6 @@ test_executorch() {

  popd

-  # Test torchgen generated code for Executorch.
-  echo "Testing ExecuTorch op registration"
-  "$BUILD_BIN_DIR"/test_edge_op_registration
-
  assert_git_not_dirty
 }

--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -137,7 +137,7 @@ sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
  if "%BUILD_ENVIRONMENT%"=="" (
-    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
+    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
  ) else (
    copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"

--- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
 ) else (
  set CONDA_PARENT_DIR=C:\Jenkins
 )
-
+set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3

 :: Be conservative here when rolling out the new AMI with conda. This will try
 :: to install conda as before if it couldn't find the conda installation. This
 :: can be removed eventually after we gain enough confidence in the AMI
-if not exist %CONDA_PARENT_DIR%\Miniconda3 (
+if not exist %CONDA_ROOT_DIR% (
  set INSTALL_FRESH_CONDA=1
 )

@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b

-  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
+  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b
 )

 :: Activate conda so that we can use its commands, i.e. conda, python, pip
-call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
+call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
+:: Activate conda so that we can use its commands, i.e. conda, python, pip
+call conda activate py_tmp
+
+call pip install -r .ci/docker/requirements-ci.txt
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
 :: build\torch. Rather than changing all these references, making a copy of torch folder
 :: from conda to the current workspace is easier. The workspace will be cleaned up after
 :: the job anyway
-xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\

 pushd .
 if "%VC_VERSION%" == "" (
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,14 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+
+# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
+# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
+# scipy from 1.6.3 to 1.10
+# expecttest from 0.1.3 to 0.3.0
+# xdoctest from 1.0.2 to 1.3.0
+python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"

 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.15.1.0
@ -52,9 +59,6 @@ python -m pip install parameterized==0.8.1
 # Install pulp for testing ilps under torch\distributed\_tools
 python -m pip install pulp==2.9.0

-# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
-python -m pip install expecttest==0.3.0
-
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
        change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")

        for file in Path(f"artifacts/dist/{old_stem}").glob(
-            "*.dist-info/**",
+            "*.dist-info/*",
        ):
            change_content_to_new_version(file)

--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@ -6,6 +6,12 @@ inputs:
  cuda-version:
    description: which cuda version to install, 'cpu' for none
    required: true
+  python-version:
+    required: false
+    type: string
+    default: "3.10"
+    description: |
+      The python version to be used. Will be 3.10 by default

 runs:
  using: composite
@ -38,18 +44,24 @@ runs:
        CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"

        {
+          echo "CONDA=${CONDA}";
          echo "CONDA_RUN=${CONDA} run --no-capture-output";
          echo "CONDA_BUILD=${CONDA} run conda-build";
          echo "CONDA_INSTALL=${CONDA} install";
        } >> "${GITHUB_ENV}"

    - name: Setup Python3
+      env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
      shell: bash
      run: |
        set +e
        set -x

-        PYTHON3=$(${CONDA_RUN} which python3)
+        # Create new py_tmp env with python-version
+        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp
+
+        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
        EXIT_CODE=$?

        if [[ "${EXIT_CODE}" == "0" ]]; then
@ -62,7 +74,7 @@ runs:
          # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
          # is also the Miniconda installation that is Python 2 based, and both can be installed if
          # needed. In both cases, Python binary is just called python
-          PYTHON=$(${CONDA_RUN} which python)
+          PYTHON=$(${CONDA_RUN} -n py_tmp which python)
          EXIT_CODE=$?

          if [[ "${EXIT_CODE}" == "0" ]]; then
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-d119fc86140785e7efc8f125c17153544d1e0f20
+090197034faf3b193c4467cedeb9281e3078892d
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -130,3 +130,6 @@
 - torch/csrc/inductor/aoti_include/**
 - torchgen/aoti/**
 - torchgen/gen_aoti_c_shim.py
+
+"ciflow/vllm":
+- .github/ci_commit_pins/vllm.txt
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
            arches=["6.4"],
-            python_versions=["3.9"],
+            python_versions=["3.10"],
        ),
        ciflow_config=CIFlowConfig(
            labels={
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -187,8 +187,6 @@ jobs:

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        with:
-          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: configure aws credentials
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@ -2,6 +2,12 @@ name: Get Changed Files

 on:
  workflow_call:
+    inputs:
+      all_files:
+        description: "Whether to return all files instead of just changed files"
+        required: false
+        type: boolean
+        default: false
    outputs:
      changed-files:
        description: "List of changed files (space-separated) or '*' if not in a PR"
@ -26,17 +32,23 @@ jobs:
            # Get the PR number from the github context
            PR_NUMBER="${{ github.event.number }}"

-            # Use gh CLI to get changed files in the PR with explicit repo
-            CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
+            # Check if all_files is requested
+            if [ "${{ inputs.all_files }}" = "true" ]; then
+              echo "all_files input is true, returning all files"
+              echo "changed-files=*" >> "$GITHUB_OUTPUT"
+            else
+              # Use gh CLI to get changed files in the PR with explicit repo
+              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')

-            if [ -z "$CHANGED_FILES" ]; then
-              echo "No changed files found, setting to '*'"
-              CHANGED_FILES="*"
+              if [ -z "$CHANGED_FILES" ]; then
+                echo "No changed files found, setting to '*'"
+                CHANGED_FILES="*"
+              fi
+
+              echo "Changed files: $CHANGED_FILES"
+              echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
            fi

-            echo "Changed files: $CHANGED_FILES"
-            echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
-
          else
            echo "Not in PR context, setting changed files to '*'"
            echo "changed-files=*" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -151,7 +151,7 @@ jobs:
          BUILD_WHEEL: 1
          MAX_JOBS: 8
          CUDA_VERSION: ${{ inputs.cuda-version }}
-          PYTHON_VERSION: "3.9"
+          PYTHON_VERSION: "3.10"
          SCCACHE_BUCKET: "ossci-compiler-cache"
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SCCACHE_REGION: us-east-1
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -184,7 +184,7 @@ jobs:
        env:
          USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
          INSTALL_WINDOWS_SDK: 1
-          PYTHON_VERSION: 3.9
+          PYTHON_VERSION: "3.10"
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -70,9 +70,8 @@ jobs:
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
-          # Executorch pin needs update
-          # pytorch-linux-jammy-py3-clang12-executorch,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
+          pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu,
          pytorch-linux-noble-riscv64-py3.12-gcc14
        ]
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -44,7 +44,7 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-rocm6_4-build:
+  manywheel-py3_10-rocm6_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -58,16 +58,16 @@ jobs:
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-rocm6_4
+      build_name: manywheel-py3_10-rocm6_4
      build_environment: linux-binary-manywheel-rocm
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm6_4-test:  # Testing
+  manywheel-py3_10-rocm6_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_9-rocm6_4-build
+      - manywheel-py3_10-rocm6_4-build
      - get-label-type
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
@ -82,14 +82,14 @@ jobs:
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
-          name: manywheel-py3_9-rocm6_4
+          name: manywheel-py3_10-rocm6_4
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -31,6 +31,8 @@ jobs:
    if: github.repository_owner == 'pytorch'
    name: Get changed files
    uses: ./.github/workflows/_get-changed-files.yml
+    with:
+      all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}

  lintrunner-clang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -53,7 +55,7 @@ jobs:
    with:
      timeout: 120
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
@ -264,10 +266,10 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.9
+      - name: Setup Python 3.10
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
-          python-version: '3.9'
+          python-version: '3.10'
          architecture: x64
          cache: pip
      - name: Install dependencies
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -127,8 +127,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
@ -318,32 +316,6 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3-clang12-executorch-build:
-    if: false  # Docker build needs pin update
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
-      test-matrix: |
-        { include: [
-          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3-clang12-executorch-test:
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3-clang12-executorch-build
-    if: false # Has been broken for a while
-    with:
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
    name: cuda12.8-py3.10-gcc9-sm75
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,8 +140,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -259,3 +259,27 @@ jobs:
      docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-build:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
+      test-matrix: |
+        { include: [
+          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-test:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3-clang12-executorch-build
+    with:
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -53,27 +53,3 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -36,6 +36,8 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
+      # When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
+      allow-reuse-old-whl: false
      build-additional-packages: "vision audio"
      build-external-packages: "vllm"
      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
--- a/.gitignore
+++ b/.gitignore
@ -259,6 +259,9 @@ gen
 .pytest_cache
 aten/build/*

+# Linker scripts for prioritized text optimization
+cmake/linker_script.ld
+
 # Bram
 plsdontbreak

--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -123,6 +123,7 @@ is_formatter = true
 code = 'MYPY'
 include_patterns = [
    'setup.py',
+    'functorch/dim/**/*.py',
    'torch/**/*.py',
    'torch/**/*.pyi',
    'caffe2/**/*.py',
@ -195,6 +196,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
+    'tools/experimental/dynamic_shapes/torchfuzz/**',
 ]
 command = [
    'python3',
@ -964,7 +966,6 @@ exclude_patterns = [
    'test/jit/**',  # should be run through test/test_jit.py
    'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py
    'test/fx/**',  # should be run through test/test_fx.py
-    'test/bottleneck_test/**',  # excluded by test/run_test.py
    'test/package/**',  # excluded by test/run_test.py
    'test/distributed/argparse_util_test.py',
    'test/distributed/bin/test_script.py',
@ -1410,8 +1411,6 @@ exclude_patterns = [
    'torch/utils/benchmark/utils/timer.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
-    'torch/utils/bottleneck/__init__.py',
-    'torch/utils/bottleneck/__main__.py',
    'torch/utils/bundled_inputs.py',
    'torch/utils/checkpoint.py',
    'torch/utils/collect_env.py',
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,4 @@
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
-# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)

 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
 # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@ -380,6 +379,13 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                       OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                        "CPU_AARCH64" OFF)
+# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
+set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
+if(LINUX AND CPU_AARCH64)
+  set(USE_PRIORITIZED_TEXT_DEFAULT ON)
+endif()
+cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
+  "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)

 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
@ -657,6 +663,11 @@ endif(MSVC)

 string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")

+# Set linker max-page-size to 64KiB on AArch64 Linux
+if(LINUX AND CPU_AARCH64)
+  add_link_options_if_supported("-z,max-page-size=0x10000")
+endif()
+
 # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
 # applicable to mobile are disabled by this variable. Setting
 # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -1421,3 +1432,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
  install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
          DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()
+
+if(USE_PRIORITIZED_TEXT_FOR_LD)
+  add_compile_options(
+    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
+    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
+  )
+  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
+  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
+
+  add_custom_command(
+    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
+    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
+    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
+    COMMENT "Generating prioritized text linker files"
+    VERBATIM
+  )
+
+  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
+
+  if(BUILD_PYTHON)
+    set(LINKER_OPT_TARGETS torch_python)
+  endif()
+
+  if(NOT BUILD_LIBTORCHLESS)
+    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
+    if(USE_CUDA)
+      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
+    endif()
+    if(USE_XPU)
+      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
+    endif()
+    if(USE_ROCM)
+      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
+    endif()
+  endif()
+
+  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
+    if(TARGET ${tgt})
+      add_dependencies("${tgt}" generate_linker_script)
+      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
+      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
+    else()
+       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
+    endif()
+  endforeach()
+
+else()
+  if(LINUX AND CPU_AARCH64)
+    message(WARNING [[
+    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
+    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+    ]])
+  endif()
+endif()
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
 }

 bool Context::allowTF32CuDNN(const std::string& op) const {
-  if (op.size() == 0){
+  if (op.empty()){
    bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
    bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
    TORCH_CHECK(
@ -281,9 +281,6 @@ bool Context::userEnabledOverrideableSDP() const {

 static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
 static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
-#ifdef USE_ROCM
-static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
-#endif

 bool Context::checkCuBLASConfigDeterministic() {
  // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
@ -343,12 +340,6 @@ void Context::setImmediateMiopen(bool b) {
 }

 bool Context::allowTF32CuBLAS() const {
-#ifdef USE_ROCM
-    const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
-    if (allow_tf32 != true) {
-      return false;
-    }
-#endif
  bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
  bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
  TORCH_CHECK(
@ -362,14 +353,6 @@ bool Context::allowTF32CuBLAS() const {
 }

 void Context::setAllowTF32CuBLAS(bool b) {
-#ifdef USE_ROCM
-  const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
-  if (allow_tf32 != true) {
-    C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
-                              << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
-    return;
-  }
-#endif
  float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
  setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
 }
@ -443,7 +426,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
    std::string msg;
    auto iterp = _fp32_precisions.find(backend);
    TORCH_CHECK(iterp != _fp32_precisions.end());
-    for (auto p : iterp->second) {
+    for (const auto& p : iterp->second) {
      msg += p;
      msg += " ";
    }
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -401,30 +401,13 @@ T* toDLPackImpl(const Tensor& src) {
  // The following code detects whether the src follows
  // a continuous pattern. If the src follows such pattern (common-case)
  // then we do not need to normalize the strides.
-  bool need_normalize_strides = false;
-  int64_t expected_stride = 1;
-  for (int i = src.dim() - 1; i >= 0; i--) {
-    // detect if we do not meet continuous pattern
-    // and the size is 1, so there is opportunity to normalize
-    if (src.stride(i) != expected_stride && src.size(i) == 1) {
-      need_normalize_strides = true;
-      break;
-    }
-    expected_stride *= src.size(i);
-  }
-
+  bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
  // less common case, try normalizing the strides
  if (need_normalize_strides) {
    // create a new tensor with possibly normalized strides
    // gh-83069
    auto shape = src.sizes();
-    auto strides = src.strides().vec();
-    for (int i = 0; i < src.dim(); i++) {
-      if (shape[i] < 2) {
-        strides[i] = 1;
-      }
-    }
-    view = src.as_strided(shape, strides, src.storage_offset());
+    view = src.as_strided(shape, {1}, src.storage_offset());
  }

  ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -133,7 +133,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
  : c10::TensorImpl(
      c10::DispatchKeySet(DispatchKey::Functionalize),
      view_value.dtype(),
-      view_value.device()
+      base->storage().data_ptr().device()
    ),
    value_(view_value),
    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
@ -485,7 +485,10 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI


 c10::Device FunctionalTensorWrapper::device_custom() const {
-  return value_.unsafeGetTensorImpl()->device();
+  // The storage pointer already uses the underlying tensor custom device (if
+  // applicable) to extract the device. So, we dont have to recurse again by
+  // doing value_.unsafeGetTensorImpl()->device().
+  return storage().data_ptr().device();
 }
 at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
  return value_.unsafeGetTensorImpl()->sizes();
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1637,9 +1637,7 @@ bool gemm_and_bias(
  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
    epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
-#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
-#endif
  }

  if (bias != nullptr) {
@ -1931,7 +1929,6 @@ void scaled_gemm(
    bool use_fast_accum) {
  // Note: see `cublasCommonArgs` for various non-intuitive manupulations
  // of input arguments to this function.
-#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
  const auto computeType = CUBLAS_COMPUTE_32F;
  const auto scaleType = CUDA_R_32F;
  const float alpha_val = 1.0;
@ -1954,8 +1951,8 @@ void scaled_gemm(
  #if ROCM_VERSION >= 70000
            if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
                // TODO: add constraints based on hipblaslt internals
-                TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
-                           "Matrix dimensions must be multiples of 32 for MX format. "
+                TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
+                           "M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
                           "Got m=", m, ", n=", n, ", k=", k);
            }
  #endif
@ -2133,8 +2130,6 @@ void scaled_gemm(
      " scaleType ",
      scaleType);
  return;
-#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
-  TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
 }

 void int8_gemm(
--- a/aten/src/ATen/native/EmbeddingBag.h
+++ b/aten/src/ATen/native/EmbeddingBag.h
@ -1,3 +1,4 @@
+#pragma once
 #include <ATen/core/Tensor.h>
 #include <ATen/Config.h>
 #include <cstdint>
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@ -23,8 +23,6 @@ Tensor& max_unpooling2d_forward_out_cpu(
  // Nondeterministic with duplicate indices
  at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");

-  auto oheight = output_size[0];
-  auto owidth = output_size[1];
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices_.scalar_type());
@ -45,6 +43,9 @@ Tensor& max_unpooling2d_forward_out_cpu(
                self_.sizes(), " with dimension ", i , " being empty.");
  }

+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
  auto memory_format = self_.suggest_memory_format();
  auto self = self_.contiguous(memory_format);
  auto indices = indices_.contiguous(memory_format);
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@ -73,7 +73,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value)
    for (const auto i : c10::irange((size_t)l_pad)) {
        auto pad_idx = pad.size() - ((i + 1) * 2);
        auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1];
-        TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
+        TORCH_CHECK(new_dim >= 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ",
                 pad[pad_idx], " and ", pad[pad_idx + 1], " resulted in a negative output size, "
                 "which is invalid. Check dimension ", l_diff + i, " of your input.");
        new_shape.emplace_back(new_dim);
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1138,9 +1138,14 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
 bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
  // TODO: We might want to enforce some structure on the shapes of the scale
  // tensors
-  return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
-      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
-      && scale.is_contiguous());
+  bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
+      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
+  bool is_packed_fp4_path = false;
+#ifdef USE_ROCM
+  is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
+      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
+#endif
+  return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
 }

 bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
@ -1381,9 +1386,15 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");

-    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
-                mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
-                "Matrix dimensions must be multiples of 32 for block-wise scaling");
+    int packed_factor = 1;
+    if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
+      // For float4 data type, each byte stores two 4-bit floating-point values,
+      // effectively packing two elements into one byte.
+      packed_factor = 2;
+    }
+    TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
+                mat2.size(1) % 16 == 0,
+                "M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");

    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
                out.scalar_type() == ScalarType::Half,
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@ -51,7 +51,7 @@ std::vector<Tensor> foreach_tensor_list_op(
      Op<opmath_t>(),
      alpha.to<opmath_t>());

-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@ -45,7 +45,7 @@ std::vector<Tensor> foreach_binary_op(
          /* res_arg_index */ 1>(),
      Op<opmath_t>(),
      scalar.to<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@ -33,7 +33,7 @@ std::vector<Tensor> foreach_binary_op(
  }

  tensor_lists.emplace_back(tensors.vec());
-  tensor_lists.emplace_back(vec_res);
+  tensor_lists.emplace_back(std::move(vec_res));

  using opmath_t = at::opmath_type<T>;
  multi_tensor_apply<2, opmath_t>(
@ -46,7 +46,7 @@ std::vector<Tensor> foreach_binary_op(
          /* res_arg_index */ 1>(),

      Op<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
@ -56,7 +56,7 @@ std::vector<Tensor> foreach_binary_op(
      Op<opmath_t>(),
      scalar.data_ptr<T>(),
      alpha.to<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@ -57,7 +57,7 @@ std::vector<Tensor> foreach_pointwise_op(
            scalar.to<opmath_t>());
      });

-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }

 template <template <class> class Op>
@ -160,7 +160,7 @@ std::vector<Tensor> foreach_pointwise_op(
            Op<opmath_t>());
      });

-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }

 #define FOREACH_POINTWISE_OP_SCALAR(NAME, OP)                           \
--- a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
@ -37,7 +37,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
    vec_res.emplace_back(at::native::empty_like(t));
  }
  std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)};

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
      at::ScalarType::Half,
@ -56,7 +56,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
            LerpFunctor<opmath_t>());
      });

-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }

 void foreach_tensor_lerp_ternary_cuda_(
@ -104,7 +104,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
    vec_res.emplace_back(at::native::empty_like(t));
  }
  std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), std::move(vec_res)};

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
      at::ScalarType::Half,
@ -124,7 +124,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
            weight.to<opmath_t>());
      });

-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }

 void foreach_tensor_lerp_list_cuda_(
@ -173,7 +173,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
    vec_res.emplace_back(at::native::empty_like(t));
  }
  std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), std::move(vec_res)};

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
      at::ScalarType::Half,
@ -193,7 +193,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
            LerpFunctor<opmath_t>());
      });

-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }

 void foreach_tensor_lerp_scalarlist_cuda_(
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@ -67,7 +67,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
          /* res_arg_index */ 1>(),
      Op<opmath_t>());

-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename scalar_t, template <class> class Op>
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@ -125,8 +125,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices_.scalar_type());
-  auto oheight = output_size[0];
-  auto owidth = output_size[1];

  TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
      indices_arg{indices_, "indices_", 3};
@ -149,6 +147,9 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
      output_size.size() == 2,
      "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");

+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
  int64_t dimw = 2;
  int64_t dimh = 1;
  int64_t numBatch = 1;
@ -217,9 +218,6 @@ static void max_unpooling3d_shape_check(
    IntArrayRef stride,
    IntArrayRef padding,
    const char *fn_name) {
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];
  TORCH_CHECK(
      indices.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices.scalar_type());
@ -250,6 +248,10 @@ static void max_unpooling3d_shape_check(
      "strides should be greater than zero, but got stride: ",
      stride);

+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
  int dimw = 3;
  int dimh = 2;
  int dimt = 1;
@ -402,8 +404,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
    const Tensor& indices_,
    IntArrayRef output_size,
    Tensor& grad_input) {
-  int64_t oheight = output_size[0];
-  int64_t owidth = output_size[1];
  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
@ -426,6 +426,9 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,

  TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());

+  int64_t oheight = output_size[0];
+  int64_t owidth = output_size[1];
+
  int64_t nInputCols, nInputRows, nInputPlane;

  int dimw = 2;
@ -505,13 +508,14 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
    IntArrayRef padding,
    Tensor& grad_input) {
  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];

  max_unpooling3d_shape_check(
    self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");

+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
  int batchSize = 0;
  int inputSlices = 0;
  int inputTime = 0;
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@ -300,8 +300,6 @@ void nonzero_static_cuda_out_impl(
    int64_t size,
    int64_t fill_value,
    Tensor& out) {
-#if defined(CUDA_VERSION) || defined(USE_ROCM)
-
  Tensor self_contiguous_ = self.contiguous();
  // see comment in nonzero_cuda_out_impl on reqs for out
  bool out_correct_size =
@ -377,9 +375,6 @@ void nonzero_static_cuda_out_impl(
  if (need_to_copy) {
    out.copy_(out_temp);
  }
-#else
-  TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
-#endif
 }

 Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@ -226,6 +226,38 @@ __global__ void CatArrayBatchedCopy_contig(
    }
 }

+
+template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
+__global__ void CatArrayBatchedCopy_vectorized(
+    char* output,
+    CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
+    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
+    const int concatDim,
+    IndexType trailingSize) {
+
+    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
+    IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
+
+    if(tid >= nElements) return;
+
+    const char * data = (char*)inputs.input[blockIdx.y];
+    IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
+    IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
+    int64_t dataOffset = (int64_t)offset  * alignment; // in bytes
+
+    IndexType stride = gridDim.x * blockDim.x;
+
+    while( tid < nElements){
+      int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
+                    os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
+      auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
+      at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
+      tid += stride;
+    }
+}
+
+
+
 /*
  Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
  to improve memory bandwidth throughput.
@ -296,12 +328,27 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
  scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
  CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
  TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
+  // If all batches are contiguous we can call a specialized implementation
+  // which requires the input tensor addresses to be aligned to a
+  // 16 Byte boundary.
+
+  constexpr bool isContig = stride_size == 1;
+  bool isAligned = true;
+  constexpr int alignment = 16;

  // Next, let's initialize the size, stride arrays for the output Tensor.
+  // for contig case, we'll canonicalize output strides, so that
+  // we don't have arbitrary strides for dims of size 0
+  size_t stride0 = 1;
  if (memory_format == c10::MemoryFormat::Contiguous) {
-    for (int i = 0; i < nDims; ++i) {
+    for (int i = nDims - 1; i >= 0; --i) {
      outputParam.tensorSize[i] = out.size(i);
-      outputParam.tensorStride[i] = out.stride(i);
+      if (isContig) {
+        outputParam.tensorStride[i] = stride0;
+        stride0 *= out.size(i);
+      } else {
+        outputParam.tensorStride[i] = out.stride(i);
+      }
    }
  } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
    // permute the semantics of dims from NCHW to NHWC so that the input
@ -320,12 +367,15 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i

  at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();

-  // If all batches are contiguous we can call a specialized implementation
-  // which requires the input tensor addresses to be aligned to a
-  // 16 Byte boundary.

-  bool isContig = true;
-  bool isAligned = true;
+  // for channels last computing slice size correctly is much more involved, so we never send it
+  // on the fully vectorized path
+  // we need output stride in cat dimension to be multiple of alignment,
+  // if we ever use it to compute offsets
+  // for catting in 0th dimension it doesn't matter
+  bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
+                        memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
+                        outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
  unsigned int max_elements_per_tensor = 0;

  // Now we loop
@ -341,6 +391,16 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      // high-dimensional tensor
      if (inputs[i+batchCounter].get().numel() > 0) {
        dimSize = inputs[i+batchCounter].get().size(dimension);
+        if (isInOutAligned) {
+          auto t = inputs[i+batchCounter].get();
+          // similarly to output stride, we cannot trust stride value to
+          // determine slice size if the corresponding dimension is 1
+          // we have to multiply all the subsequent sizes
+          int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
+             t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
+          slice_size *= sizeof(scalar_t);
+          isInOutAligned &= (slice_size % alignment == 0);
+        }
      }

      catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@ -351,10 +411,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 #ifdef USE_ROCM
      // On ROCm, CatArrayBatchedCopy_contig is faster
      isAligned = false;
+      isInOutAligned = false;
 #else
      // If at least one of the inputs is not aligned, we can't call the
      // CatArrayBatchedCopy_alignedK_contig
      isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
+      isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
 #endif

      if (stride_size > 1) {
@ -365,7 +427,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
          catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
        }
        catMetaData.isContiguous[batchCounter] = false;
-        isContig = false;
      } else {
        catMetaData.isContiguous[batchCounter] = true;
      }
@ -388,10 +449,13 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
          max_elements_per_tensor, batchCounter);
 #else
    dim3 applyBlock, catGrid;
-    if (isContig && sizeof(scalar_t) > 2) {
+    if (isInOutAligned) {
+      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
+        max_elements_per_tensor, batchCounter);
+    } else if (isContig && isAligned && sizeof(scalar_t) > 2) {
      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
          max_elements_per_tensor, batchCounter);
-    } else if (isContig && sizeof(scalar_t) == 2) {
+    } else if (isContig && isAligned && sizeof(scalar_t) == 2) {
      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
          max_elements_per_tensor, batchCounter);
    } else {
@ -399,6 +463,30 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      getCatGrid(batchCounter, catGrid);
    }
 #endif
+    int32_t trailingSize;
+    TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
+    if (isInOutAligned) {
+      // in this case we can and should flatten the tensors after the cat dim
+      // we want to view the tensors as if consisting of `alignment`-sized elements
+      // however, we might not be able to cleanly divide just the last dim -
+      // it might not be the multiple of alignment.
+      // however, we know that the full concatted slice is multiple of alignment,
+      // so if we flatten all the dims after and including concat dim,
+      // it will be divisible by alignment
+      // then we need to divide last out size by elems_per_vec,
+      // and divide all strides except last by elems_per_vec (last stride is 1 always)
+      // for input, we will fix up the sizes and strides in the kernel directly
+      kernelOutputParam = outputParam;
+      nDims = dimension + 1;
+      constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
+      auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
+      kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
+      trailingSize = outputParam.tensorStride[dimension];
+      kernelOutputParam.tensorStride[dimension] = 1;
+      for (int i = 0; i < dimension; ++i) {
+        kernelOutputParam.tensorStride[i] /= elems_per_vec;
+      }
+    }

    if (memory_format != c10::MemoryFormat::Contiguous) {
      switch (dimension) {
@ -413,7 +501,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
    }
    // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
-    if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
+    if (isInOutAligned) {\
+      constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
+      CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
+      catGrid, applyBlock, 0, stream.stream()>>>(\
+        (char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
+    } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
          catGrid, applyBlock, 0, stream.stream()>>>(\
              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -221,22 +221,9 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
  std::optional<CuFFTConfig> uncached_plan;
  const CuFFTConfig * config = nullptr;

-  // Workaround for gh-63152, gh-58724
-  // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
  // Bluestein's algorithm is only used when a size has large prime factors,
  // sizes with only small prime factors can still be cached
-  bool use_caching = true;
-#ifdef CUFFT_VERSION
-  if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
-    // Only cache plans for transforms with small prime factors
-    use_caching = std::none_of(
-        signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
-      return has_large_prime_factor(dim_size);
-    });
-  }
-#endif
-
-  if (use_caching && plan_cache.max_size() > 0) {
+  if (plan_cache.max_size() > 0) {
    guard.lock();
    if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
      config = &plan_cache.lookup(Params);
--- a/aten/src/ATen/native/im2col_shape_check.h
+++ b/aten/src/ATen/native/im2col_shape_check.h
@ -2,6 +2,7 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/div_rtn.h>
+#include <c10/util/safe_numerics.h>

 namespace at::native {

@ -54,6 +55,14 @@ inline void col2im_shape_check(

  int64_t batch_dim = (ndim == 3) ? 0 : -1;
  int64_t n_input_plane = input.size(batch_dim + 1);
+  uint64_t prod_kernel_size = 1;
+
+  TORCH_CHECK(!c10::mul_overflows(static_cast<uint64_t>(kernel_width), static_cast<uint64_t>(kernel_height), &prod_kernel_size),
+            "Given kernel_width = ",
+            kernel_width,
+            " and kernel_height = ",
+            kernel_height,
+            " the product of kernel_width and kernel_height overflowed.");

  if (n_input_plane % (kernel_width * kernel_height) != 0) {
    TORCH_CHECK(false,
--- a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
@ -559,4 +559,60 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
      at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
  return _int_mm_out_xpu(self, mat2, result);
 }
+
+Tensor _weight_int8pack_mm_xpu(
+    const Tensor& A,
+    const Tensor& B,
+    const Tensor& scales) {
+  auto M = A.size(0);
+  auto N = B.size(0);
+  auto K = A.size(1);
+
+  TORCH_CHECK(
+      A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
+      " : expect A to be either 32-bit or 16-bit float tensor.");
+  TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
+  TORCH_CHECK(
+      A.stride(1) == 1, " : A must be contiguous on the last dimension.");
+  TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
+  TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
+  TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
+
+  TORCH_CHECK(
+      scales.dim() == 1 && scales.size(0) == N,
+      " : expect scales to be 1d tensor with size ",
+      N);
+
+  auto C = at::empty({M, N}, A.options());
+
+  // --- Launch kernel ---
+  Tensor bias = at::Tensor();
+  Tensor mat2_zero_points = at::Tensor();
+  Tensor non_const_scales = scales;
+  auto post_op_args = torch::List<std::optional<at::Scalar>>();
+
+  at::native::onednn::quantized_matmul(
+      A.contiguous(),
+      1.0,
+      0,
+      B,
+      non_const_scales,
+      mat2_zero_points,
+      bias,
+      C,
+      1.0,
+      0,
+      C.scalar_type(),
+      /*other*/ std::nullopt,
+      /*other scale*/ 1.0,
+      /*other zp*/ 0,
+      /*binary post op*/ "none",
+      /*binary alpha*/ 1.0,
+      /*post_op_name*/ "none",
+      post_op_args,
+      /*post_op_algorithm*/ "none",
+      /*m2_trans*/ false);
+
+  return C;
+}
 } // namespace at::native
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@ -110,8 +110,9 @@ void quantized_matmul(
  // [Note] Quantized Matrix Multiplication at XPU
  // The following code integrates oneDNN quantized gemm. The quantization
  // config we support:
-  // activation: s8&u8; per tensor calibrated; symmetric&asymmetric
-  // weight: s8; per_tensor/per_channel calibrated; symmetric
+  // activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
+  // symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
+  // symmetric
  auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
  construct_attr_by_post_op(
      binary_post_op,
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
@ -0,0 +1,25 @@
+#pragma once
+#include <c10/metal/common.h>
+
+#ifdef __METAL__
+enum class EmbeddingBagMode { SUM = 0, MEAN, MAX };
+#else
+#include <ATen/native/EmbeddingBag.h>
+using at::native::EmbeddingBagMode;
+#endif
+
+template <typename idx_type_t = uint32_t>
+struct EmbeddingBagParams {
+  ::c10::metal::array<idx_type_t, 2> weight_strides;
+  ::c10::metal::array<idx_type_t, 2> output_strides;
+  ::c10::metal::array<idx_type_t, 2> max_indices_strides;
+
+  idx_type_t per_sample_weights_strides;
+
+  idx_type_t num_indices;
+  idx_type_t num_bags;
+  idx_type_t feature_size;
+
+  EmbeddingBagMode mode;
+  int64_t padding_idx;
+};
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@ -0,0 +1,212 @@
+#include <ATen/native/mps/kernels/EmbeddingBag.h>
+#include <c10/metal/utils.h>
+#include <metal_array>
+#include <metal_stdlib>
+
+using namespace metal;
+using namespace c10::metal;
+
+template <EmbeddingBagMode M, typename T>
+struct ReductionOpInit {
+  inline opmath_t<T> operator()() {
+    return 0;
+  }
+};
+
+template <typename T>
+struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
+  inline opmath_t<T> operator()() {
+    return static_cast<opmath_t<T>>(-INFINITY);
+  }
+};
+
+template <EmbeddingBagMode M, typename T>
+struct ReductionOp {
+  inline opmath_t<T> operator()(
+      T weight_val,
+      opmath_t<T> out_val,
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_strides);
+};
+
+template <typename T>
+struct ReductionOp<EmbeddingBagMode::SUM, T> {
+  inline opmath_t<T> operator()(
+      T weight_val,
+      opmath_t<T> out_val,
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_strides) {
+    if (per_sample_weights_strides) {
+      T per_sample_weight = per_sample_weights
+          [per_sample_weights_strides * per_sample_weights_index];
+      return static_cast<opmath_t<T>>(per_sample_weight) *
+          static_cast<opmath_t<T>>(weight_val) +
+          out_val;
+    } else {
+      return static_cast<opmath_t<T>>(weight_val) + out_val;
+    }
+  }
+};
+
+template <typename T>
+struct ReductionOp<EmbeddingBagMode::MEAN, T> {
+  inline opmath_t<T> operator()(
+      T weight_val,
+      opmath_t<T> out_val,
+      uint32_t,
+      constant T*,
+      uint32_t) {
+    return static_cast<opmath_t<T>>(weight_val) + out_val;
+  }
+};
+
+template <typename T>
+struct ReductionOp<EmbeddingBagMode::MAX, T> {
+  inline opmath_t<T> operator()(
+      T weight_val,
+      opmath_t<T> out_val,
+      uint32_t,
+      constant T*,
+      uint32_t) {
+    return max(static_cast<opmath_t<T>>(weight_val), out_val);
+  }
+};
+
+template <EmbeddingBagMode M, typename T>
+struct ReductionOpFinal {
+  inline T operator()(opmath_t<T> val, uint32_t) {
+    return static_cast<T>(val);
+  }
+};
+
+template <typename T>
+struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> {
+  inline T operator()(opmath_t<T> val, uint32_t count) {
+    auto out = val / count;
+    return static_cast<T>((count == 0) ? 0 : out);
+  }
+};
+
+template <typename T>
+struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
+  inline T operator()(opmath_t<T> val, uint32_t count) {
+    return static_cast<T>((count == 0) ? 0 : val);
+  }
+};
+
+template <EmbeddingBagMode M, typename T, typename I>
+void embedding_bag_impl(
+    constant T* weight,
+    constant I* indices,
+    constant I* offsets,
+    constant T* per_sample_weights,
+    device T* output,
+    device I* offset2bag,
+    device I* bag_size,
+    device I* max_indices,
+    constant EmbeddingBagParams<uint32_t>& params,
+    uint tid) {
+  auto num_indices = params.num_indices;
+  auto num_bags = params.num_bags;
+  auto feature_size = params.feature_size;
+  auto padding_idx = params.padding_idx;
+  auto per_sample_weights_strides = params.per_sample_weights_strides;
+  constant auto& output_strides = params.output_strides;
+  constant auto& weight_strides = params.weight_strides;
+  constant auto& max_indices_strides = params.max_indices_strides;
+
+  auto bag_idx = tid / feature_size;
+  auto feature_idx = tid % feature_size;
+
+  output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
+
+  uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
+  bool is_last_bag = bag_idx + 1 == num_bags;
+  uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
+  uint32_t indices_end = is_last_bag * (num_indices) +
+      (!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end]));
+
+  auto out_val = ReductionOpInit<M, T>()();
+
+  uint32_t bag_size_ = 0;
+
+  for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
+       indices_idx++) {
+    I weight_idx = indices[indices_idx];
+    bool pad = (weight_idx == padding_idx);
+    T weight_val = weight
+        [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
+         feature_idx * weight_strides[1]];
+
+    bag_size_ += static_cast<uint32_t>(!pad);
+
+    auto tmp_val = ReductionOp<M, T>()(
+        weight_val,
+        out_val,
+        indices_idx,
+        per_sample_weights,
+        per_sample_weights_strides);
+
+    out_val = pad ? out_val : tmp_val;
+  }
+
+  *output = ReductionOpFinal<M, T>()(out_val, bag_size_);
+}
+
+#define DISPATCH_IMPL(MODE)        \
+  return embedding_bag_impl<MODE>( \
+      weight,                      \
+      indices,                     \
+      offsets,                     \
+      per_sample_weights,          \
+      output,                      \
+      offset2bag,                  \
+      bag_size,                    \
+      max_indices,                 \
+      params,                      \
+      tid)
+
+template <typename T, typename I>
+kernel void embedding_bag(
+    constant T* weight [[buffer(0)]],
+    constant I* indices [[buffer(1)]],
+    constant I* offsets [[buffer(2)]],
+    constant T* per_sample_weights [[buffer(3)]],
+    device T* output [[buffer(4)]],
+    device I* offset2bag [[buffer(5)]],
+    device I* bag_size [[buffer(6)]],
+    device I* max_indices [[buffer(7)]],
+    constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]],
+    uint tid [[thread_position_in_grid]]) {
+  switch (params.mode) {
+    case EmbeddingBagMode::SUM:
+      DISPATCH_IMPL(EmbeddingBagMode::SUM);
+    case EmbeddingBagMode::MEAN:
+      DISPATCH_IMPL(EmbeddingBagMode::MEAN);
+    case EmbeddingBagMode::MAX:
+      DISPATCH_IMPL(EmbeddingBagMode::MAX);
+  }
+}
+
+#define REGISTER_EMBEDDING_BAG_OP(T, I)                             \
+  template [[host_name("embedding_bag_" #T "_" #I)]]                \
+  kernel void embedding_bag<T, I>(                                  \
+      constant T * weight [[buffer(0)]],                            \
+      constant I * indices [[buffer(1)]],                           \
+      constant I * offsets [[buffer(2)]],                           \
+      constant T * per_sample_weights [[buffer(3)]],                \
+      device T * output [[buffer(4)]],                              \
+      device I * offset2bag [[buffer(5)]],                          \
+      device I * bag_size [[buffer(6)]],                            \
+      device I * max_indices [[buffer(7)]],                         \
+      constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
+      uint tid [[thread_position_in_grid]]);
+
+REGISTER_EMBEDDING_BAG_OP(float, int);
+REGISTER_EMBEDDING_BAG_OP(float, long);
+REGISTER_EMBEDDING_BAG_OP(half, int);
+REGISTER_EMBEDDING_BAG_OP(half, long);
+REGISTER_EMBEDDING_BAG_OP(bfloat, int);
+REGISTER_EMBEDDING_BAG_OP(bfloat, long);
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -198,7 +198,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,

    if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
      inputNDArray = getMPSNDArray(input_t, inputShape);
-      outputNDArray = getMPSNDArray(*output, outputShape);
+      outputNDArray = getMPSNDArray(output_t, outputShape);
    }

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@ -302,7 +302,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
      }
    }
    auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
-                                           : Placeholder(cachedGraph->outputTensor_, *output);
+                                           : Placeholder(cachedGraph->outputTensor_, output_t);

    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
        [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
@ -315,7 +315,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
  }

-  return *output;
+  return output_t;
 }

 Tensor _mps_convolution(const Tensor& input_t,
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@ -0,0 +1,179 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/TensorUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/EmbeddingBag.h>
+#include <ATen/native/Pool.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/kernels/EmbeddingBag.h>
+
+#include <fmt/format.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_embedding_bag_forward_only_native.h>
+#include <ATen/ops/_embedding_bag_native.h>
+#include <ATen/ops/empty.h>
+#endif
+
+namespace at::native {
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/EmbeddingBag_metallib.h>
+#endif
+
+namespace {
+
+std::pair<Tensor, Tensor> promoteIndicesAndOffsets(const Tensor& indices, const Tensor& offsets) {
+  const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type());
+  return {indices.scalar_type() == commonType ? indices : indices.toType(commonType),
+          offsets.scalar_type() == commonType ? offsets : offsets.toType(commonType)};
+}
+
+} // namespace
+
+namespace mps {
+
+static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
+    const Tensor& weight,
+    const Tensor& indices_,
+    const Tensor& offsets_,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    bool sparse,
+    const std::optional<Tensor>& per_sample_weights_opt,
+    bool include_last_offset,
+    int64_t padding_idx) {
+  TORCH_CHECK(indices_.dim() == 1, "input has to be a 1D Tensor, but got Tensor of dimension ", indices_.dim());
+  if (indices_.dim() == 1) {
+    TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor, but got Tensor of dimension ", offsets_.dim());
+  }
+  TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim());
+
+  Tensor indices, offsets;
+  std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
+  auto indices_arg = TensorArg(indices, "indices", 1);
+  checkScalarTypes("embedding_bag_mps", indices_arg, {kLong, kInt});
+  auto offsets_arg = TensorArg(offsets, "offsets", 1);
+  checkScalarTypes("embedding_bag_mps", offsets_arg, {kLong, kInt});
+  checkSameType("embedding_bag_mps", indices_arg, offsets_arg);
+  auto weight_arg = TensorArg(weight, "weight", 1);
+
+  int64_t num_indices = indices.size(0);
+  int64_t num_bags = offsets.size(0);
+  if (include_last_offset) {
+    num_bags -= 1;
+  }
+  int64_t feature_size = weight.size(1);
+
+  auto bag_size = at::empty(offsets.sizes(), indices.options());
+  auto offset2bag = at::empty({indices.size(0)}, indices.options());
+  auto output = at::empty({num_bags, feature_size}, weight.options());
+
+  Tensor max_indices;
+
+  if (mode == EmbeddingBagMode::MAX) {
+    max_indices = at::empty({num_bags, feature_size}, indices.options());
+  } else {
+    max_indices = at::empty({0}, indices.options());
+  }
+
+  EmbeddingBagParams<uint32_t> params;
+
+  for (const auto dim : c10::irange(weight.dim())) {
+    params.weight_strides[dim] = safe_downcast<uint32_t, int64_t>(weight.stride(dim));
+    params.output_strides[dim] = safe_downcast<uint32_t, int64_t>(output.stride(dim));
+
+    if (mode == EmbeddingBagMode::MAX) {
+      params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
+    }
+  }
+
+  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
+  params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
+
+  params.num_indices = num_indices;
+  params.num_bags = num_bags;
+  params.feature_size = feature_size;
+  params.mode = static_cast<EmbeddingBagMode>(mode);
+  params.padding_idx = padding_idx;
+
+  auto num_threads = output.numel();
+  MPSStream* stream = getCurrentMPSStream();
+
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
+      auto pipeline_state = lib.getPipelineStateForFunc(
+          fmt::format("embedding_bag_{}_{}", scalarToMetalTypeString(weight), scalarToMetalTypeString(indices)));
+
+      getMPSProfiler().beginProfileKernel(pipeline_state, "embedding_bag", {weight, indices, offsets});
+      [computeEncoder setComputePipelineState:pipeline_state];
+      mtl_setArgs(computeEncoder,
+                  weight,
+                  indices,
+                  offsets,
+                  use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
+                  output,
+                  offset2bag,
+                  bag_size,
+                  max_indices,
+                  params);
+
+      mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
+      getMPSProfiler().endProfileKernel(pipeline_state);
+    }
+  });
+
+  return std::tuple<Tensor, Tensor, Tensor, Tensor>(
+      std::move(output), std::move(offset2bag), std::move(bag_size), std::move(max_indices));
+}
+
+} // namespace mps
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps(const Tensor& weight,
+                                                              const Tensor& indices,
+                                                              const Tensor& offsets,
+                                                              const bool scale_grad_by_freq,
+                                                              const int64_t mode,
+                                                              bool sparse,
+                                                              const std::optional<Tensor>& per_sample_weights_opt,
+                                                              bool include_last_offset,
+                                                              int64_t padding_idx) {
+  return mps::_embedding_bag_mps_impl(weight,
+                                      indices,
+                                      offsets,
+                                      scale_grad_by_freq,
+                                      mode,
+                                      sparse,
+                                      per_sample_weights_opt,
+                                      include_last_offset,
+                                      padding_idx);
+}
+
+std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    bool sparse,
+    const std::optional<Tensor>& per_sample_weights_opt,
+    bool include_last_offset,
+    int64_t padding_idx) {
+  return _embedding_bag_mps(weight,
+                            indices,
+                            offsets,
+                            scale_grad_by_freq,
+                            mode,
+                            sparse,
+                            per_sample_weights_opt,
+                            include_last_offset,
+                            padding_idx);
+}
+
+} // namespace at::native
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -20,6 +20,7 @@
 #include <ATen/ops/baddbmm_native.h>
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
+#include <ATen/ops/eye_native.h>
 #include <ATen/ops/linalg_cholesky_ex_native.h>
 #include <ATen/ops/linalg_inv_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
@ -496,26 +497,24 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
  using namespace mps;
  TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
  TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
-  using CachedGraph = MPSUnaryCachedGraph;

-  MPSStream* stream = getCurrentMPSStream();
  info.zero_();
-
  if (A.numel() == 0) {
    return;
  }

-  if (!result.is_contiguous()) {
-    result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
-  }
  auto A_sizes = A.sizes();
  int ndim = A.dim();

-  Tensor LU = empty_like(A);
-  Tensor identity = zeros_like(A);
+  Tensor LU = empty_like(A, MemoryFormat::Contiguous);
+  Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A);
  Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
-  (ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
-  linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
+  // need to do this to keep the strides of the result tensor
+  // mps's solve expects row major layout, while inductor
+  // expects result to be column major
+  Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
+  linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
+  result.copy_(tmp);
 }

 static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -519,6 +519,13 @@ static void max_unpool_out_mps_template(const Tensor& input,
                                        Tensor& output,
                                        const int32_t pooling_dims,
                                        const std::string& op_name) {
+  TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims),
+              op_name,
+              "There should be exactly ",
+              pooling_dims,
+              " elements but got ",
+              output_size_.size());
+
  auto dims = input.dim();
  auto leading_dims = input.dim() - pooling_dims;

@ -534,6 +541,18 @@ static void max_unpool_out_mps_template(const Tensor& input,
  output.resize_(output_size, memory_format);
  output.fill_(0);

+  if (indices.defined() && indices.numel() > 0) {
+    auto output_image_size = c10::multiply_integers(output_size_);
+
+    int64_t min_idx = indices.min().item<int64_t>();
+    int64_t max_idx = indices.max().item<int64_t>();
+
+    if (min_idx < 0 || max_idx >= output_image_size) {
+      int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
+      TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
+    }
+  }
+
  id<MTLDevice> device = MPSDevice::getInstance()->device();
  MPSStream* mpsStream = getCurrentMPSStream();
  const auto numThreads = input.numel();
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2351,6 +2351,7 @@
  dispatch:
    CPU: _embedding_bag_forward_only_cpu
    CUDA: _embedding_bag_forward_only_cuda
+    MPS: _embedding_bag_forward_only_mps
  autogen: _embedding_bag_forward_only.out

 - func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
@ -2372,6 +2373,7 @@
  dispatch:
    CPU: _embedding_bag_cpu
    CUDA: _embedding_bag_cuda
+    MPS: _embedding_bag_mps
  autogen: _embedding_bag.out
  tags: core

@ -4241,6 +4243,7 @@
    CPU: _weight_int8pack_mm_cpu
    CUDA: _weight_int8pack_mm_cuda
    MPS: _weight_int8pack_mm_mps
+    XPU: _weight_int8pack_mm_xpu

 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
  python_module: sparse
@ -10846,6 +10849,7 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
+    MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
  autogen: _foreach_maximum.Scalar_out

 # foreach_minimum/maximum dispatches to clamp_max/min
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@ -64,7 +64,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
  // create sparse descriptor, dtype
  cusparseLtMatDescriptor_t sparse_input_descriptor;
  cudaDataType type;
-  auto compression_factor = 9;

  #ifdef USE_ROCM
  TORCH_CHECK(isHipSparseLtSupported());
@ -73,7 +72,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
  switch (sparse_input.scalar_type()) {
    case at::ScalarType::Char:
      type = CUDA_R_8I;
-      compression_factor = 10;
      break;
    case at::ScalarType::Half:
      type = CUDA_R_16F;
@ -89,7 +87,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
 #if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602 && !defined(USE_ROCM)
    case at::ScalarType::Float8_e4m3fn:
      type = CUDA_R_8F_E4M3;
-      compression_factor = 10;
      break;
 #endif
    default:
@ -97,10 +94,6 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
      break;
  }

-  // create a new compressed tensor with the same dtype as
-  auto compressed_tensor =
-      sparse_input.new_empty(sparse_input.numel() * compression_factor / 16);
-
  TORCH_CUDASPARSE_CHECK(cusparseLtStructuredDescriptorInit(
      &handle,
      &sparse_input_descriptor,
@ -121,6 +114,15 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
      &compressed_size,
      &compressed_buffer_size));

+  // create a new compressed tensor with the same dtype as the input,
+  // and with packed data/metadata stored in an array with original
+  // number of rows, and sufficient columns to provide compressed_size
+  // buffer (in bytes)
+  size_t orig_m = sparse_input.size(0);
+  size_t div = orig_m * sparse_input.itemsize();
+  size_t new_n = (compressed_size + div - 1) / div; // floor
+  auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});
+
  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
  auto compressedBufferPtr = allocator.allocate(compressed_buffer_size);
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@ -165,7 +167,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
  cudaDataType output_type;
  cudaDataType C_type;
  cusparseComputeType compute_type;
-  auto compression_factor = 9;

  #ifdef USE_ROCM
  TORCH_CHECK(isHipSparseLtSupported());
@ -177,7 +178,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
      output_type = CUDA_R_8I;
      C_type = CUDA_R_8I;
      compute_type = CUSPARSE_COMPUTE_32I;
-      compression_factor = 10;
      break;

 // cuSPARSELt v0.5.2 onwards changes CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUT_16F
@ -210,7 +210,6 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
      output_type = CUDA_R_8F_E4M3;
      C_type = CUDA_R_16F;
      compute_type = CUSPARSE_COMPUTE_32F;
-      compression_factor = 10;
      break;
 #endif
 // cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
@ -300,9 +299,10 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
    }
  }

+  TORCH_INTERNAL_ASSERT(compressed_A.dim() == 2); // encoded M x S
  int64_t k = dense_B.size(0);
  int64_t n = dense_B.size(1);
-  int64_t m = (compressed_A.numel() * 16 / compression_factor) / k;
+  int64_t m = compressed_A.size(0);

  // initialize sparse descriptor
  cusparseLtMatDescriptor_t sparse_input_descriptor;
--- a/aten/src/ATen/test/cuda_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@ -5,51 +5,6 @@

 #include <ATen/test/allocator_clone_test.h>

-#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
-
 TEST(AllocatorTestCUDA, test_clone) {
  test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
-
-static int called_dummy_free_0 = 0;
-static int called_dummy_free_1 = 0;
-
-void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
-void dummy_free_0(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_0++;
-}
-void dummy_free_1(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_1++;
-}
-
-// Tests that data_ptrs have their respective deleters
-// when mixing allocators
-TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
-  // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
-  auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
-  at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
-  auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
-  at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Manually use a's deleter
-  auto* ctx = a.storage().data_ptr().get_context();
-  a.storage().data_ptr().get_deleter()(ctx);
-  a.storage().mutable_data_ptr().release_context();
-
-  // a's deleter is dummy_free_0
-  // dummy_free_0 should be called above, so called_dummy_free_0 should be 1
-  ASSERT_TRUE(called_dummy_free_0 == 1);
-
-  // Manually use b's deleter
-  ctx = b.storage().data_ptr().get_context();
-  b.storage().data_ptr().get_deleter()(ctx);
-  b.storage().mutable_data_ptr().release_context();
-
-  // b's deleter is dummy_free_1
-  // dummy_free_1 should be called above, so called_dummy_free_1 should be 1
-  ASSERT_TRUE(called_dummy_free_1 == 1);
-}
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -174,11 +174,11 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,pass,5
+meta-llama/Llama-3.2-1B,pass,0



-google/gemma-2-2b,pass,5
+google/gemma-2-2b,pass,0



@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0



-openai/whisper-tiny,pass,6
+openai/whisper-tiny,pass,0



-Qwen/Qwen3-0.6B,pass,5
+Qwen/Qwen3-0.6B,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,8
+hf_Reformer,pass,5



@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,11
+hf_T5_generate,pass,7



--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Reformer,pass,20



--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -170,15 +170,15 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,fail_accuracy,0
+meta-llama/Llama-3.2-1B,fail_to_run,0



-google/gemma-2-2b,fail_accuracy,0
+google/gemma-2-2b,fail_to_run,0



-google/gemma-3-4b-it,fail_accuracy,0
+google/gemma-3-4b-it,fail_to_run,0



@ -186,4 +186,4 @@ openai/whisper-tiny,fail_to_run,0



-Qwen/Qwen3-0.6B,fail_accuracy,0
+Qwen/Qwen3-0.6B,fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@ -138,7 +138,7 @@ hf_Bert_large,pass,0



-hf_BigBird,pass,25
+hf_BigBird,pass,27



@ -158,7 +158,7 @@ hf_Longformer,pass,4



-hf_Reformer,pass,8
+hf_Reformer,pass,5



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@ -138,7 +138,7 @@ hf_Bert_large,pass,0



-hf_BigBird,pass,25
+hf_BigBird,pass,27



@ -158,7 +158,7 @@ hf_Longformer,pass,4



-hf_Reformer,pass,8
+hf_Reformer,pass,5



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -138,7 +138,7 @@ hf_Bert_large,pass,0



-hf_BigBird,pass,25
+hf_BigBird,pass,27



@ -158,7 +158,7 @@ hf_Longformer,pass,4



-hf_Reformer,pass,8
+hf_Reformer,pass,5



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -174,11 +174,11 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,pass,5
+meta-llama/Llama-3.2-1B,pass,0



-google/gemma-2-2b,pass,5
+google/gemma-2-2b,pass,0



@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0



-openai/whisper-tiny,pass,6
+openai/whisper-tiny,pass,0



-Qwen/Qwen3-0.6B,pass,5
+Qwen/Qwen3-0.6B,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,8
+hf_Reformer,pass,5



@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,11
+hf_T5_generate,pass,7



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Reformer,pass,20



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@ -122,7 +122,7 @@ hf_Bert_large,pass,0



-hf_BigBird,pass,25
+hf_BigBird,pass,27



@ -142,7 +142,7 @@ hf_Longformer,pass,4



-hf_Reformer,pass,8
+hf_Reformer,pass,5



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -174,11 +174,11 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,pass,5
+meta-llama/Llama-3.2-1B,pass,0



-google/gemma-2-2b,pass,5
+google/gemma-2-2b,pass,0



@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass,0



-openai/whisper-tiny,pass,6
+openai/whisper-tiny,pass,0



-Qwen/Qwen3-0.6B,pass,5
+Qwen/Qwen3-0.6B,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,8
+hf_Reformer,pass,5



@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,11
+hf_T5_generate,pass,7



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Reformer,pass,20



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -174,11 +174,11 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,pass,5
+meta-llama/Llama-3.2-1B,pass,0



-google/gemma-2-2b,pass,5
+google/gemma-2-2b,pass,0



@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0



-openai/whisper-tiny,pass,6
+openai/whisper-tiny,pass,0



-Qwen/Qwen3-0.6B,pass,5
+Qwen/Qwen3-0.6B,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_fail_to_run,0
+
+
+
+google/gemma-2-2b,eager_fail_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_fail_to_run,0
+
+
+
+openai/whisper-tiny,eager_fail_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,8
+hf_Reformer,pass,5



@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,11
+hf_T5_generate,pass,7



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Reformer,pass,20



--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -174,11 +174,11 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,pass,5
+meta-llama/Llama-3.2-1B,pass,0



-google/gemma-2-2b,pass,5
+google/gemma-2-2b,pass,0



@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0



-openai/whisper-tiny,pass,6
+openai/whisper-tiny,pass,0



-Qwen/Qwen3-0.6B,pass,5
+Qwen/Qwen3-0.6B,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,8
+hf_Reformer,pass,5



@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,11
+hf_T5_generate,pass,7



--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,25
+hf_Reformer,pass,20



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@ -181,7 +181,7 @@ hf_T5_base,pass,0



-hf_T5_generate,pass,11
+hf_T5_generate,pass,7



@ -205,7 +205,7 @@ llama,pass,0



-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
@ -178,7 +178,7 @@ llama,fail_to_run,0



-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@ -181,7 +181,7 @@ hf_T5_base,pass,0



-hf_T5_generate,pass,11
+hf_T5_generate,pass,7



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@ -198,7 +198,7 @@ llama,pass,0



-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
@ -171,3 +171,23 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
+
+
+
+meta-llama/Llama-3.2-1B,eager_failed_to_run,0
+
+
+
+google/gemma-2-2b,eager_failed_to_run,0
+
+
+
+google/gemma-3-4b-it,eager_failed_to_run,0
+
+
+
+openai/whisper-tiny,eager_failed_to_run,0
+
+
+
+Qwen/Qwen3-0.6B,eager_failed_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@ -181,7 +181,7 @@ hf_T5_base,pass,0



-hf_T5_generate,pass,11
+hf_T5_generate,pass,7



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
@ -198,7 +198,7 @@ llama,pass,0



-llama_v2_7b_16h,model_fail_to_load,0
+llama_v2_7b_16h,pass_due_to_skip,0



--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -3580,18 +3580,10 @@ def process_caching_precompile():
    )
    from torch._dynamo.precompile_context import PrecompileContext

-    # Serialize all callables, clear PrecompileContext
-    # TODO: put this under torch.compiler API once ready
-    serialized = PrecompileContext.serialize()
-    PrecompileContext.clear()
-    if serialized is not None:
-        artifacts, info = serialized
-        print(
-            f"Saving {len(info.precompile_dynamo_artifacts)} Precompile Artifact(s)..."
-        )
-        results = PrecompileContext.deserialize(artifacts)
-        assert results is not None
-        PrecompileContext.populate_caches(results)
+    debug_info = PrecompileContext.save_to_dynamo_cache()
+    print(
+        f"Saved {len(debug_info['dynamo'])} precompile artifacts with {len(debug_info['backends'])} backends"
+    )


 def process_entry(rank, runner, original_dir, args):
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@ -6,7 +6,7 @@ add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1



-add_loop_inductor,compile_time_instruction_count,30280000000,0.1
+add_loop_inductor,compile_time_instruction_count,29660000000,0.1



@ -50,27 +50,27 @@ symint_sum_loop,compile_time_instruction_count,4299000000,0.1



-aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.1
+aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1869000000,0.1



-aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.1
+aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5281000000,0.1



-aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.1
+aotdispatcher_partitioner_cpu,compile_time_instruction_count,8333000000,0.1



-aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.1
+aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1909000000,0.1



-aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.1
+aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3442000000,0.1



-aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.1
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9239000000,0.1



@ -78,7 +78,7 @@ mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1



-mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8802129167,0.1
+mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,9051000000,0.1



@ -86,4 +86,4 @@ basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1



-basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.1
+basic_InlineMod_eager,compile_time_instruction_count,7618000000,0.1
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -1998,7 +1998,21 @@ def define_buck_targets(
                    third_party("sleef_arm"),
                ],
            }),
-            compiler_flags = get_aten_compiler_flags(),
+            compiler_flags = get_aten_compiler_flags() + select({
+                "DEFAULT": [],
+                "ovr_config//os:android-arm32": [
+                    "-mfpu=vfpv3-d16",
+                    "-march=armv7-a",
+                    "-mthumb",
+                    "-mfpu=neon",
+                ],
+                "ovr_config//os:android-x86_32": [
+                    "-mssse3",
+                ],
+                "ovr_config//os:android-x86_64": [
+                    "-mssse3",
+                ],
+            }),
            exported_preprocessor_flags = get_aten_preprocessor_flags(),
            exported_deps = [
                ":aten_header",
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
    is_le<sizeof(autograd_meta_),         16,  FieldNameEnum::autograd_meta_>();
    is_le<sizeof(extra_meta_),            16,  FieldNameEnum::extra_meta_>();
    are_equal<sizeof(version_counter_),    8,  FieldNameEnum::version_counter_>();
-    are_equal<sizeof(pyobj_slot_),         8,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(pyobj_slot_),   16,  FieldNameEnum::pyobj_slot_>();
    are_equal<sizeof(sizes_and_strides_), 88,  FieldNameEnum::sizes_and_strides_>();
    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
--- a/c10/core/impl/DeviceGuardImplInterface.cpp
+++ b/c10/core/impl/DeviceGuardImplInterface.cpp
@ -1,4 +1,5 @@
 #include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/core/impl/FakeGuardImpl.h>
 #include <array>

 namespace c10::impl {
@ -14,4 +15,26 @@ DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
  device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
 }

+namespace {
+thread_local std::unique_ptr<DeviceGuardImplInterface> tls_fake_device_guard =
+    nullptr;
+}
+
+void ensureCUDADeviceGuardSet() {
+  constexpr auto cuda_idx = static_cast<std::size_t>(DeviceType::CUDA);
+
+  const DeviceGuardImplInterface* p =
+      device_guard_impl_registry[cuda_idx].load();
+
+  // A non-null `ptr` indicates that the CUDA guard is already set up,
+  // implying this is using cuda build
+  if (p && p->deviceCount() == 0) {
+    // In following cases, we override CUDA guard interface with a no-op
+    // device guard. When p->deviceCount() == 0, cuda build is enabled, but no
+    // cuda devices available.
+    tls_fake_device_guard = std::make_unique<FakeGuardImpl<DeviceType::CUDA>>();
+    device_guard_impl_registry[cuda_idx].store(tls_fake_device_guard.get());
+  }
+}
+
 } // namespace c10::impl
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@ -6,6 +6,7 @@
 #include <c10/util/Exception.h>

 // Just for C10_ANONYMOUS_VARIABLE
+#include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Registry.h>

 #include <array>
@ -251,7 +252,7 @@ struct C10_API DeviceGuardImplInterface {
 // for devices that don't actually have a concept of device index.  Prominent
 // examples are CPU and Meta.
 template <DeviceType D>
-struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
+struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface {
  NoOpDeviceGuardImpl() = default;
  DeviceType type() const override {
    return D;
@ -371,5 +372,7 @@ inline bool hasDeviceGuardImpl(DeviceType type) {
  return device_guard_impl_registry[static_cast<size_t>(type)].load();
 }

+void C10_API ensureCUDADeviceGuardSet();
+
 } // namespace impl
 } // namespace c10
--- a/Show More
+++ b/Show More