Update (base update)

[ghstack-poisoned]
2025-10-24 15:44:58 +08:00 · 2025-09-23 12:20:53 +00:00 · 2025-09-23 12:08:34 +00:00 · 2025-09-22 11:58:17 +00:00
393 changed files with 10528 additions and 9415 deletions
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
        try:
            with socket.create_connection((addr, port), timeout=timeout):
                return
-        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
+        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
            if i == attempt_cnt - 1:
                raise
            time.sleep(timeout)
@ -1004,7 +1004,7 @@ if __name__ == "__main__":
        install_condaforge_python(host, args.python_version)
        sys.exit(0)

-    python_version = args.python_version if args.python_version is not None else "3.10"
+    python_version = args.python_version if args.python_version is not None else "3.9"

    if args.use_torch_from_pypi:
        configure_system(host, compiler=args.compiler, python_version=python_version)
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -112,6 +112,8 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

+numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
+numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
@ -132,7 +134,7 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.10"
+numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13"

@ -324,6 +326,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 lxml==5.3.0
 #Description: This is a requirement of unittest-xml-reporting

+# Python-3.9 binaries
+
 PyGithub==2.3.0

 sympy==1.13.3
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,11 +35,10 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
+  # backends (specifically the gloo backend), so test that this case works too
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

+python -mpip install -r requirements.txt
+
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

+python -mpip install --no-input -r requirements.txt
+
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
@ -55,7 +59,7 @@ test_python_shard() {

  setup_test_python

-  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"
+  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"

  assert_git_not_dirty
 }
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -322,29 +322,23 @@ test_python_shard() {

  # modify LD_LIBRARY_PATH to ensure it has the conda env.
  # This set of tests has been shown to be buggy without it for the split-build
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running

  assert_git_not_dirty
 }

 test_python() {
  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
  assert_git_not_dirty
 }

 test_python_smoke() {
-  # Smoke tests for H100/B200
+  # Smoke tests for H100
  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

-test_python_smoke_b200() {
-  # Targeted smoke tests for B200 - staged approach to avoid too many failures
-  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
-  assert_git_not_dirty
-}
-
 test_h100_distributed() {
  # Distributed tests at H100
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -390,7 +384,6 @@ test_dynamo_wrapped_shard() {
    --exclude-distributed-tests \
    --exclude-torch-export-tests \
    --exclude-aot-dispatch-tests \
-    --exclude-quantization-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose \
    --upload-artifacts-while-running
@ -1163,12 +1156,6 @@ test_distributed() {
  fi
 }

-test_quantization() {
-  echo "Testing quantization"
-
-  python test/test_quantization.py
-}
-
 test_rpc() {
  echo "Testing RPC C++ tests"
  # NB: the ending test_rpc must match the current function name for the current
@ -1586,7 +1573,7 @@ test_executorch() {
 test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops profiler/test_memory_profiler \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
        distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

@ -1662,8 +1649,6 @@ elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
  test_python_legacy_jit
-elif [[ "$TEST_CONFIG" == 'quantization' ]]; then
-  test_quantization
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
  # TODO: run some C++ tests
  echo "no-op at the moment"
@ -1788,8 +1773,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
-elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
-  test_python_smoke_b200
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
 elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@ -25,7 +25,7 @@ echo Copying over test times file
 robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"

 echo Run nn tests
-python run_test.py --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
+python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
 if ERRORLEVEL 1 goto fail

 popd
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -177,7 +177,8 @@ source ~/${desired_python}-build/bin/activate
 retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
+# is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@ -59,7 +59,7 @@ runs:
        set -x

        # Create new py_tmp env with python-version
-        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
+        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp

        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
        EXIT_CODE=$?
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-1983609239caaab24ab1ed2bfa2aa92e8c76c1b1
+090197034faf3b193c4467cedeb9281e3078892d
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -525,21 +525,6 @@
  - Lint
  - pull

- name: typechecking
-  patterns:
-  - 'pyrefly.toml'
-  - 'mypy.ini'
-  - 'mypy-strict.ini'
-  approved_by:
-  - lolpack
-  - maggiemoss
-  - ndmitchell
-  - kinto0
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull
-
 - name: superuser
  patterns:
  - '*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -19,7 +19,6 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/periodic
 - ciflow/periodic-rocm-mi300
- ciflow/quantization-periodic
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
@ -37,7 +36,6 @@ ciflow_push_tags:
 - ciflow/win-arm64
 - ciflow/h100-symm-mem
 - ciflow/h100-cutlass-backend
- ciflow/b200
 retryable_workflows:
 - pull
 - trunk
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -155,7 +155,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["13.0"],
+            arches=["12.8"],
            python_versions=["3.12"],
        ),
        branches="main",
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -71,15 +71,12 @@ jobs:
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      timeout-minutes: 420
-      {%- elif config["gpu_arch_type"] == "rocm" %}
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.24xlarge.ephemeral
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -50,7 +50,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
        device: ["cuda", "rocm", "xpu", "aarch64"]
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
@ -108,6 +108,9 @@ jobs:

          # Determine python executable for given version
          case $PY_VERS in
+          3.9)
+            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
+            ;;
          3.10)
            PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
            ;;
@ -191,7 +194,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
        device: ["xpu"]
    timeout-minutes: 40
    env:
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -35,7 +35,6 @@ jobs:
      contents: write
    outputs:
      pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
-      pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
@ -54,12 +53,8 @@ jobs:
          tag_or_branch="${tag_or_branch#refs/heads/}"
          # replace directory separators with _ in branch name
          tag_or_branch="${tag_or_branch//\//_}"
-          torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')"
-          {
-            echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
-            echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
-            echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
-          } >> "$GITHUB_ENV"
+          echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
+          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
      - name: Checkout optional submodules
        run: python3 tools/optional_submodules.py
      - name: Copy docs requirements for inclusion
@ -69,47 +64,30 @@ jobs:
          cp .ci/docker/requirements-docs.txt docs/requirements.txt
      - name: Create source distribution
        run: |
-          # Create new folder with specified name so extracting the archive yields that
-          rm -rf "/tmp/$PT_RELEASE_NAME"
-          cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
-          mv "/tmp/$PT_RELEASE_NAME" .
-          # Cleanup
-          rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
-          find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
-          # Create archive
-          tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
-          echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
-      - name: Create PEP 517 compatible source distribution
-        run: |
-          pip install build==1.2.2.post1 || exit 1
-          python -m build --sdist || exit 1
-          cd dist || exit 1
+            # Create new folder with specified name so extracting the archive yields that
+            rm -rf "/tmp/$PT_RELEASE_NAME"
+            cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
+            mv "/tmp/$PT_RELEASE_NAME" .
+            # Cleanup
+            rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
+            find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
+            # Create archive
+            tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
+            echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
      - name: Upload source distribution for release
        if: ${{ github.event_name == 'release' }}
        uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
        with:
-          files: |
-            ${{ env.PT_RELEASE_FILE }}
-            ${{ env.PT_PEP517_RELEASE_FILE }}
-      - name: Upload source distribution to GHA artifacts  # for release tags
+          files: ${{env.PT_RELEASE_FILE}}
+      - name: Upload source distribution to GHA artifacts for release tags
        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
          name: ${{ env.PT_RELEASE_FILE }}
          path: ${{ env.PT_RELEASE_FILE }}
-      - name: Upload PEP 517 source distribution to GHA artifacts  # for release tags
-        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-        uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
-        with:
-          name: ${{ env.PT_PEP517_RELEASE_FILE }}
-          path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
      - name: Set output
        id: release_name
-        run: |
-          {
-            echo "pt_release_name=${{ env.PT_RELEASE_FILE }}";
-            echo "pt_pep517_release_name=${{ env.PT_PEP517_RELEASE_FILE }}";
-          } >> "${GITHUB_OUTPUT}"
+        run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"

  upload_source_code_to_s3:
    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
@ -125,9 +103,6 @@ jobs:
      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
        with:
          name: ${{ needs.release.outputs.pt_release_name }}
-      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
-        with:
-          name: ${{ needs.release.outputs.pt_pep517_release_name }}
      - name: Configure AWS credentials(PyTorch account)
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
@ -138,9 +113,7 @@ jobs:
          s3-bucket: pytorch
          s3-prefix: source_code/test
          if-no-files-found: warn
-          path: |
-            ${{ needs.release.outputs.pt_release_name }}
-            ${{ needs.release.outputs.pt_pep517_release_name }}
+          path: ${{ needs.release.outputs.pt_release_name }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -62,7 +62,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -128,7 +128,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -174,7 +174,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -220,7 +220,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -265,7 +265,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -331,7 +331,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -377,7 +377,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -423,7 +423,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -468,7 +468,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -534,7 +534,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -580,7 +580,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -626,7 +626,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -671,7 +671,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -737,7 +737,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -783,7 +783,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -829,7 +829,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -874,7 +874,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -940,7 +940,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -986,7 +986,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -1032,7 +1032,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -1077,7 +1077,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -1143,7 +1143,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -1189,7 +1189,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -1235,7 +1235,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
@ -1280,7 +1280,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -1346,7 +1346,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
@ -1392,7 +1392,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
@ -1438,7 +1438,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.r7g.12xlarge.memory
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -333,7 +333,6 @@ jobs:
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: libtorch-rocm6_3-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
@ -448,7 +447,6 @@ jobs:
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: libtorch-rocm6_4-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -42,7 +42,7 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_12-cuda13_0-build:
+  manywheel-py3_12-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -51,22 +51,22 @@ jobs:
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8"
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda13_0
+      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda13_0-test:  # Testing
+  manywheel-py3_12-cuda12_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_12-cuda13_0-build
+      - manywheel-py3_12-cuda12_8-build
      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
@ -74,13 +74,13 @@ jobs:
      PACKAGE_TYPE: manywheel
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu130
-      GPU_ARCH_VERSION: "13.0"
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8"
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda13_0
+      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -323,7 +323,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_10-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -435,7 +434,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_10-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -917,7 +915,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_11-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -1029,7 +1026,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_11-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -1511,7 +1507,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_12-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -1623,7 +1618,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_12-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -2105,7 +2099,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_13-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -2217,7 +2210,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_13-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -2699,7 +2691,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_13t-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -2811,7 +2802,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_13t-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -3293,7 +3283,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_14-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -3405,7 +3394,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_14-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -3887,7 +3875,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_14t-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -3999,7 +3986,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_14t-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -60,7 +60,6 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
      build_name: manywheel-py3_10-rocm6_4
      build_environment: linux-binary-manywheel-rocm
    secrets:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -127,8 +127,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/quantization-periodic.yml
+++ b/.github/workflows/quantization-periodic.yml
@ -1,54 +0,0 @@
-name: quantization-periodic
-
-on:
-  push:
-    tags:
-      - ciflow/quantization-periodic/*
-  workflow_dispatch:
-  schedule:
-    # run weekly
-    - cron: "45 0 * * 0"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  get-default-label-prefix:
-    name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-      opt_out_experiments: lf
-
-  periodic-quantization-build:
-    name: periodic-quantization-build
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-default-label-prefix
-    with:
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '8.9'
-      test-matrix: |
-        { include: [
-          { config: "quantization", shard: 1, num_shards: 1, runner: "${{ needs.get-default-label-prefix.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-        ]}
-    secrets: inherit
-  periodic-test-quantization:
-    name: periodic-test-quantization
-    uses: ./.github/workflows/_linux-test.yml
-    needs: periodic-quantization-build
-    with:
-      build-environment: linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      docker-image: ${{ needs.periodic-quantization-build.outputs.docker-image }}
-      test-matrix: ${{ needs.periodic-quantization-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,8 +140,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -1,76 +0,0 @@
-# B200 Smoke Tests CI Workflow
-#
-# This workflow runs smoke tests on B200 hardware
-#
-# Flow:
-# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
-# 2. Runs smoke tests on linux.dgx.b200 runner
-# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
-#
-# Triggered by:
-# - Pull requests modifying this workflow file
-# - Manual dispatch
-# - Schedule (every 6 hours)
-# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
-
-name: B200 Smoke Tests
-
-on:
-  pull_request:
-    paths:
-      - .github/workflows/test-b200.yml
-  workflow_dispatch:
-  schedule:
-    - cron: 0 4,10,16,22 * * *  # every 6 hours
-  push:
-    tags:
-      - ciflow/b200/*
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-      cuda-arch-list: '10.0'
-      test-matrix: |
-        { include: [
-          { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
-        ]}
-      # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
-      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -82,7 +82,6 @@ torch/return_types.pyi
 torch/nn/functional.pyi
 torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
-torch/csrc/functionalization/generated/*
 torch/csrc/lazy/generated/*.[!m]*
 torch_compile_debug/
 # Listed manually because some files in this directory are not generated
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -49,7 +49,7 @@ init_command = [
    'mccabe==0.7.0',
    'pycodestyle==2.14.0',
    'pyflakes==3.4.0',
-    'torchfix==0.4.0 ; python_version >= "3.10" and python_version < "3.13"',
+    'torchfix==0.4.0 ; python_version >= "3.9" and python_version < "3.13"',
 ]


@ -153,7 +153,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
+    'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
    'numpy==2.1.0 ; python_version >= "3.12"',
    'expecttest==0.3.0',
    'mypy==1.16.0',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -22,7 +22,6 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
-    "-DUSE_DISTRIBUTED",
    "-DAT_PER_OPERATOR_HEADERS",
    "-DATEN_THREADING=NATIVE",
    "-DNO_CUDNN_DESTROY_HANDLE",
@ -91,8 +90,6 @@ generated_cpu_cpp = [
    "aten/src/ATen/NativeMetaFunctions.h",
    "aten/src/ATen/RegistrationDeclarations.h",
    "aten/src/ATen/VmapGeneratedPlumbing.h",
-    "aten/src/ATen/ViewMetaClasses.h",
-    "aten/src/ATen/ViewMetaClasses.cpp",
    "aten/src/ATen/core/aten_interned_strings.h",
    "aten/src/ATen/core/enum_tag.h",
    "aten/src/ATen/core/TensorBody.h",
@ -813,7 +810,7 @@ cc_library(
    name = "torch_python",
    srcs = libtorch_python_core_sources
        + if_cuda(libtorch_python_cuda_sources)
-        + if_cuda(libtorch_python_distributed_sources)
+        + libtorch_python_distributed_sources
        + GENERATED_AUTOGRAD_PYTHON,
    hdrs = glob([
        "torch/csrc/generic/*.cpp",
@ -835,6 +832,36 @@ pybind_extension(
    ],
 )

+cc_library(
+    name = "functorch",
+    hdrs = glob([
+        "functorch/csrc/dim/*.h",
+    ]),
+    srcs = glob([
+        "functorch/csrc/dim/*.cpp",
+    ]),
+    deps = [
+        ":aten_nvrtc",
+        ":torch_python",
+        "@pybind11",
+    ],
+)
+
+pybind_extension(
+    name = "functorch/_C",
+    copts=[
+        "-DTORCH_EXTENSION_NAME=_C"
+    ],
+    srcs = [
+        "functorch/csrc/init_dim_only.cpp",
+    ],
+    deps = [
+        ":functorch",
+        ":torch_python",
+        ":aten_nvrtc",
+    ],
+)
+
 cc_binary(
    name = "torch/bin/torch_shm_manager",
    srcs = [
@ -875,6 +902,7 @@ py_library(
    ],
    data = [
        ":torch/_C.so",
+        ":functorch/_C.so",
        ":torch/bin/torch_shm_manager",
    ],
 )
@ -1077,7 +1105,6 @@ test_suite(
        "aten/src/ATen/templates/LazyNonNativeIr.h",
        "aten/src/ATen/templates/RegisterDispatchKey.cpp",
        "aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
-        "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
        "aten/src/ATen/native/native_functions.yaml",
        "aten/src/ATen/native/tags.yaml",
        "aten/src/ATen/native/ts_native_functions.yaml",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -180,8 +180,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
  set(CPU_POWER ON)
 endif()

-# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
-# tested and likely won't work without additional changes.
+# For non-supported platforms, turn USE_DISTRIBUTED off by default.
+# NB: USE_DISTRIBUTED simply disables the backend; distributed code
+# still gets built
 if(NOT LINUX AND NOT WIN32)
  set(USE_DISTRIBUTED
      OFF
@ -261,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Use distributed" ON)
+option(USE_DISTRIBUTED "Enable default distributed backends" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -437,11 +438,10 @@ if(WIN32)
      PATH_SUFFIXES lib
      NO_DEFAULT_PATH)
    if(NOT libuv_tmp_LIBRARY)
-      set(USE_DISTRIBUTED OFF)
      set(USE_GLOO OFF)
      message(
        WARNING
-          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
+          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
      )
    else()
@ -1390,6 +1390,10 @@ endif()
 include(cmake/Summary.cmake)
 caffe2_print_configuration_summary()

+if(BUILD_FUNCTORCH)
+  add_subdirectory(functorch)
+endif()
+
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
  string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,61 +1,20 @@
 # Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html

-# Include individual top-level files
-include CITATION.cff
-include CODEOWNERS
-include Dockerfile
-include LICENSE
-include MANIFEST.in
-include Makefile
-include NOTICE
-include .bc-linter.yml
-include .clang-format .clang-tidy
-include .cmakelintrc
-include .coveragerc
-include .dockerignore
-include .editorconfig
-include .flake8
-include .gdbinit
-include .lintrunner.toml
-include .lldbinit
-include codex_setup.sh
-include docker.Makefile
-include pyrefly.toml
-include ubsan.supp
-
-# Include bazel and BUCK related files
-include BUILD.bazel BUCK.oss
-include WORKSPACE
-include *.bzl
-include .bazelignore .bazelrc .bazelversion
-
-# Include general configuration files
-include *.ini
-# Include important top-level information
-include *.md
-# Include technical text files at the moment, comprises
-# version.txt, CMakeLists.txt, requirements.txt
-include *.txt
-
-# Include ctags configuration
-include .ctags.d/*.ctags
-
-# Include subfolders completely
-graft .devcontainer
-graft .vscode
+# Include source files in SDist
+include CMakeLists.txt
+include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
+include BUCK BUCK.*
+include requirements*.txt
+include version.txt
+include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
+include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
 graft android
 graft aten
-graft benchmarks
 graft binaries
 graft c10
 graft caffe2
 graft cmake
-graft docs
 graft functorch
-graft ios
-graft mypy_plugins
-graft scripts
-graft test
 graft third_party
 graft tools
 graft torch
@ -63,37 +22,29 @@ graft torchgen
 # FIXME: torch-xla build during codegen will fail if include this file in wheel
 exclude torchgen/BUILD.bazel

-# The following exclusions omit parts from third-party dependencies that
-# contain invalid symlinks[1] and that are not needed for pytorch, such as
-# bindings for unused languages
-prune third_party/flatbuffers/java
-prune third_party/flatbuffers/kotlin
-prune third_party/ittapi/rust
-prune third_party/nccl/pkg/debian
-prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-*
-
-# The following document is also an invalid symlink[1] and superfluous
-exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md
-
-# Omit autogenerated code
-prune torchgen/packaged
-
-# Omit caches, compiled, and scm related content
-prune */__pycache__
-prune **/.github
-prune **/.gitlab
-global-exclude *.o *.obj *.so *.dylib *.a *.pxd *.dll *.lib
-global-exclude *.py[cod] *.swp *~
-global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules
-global-exclude .gitlab-ci.yml
+# Misc files and directories in SDist
+include *.md
+include CITATION.cff
+include LICENSE NOTICE
+include mypy*.ini
+graft benchmarks
+graft docs
+graft mypy_plugins
+graft scripts

 # Misc files needed for custom setuptools command
 include .gitignore
 include .gitmodules

-# [1] Invalid symlinks for the purposes of Python source distributions are,
-# according to the source distribution format[2] links pointing outside the
-# destination directory or links with a `..` component, which is those of
-# concern here.
+# Include test suites in SDist
+graft test
+include pytest.ini
+include .coveragerc

-# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features
+# Prune generated/compiled files
+prune torchgen/packaged
+prune */__pycache__
+global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]
+
+prune */.git
+global-exclude .git *~ *.swp
--- a/README.md
+++ b/README.md
@ -161,7 +161,7 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)

 #### Prerequisites
 If you are installing from source, you will need:
- Python 3.10 or later
+- Python 3.9 or later
 - A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
 - Visual Studio or Visual Studio Build Tool (Windows only)

--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -317,20 +317,10 @@ IF(USE_FBGEMM_GENAI)
        -greedy-reverse-local-assignment=1
        -fhip-new-launch-api)

-      # Only compile for gfx942 for now.
-      # This is rather hacky, I could not figure out a clean solution :(
-      set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
-      string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
-      if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
-        list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
-      endif()
-      set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
-
      hip_add_library(
        fbgemm_genai STATIC
        ${fbgemm_genai_native_rocm_hip}
        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
-      set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)

--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -4,8 +4,8 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Half.h>
-#include <c10/util/Metaprogramming.h>
 #include <c10/util/complex.h>
+#include <torch/headeronly/util/Metaprogramming.h>

 #ifdef __CUDACC__
 #include <cuda.h> // For CUDA_VERSION
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -9,6 +9,11 @@

 namespace at::functionalization {

+ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
+  if (out_idx == this->out_index) return *this;
+  return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
+}
+
 // Note [Functionalization: Alias Removal Part 2]
 // See Note [Functionalization: Alias Removal] for more details.
 // This function applies a single update from one of the views to the StorageImpl.
@ -37,12 +42,12 @@ namespace at::functionalization {
 static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
  at::Tensor t = update.new_val;
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  if (update.view_metas.empty()) { return t; }
+  if (update.view_metas.empty()) return t;

  std::vector<at::Tensor> tmp_values({base});
  tmp_values.reserve(update.view_metas.size());
  for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
-    at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
+    at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
    // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
    // All of these ops require additional information to recover the sizes of the original tensor.
    // If need to, we could probably apply this optimization and only bother computing tmp_values
@ -50,8 +55,9 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
    tmp_values.push_back(std::move(next_view));
  }
  for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
+    int64_t out_idx = update.view_metas[i].out_index;
    // Each view inverse is implemented in ViewInverses.cpp.
-    t = update.view_metas[i]->reverse(tmp_values[i], t);
+    t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
  }
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
  return t;
@ -105,13 +111,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }

-void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
+void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
  TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");

  if (metas.size() > 1) {
    for (size_t i = 1; i < metas.size(); ++i) {
      // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
-      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
+      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
 "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
 " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
 "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@ -8,89 +8,44 @@ namespace at::functionalization {

 // See Note [Functionalization Pass In Core]

-enum class InverseReturnMode {
-  /// Specifies that functional inverses should always return a view.
-  AlwaysView,
-  /// Specifies that functional inverses should always return a non-view / copy.
-  NeverView,
-  /// Specifies that functional inverses should return a view unless a (copying)
-  /// scatter
-  /// inverse exists, in which case that will be used instead.
-  /// This avoids as_strided() calls that can be difficult for subclasses to
-  /// handle.
-  ViewOrScatterInverse,
-};
-
-#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
-  static const char* name() {                 \
-    return #TYPE;                             \
-  }
-
-#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
-  using SerializableTuple = std::tuple<__VA_ARGS__>
-
 // ViewMeta is a class used by the functionalization pass to navigate between
 // a base tensor and a view tensor.
 // For example, if I call `b = a.view1(...)`
-// the functionalization pass will generate and store a ViewMeta specialization
-// for `view1` operation on b that looks like:
+// the functionalization pass will generate and store a ViewMeta on b that looks
+// like:
 //
-// struct TORCH_API view1_ViewMeta : public ViewMeta {
-//   FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
-//   FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
-//       bool /* reapply_views */,
-//       const std::vector<int64_t>&);
-//
-//   view1_ViewMeta(const SerializableTuple& tpl)
-//       : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
-//
-//   view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
-//       : ViewMeta(/*has_symbolic_inputs=*/false),
-//         reapply_views(reapply_views),
-//         size(size) {}
-//
-//   Tensor forward(const Tensor& base) override {
-//       return base.view1(...);
+// ViewMeta(
+//   [<captures>](const Tensor& base, int64_t mutated_view_idx) {
+//     return base.view1(...);
+//   },
+//   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
+//   int64_t mutated_view_idx) -> at::Tensor {
+//     return at::functionalization::impl::view1_inverse(base, mutated_view,
+//     ...);
 //   }
 //
-//   Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
-//       return at::functionalization::impl::view1_inverse(base, mutated_view,
-//       ...);
-//   }
+// The forward_fn lambda describes how to replay view1 on a tensor.
 //
-//   SerializableTuple to_serializable_tuple() {
-//     return std::make_tuple(reapply_views, size);
-//   }
-//
-//   bool reapply_views;
-//   std::vector<int64_t> size;
-// };
-//
-// The forward function describes how to replay view1 on a tensor.
-//
-// The reverse function describes how, given a tensor that is already a view,
+// The reverse_fn lambda describes how, given a tensor that is already a view,
 // how to get the corresponding base tensor. See Note [Functionalization Pass:
 // View Inverses] for details.
-//
-// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
-// representing the `ViewMeta` instance state. Methods that take in/return such
-// a type are used for supporting pickle serialization.
 struct ViewMeta {
  ViewMeta(
+      std::function<Tensor(const Tensor&, int64_t)> forward,
+      std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
      bool has_symbolic_inputs,
      bool is_multi_output = false,
      bool is_as_strided = false,
      int64_t out_idx = 0)
-      : out_index(out_idx),
+      : forward_fn(std::move(forward)),
+        reverse_fn(std::move(reverse)),
+        out_index(out_idx),
        is_multi_output(is_multi_output),
        is_as_strided(is_as_strided),
        has_symbolic_inputs(has_symbolic_inputs) {}

-  virtual ~ViewMeta() = default;
-
-  virtual Tensor forward(const Tensor& base) = 0;
-  virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
-
+  std::function<Tensor(const Tensor&, int64_t)> forward_fn;
+  std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
  // See Note [out_idx in ViewMeta]
  int64_t out_index;

@ -102,17 +57,10 @@ struct ViewMeta {
  // Tells us if this view operation has any symbolic inputs
  bool has_symbolic_inputs;

-  // Returns a new ViewMeta with the same forward/reverse
+  // Returns a copy of the current ViewMeta, if out_idx matches the current
+  // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
  // functions, but a new out index.
-  //
-  // This method should be implemented by those `ViewMeta` that have more than
-  // one output.
-  virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
-    TORCH_CHECK_NOT_IMPLEMENTED(
-        false,
-        "ViewMeta::to_out_index not implemented. ",
-        "Likely because there's only one output.");
-  }
+  ViewMeta to_out_idx(int64_t out_idx);
 };

 // FunctionalStorageImpl is a subclass of StorageImpl used by the
@ -145,14 +93,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
    const at::Tensor new_val;
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
-    const std::vector<std::shared_ptr<ViewMeta>> view_metas;
+    const std::vector<ViewMeta> view_metas;
  };

  explicit FunctionalStorageImpl(const Tensor& value);

  void add_update(
      const Tensor& updated_val,
-      const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
+      const std::vector<ViewMeta>& view_metas);
  bool apply_updates();
  const Tensor& base() {
    return base_;
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -129,19 +129,17 @@ void FunctionalTensorWrapper::freeze_storage() const {
 // - view_value: The output tensor that we need to wrap.
 // - base: The "base" of the view that `view_value` was generated from.
 // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
-FunctionalTensorWrapper::FunctionalTensorWrapper(
-    const Tensor& view_value,
-    const FunctionalTensorWrapper* base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta)
-    : c10::TensorImpl(
-          c10::DispatchKeySet(DispatchKey::Functionalize),
-          view_value.dtype(),
-          base->storage().data_ptr().device()),
-      value_(view_value),
-      is_multi_output_view_(
-          base->is_multi_output_view_ || meta->is_multi_output),
-      was_storage_changed_(base->was_storage_changed_),
-      is_symbolic_(base->is_symbolic_) {
+FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
+  : c10::TensorImpl(
+      c10::DispatchKeySet(DispatchKey::Functionalize),
+      view_value.dtype(),
+      base->storage().data_ptr().device()
+    ),
+    value_(view_value),
+    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
+    was_storage_changed_(base->was_storage_changed_),
+    is_symbolic_(base->is_symbolic_)
+{
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
  TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
  set_constructor_metadata();
@ -150,10 +148,11 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(
      view_metas_ = base->view_metas_;  // copy
  }
  view_metas_.push_back(meta);
-  maybe_mark_symbolic(meta.get());
+  maybe_mark_symbolic(meta);
  storage_ = base->storage_; // alias this tensor's storage with the base tensor's
 }

+
 functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
  return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
 }
@ -177,18 +176,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 }

 // See Note [Functionalization Pass - Inplace View Ops]
-void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
+void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
  view_metas_.push_back(meta);
  // Manually track the fact that this tensor received a metadata mutation!
  has_metadata_mutation_ = true;
  // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
-  maybe_mark_symbolic(meta.get());
+  maybe_mark_symbolic(meta);
  // Note [Functionalization Pass - Inplace View Ops]
  // So, these ops are special - they're mutation AND view ops. They get special codegen.
  // An example is transpose_, e.g. `a.transpose_()`
  // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
  at::AutoDispatchSkipFunctionalize guard;
-  value_ = meta->forward(value_);
+  value_ = meta.forward_fn(value_, meta.out_index);
  TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
 }

@ -369,8 +368,15 @@ void FunctionalTensorWrapper::sync_() {
  regenerate_from_base();
 }

-const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
-  return view_metas_;
+Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
+  auto t = base;
+
+  // Reapply views to get the viewed tensor from the base in alias_
+  for (auto& view_meta: view_metas_) {
+    t = view_meta.forward_fn(t, view_meta.out_index);
+  }
+
+  return t;
 }

 void FunctionalTensorWrapper::regenerate_from_base() {
@ -379,7 +385,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
  auto t = storage_impl->base();

  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
+  t = apply_view_metas(t);
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));

  replace_(t, /*from_lazy_regenerate=*/true);
@ -721,11 +727,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
 }

 bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
-  if (t_list.empty()) { return false; }
+  if (t_list.empty()) return false;
  auto functional_count = 0;
  for (const auto i : c10::irange(t_list.size())) {
    auto const & e= t_list[i];
-    if (!e.has_value() || !e->defined()) { continue; }
+    if (!e.has_value() || !e->defined()) continue;
    if (isFunctionalTensor(e)) {
      ++functional_count;
    }
@ -735,10 +741,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {

 template <typename T>
 static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
-  if (list.size() == 0) { return false; }
+  if (list.size() == 0) return false;
  auto functional_count = 0;
  for (const auto& tensor : list) {
-    if (!tensor.defined()) { continue; }
+    if (!tensor.defined()) continue;
    if (isFunctionalTensor(tensor)) {
      ++functional_count;
    }
@ -756,28 +762,20 @@ void freeze_functional_tensor(const Tensor& tensor) {
  functional_base_impl->freeze_storage();
 }

-Tensor create_functional_tensor_with_view_meta(
-    const at::Tensor& view_to_wrap,
-    const at::Tensor& base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta,
-    int64_t out_idx) {
+Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
  auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
-  auto meta_ = meta;
  if (out_idx != 0) {
    // Note [out_idx in ViewMeta]
    // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
    // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
-    meta_ = meta->to_out_index(out_idx);
+    meta = meta.to_out_idx(out_idx);
  }
-  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
+  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
 }

-std::vector<Tensor> create_functional_tensor_with_view_meta(
-    ITensorListRef view_to_wrap,
-    const at::Tensor& base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta) {
+std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
  std::vector<Tensor> outputs(view_to_wrap.size());
  int64_t i = 0;
  for (const auto& tensor : view_to_wrap) {
@ -787,22 +785,12 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(
  return outputs;
 }

-void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
+void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
  auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
  self_impl->mutate_view_meta(meta);
 }

-Tensor apply_view_meta_sequence(
-    const Tensor& base,
-    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
-  Tensor r = base;
-  for (auto& vm : sequence) {
-    r = vm->forward(r);
-  }
-  return r;
-}
-
 // Note [Propagating strides in the functionalization pass]
 // In order to properly compute stride information, the functionalization pass
 // calls each {view} reference implementations with meta tensors.
@ -896,7 +884,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
    const auto& ivalue = returns[idx];
    if (ivalue.isTensor()) {
      const auto& t = ivalue.toTensor();
-      if (!t.defined()) { continue; }
+      if (!t.defined()) continue;
      at::functionalization::impl::sync(t);
      auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
      (*stack)[returns_begin + idx] = t_new;
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  explicit FunctionalTensorWrapper(
      const Tensor& view_value,
      const FunctionalTensorWrapper* base,
-      const std::shared_ptr<functionalization::ViewMeta>& meta);
+      const functionalization::ViewMeta& meta);

  // Get the underlying, actual tensor, that doesn't know anything about
  // functionalization.
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
        ->are_all_mutations_under_no_grad_or_inference_mode();
  }

-  void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
-    is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
+  void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
+    is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
  }

  bool is_symbolic() const {
    return is_symbolic_;
  }

-  // Retrieves the ViewMeta sequence of this tensor.
-  const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
-      const;
+  // Runs the forward_fn of every ViewMeta collected in the current instance
+  // to some other base.
+  Tensor apply_view_metas(const Tensor& base);

  // Sync's the underlying tensor with its alias, if it's out of date. This
  // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@ -146,8 +146,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  // from the base tensor. This method is used by inplace-view ops like
  // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
  // tensor by replaying the views off of the alias.
-  void mutate_view_meta(
-      const std::shared_ptr<at::functionalization::ViewMeta>& meta);
+  void mutate_view_meta(const at::functionalization::ViewMeta& meta);

  // Custom implementation of self.set_(src)
  void set__impl(const FunctionalTensorWrapper* other);
@ -286,7 +285,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  bool is_symbolic_ = false;

  size_t generation_ = 0;
-  std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
+  std::vector<at::functionalization::ViewMeta> view_metas_;

 protected:
  static void copy_tensor_metadata(
@ -378,20 +377,16 @@ TORCH_API void propagate_xla_data_direct(
 Tensor create_functional_tensor_with_view_meta(
    const Tensor& view_to_wrap,
    const Tensor& base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta,
+    functionalization::ViewMeta meta,
    int64_t out_idx = 0);
 std::vector<Tensor> create_functional_tensor_with_view_meta(
    ITensorListRef view_to_wrap,
    const Tensor& base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta);
+    const functionalization::ViewMeta& meta);

 void mutate_view_meta(
    const Tensor& self,
-    const std::shared_ptr<functionalization::ViewMeta>& meta);
-
-TORCH_API Tensor apply_view_meta_sequence(
-    const Tensor& base,
-    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
+    const functionalization::ViewMeta& meta);

 void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
 void set_sizes_strides_offset(
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@ -1,5 +1,3 @@
-#include <ATen/FunctionalizeFallbackKernel.h>
-
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
@ -9,6 +7,7 @@
 #include <torch/library.h>
 #include <c10/util/irange.h>
 #include <c10/util/strides.h>
+#include <ATen/EmptyTensor.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/ATen.h>
@ -29,31 +28,6 @@
 #include <utility>
 #endif

-namespace at::functionalization {
-
-Tensor resize__ViewMeta::forward(const Tensor& base) {
-  if (reapply_views) {
-    return base.as_strided(size, c10::contiguous_strides(size));
-  } else {
-    return at::as_strided_copy(base, size, c10::contiguous_strides(size));
-  }
-}
-
-Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
-  return base.as_strided_scatter(
-      mutated_view, size, c10::contiguous_strides(size));
-}
-
-Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
-  return at::_unsafe_view_symint(base, size);
-}
-
-Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
-  return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
-}
-
-} // namespace at::functionalization
-
 namespace {
  void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
    const auto& schema = op.schema();
@ -132,9 +106,7 @@ namespace {
      const auto& ivalue = returns[idx];
      if (ivalue.isTensor() && should_wrap_outputs) {
        const auto& t = ivalue.toTensor();
-        if (!t.defined()) {
-          continue;
-        }
+        if (!t.defined()) continue;
        auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
        (*stack)[returns_begin + idx] = t_new;
      } else if (ivalue.isTensorList() && should_wrap_outputs) {
@ -197,8 +169,19 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
  // The output of resizing is equivalent to taking a slice of a larger tensor.
  // We have to emulate this "slicing" with an as_strided call.
  auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
-  auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
-      reapply_views, size.vec());
+  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+    [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      if (reapply_views) {
+        return base.as_strided(size, c10::contiguous_strides(size));
+      } else {
+        return at::as_strided_copy(base, size, c10::contiguous_strides(size));
+      }
+    },
+    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
+    },
+    /*has_symbolic_inputs=*/false
+  );
  at::functionalization::impl::mutate_view_meta(self, view_meta);
  return self;
 }
@ -317,11 +300,17 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
    tmp_output = at::_unsafe_view_symint(self_, size);
  }

-  bool has_symbolic_inputs = std::any_of(
-      size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
-  auto view_meta =
-      std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
-          has_symbolic_inputs, size.vec());
+  bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
+
+  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+    [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      return at::_unsafe_view_symint(base, size);
+    },
+    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
+    },
+    /*has_symbolic_inputs=*/has_symbolic_inputs
+  );

  auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
  // See  Note [Propagating strides in the functionalization pass]
--- a/aten/src/ATen/FunctionalizeFallbackKernel.h
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.h
@ -1,58 +0,0 @@
-#pragma once
-
-#include <ATen/FunctionalStorageImpl.h>
-
-namespace at::functionalization {
-
-// `ViewMeta` implementation for `resize_` operation.
-struct TORCH_API resize__ViewMeta : public ViewMeta {
-  FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
-  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
-      bool /* reapply_views */,
-      const std::vector<int64_t>&);
-
-  resize__ViewMeta(const SerializableTuple& tpl)
-      : resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
-
-  resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
-      : ViewMeta(/*has_symbolic_inputs=*/false),
-        reapply_views(reapply_views),
-        size(size) {}
-
-  Tensor forward(const Tensor& base) override;
-  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
-
-  SerializableTuple to_serializable_tuple() {
-    return std::make_tuple(reapply_views, size);
-  }
-
-  bool reapply_views;
-  std::vector<int64_t> size;
-};
-
-// `ViewMeta` implementation for `_unsafe_view` operation.
-struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
-  FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
-  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
-      bool /* has_symbolic_inputs */,
-      const std::vector<c10::SymInt>&);
-
-  _unsafe_view_ViewMeta(const SerializableTuple& tpl)
-      : _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
-
-  _unsafe_view_ViewMeta(
-      bool has_symbolic_inputs,
-      const std::vector<c10::SymInt>& size)
-      : ViewMeta(has_symbolic_inputs), size(size) {}
-
-  Tensor forward(const Tensor& base) override;
-  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
-
-  SerializableTuple to_serializable_tuple() {
-    return std::make_tuple(has_symbolic_inputs, size);
-  }
-
-  std::vector<c10::SymInt> size;
-};
-
-} // namespace at::functionalization
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@ -7,8 +7,8 @@
 #include <c10/core/TensorImpl.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Exception.h>
-#include <c10/util/Metaprogramming.h>
 #include <c10/util/irange.h>
+#include <torch/headeronly/util/Metaprogramming.h>

 namespace at::native {
 struct NestedTensorImpl;
--- a/aten/src/ATen/core/ATen_pch.h
+++ b/aten/src/ATen/core/ATen_pch.h
@ -113,7 +113,7 @@
 #include <c10/util/IdWrapper.h>
 #include <c10/util/Logging.h>
 #include <c10/util/MaybeOwned.h>
-#include <c10/util/Metaprogramming.h>
+#include <torch/headeronly/util/Metaprogramming.h>
 #include <c10/util/Optional.h>
 #include <c10/util/Registry.h>
 #include <c10/util/SmallVector.h>
@ -122,9 +122,9 @@
 #include <c10/util/Type.h>
 #include <c10/util/TypeCast.h>
 #include <c10/util/TypeIndex.h>
-#include <c10/util/TypeList.h>
+#include <torch/headeronly/util/TypeList.h>
 #include <c10/util/TypeSafeSignMath.h>
-#include <c10/util/TypeTraits.h>
+#include <torch/headeronly/util/TypeTraits.h>
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/accumulate.h>
 #include <c10/util/bit_cast.h>
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@ -2,8 +2,8 @@

 #include <c10/macros/Macros.h>
 #include <c10/macros/Export.h>
-#include <c10/util/TypeTraits.h>
-#include <c10/util/TypeList.h>
+#include <torch/headeronly/util/TypeTraits.h>
+#include <torch/headeronly/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/order_preserving_flat_hash_map.h>
 #include <optional>
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@ -4,8 +4,8 @@
 #include <ATen/core/jit_type_base.h>
 #include <c10/macros/Macros.h>
 #include <c10/macros/Export.h>
-#include <c10/util/TypeTraits.h>
-#include <c10/util/TypeList.h>
+#include <torch/headeronly/util/TypeTraits.h>
+#include <torch/headeronly/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/ArrayRef.h>
 #include <optional>
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@ -4,8 +4,8 @@
 #include <ATen/core/boxing/BoxedKernel.h>
 #include <ATen/core/stack.h>
 #include <c10/core/DispatchKeySet.h>
-#include <c10/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
+#include <torch/headeronly/util/TypeList.h>
 #include <atomic>
 #include <memory>
 #include <type_traits>
--- a/aten/src/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h
+++ b/aten/src/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h
@ -1,6 +1,6 @@
 #pragma once

-#include <c10/util/TypeTraits.h>
+#include <torch/headeronly/util/TypeTraits.h>

 namespace c10::impl {

--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@ -9,7 +9,7 @@

 #include <ATen/core/boxing/BoxedKernel.h>

-#include <c10/util/Metaprogramming.h>
+#include <torch/headeronly/util/Metaprogramming.h>
 #include <type_traits>

 namespace c10::impl {
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@ -4,9 +4,9 @@
 #include <ATen/core/boxing/OperatorKernel.h>
 #include <ATen/core/ivalue.h>
 #include <ATen/core/stack.h>
-#include <c10/util/Metaprogramming.h>
-#include <c10/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
+#include <torch/headeronly/util/Metaprogramming.h>
+#include <torch/headeronly/util/TypeList.h>

 #include <utility>

--- a/aten/src/ATen/core/dispatch/CppSignature.h
+++ b/aten/src/ATen/core/dispatch/CppSignature.h
@ -2,8 +2,8 @@

 #include <c10/core/DispatchKeySet.h>
 #include <c10/macros/Macros.h>
-#include <c10/util/Metaprogramming.h>
 #include <c10/util/Type.h>
+#include <torch/headeronly/util/Metaprogramming.h>
 #include <typeindex>

 namespace c10::impl {
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@ -7,8 +7,8 @@
 #include <c10/core/DispatchKey.h>
 #include <c10/core/PyHandleCache.h>
 #include <c10/core/SafePyObject.h>
-#include <c10/util/Metaprogramming.h>
 #include <c10/util/flat_hash_map.h>
+#include <torch/headeronly/util/Metaprogramming.h>

 #include <ATen/core/dispatch/CppSignature.h>
 #include <ATen/core/dispatch/OperatorOptions.h>
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -7,7 +7,7 @@
 #include <ATen/core/symbol.h>
 #include <ATen/core/type_factory.h>
 #include <ATen/core/qualified_name.h>
-#include <c10/util/TypeList.h>
+#include <torch/headeronly/util/TypeList.h>
 #include <optional>
 #include <c10/core/SymFloat.h>
 #include <c10/core/SymBool.h>
--- a/aten/src/ATen/core/op_registration/infer_schema.h
+++ b/aten/src/ATen/core/op_registration/infer_schema.h
@ -6,7 +6,7 @@
 */

 #include <ATen/core/function_schema.h>
-#include <c10/util/Metaprogramming.h>
+#include <torch/headeronly/util/Metaprogramming.h>

 namespace c10 {
 namespace detail::infer_schema {
--- a/aten/src/ATen/functorch/BatchRulesDynamic.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDynamic.cpp
@ -8,7 +8,7 @@
 #include <ATen/functorch/BatchRulesHelper.h>
 #include <ATen/functorch/BatchedFallback.h>
 #include <ATen/core/dispatch/Dispatcher.h>
-#include <c10/util/Metaprogramming.h>
+#include <torch/headeronly/util/Metaprogramming.h>

 // This file contains batching rules for operations that return Tensors of
 // dynamic shape. We generally don't support those with vmap so we raise
--- a/aten/src/ATen/functorch/BatchRulesHelper.h
+++ b/aten/src/ATen/functorch/BatchRulesHelper.h
@ -5,7 +5,7 @@
 // LICENSE file in the root directory of this source tree.
 #pragma once

-#include <c10/util/TypeList.h>
+#include <torch/headeronly/util/TypeList.h>

 #include <ATen/ATen.h>
 #include <ATen/Operators.h>
--- a/aten/src/ATen/native/CPUFallback.h
+++ b/aten/src/ATen/native/CPUFallback.h
@ -4,7 +4,7 @@
 #include <ATen/core/stack.h>
 #include <ATen/core/boxing/KernelFunction.h>
 #include <ATen/core/dispatch/Dispatcher.h>
-#include <c10/util/Metaprogramming.h>
+#include <torch/headeronly/util/Metaprogramming.h>
 #include <torch/library.h>

 namespace at::native {
--- a/aten/src/ATen/native/Fill.cpp
+++ b/aten/src/ATen/native/Fill.cpp
@ -97,38 +97,43 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) {
  int64_t nDims = self.dim();
  TORCH_CHECK(nDims >= 2, "dimensions must larger than 1");

-  auto height = self.sym_size(0);
-  auto width = self.sym_size(1);
+  int64_t height = self.size(0);
+  int64_t width = self.size(1);

  if (nDims > 2) {
+    int64_t dim1 = height;
    for (const auto i : c10::irange(1, nDims)) {
-      if (self.sym_size(i) != height) {
+      if (self.size(i) != dim1) {
        TORCH_CHECK(false, "all dimensions of input must be of equal length");
      }
    }
  }

-  auto storage_offset = self.sym_storage_offset();
-  auto size = std::min(height, width);
+  int64_t storage_offset = self.storage_offset();
+  std::vector<int64_t> sizes;
+  std::vector<int64_t> strides;
+  int64_t size = std::min(height, width);

  int64_t stride = 0;
  for (const auto i : c10::irange(nDims)) {
    stride += self.stride(i);
  }
-  std::vector<SymInt> strides{stride};
-  std::vector<SymInt> sizes{size};
+  strides.push_back(stride);
+  sizes.push_back(size);

-  auto main_diag = self.as_strided_symint(sizes, strides, storage_offset);
+  auto main_diag = self.as_strided(sizes, strides, storage_offset);
  main_diag.fill_(fill_value);

  if (wrap && nDims == 2 && height > width + 1) {
-    auto step = width + 1;
-    auto wrap_size = ((self.numel() + step - 1) / step) - size;
-    std::vector<SymInt> wrap_sizes{wrap_size};
+    std::vector<int64_t> wrap_sizes;

-    auto offset = self.stride(0) * (width + 1);
+    int64_t step = width + 1;
+    int64_t wrap_size = ((self.numel() + step - 1) / step) - size;
+    wrap_sizes.push_back(wrap_size);

-    auto wrap_diag = self.as_strided_symint(wrap_sizes, strides, storage_offset + offset);
+    int64_t offset = self.stride(0) * (width + 1);
+
+    auto wrap_diag = self.as_strided(wrap_sizes, strides, storage_offset + offset);
    wrap_diag.fill_(fill_value);
  }

--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -1,5 +1,3 @@
-#include <ATen/core/ATen_fwd.h>
-#include <c10/core/ScalarType.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
@ -1880,18 +1878,19 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {

  Tensor xtensor = self.expand(padded_size);

-  Tensor urtensor;
+  Tensor result;
  if (self.is_quantized()) {
-    urtensor = at::empty_quantized(target_size, self);
+    result = at::empty_quantized(target_size, self);
  } else {
-    urtensor = at::empty(target_size, self.options());
+    result = at::empty(target_size, self.options());
  }

  // return an empty tensor if one of the repeat dimensions is zero
  if (zero_tensor) {
-    return urtensor;
+    return result;
  }

+  Tensor urtensor = at::alias(result);
  for (const auto i : c10::irange(xtensor.dim())) {
    // can't unfold with step 0, so make sure step is at least 1
    // (it doesn't matter what it is in that case, because the size is 0).
@ -1901,22 +1900,7 @@ Tensor repeat(const Tensor& self, IntArrayRef repeats) {

  urtensor.copy_(xtensor.expand_as(urtensor));

-  // Combine the dimensions to produce the target_size.
-  // xtensor dims: [a0, ..., ad-1]
-  // urtensor dims: [a0, ..., ad-1, b0, ..., bd-1]
-  // b dims are produced by unfold.
-  // Transform urtensor to [a0 * b0, ..., ad-1 * bd-1]
-  const int64_t n_dims = xtensor.dim();
-  auto range_a = at::arange(xtensor.dim(), at::TensorOptions(at::kLong));
-  auto range_b = range_a + n_dims;
-  auto stacked = stack({std::move(range_a), std::move(range_b)}, 1).flatten();
-  auto permutation = IntArrayRef(stacked.data_ptr<int64_t>(), n_dims * 2);
-  // Permute from [a0, ..., ad-1, b0, ..., bd-1] to [a0, b0, ..., ad-1, bd-1]
-  urtensor = urtensor.permute(permutation);
-  // Reshape from [a0, b0, ..., ad-1, bd-1] to [a0 * b0, ..., ad-1 * bd-1]
-  urtensor = urtensor.reshape(target_size);
-
-  return urtensor;
+  return result;
 }

 Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@ -2,7 +2,7 @@

 #include <ATen/native/cpu/Loops.h>
 #include <ATen/Parallel.h>
-#include <c10/util/TypeList.h>
+#include <torch/headeronly/util/TypeList.h>
 #include <c10/core/Scalar.h>
 #include <c10/util/irange.h>

--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -999,41 +999,12 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
      dtypes[i] = iter.dtype(i);
    }
    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
-#ifdef USE_ROCM
-    constexpr int grp_sz = 128;
-    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
-      if (unrl) {
-        auto offsets0 = offset_calc.get(idx);
-        auto offsets1 = offset_calc.get(idx + grp_sz);
-        auto offsets2 = offset_calc.get(idx + grp_sz * 2);
-        auto offsets3 = offset_calc.get(idx + grp_sz * 3);
-        void* out0 = data[0] + offsets0[0];
-        void* out1 = data[0] + offsets1[0];
-        void* out2 = data[0] + offsets2[0];
-        void* out3 = data[0] + offsets3[0];
-        arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1);
-        arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1);
-        arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1);
-        arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1);
-        c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
-        c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
-        c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
-        c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
-      } else {
-        auto offsets = offset_calc.get(idx);
-        void* out = data[0] + offsets[0];
-        arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
-        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
-      }
-    });
-#else
    launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
      auto offsets = offset_calc.get(idx);
      void* out = data[0] + offsets[0];
      arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
    });
-#endif
  }
 }

--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.h
@ -14,7 +14,7 @@ struct EmbeddingBagParams {
  ::c10::metal::array<idx_type_t, 2> output_strides;
  ::c10::metal::array<idx_type_t, 2> max_indices_strides;

-  idx_type_t per_sample_weights_stride;
+  idx_type_t per_sample_weights_strides;

  idx_type_t num_indices;
  idx_type_t num_bags;
--- a/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
+++ b/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal
@ -23,72 +23,54 @@ struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
 template <EmbeddingBagMode M, typename T>
 struct ReductionOp {
  inline opmath_t<T> operator()(
-      opmath_t<T> weight_val,
+      T weight_val,
      opmath_t<T> out_val,
-      bool is_first) {
-    return weight_val + out_val;
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_strides);
+};
+
+template <typename T>
+struct ReductionOp<EmbeddingBagMode::SUM, T> {
+  inline opmath_t<T> operator()(
+      T weight_val,
+      opmath_t<T> out_val,
+      uint32_t per_sample_weights_index,
+      constant T* per_sample_weights,
+      uint32_t per_sample_weights_strides) {
+    if (per_sample_weights_strides) {
+      T per_sample_weight = per_sample_weights
+          [per_sample_weights_strides * per_sample_weights_index];
+      return static_cast<opmath_t<T>>(per_sample_weight) *
+          static_cast<opmath_t<T>>(weight_val) +
+          out_val;
+    } else {
+      return static_cast<opmath_t<T>>(weight_val) + out_val;
+    }
+  }
+};
+
+template <typename T>
+struct ReductionOp<EmbeddingBagMode::MEAN, T> {
+  inline opmath_t<T> operator()(
+      T weight_val,
+      opmath_t<T> out_val,
+      uint32_t,
+      constant T*,
+      uint32_t) {
+    return static_cast<opmath_t<T>>(weight_val) + out_val;
  }
 };

 template <typename T>
 struct ReductionOp<EmbeddingBagMode::MAX, T> {
  inline opmath_t<T> operator()(
-      opmath_t<T> weight_val,
+      T weight_val,
      opmath_t<T> out_val,
-      bool is_first) {
-    return (is_first || weight_val > out_val) ? weight_val : out_val;
-  }
-};
-
-template <EmbeddingBagMode M, typename T>
-struct MaybeApplyPerSampleWeight {
-  inline opmath_t<T> operator()(
-      opmath_t<T> weight_val,
-      uint32_t per_sample_weights_index,
-      constant T* per_sample_weights,
-      uint32_t per_sample_weights_stride) {
-    return weight_val;
-  }
-};
-
-template <typename T>
-struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
-  inline opmath_t<T> operator()(
-      opmath_t<T> weight_val,
-      uint32_t per_sample_weights_index,
-      constant T* per_sample_weights,
-      uint32_t per_sample_weights_stride) {
-    if (per_sample_weights_stride) {
-      T per_sample_weight = per_sample_weights
-          [per_sample_weights_stride * per_sample_weights_index];
-      return static_cast<opmath_t<T>>(per_sample_weight) * weight_val;
-    } else {
-      return weight_val;
-    }
-  }
-};
-
-template <EmbeddingBagMode M, typename T, typename I>
-struct MaybeCalcMaxIndex {
-  inline void operator()(
-      opmath_t<T> weight_val,
-      opmath_t<T> out_val,
-      bool is_first,
-      thread I& max_idx,
-      I weight_idx,
-      bool pad) {}
-};
-
-template <typename T, typename I>
-struct MaybeCalcMaxIndex<EmbeddingBagMode::MAX, T, I> {
-  inline void operator()(
-      opmath_t<T> weight_val,
-      opmath_t<T> out_val,
-      bool is_first,
-      thread I& max_idx,
-      I weight_idx,
-      bool pad) {
-    max_idx = !pad && (is_first || weight_val > out_val) ? weight_idx : max_idx;
+      uint32_t,
+      constant T*,
+      uint32_t) {
+    return max(static_cast<opmath_t<T>>(weight_val), out_val);
  }
 };

@ -114,30 +96,6 @@ struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
  }
 };

-template <EmbeddingBagMode M, typename I>
-struct MaybeWriteMaxIndex {
-  inline void operator()(
-      device I*,
-      const constant ::c10::metal::array<uint32_t, 2>&,
-      uint32_t,
-      uint32_t,
-      I) {}
-};
-
-template <typename I>
-struct MaybeWriteMaxIndex<EmbeddingBagMode::MAX, I> {
-  inline void operator()(
-      device I* max_indices,
-      const constant ::c10::metal::array<uint32_t, 2>& max_indices_strides,
-      uint32_t bag_idx,
-      uint32_t feature_idx,
-      I max_idx) {
-    max_indices
-        [bag_idx * max_indices_strides[0] +
-         feature_idx * max_indices_strides[1]] = max_idx;
-  }
-};
-
 template <EmbeddingBagMode M, typename T, typename I>
 void embedding_bag_impl(
    constant T* weight,
@ -154,7 +112,7 @@ void embedding_bag_impl(
  auto num_bags = params.num_bags;
  auto feature_size = params.feature_size;
  auto padding_idx = params.padding_idx;
-  auto per_sample_weights_stride = params.per_sample_weights_stride;
+  auto per_sample_weights_strides = params.per_sample_weights_strides;
  constant auto& output_strides = params.output_strides;
  constant auto& weight_strides = params.weight_strides;
  constant auto& max_indices_strides = params.max_indices_strides;
@ -162,6 +120,8 @@ void embedding_bag_impl(
  auto bag_idx = tid / feature_size;
  auto feature_idx = tid % feature_size;

+  output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
+
  uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
  bool is_last_bag = bag_idx + 1 == num_bags;
  uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
@ -171,37 +131,28 @@ void embedding_bag_impl(
  auto out_val = ReductionOpInit<M, T>()();

  uint32_t bag_size_ = 0;
-  I max_idx = 0;

  for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
       indices_idx++) {
    I weight_idx = indices[indices_idx];
    bool pad = (weight_idx == padding_idx);
-    auto weight_val = static_cast<opmath_t<T>>(
-        weight
-            [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
-             feature_idx * weight_strides[1]]);
+    T weight_val = weight
+        [static_cast<uint32_t>(weight_idx) * weight_strides[0] +
+         feature_idx * weight_strides[1]];

-    weight_val = MaybeApplyPerSampleWeight<M, T>()(
-        weight_val, indices_idx, per_sample_weights, per_sample_weights_stride);
-
-    auto new_out_val = ReductionOp<M, T>()(weight_val, out_val, bag_size_ == 0);
-
-    MaybeCalcMaxIndex<M, T, I>()(
-        weight_val, out_val, bag_size_ == 0, max_idx, weight_idx, pad);
-
-    out_val = pad ? out_val : new_out_val;
-    offset2bag[indices_idx] = bag_idx;
    bag_size_ += static_cast<uint32_t>(!pad);
+
+    auto tmp_val = ReductionOp<M, T>()(
+        weight_val,
+        out_val,
+        indices_idx,
+        per_sample_weights,
+        per_sample_weights_strides);
+
+    out_val = pad ? out_val : tmp_val;
  }

-  output[bag_idx * output_strides[0] + feature_idx * output_strides[1]] =
-      ReductionOpFinal<M, T>()(out_val, bag_size_);
-
-  bag_size[bag_idx] = bag_size_;
-
-  MaybeWriteMaxIndex<M, I>()(
-      max_indices, max_indices_strides, bag_idx, feature_idx, max_idx);
+  *output = ReductionOpFinal<M, T>()(out_val, bag_size_);
 }

 #define DISPATCH_IMPL(MODE)        \
--- a/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
+++ b/aten/src/ATen/native/mps/operations/EmbeddingBag.mm
@ -66,12 +66,11 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
  int64_t num_indices = indices.size(0);
  int64_t num_bags = offsets.size(0);
  if (include_last_offset) {
-    TORCH_CHECK(num_bags >= 1, "include_last_offset: number of offsets should be at least 1");
    num_bags -= 1;
  }
  int64_t feature_size = weight.size(1);

-  auto bag_size = at::empty({num_bags}, indices.options());
+  auto bag_size = at::empty(offsets.sizes(), indices.options());
  auto offset2bag = at::empty({indices.size(0)}, indices.options());
  auto output = at::empty({num_bags, feature_size}, weight.options());

@ -95,7 +94,7 @@ static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
  }

  bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
-  params.per_sample_weights_stride = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
+  params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;

  params.num_indices = num_indices;
  params.num_bags = num_bags;
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -3858,7 +3858,7 @@
  device_check: NoCheck   # TensorIterator
  structured: True
  dispatch:
-    CPU, CUDA, MTIA: aminmax_out
+    CPU, CUDA: aminmax_out
    MPS: aminmax_out_mps

 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
@ -3909,7 +3909,7 @@
 - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
-    CPU, CUDA, MTIA: amax_out
+    CPU, CUDA: amax_out
    MPS: amax_out_mps

 # Return: (Tensor output, Tensor indices)
@ -4090,7 +4090,7 @@
 - func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
  structured: True
  dispatch:
-    CPU, CUDA, MTIA: amin_out
+    CPU, CUDA: amin_out
    MPS: amin_out_mps

 # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
@ -158,46 +158,12 @@ c10::intrusive_ptr<EmbeddingPackedParamsBase> PackedEmbeddingBagWeight::prepack(
  return packed_ptr;
 }

-#ifdef USE_FBGEMM
-namespace {
-/// Number of columns in the rowwise min/max buffer passed to the quantization function(s)
-constexpr int kRowwiseMinMaxNumCols = 2;
-
-bool _validate_rowwise_min_max(
-  const at::Tensor& weight,
-  const std::optional<at::Tensor>& rowwise_min_max_opt) {
-  const auto is_valid_rowwise_min_max = rowwise_min_max_opt.has_value();
-
-  if (is_valid_rowwise_min_max) {
-      TORCH_CHECK(
-        (rowwise_min_max_opt->dim() == 2 &&
-        rowwise_min_max_opt->size(0) == weight.size(0) &&
-        rowwise_min_max_opt->size(1) == kRowwiseMinMaxNumCols),
-        "'rowwise_min_max' must be a 2D tensor with shape [num_rows(weight), 2].");
-  }
-
-  return is_valid_rowwise_min_max;
-}
-
-auto _get_rowwise_min_max_contig(
-  const std::optional<at::Tensor>& rowwise_min_max_opt) {
-    return rowwise_min_max_opt.has_value()
-      ? rowwise_min_max_opt->expect_contiguous(rowwise_min_max_opt->suggest_memory_format())
-      : at::borrow_from_optional_tensor(rowwise_min_max_opt);
-}
-}
-#endif // USE_FBGEMM
-
 namespace at::native {

 // Note - This is a temporary pack function for embedding bag which quantizes
 // and packs the float weight tensor. In the next step it will be replaced by a
 // quantize and pack function once we support FP scale and FP zero_point
 //
-// The optional rowwise_min_max argument is to support callers to pass in the min/max
-// values of the weight tensor. If the rowwise_min_max is not provided, the min/max
-// values will be computed from the weight tensor.
-//
 // Python example examining a packed 8bit zero_point and scale:
 //
 // >> x = torch.from_numpy(np.array([[[10, 20], [30, 40]],[[50, 60], [70, 80]]],
@ -255,10 +221,7 @@ namespace at::native {
 //
 //        [[50.        , 60.00000035],
 //         [70.        , 80.00000035]]])
-Tensor& qembeddingbag_byte_prepack_out(
-    Tensor& output,
-    const Tensor& weight,
-    const std::optional<Tensor>& rowwise_min_max_opt) {
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
  // The "last" dimension of an N-Dimensioned batch of embedding bags is
  // quantization channel. E.g. for a 2D embedding bag, this has
  // [ row, col ] dimensions, for batched of embedding bags, dimensions might be
@ -293,16 +256,9 @@ Tensor& qembeddingbag_byte_prepack_out(
  auto* output_data = output.data_ptr<uint8_t>();

 #ifdef USE_FBGEMM
-  // Move these outside of the ifdef when we support non-FBGEMM flow.
-  const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
-  const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
-
  if (weight_contig->scalar_type() == at::ScalarType::Half) {
    const auto weight_data =
        static_cast<fbgemm::float16*>(weight_contig->data_ptr());
-    const auto rowwise_min_max_data = is_valid_rowwise_min_max
-        ? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
-        : nullptr;
    at::parallel_for(
        0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
          fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<
@ -310,21 +266,17 @@ Tensor& qembeddingbag_byte_prepack_out(
              weight_data + start_idx * embedding_cols,
              end_idx - start_idx,
              embedding_cols,
-              output_data + start_idx * output_columns,
-              (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
+              output_data + start_idx * output_columns);
        });
  } else {
    const auto weight_data = weight_contig->data_ptr<float>();
-    const auto rowwise_min_max_data =
-        is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
    at::parallel_for(
        0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
          fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat<float>(
              weight_data + start_idx * embedding_cols,
              end_idx - start_idx,
              embedding_cols,
-              output_data + start_idx * output_columns,
-              (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
+              output_data + start_idx * output_columns);
        });
  }

@ -374,22 +326,6 @@ Tensor qembeddingbag_byte_prepack(const Tensor& weight) {
  return output;
 }

-static Tensor qembeddingbag_byte_prepack_with_rowwise_min_max(
-    const Tensor& weight,
-    const Tensor& rowwise_min_max) {
-  const auto weight_contig =
-      weight.expect_contiguous(weight.suggest_memory_format());
-  Tensor output = at::detail::empty_cpu(
-      {0},
-      at::kByte,
-      weight_contig->layout(),
-      weight_contig->device(),
-      std::nullopt,
-      std::nullopt);
-  qembeddingbag_byte_prepack_out(output, weight, rowwise_min_max);
-  return output;
-}
-
 Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
  const auto weight_contig =
      weight.expect_contiguous(weight.suggest_memory_format());
@ -399,7 +335,7 @@ Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight) {
      "'embedding_bag_byte_prepack' only support float32 or float16.");
  const auto weight_sizes = weight.sym_sizes();
  const auto cols_dim = weight.ndimension() - 1;
-  const auto& embedding_cols = weight_sizes[cols_dim];
+  const auto embedding_cols = weight_sizes[cols_dim];
  // Add 8 bytes per column to store FP32 scale and zero_point per row.
  const auto output_columns = embedding_cols + 2 * sizeof(float);

@ -423,8 +359,7 @@ Tensor _qembeddingbag_nbit_prepack_helper(
    int bit_width,
    const bool optimized_qparams,
    const int64_t nbins,
-    const double ratio,
-    const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt) {
+    const double ratio) {
  TORCH_CHECK(
      weight.scalar_type() == at::ScalarType::Float ||
          weight.scalar_type() == at::ScalarType::Half,
@ -466,17 +401,10 @@ Tensor _qembeddingbag_nbit_prepack_helper(
  auto* output_data = output.data_ptr<uint8_t>();

 #ifdef USE_FBGEMM
-  // Move these outside of the ifdef when we support non-FBGEMM flow.
-  const auto is_valid_rowwise_min_max = _validate_rowwise_min_max(weight, rowwise_min_max_opt);
-  const auto rowwise_min_max_contig = _get_rowwise_min_max_contig(rowwise_min_max_opt);
-
  if (!optimized_qparams) {
    if (weight_contig.scalar_type() == at::ScalarType::Half) {
      const auto weight_data =
          static_cast<fbgemm::float16*>(weight_contig.data_ptr());
-      const auto rowwise_min_max_data = is_valid_rowwise_min_max
-          ? static_cast<fbgemm::float16*>(rowwise_min_max_contig->data_ptr())
-          : nullptr;
      at::parallel_for(
          0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
            fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<
@ -485,13 +413,10 @@ Tensor _qembeddingbag_nbit_prepack_helper(
                weight_data + start_idx * embedding_cols,
                end_idx - start_idx,
                static_cast<int>(embedding_cols),
-                output_data + start_idx * output_shape[1],
-                (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
+                output_data + start_idx * output_shape[1]);
          });
    } else {
      const auto weight_data = weight_contig.data_ptr<float>();
-      const auto rowwise_min_max_data =
-          is_valid_rowwise_min_max ? rowwise_min_max_contig->data_ptr<float>() : nullptr;
      at::parallel_for(
          0, embedding_rows, 1, [&](int64_t start_idx, int64_t end_idx) {
            fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf<float>(
@ -499,8 +424,7 @@ Tensor _qembeddingbag_nbit_prepack_helper(
                weight_data + start_idx * embedding_cols,
                end_idx - start_idx,
                static_cast<int>(embedding_cols),
-                output_data + start_idx * output_shape[1],
-                (is_valid_rowwise_min_max ? (rowwise_min_max_data + start_idx * kRowwiseMinMaxNumCols) : nullptr));
+                output_data + start_idx * output_shape[1]);
          });
    }
  } else {
@ -590,16 +514,6 @@ Tensor qembeddingbag_4bit_prepack(
      weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio);
 }

-Tensor qembeddingbag_4bit_prepack_with_rowwise_min_max(
-    const Tensor& weight,
-    const Tensor& rowwise_min_max,
-    const bool optimized_qparams,
-    const int64_t nbins,
-    const double ratio) {
-  return _qembeddingbag_nbit_prepack_helper(
-      weight, 4 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
-}
-
 // Applies 2-bit row-wise quantization by determining the range
 // (maximum - minimum) and bias (minimum value) of each row in the input
 // matrix, and then scaling each element to an 2-bit number between 0 and
@ -617,16 +531,6 @@ Tensor qembeddingbag_2bit_prepack(
      weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio);
 }

-Tensor qembeddingbag_2bit_prepack_with_rowwise_min_max(
-    const Tensor& weight,
-    const Tensor& rowwise_min_max,
-    const bool optimized_qparams,
-    const int64_t nbins,
-    const double ratio) {
-  return _qembeddingbag_nbit_prepack_helper(
-      weight, 2 /*bit_width*/, optimized_qparams, nbins, ratio, rowwise_min_max);
-}
-
 class QEmbeddingPackWeights final {
 public:
  static c10::intrusive_ptr<EmbeddingPackedParamsBase> run(const at::Tensor& weight) {
@ -638,21 +542,12 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack"),
      TORCH_FN(qembeddingbag_byte_prepack));
-  m.impl(
-      TORCH_SELECTIVE_NAME("quantized::embedding_bag_byte_prepack_with_rowwise_min_max"),
-      TORCH_FN(qembeddingbag_byte_prepack_with_rowwise_min_max));
  m.impl(
      TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack"),
      TORCH_FN(qembeddingbag_4bit_prepack));
-  m.impl(
-      TORCH_SELECTIVE_NAME("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max"),
-      TORCH_FN(qembeddingbag_4bit_prepack_with_rowwise_min_max));
  m.impl(
      TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack"),
      TORCH_FN(qembeddingbag_2bit_prepack));
-  m.impl(
-      TORCH_SELECTIVE_NAME("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max"),
-      TORCH_FN(qembeddingbag_2bit_prepack_with_rowwise_min_max));
 }

 TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.h
@ -3,10 +3,7 @@

 namespace at::native {

-Tensor& qembeddingbag_byte_prepack_out(
-    Tensor& output,
-    const Tensor& weight,
-    const std::optional<Tensor>& rowwise_min_max_opt = std::nullopt);
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);

 Tensor qembeddingbag_byte_prepack(const Tensor& weight);

--- a/aten/src/ATen/native/quantized/library.cpp
+++ b/aten/src/ATen/native/quantized/library.cpp
@ -121,12 +121,9 @@ TORCH_LIBRARY(quantized, m) {
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_unpack(__torch__.torch.classes.quantized.EmbeddingPackedParamsBase W_prepack) -> Tensor W_origin"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
-  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
-  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack(Tensor weight, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
-  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_prepack_with_rowwise_min_max(Tensor weight, Tensor rowwise_min_max, bool optimized_qparams=False, int nbins=200, float ratio=0.16) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_2bit_unpack(Tensor weight) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_byte_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
  m.def(TORCH_SELECTIVE_SCHEMA("quantized::embedding_bag_4bit_rowwise_offsets(Tensor weight, Tensor indices, Tensor? offsets=None, bool scale_grad_by_freq=False, int mode=0, bool pruned_weights=False, Tensor? per_sample_weights=None, Tensor? compressed_indices_mapping=None, bool include_last_offset=False) -> Tensor"), {at::Tag::pt2_compliant_tag});
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@ -120,7 +120,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
  // buffer (in bytes)
  size_t orig_m = sparse_input.size(0);
  size_t div = orig_m * sparse_input.itemsize();
-  size_t new_n = (compressed_size + div - 1) / div; // ceil(s,d) = (s+d-1)/d
+  size_t new_n = (compressed_size + div - 1) / div; // floor
  auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});

  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
@ -155,7 +155,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
    handle_initialized = true;
  }
-  // cuSPARSELt constructs
+  // cupsarselt constructs
  cusparseLtMatmulDescriptor_t matmul;
  cusparseLtMatmulPlan_t plan;
  cusparseLtMatmulAlgSelection_t alg_sel;
--- a/aten/src/ATen/templates/FunctionalInverses.h
+++ b/aten/src/ATen/templates/FunctionalInverses.h
@ -2,12 +2,22 @@

 // ${generated_comment}

-#include <ATen/FunctionalStorageImpl.h>
 #include <ATen/Tensor.h>

 namespace at {
 namespace functionalization {

+enum class InverseReturnMode {
+  /// Specifies that functional inverses should always return a view.
+  AlwaysView,
+  /// Specifies that functional inverses should always return a non-view / copy.
+  NeverView,
+  /// Specifies that functional inverses should return a view unless a (copying) scatter
+  /// inverse exists, in which case that will be used instead.
+  /// This avoids as_strided() calls that can be difficult for subclasses to handle.
+  ViewOrScatterInverse,
+};
+
 struct FunctionalInverses {

 ${view_inverse_declarations}
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@ -4,7 +4,7 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
 #include <ATen/FunctionalTensorWrapper.h>
-#include <ATen/ViewMetaClasses.h>
+#include <ATen/FunctionalInverses.h>
 #include <ATen/MemoryOverlap.h>
 #include <torch/library.h>

--- a/aten/src/ATen/templates/ViewMetaClasses.cpp
+++ b/aten/src/ATen/templates/ViewMetaClasses.cpp
@ -1,19 +0,0 @@
-// ${generated_comment}
-
-#include <ATen/FunctionalInverses.h>
-#include <ATen/ViewMetaClasses.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Operators.h>
-#include <ATen/NativeFunctions.h>
-#else
-${op_headers}
-#endif
-
-namespace at {
-namespace functionalization {
-
-${view_meta_implementations}
-
-} // namespace functionalization
-} // namespace at
--- a/aten/src/ATen/templates/ViewMetaClasses.h
+++ b/aten/src/ATen/templates/ViewMetaClasses.h
@ -1,12 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-// ${generated_comment}
-
-#include <ATen/FunctionalStorageImpl.h>
-
-namespace at {
-namespace functionalization {
-
-${view_meta_declarations}
-
-} // namespace functionalization
-} // namespace at
--- a/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
+++ b/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
@ -1,11 +0,0 @@
-#include <ATen/ViewMetaClasses.h>
-#include <torch/csrc/functionalization/Module.h>
-
-namespace torch::functionalization {
-
-void initGenerated(PyObject* module) {
-  auto functionalization = py::handle(module).cast<py::module>();
-  $view_meta_bindings
-}
-
-} // namespace torch::functionalization
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -78,8 +78,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
                "google/gemma-3-4b-it",
                "openai/whisper-tiny",
                "Qwen/Qwen3-0.6B",
-                "mistralai/Mistral-7B-Instruct-v0.3",
-                "openai/gpt-oss-20b",
            }
        )

--- a/benchmarks/dynamo/check_graph_breaks.py
+++ b/benchmarks/dynamo/check_graph_breaks.py
@ -61,8 +61,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
                "google/gemma-3-4b-it",
                "openai/whisper-tiny",
                "Qwen/Qwen3-0.6B",
-                "mistralai/Mistral-7B-Instruct-v0.3",
-                "openai/gpt-oss-20b",
            }
        )

--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -191,11 +191,3 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
-
-
-
-mistralai/Mistral-7B-Instruct-v0.3,pass,0
-
-
-
-openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -187,11 +187,3 @@ openai/whisper-tiny,fail_to_run,0


 Qwen/Qwen3-0.6B,fail_to_run,0
-
-
-
-mistralai/Mistral-7B-Instruct-v0.3,fail_to_run,0
-
-
-
-openai/gpt-oss-20b,fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_huggingface_inference.csv
@ -191,11 +191,3 @@ openai/whisper-tiny,pass_due_to_skip,0


 Qwen/Qwen3-0.6B,pass_due_to_skip,0
-
-
-
-mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
-
-
-
-openai/gpt-oss-20b,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_huggingface_inference.csv
@ -191,11 +191,3 @@ openai/whisper-tiny,pass_due_to_skip,0


 Qwen/Qwen3-0.6B,pass_due_to_skip,0
-
-
-
-mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
-
-
-
-openai/gpt-oss-20b,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_huggingface_inference.csv
@ -191,11 +191,3 @@ openai/whisper-tiny,pass_due_to_skip,0


 Qwen/Qwen3-0.6B,pass_due_to_skip,0
-
-
-
-mistralai/Mistral-7B-Instruct-v0.3,pass_due_to_skip,0
-
-
-
-openai/gpt-oss-20b,pass_due_to_skip,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -191,11 +191,3 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
-
-
-
-mistralai/Mistral-7B-Instruct-v0.3,pass,0
-
-
-
-openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_huggingface_inference.csv
@ -191,11 +191,3 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
-
-
-
-mistralai/Mistral-7B-Instruct-v0.3,pass,0
-
-
-
-openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -191,11 +191,3 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
-
-
-
-mistralai/Mistral-7B-Instruct-v0.3,pass,0
-
-
-
-openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -191,11 +191,3 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
-
-
-
-mistralai/Mistral-7B-Instruct-v0.3,pass,0
-
-
-
-openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -191,11 +191,3 @@ openai/whisper-tiny,pass,0


 Qwen/Qwen3-0.6B,pass,0
-
-
-
-mistralai/Mistral-7B-Instruct-v0.3,pass,0
-
-
-
-openai/gpt-oss-20b,pass,0
--- a/benchmarks/dynamo/huggingface.yaml
+++ b/benchmarks/dynamo/huggingface.yaml
@ -11,8 +11,6 @@ skip:
    - GPTJForQuestionAnswering
    # Model too big
    - google/gemma-3-4b-it
-    - openai/gpt-oss-20b
-    - mistralai/Mistral-7B-Instruct-v0.3

  device:
    cpu:
@ -21,8 +19,6 @@ skip:
      - google/gemma-3-4b-it
      - openai/whisper-tiny
      - Qwen/Qwen3-0.6B
-      - mistralai/Mistral-7B-Instruct-v0.3
-      - openai/gpt-oss-20b

  control_flow:
    - AllenaiLongformerBase
@ -83,8 +79,6 @@ batch_size:
    google/gemma-3-4b-it: 8
    openai/whisper-tiny: 8
    Qwen/Qwen3-0.6B: 8
-    mistralai/Mistral-7B-Instruct-v0.3: 8
-    openai/gpt-oss-20b: 8


 tolerance:
--- a/benchmarks/dynamo/huggingface_llm_models.py
+++ b/benchmarks/dynamo/huggingface_llm_models.py
@ -99,6 +99,4 @@ HF_LLM_MODELS: dict[str, Benchmark] = {
    "google/gemma-3-4b-it": TextGenerationBenchmark,
    "openai/whisper-tiny": WhisperBenchmark,
    "Qwen/Qwen3-0.6B": TextGenerationBenchmark,
-    "mistralai/Mistral-7B-Instruct-v0.3": TextGenerationBenchmark,
-    "openai/gpt-oss-20b": TextGenerationBenchmark,
 }
--- a/benchmarks/dynamo/huggingface_models_list.txt
+++ b/benchmarks/dynamo/huggingface_models_list.txt
@ -51,5 +51,3 @@ google/gemma-2-2b,8
 google/gemma-3-4b-it,8
 openai/whisper-tiny,8
 Qwen/Qwen3-0.6B,8
-mistralai/Mistral-7B-Instruct-v0.3, 8
-openai/gpt-oss-20b, 8
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
 # for targets in subfolders
 ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"

-C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
+C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")

 # a dictionary maps third party library name to fbsource and oss target
 THIRD_PARTY_LIBS = {
@ -391,8 +391,6 @@ def get_aten_generated_files(enabled_backends):
        "CompositeExplicitAutogradFunctions_inl.h",
        "CompositeExplicitAutogradNonFunctionalFunctions.h",
        "CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
-        "ViewMetaClasses.h",
-        "ViewMetaClasses.cpp",
        "VmapGeneratedPlumbing.h",
        "core/ATenOpList.cpp",
        "core/TensorBody.h",
@ -950,6 +948,7 @@ def define_buck_targets(
            [
                ("torch/csrc/api/include", "torch/**/*.h"),
                ("", "torch/csrc/**/*.h"),
+                ("", "torch/csrc/**/*.hpp"),
                ("", "torch/nativert/**/*.h"),
                ("", "torch/headeronly/**/*.h"),
                ("", "torch/script.h"),
@ -1194,7 +1193,6 @@ def define_buck_targets(
            "NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
            "Operators.h": ":gen_aten[Operators.h]",
            "RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
-            "ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]",
            "core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
            "core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
            "core/enum_tag.h": ":gen_aten[core/enum_tag.h]",
@ -2050,6 +2048,7 @@ def define_buck_targets(
                ("", "caffe2/utils/*.h"),
                ("", "caffe2/core/*.h"),
                ("", "torch/csrc/*.h"),
+                ("", "torch/csrc/*.hpp"),
                ("", "torch/csrc/api/include/torch/*.h"),
                ("", "torch/csrc/autograd/*.h"),
                ("", "torch/csrc/autograd/*/*.h"),
--- a/build.bzl
+++ b/build.bzl
@ -118,9 +118,6 @@ def define_targets(rules):
            ":LazyNonNativeIr.h",
            ":RegisterDispatchDefinitions.ini",
            ":RegisterDispatchKey.cpp",
-            ":ViewMetaClassesPythonBinding.cpp",
-            ":ViewMetaClasses.cpp",
-            ":ViewMetaClasses.h",
            ":native_functions.yaml",
            ":shape_inference.h",
            ":tags.yaml",
@ -173,7 +170,6 @@ GENERATED_H = [
    "FunctionalInverses.h",
    "RedispatchFunctions.h",
    "RegistrationDeclarations.h",
-    "ViewMetaClasses.h",
    "VmapGeneratedPlumbing.h",
 ]

@ -250,7 +246,6 @@ GENERATED_CPP = [
    "RegisterFunctionalization_1.cpp",
    "RegisterFunctionalization_2.cpp",
    "RegisterFunctionalization_3.cpp",
-    "ViewMetaClasses.cpp",
 ]

 GENERATED_CPP_CORE = [
@ -312,7 +307,6 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
    "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
    "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
    "torch/csrc/autograd/generated/python_variable_methods.cpp",
-    "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
 ]

 GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -1010,7 +1010,6 @@ libtorch_python_core_sources = [
    "torch/csrc/utils/disable_torch_function.cpp",
    "torch/csrc/utils/verbose.cpp",
    "torch/csrc/cpu/Module.cpp",
-    "torch/csrc/functionalization/Module.cpp",
    "torch/csrc/instruction_counter/Module.cpp",
    "torch/nativert/python/Bindings.cpp",
 ] + lazy_tensor_core_python_sources
@ -1053,7 +1052,6 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
        "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
        "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
        "torch/csrc/autograd/generated/python_variable_methods.cpp",
-        "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp",
    ]]

    _libtorch_python_sources.extend(libtorch_python_core_sources)
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@ -3,9 +3,9 @@
 #include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
-#include <c10/util/Metaprogramming.h>
-#include <c10/util/TypeList.h>
 #include <c10/util/llvmMathExtras.h>
+#include <torch/headeronly/util/Metaprogramming.h>
+#include <torch/headeronly/util/TypeList.h>
 #include <array>
 #include <cstddef>
 #include <cstdint>
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -3244,7 +3244,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
    are_equal<sizeof(autograd_meta_),      4,  FieldNameEnum::autograd_meta_>();
    are_equal<sizeof(extra_meta_),         4,  FieldNameEnum::extra_meta_>();
    are_equal<sizeof(version_counter_),    4,  FieldNameEnum::version_counter_>();
-    are_equal<sizeof(pyobj_slot_),         4,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(pyobj_slot_),    8,  FieldNameEnum::pyobj_slot_>();
    is_le<sizeof(sizes_and_strides_),     88, FieldNameEnum::sizes_and_strides_>();
    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
    is_le<sizeof(autograd_meta_),         16,  FieldNameEnum::autograd_meta_>();
    is_le<sizeof(extra_meta_),            16,  FieldNameEnum::extra_meta_>();
    are_equal<sizeof(version_counter_),    8,  FieldNameEnum::version_counter_>();
-    are_equal<sizeof(pyobj_slot_),         8,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(pyobj_slot_),   16,  FieldNameEnum::pyobj_slot_>();
    are_equal<sizeof(sizes_and_strides_), 88,  FieldNameEnum::sizes_and_strides_>();
    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
--- a/c10/core/impl/PyInterpreterHooks.h
+++ b/c10/core/impl/PyInterpreterHooks.h
@ -13,10 +13,11 @@ struct C10_API PyInterpreterHooksInterface {

  // Get the PyInterpreter instance
  // Stub implementation throws error when Python is not available
-  // We return nullptr rather than throwing an error since there are bits of c10
-  // that expect an empty PyObjectSlot when python is not available.
  virtual PyInterpreter* getPyInterpreter() const {
-    return nullptr;
+    TORCH_CHECK(
+        false,
+        "PyTorch was compiled without Python support. "
+        "Cannot access Python interpreter from C++.");
  }
 };

--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@ -2,7 +2,7 @@

 namespace c10::impl {

-PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
+PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}

 PyObjectSlot::~PyObjectSlot() {
  maybe_destroy_pyobj();
@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {

 void PyObjectSlot::maybe_destroy_pyobj() {
  if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
+    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*getGlobalPyInterpreter())
+    (*pyobj_interpreter_.load(std::memory_order_acquire))
        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
    // NB: this destructor can only be entered when there are no
    // references to this C++ object (obviously), NOR any references
@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
 }

 PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return getGlobalPyInterpreter();
+  return pyobj_interpreter_.load(std::memory_order_acquire);
 }

 PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
 }

 PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = getGlobalPyInterpreter();
+  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
  if (interpreter) {
    return *interpreter;
  }
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -6,17 +6,10 @@
 #include <c10/util/python_stub.h>
 #include <optional>

+#include <atomic>
+
 namespace c10::impl {

-// Function pointer type for getting the global interpreter
-using GetPyInterpreterFn = PyInterpreter* (*)();
-
-// Global function pointer (set by csrc initialization)
-C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
-
-// Helper function to get the global interpreter
-C10_API PyInterpreter* getGlobalPyInterpreter();
-
 struct C10_API PyObjectSlot {
 public:
  PyObjectSlot();
@ -33,6 +26,8 @@ struct C10_API PyObjectSlot {
  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
  // PyObject if necessary!
  void init_pyobj(PyObject* pyobj) {
+    pyobj_interpreter_.store(
+        getGlobalPyInterpreter(), std::memory_order_relaxed);
    pyobj_ = pyobj;
  }

@ -60,15 +55,18 @@ struct C10_API PyObjectSlot {

  // @todo alban: I'm not too sure what's going on here, we can probably delete
  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj() const {
-    impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
-    if (interpreter == nullptr || pyobj_ == nullptr) {
+  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
+    impl::PyInterpreter* interpreter =
+        pyobj_interpreter_.load(std::memory_order_acquire);
+    if (interpreter == nullptr) {
      return std::nullopt;
    }
-    if (c10::impl::HermeticPyObjectTLS::get_state()) {
+
+    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
      return std::nullopt;
+    } else {
+      return _unchecked_untagged_pyobj();
    }
-    return _unchecked_untagged_pyobj();
  }

  PyInterpreter& load_pyobj_interpreter() const;
@ -78,6 +76,30 @@ struct C10_API PyObjectSlot {
  void set_owns_pyobj(bool b);

 private:
+  // This field contains the interpreter tag for this object.  See
+  // Note [Python interpreter tag] for general context
+  //
+  // Note [Memory ordering on Python interpreter tag]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // What memory_order do we need when accessing this atomic?  We don't
+  // need a single total modification order (as provided by
+  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
+  // transition from -1 to some positive integer and never changes afterwards.
+  // Because there is only one modification, it trivially already has a total
+  // modification order (e.g., we don't need fences or locked instructions on
+  // x86)
+  //
+  // In fact, one could make a reasonable argument that relaxed reads are OK,
+  // due to the presence of external locking (GIL) to ensure that interactions
+  // with other data structures are still correctly synchronized, so that
+  // we fall in the "Single-Location Data Structures" case as described in
+  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
+  // as I get the same assembly in both cases.  So I just use the more
+  // conservative acquire (which will impede compiler optimizations but I don't
+  // care)
+  std::atomic<PyInterpreter*> pyobj_interpreter_;
+
  // This field contains a reference to a PyObject representing this Tensor.
  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
  // PyObject for it and set this field.  This field does not have to be
--- a/c10/ovrsource_defs.bzl
+++ b/c10/ovrsource_defs.bzl
@ -18,9 +18,9 @@ cuda_supported_platforms = [

 def define_c10_ovrsource(name, is_mobile):
    if is_mobile:
-        pp_flags = ["-DC10_MOBILE=1"]
+        pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"]
    else:
-        pp_flags = []
+        pp_flags = ["-DC10_USE_GLOG"]

    oxx_static_library(
        name = name,
--- a/c10/test/util/Metaprogramming_test.cpp
+++ b/c10/test/util/Metaprogramming_test.cpp
@ -1,6 +1,6 @@
 #include <c10/test/util/Macros.h>
-#include <c10/util/Metaprogramming.h>
 #include <gtest/gtest.h>
+#include <torch/headeronly/util/Metaprogramming.h>
 #include <cstdlib>

 using namespace c10::guts;
--- a/c10/test/util/TypeIndex_test.cpp
+++ b/c10/test/util/TypeIndex_test.cpp
@ -1,6 +1,6 @@
-#include <c10/util/Metaprogramming.h>
 #include <c10/util/TypeIndex.h>
 #include <gtest/gtest.h>
+#include <torch/headeronly/util/Metaprogramming.h>

 using c10::util::get_fully_qualified_type_name;
 using c10::util::get_type_index;
--- a/c10/test/util/TypeList_test.cpp
+++ b/c10/test/util/TypeList_test.cpp
@ -1,5 +1,5 @@
-#include <c10/util/TypeList.h>
 #include <gtest/gtest.h>
+#include <torch/headeronly/util/TypeList.h>
 #include <memory>

 using namespace c10::guts::typelist;
--- a/c10/test/util/TypeTraits_test.cpp
+++ b/c10/test/util/TypeTraits_test.cpp
@ -1,5 +1,5 @@
-#include <c10/util/TypeTraits.h>
 #include <gtest/gtest.h>
+#include <torch/headeronly/util/TypeTraits.h>

 using namespace c10::guts;

--- a/c10/util/Metaprogramming.cpp
+++ b/c10/util/Metaprogramming.cpp
@ -1 +1 @@
-#include <c10/util/Metaprogramming.h>
+#include <torch/headeronly/util/Metaprogramming.h>
--- a/c10/util/Metaprogramming.h
+++ b/c10/util/Metaprogramming.h
@ -1,224 +1 @@
-#pragma once
-
-#include <c10/util/TypeList.h>
-#include <type_traits>
-
-namespace c10::guts {
-
-/**
- * Access information about result type or arguments from a function type.
- * Example:
- * using A = function_traits<int (float, double)>::return_type // A == int
- * using A = function_traits<int (float, double)>::parameter_types::tuple_type
- * // A == tuple<float, double>
- */
-template <class Func>
-struct function_traits {
-  static_assert(
-      !std::is_same_v<Func, Func>,
-      "In function_traits<Func>, Func must be a plain function type.");
-};
-template <class Result, class... Args>
-struct function_traits<Result(Args...)> {
-  using func_type = Result(Args...);
-  using return_type = Result;
-  using parameter_types = typelist::typelist<Args...>;
-  static constexpr auto number_of_parameters = sizeof...(Args);
-};
-
-/**
- * infer_function_traits: creates a `function_traits` type for a simple
- * function (pointer) or functor (lambda/struct). Currently does not support
- * class methods.
- */
-
-template <typename Functor>
-struct infer_function_traits {
-  using type = function_traits<
-      c10::guts::detail::strip_class_t<decltype(&Functor::operator())>>;
-};
-
-template <typename Result, typename... Args>
-struct infer_function_traits<Result (*)(Args...)> {
-  using type = function_traits<Result(Args...)>;
-};
-
-template <typename Result, typename... Args>
-struct infer_function_traits<Result(Args...)> {
-  using type = function_traits<Result(Args...)>;
-};
-
-template <typename T>
-using infer_function_traits_t = typename infer_function_traits<T>::type;
-
-/**
- * make_function_traits: creates a `function_traits` type given a Return type
- * and a typelist of Argument types
- *
- * Example:
- * bool f(int, int);
- *
- * infer_function_traits_t<f> == make_function_traits_t<bool,
- * typelist::typelist<int, int>>
- */
-template <typename Result, typename ArgList>
-struct make_function_traits {
-  static_assert(
-      false_t<ArgList>::value,
-      "In guts::make_function_traits<Result, TypeList>, the ArgList argument must be typelist<...>.");
-};
-
-template <typename Result, typename... Args>
-struct make_function_traits<Result, typelist::typelist<Args...>> {
-  using type = function_traits<Result(Args...)>;
-};
-
-template <typename Result, typename ArgList>
-using make_function_traits_t =
-    typename make_function_traits<Result, ArgList>::type;
-
-/**
- * make_offset_index_sequence<Start, N>
- * Like make_index_sequence<N>, but starting from Start instead of 0.
- *
- * Example:
- *  make_offset_index_sequence<10, 3> == std::index_sequence<10, 11, 12>
- */
-template <size_t Start, size_t N, size_t... Is>
-struct make_offset_index_sequence_impl
-    : make_offset_index_sequence_impl<Start, N - 1, Start + N - 1, Is...> {
-  static_assert(
-      static_cast<int>(Start) >= 0,
-      "make_offset_index_sequence: Start < 0");
-  static_assert(static_cast<int>(N) >= 0, "make_offset_index_sequence: N < 0");
-};
-
-template <size_t Start, size_t... Is>
-struct make_offset_index_sequence_impl<Start, 0, Is...> {
-  typedef std::index_sequence<Is...> type;
-};
-
-template <size_t Start, size_t N>
-using make_offset_index_sequence =
-    typename make_offset_index_sequence_impl<Start, N>::type;
-
-/**
- * Use tuple_elements to extract a position-indexed subset of elements
- * from the argument tuple into a result tuple.
- *
- * Example:
- *  std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
- *  std::tuple<int, double> result = tuple_elements(t, std::index_sequence<0,
- * 2>());
- */
-template <class Tuple, size_t... Is>
-constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...>) {
-  return std::tuple<std::tuple_element_t<Is, Tuple>...>(std::get<Is>(t)...);
-}
-
-/**
- * Use tuple_take to extract the first or last n elements from the argument
- * tuple into a result tuple.
- *
- * Example:
- *  std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
- *  std::tuple<int, const char*> first_two = tuple_take<decltype(t), 2>(t);
- *  std::tuple<const char*, double> last_two = tuple_take<decltype(t), -2>(t);
- */
-template <class Tuple, int N, class Enable = void>
-struct TupleTake {};
-
-template <class Tuple, int N>
-struct TupleTake<Tuple, N, std::enable_if_t<N >= 0, void>> {
-  static auto call(Tuple t) {
-    constexpr size_t size = std::tuple_size<Tuple>();
-    static_assert(N <= size, "tuple_take: N > size");
-    return tuple_elements(t, std::make_index_sequence<N>{});
-  }
-};
-
-template <class Tuple, int N>
-    struct TupleTake < Tuple,
-    N, std::enable_if_t<N<0, void>> {
-  static auto call(Tuple t) {
-    constexpr size_t size = std::tuple_size<Tuple>();
-    static_assert(-N <= size, "tuple_take: -N > size");
-    return tuple_elements(t, make_offset_index_sequence<size + N, -N>{});
-  }
-};
-
-template <class Tuple, int N>
-auto tuple_take(Tuple t) {
-  return TupleTake<Tuple, N>::call(t);
-}
-
-/**
- * Use tuple_slice to extract a contiguous subtuple from the argument.
- *
- * Example:
- *  std::tuple<int, const char*, double, bool> t = std::make_tuple(0,
- * "HEY", 2.0, false); std::tuple<int, const char*> middle_two =
- * tuple_slice<decltype(t), 1, 2>(t);
- */
-template <class Tuple, size_t Start, size_t N>
-constexpr auto tuple_slice(Tuple t) {
-  constexpr size_t size = std::tuple_size<Tuple>();
-  static_assert(Start + N <= size, "tuple_slice: Start + N > size");
-  return tuple_elements(t, make_offset_index_sequence<Start, N>{});
-}
-
-/**
- * Use tuple_map to run a mapping function over a tuple to get a new tuple.
- *
- * Example 1:
- *   auto result = tuple_map(std::tuple<int32_t, int32_t, int32_t>(3, 4, 5), []
- * (int32_t a) -> int16_t {return a+1;});
- *   // result == std::tuple<int16_t, int16_t, int16_t>(4, 5, 6)
- *
- * Example 2:
- *   struct Mapper {
- *     std::string operator()(int32_t a) const {
- *       return std::to_string(a);
- *     }
- *     int64_t operator()(const std::string& a) const {
- *        return atoi(a.c_str());
- *     }
- *   };
- *   auto result = tuple_map(std::tuple<int32_t, std::string>(3, "4"),
- * Mapper());
- *   // result == std::tuple<std::string, int64_t>("3", 4)
- *
- * Example 3:
- *   struct A final {
- *    int32_t func() {
- *      return 5;
- *    }
- *  };
- *  struct B final {
- *    std::string func() {
- *      return "5";
- *    }
- *  };
- *  auto result = tuple_map(std::make_tuple(A(), B()), [] (auto a) { return
- * a.func(); });
- *  // result == std::tuple<int32_t, std::string>(5, "5");
- */
-namespace detail {
-template <class Mapper, class... Args, size_t... Indices>
-auto tuple_map(
-    // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
-    std::tuple<Args...>&& tuple,
-    const Mapper& mapper,
-    std::index_sequence<Indices...>) {
-  return std::tuple<decltype(mapper(std::forward<Args>(std::get<Indices>(
-      tuple))))...>(mapper(std::forward<Args>(std::get<Indices>(tuple)))...);
-}
-} // namespace detail
-
-template <class Mapper, class... Args>
-auto tuple_map(std::tuple<Args...>&& tuple, const Mapper& mapper) {
-  return detail::tuple_map(
-      std::move(tuple), mapper, std::index_sequence_for<Args...>());
-}
-
-} // namespace c10::guts
+#include <torch/headeronly/util/Metaprogramming.h>
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Luca Wehrstedt	9072e92197	Update (base update) [ghstack-poisoned]	2025-09-23 12:20:53 +00:00
Luca Wehrstedt	ecb064b1dd	Update (base update) [ghstack-poisoned]	2025-09-23 12:08:34 +00:00
Luca Wehrstedt	d9f0ae2f43	Update (base update) [ghstack-poisoned]	2025-09-22 11:58:17 +00:00