fix the problem that cpu_fallback for aten::triu_indices on custom device crashed (#121306 )

Fixes #121289 Pull Request resolved: https://github.com/pytorch/pytorch/pull/121306 Approved by: https://github.com/ezyang
Avoid COW materialize in nn.functional forward ops (3) (#122443 )
2025-10-21 21:49:24 +08:00 · 2024-03-26 01:29:45 +00:00 · 2024-03-26 00:56:57 +00:00 · 2024-03-26 00:52:12 +00:00 · 2024-03-25 22:44:54 +00:00 · 2024-03-25 22:05:20 +00:00
983 changed files with 31082 additions and 10574 deletions
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-e2a8f9548aecb62a68e264607174a7d207ed2929
+7f96f5a852ba452670255d28d59f1e6398141fbb
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-a9bc1a36470eefafe0e2ab2503b8698f1e89e7e3
+989adb9a29496c22a36ef82ca69cad5dad536b9c
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -57,8 +57,21 @@ fi
  # Uncomment the below when resolved to track the latest conda update
  # as_jenkins conda update -y -n base conda

+  if [[ $(uname -m) == "aarch64" ]]; then
+    export SYSROOT_DEP="sysroot_linux-aarch64=2.17"
+  else
+    export SYSROOT_DEP="sysroot_linux-64=2.17"
+  fi
+
  # Install correct Python version
-  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION"
+  # Also ensure sysroot is using a modern GLIBC to match system compilers
+  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
+             python="$ANACONDA_PYTHON_VERSION" \
+             ${SYSROOT_DEP}
+
+  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
+  # which is provided in libstdcxx 12 and up.
+  conda_install libstdcxx-ng=12.3.0 -c conda-forge

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
@ -110,14 +123,5 @@ fi
    pip_install -r /opt/conda/requirements-docs.txt
  fi

-  # HACK HACK HACK
-  # gcc-9 for ubuntu-18.04 from http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu
-  # Pulls llibstdc++6 13.1.0-8ubuntu1~18.04 which is too new for conda
-  # So remove libstdc++6.so.3.29 installed by https://anaconda.org/anaconda/libstdcxx-ng/files?version=11.2.0
-  # Same is true for gcc-12 from Ubuntu-22.04
-  if grep -e [12][82].04.[623] /etc/issue >/dev/null; then
-    rm /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/libstdc++.so.6
-  fi
-
  popd
 fi
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -33,12 +33,12 @@ pip_install coloredlogs packaging
 pip_install onnxruntime==1.17.0
 pip_install onnx==1.15.0
 # pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
-pip_install onnxscript==0.1.0.dev20240301 --no-deps
+pip_install onnxscript==0.1.0.dev20240315 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
 IMPORT_SCRIPT_FILENAME="/tmp/onnx_import_script.py"
-as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2");' > "${IMPORT_SCRIPT_FILENAME}"
+as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3");' > "${IMPORT_SCRIPT_FILENAME}"

 # Need a PyTorch version for transformers to work
 pip_install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
--- a/.ci/docker/common/install_protobuf.sh
+++ b/.ci/docker/common/install_protobuf.sh
@ -11,7 +11,8 @@ mkdir -p $pb_dir
 ln -s /usr/lib64 "$pb_dir/lib64"

 curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
-tar -xvz -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
+
+tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
 NPROC=$[$(nproc) - 2]
 pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
 popd
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -223,6 +223,10 @@ if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi

+WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
+sudo chown -R jenkins /var/lib/jenkins/workspace
+git config --global --add safe.directory /var/lib/jenkins/workspace
+
 if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
  set -e

@ -248,13 +252,17 @@ else
  ( ! get_exit_code python setup.py clean bad_argument )

  if [[ "$BUILD_ENVIRONMENT" != *libtorch* ]]; then
-
    # rocm builds fail when WERROR=1
    # XLA test build fails when WERROR=1
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
+      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
+        # Install numpy-2.0 release candidate for builds
+        # Which should be backward compatible with Numpy-1.X
+        python -mpip install --pre numpy==2.0.0b1
+      fi
      WERROR=1 python setup.py bdist_wheel
    else
      python setup.py bdist_wheel
@ -355,3 +363,5 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
 fi

 print_sccache_stats
+
+sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -159,7 +159,7 @@ function install_torchvision() {
 }

 function install_tlparse() {
-  pip_install --user "tlparse==0.3.5"
+  pip_install --user "tlparse==0.3.7"
  PATH="$(python -m site --user-base)/bin:$PATH"
 }

--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -45,6 +45,7 @@ time python test/run_test.py --verbose -i distributed/test_device_mesh
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
+time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state.py

 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -299,6 +299,8 @@ test_inductor_distributed() {
  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group
  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing
  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
+  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp
+  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume
  pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype
--- a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
@ -8,7 +8,7 @@ body:
      value: >
        #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the
        existing and past issues](https://github.com/pytorch/pytorch/issues)
-        It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/master/dynamo/index.html)
+        It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/main/dynamo/index.html)
  - type: textarea
    attributes:
      label: 🐛 Describe the bug
@ -33,7 +33,7 @@ body:
      label: Minified repro
      description: |
        Please run the minifier on your example and paste the minified code below
-        Learn more here https://pytorch.org/docs/master/compile/troubleshooting.html
+        Learn more here https://pytorch.org/docs/main/torch.compiler_troubleshooting.html
      placeholder: |
        env TORCHDYNAMO_REPRO_AFTER="aot" python your_model.py
        or
--- a/.github/actions/download-build-artifacts/action.yml
+++ b/.github/actions/download-build-artifacts/action.yml
@ -9,6 +9,10 @@ inputs:
  use-gha:
    description: If set to any value, use GHA to download the artifact. Otherwise use s3.
    required: false
+  s3-bucket:
+    description: S3 bucket to download builds
+    required: false
+    default: "gha-artifacts"

 runs:
  using: composite
@ -18,6 +22,7 @@ runs:
      uses: seemethere/download-artifact-s3@v4
      with:
        name: ${{ inputs.name }}
+        s3-bucket: ${{ inputs.s3-bucket }}

    - name: Download PyTorch Build Artifacts from GHA
      if: inputs.use-gha
@ -29,6 +34,10 @@ runs:
      shell: bash
      run: unzip -o artifacts.zip

+    - name: Remove artifacts.zip
+      shell: bash
+      run: rm artifacts.zip
+
    - name: Output disk space left
      shell: bash
      run: df -H
--- a/.github/actions/linux-build/action.yml
+++ b/.github/actions/linux-build/action.yml
@ -0,0 +1,207 @@
+name: linux-build
+
+inputs:
+  build-environment:
+    required: true
+    description: Top-level label for what's being built/tested.
+  docker-image-name:
+    required: true
+    description: Name of the base docker image to build with.
+  build-generates-artifacts:
+    required: false
+    default: "true"
+    description: If set, upload generated build artifacts.
+  build-with-debug:
+    required: false
+    default: "false"
+    description: If set, build in debug mode.
+  sync-tag:
+    required: false
+    default: ""
+    description: |
+      If this is set, our linter will use this to make sure that every other
+      job with the same `sync-tag` is identical.
+  cuda-arch-list:
+    required: false
+    default: "5.2"
+    description: Runner label to select worker type
+  runner:
+    required: false
+    default: "linux.2xlarge"
+    description: |
+      List of CUDA architectures CI build should target.
+  test-matrix:
+    required: false
+    type: string
+    description: |
+      An option JSON description of what test configs to run later on. This
+      is moved here from the Linux test workflow so that we can apply filter
+      logic using test-config labels earlier and skip unnecessary builds
+  s3-bucket:
+    description: S3 bucket to download artifact
+    required: false
+    default: "gha-artifacts"
+  aws-role-to-assume:
+    description: role to assume for downloading artifacts
+    required: false
+    default: ""
+  GITHUB_TOKEN:
+    description: GitHub token
+    required: true
+  HUGGING_FACE_HUB_TOKEN:
+    description: Hugging Face Hub token
+    required: false
+    default: ""
+outputs:
+  docker-image:
+    value: ${{ steps.calculate-docker-image.outputs.docker-image }}
+    description: The docker image containing the built PyTorch.
+  test-matrix:
+    value: ${{ steps.filter.outputs.test-matrix }}
+    description: An optional JSON description of what test configs to run later on.
+
+runs:
+  using: composite
+  steps:
+    - name: Setup Linux
+      uses: ./.github/actions/setup-linux
+
+    - name: configure aws credentials
+      uses: aws-actions/configure-aws-credentials@v3
+      if: ${{ inputs.aws-role-to-assume != '' }}
+      with:
+        role-to-assume: ${{ inputs.aws-role-to-assume }}
+        role-session-name: gha-linux-build
+        role-duration-seconds: 10800
+        aws-region: us-east-1
+
+    - name: Calculate docker image
+      id: calculate-docker-image
+      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+      with:
+        docker-image-name: ${{ inputs.docker-image-name }}
+
+    - name: Use following to pull public copy of the image
+      id: print-ghcr-mirror
+      env:
+        ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      shell: bash
+      run: |
+        tag=${ECR_DOCKER_IMAGE##*/}
+        echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+    - name: Pull docker image
+      uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+      with:
+        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+    - name: Parse ref
+      id: parse-ref
+      shell: bash
+      run: .github/scripts/parse_ref.py
+
+    - name: Get workflow job id
+      id: get-job-id
+      uses: ./.github/actions/get-workflow-job-id
+      if: always()
+      with:
+        github-token: ${{ inputs.GITHUB_TOKEN }}
+
+    # Apply the filter logic to the build step too if the test-config label is already there
+    - name: Select all requested test configurations (if the test matrix is available)
+      id: filter
+      uses: ./.github/actions/filter-test-configs
+      with:
+        github-token: ${{ inputs.GITHUB_TOKEN }}
+        test-matrix: ${{ inputs.test-matrix }}
+        job-name: ${{ steps.get-job-id.outputs.job-name }}
+
+    - name: Download pytest cache
+      uses: ./.github/actions/pytest-cache-download
+      continue-on-error: true
+      with:
+        cache_dir: .pytest_cache
+        job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
+        s3_bucket: ${{ inputs.s3-bucket }}
+
+    - name: Build
+      if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
+      id: build
+      env:
+        BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
+        BRANCH: ${{ steps.parse-ref.outputs.branch }}
+        # TODO duplicated
+        AWS_DEFAULT_REGION: us-east-1
+        PR_NUMBER: ${{ github.event.pull_request.number }}
+        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
+        XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
+        PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
+        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
+        DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
+        DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }}
+        OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
+      shell: bash
+      run: |
+        # detached container should get cleaned up by teardown_ec2_linux
+        container_name=$(docker run \
+          -e BUILD_ENVIRONMENT \
+          -e MAX_JOBS="$(nproc --ignore=2)" \
+          -e AWS_DEFAULT_REGION \
+          -e PR_NUMBER \
+          -e SHA1 \
+          -e BRANCH \
+          -e SCCACHE_BUCKET \
+          -e SCCACHE_S3_KEY_PREFIX \
+          -e XLA_CUDA \
+          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
+          -e SKIP_SCCACHE_INITIALIZATION=1 \
+          -e TORCH_CUDA_ARCH_LIST \
+          -e PR_LABELS \
+          -e OUR_GITHUB_JOB_ID \
+          -e HUGGING_FACE_HUB_TOKEN \
+          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+          --security-opt seccomp=unconfined \
+          --cap-add=SYS_PTRACE \
+          --tty \
+          --detach \
+          --user jenkins \
+          -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+          -w /var/lib/jenkins/workspace \
+          "${DOCKER_IMAGE}"
+        )
+        docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
+
+    - name: Archive artifacts into zip
+      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
+      shell: bash
+      run: |
+        zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
+
+    - name: Store PyTorch Build Artifacts on S3
+      uses: seemethere/upload-artifact-s3@v5
+      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
+      with:
+        name: ${{ inputs.build-environment }}
+        retention-days: 14
+        if-no-files-found: error
+        path: artifacts.zip
+        s3-bucket: ${{ inputs.s3-bucket }}
+
+    - name: Upload sccache stats
+      if: steps.build.outcome != 'skipped'
+      uses: seemethere/upload-artifact-s3@v5
+      with:
+        s3-prefix: |
+          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
+        retention-days: 365
+        if-no-files-found: warn
+        path: sccache-stats-*.json
+        s3-bucket: ${{ inputs.s3-bucket }}
+
+    - name: Teardown Linux
+      uses: pytorch/test-infra/.github/actions/teardown-linux@main
+      if: always()
--- a/.github/actions/pytest-cache-download/action.yml
+++ b/.github/actions/pytest-cache-download/action.yml
@ -9,6 +9,10 @@ inputs:
  job_identifier:
    description: Text that uniquely identifies a given job type within a workflow. All shards of a job should share the same job identifier.
    required: true
+  s3_bucket:
+    description: S3 bucket to upload/download PyTest cache
+    required: false
+    default: ""

 runs:
  using: composite
@ -30,6 +34,7 @@ runs:
        CACHE_DIR: ${{ inputs.cache_dir }}
        JOB_IDENTIFIER: ${{ inputs.job_identifier }}
        REPO: ${{ github.repository }}
+        BUCKET: ${{ inputs.s3_bucket }}
      run: |
        python3 .github/scripts/pytest_cache.py \
          --download \
@ -38,3 +43,4 @@ runs:
          --job_identifier $JOB_IDENTIFIER \
          --temp_dir $RUNNER_TEMP \
          --repo $REPO \
+          --bucket $BUCKET \
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -26,8 +26,14 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"

+    - name: Check if in a ARC runner
+      shell: bash
+      id: check_arc_runner
+      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)"  >> $GITHUB_OUTPUT
+
    - name: Start docker if docker deamon is not running
      shell: bash
+      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
      run: |
        if systemctl is-active --quiet docker; then
            echo "Docker daemon is running...";
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@ -11,6 +11,10 @@ inputs:
      Suffix to add to the filename of the artifacts. This should include the
      workflow job id, see [Job id in artifacts].
    required: true
+  s3-bucket:
+    description: S3 bucket to download builds
+    required: false
+    default: "gha-artifacts"

 runs:
  using: composite
@ -87,6 +91,7 @@ runs:
      uses: seemethere/upload-artifact-s3@v5
      if: ${{ !inputs.use-gha }}
      with:
+        s3-bucket: ${{ inputs.s3-bucket }}
        s3-prefix: |
          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
        retention-days: 14
@ -97,6 +102,7 @@ runs:
      uses: seemethere/upload-artifact-s3@v5
      if: ${{ !inputs.use-gha }}
      with:
+        s3-bucket: ${{ inputs.s3-bucket }}
        s3-prefix: |
          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
        retention-days: 14
@ -108,6 +114,7 @@ runs:
      if: ${{ !inputs.use-gha }}
      continue-on-error: true
      with:
+        s3-bucket: ${{ inputs.s3-bucket }}
        s3-prefix: |
          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
        retention-days: 14
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-87aeb554d3e2f7855b7abe5120c282f59648ed7a
+17a70815259222570feb071034acd7bae2adc019
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-2c127da8b5e2e8f44b50994c6cb931bcca267cfe
+a0c79b399b75368208464b2c638708165cca7ef1
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -21,3 +21,4 @@ retryable_workflows:
 - trunk
 - linux-binary
 - windows-binary
+labeler_config: labeler.yml
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -27,3 +27,6 @@ rockset==1.0.3
 z3-solver==4.12.2.0
 tensorboard==2.13.0
 optree==0.9.1
+# NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
+# which the stringify metadata is wrong when escaping double quote
+protobuf==3.20.2
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -99,7 +99,14 @@ def build_triton(
            triton_repo = "https://github.com/openai/triton"
            triton_pkg_name = "pytorch-triton"
        check_call(["git", "clone", triton_repo], cwd=tmpdir)
-        check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
+        if release:
+            ver, rev, patch = version.split(".")
+            check_call(
+                ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir
+            )
+        else:
+            check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
+
        if build_conda:
            with open(triton_basedir / "meta.yaml", "w") as meta:
                print(
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3

 import json
+import logging
 import os
 import re
 import subprocess
@ -8,6 +9,7 @@ import sys
 import warnings
 from enum import Enum
 from functools import lru_cache
+from logging import info
 from typing import Any, Callable, Dict, List, Optional, Set
 from urllib.request import Request, urlopen

@ -17,33 +19,7 @@ REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://gi

 PREFIX = "test-config/"

-# Same as shard names
-VALID_TEST_CONFIG_LABELS = {
-    f"{PREFIX}{label}"
-    for label in {
-        "backwards_compat",
-        "crossref",
-        "default",
-        "deploy",
-        "distributed",
-        "docs_tests",
-        "dynamo",
-        "force_on_cpu",
-        "functorch",
-        "inductor",
-        "inductor_distributed",
-        "inductor_huggingface",
-        "inductor_timm",
-        "inductor_torchbench",
-        "jit_legacy",
-        "multigpu",
-        "nogpu_AVX512",
-        "nogpu_NO_AVX2",
-        "slow",
-        "tsan",
-        "xla",
-    }
-}
+logging.basicConfig(level=logging.INFO)


 def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool:
@ -155,19 +131,25 @@ def get_labels(pr_number: int) -> Set[str]:
    }


+def filter_labels(labels: Set[str], label_regex: Any) -> Set[str]:
+    """
+    Return the list of matching labels
+    """
+    return {l for l in labels if re.match(label_regex, l)}
+
+
 def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, List[Any]]:
    """
    Select the list of test config to run from the test matrix. The logic works
    as follows:

-    If the PR has one or more labels as specified in the VALID_TEST_CONFIG_LABELS set, only
-    these test configs will be selected.  This also works with ciflow labels, for example,
-    if a PR has both ciflow/trunk and test-config/functorch, only trunk functorch builds
-    and tests will be run
+    If the PR has one or more test-config labels as specified, only these test configs
+    will be selected.  This also works with ciflow labels, for example, if a PR has both
+    ciflow/trunk and test-config/functorch, only trunk functorch builds and tests will
+    be run.

    If the PR has none of the test-config label, all tests are run as usual.
    """
-
    filtered_test_matrix: Dict[str, List[Any]] = {"include": []}

    for entry in test_matrix.get("include", []):
@ -177,18 +159,19 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis

        label = f"{PREFIX}{config_name.strip()}"
        if label in labels:
-            print(
-                f"Select {config_name} because label {label} is presented in the pull request by the time the test starts"
-            )
+            msg = f"Select {config_name} because label {label} is present in the pull request by the time the test starts"
+            info(msg)
            filtered_test_matrix["include"].append(entry)

-    valid_test_config_labels = labels.intersection(VALID_TEST_CONFIG_LABELS)
-
-    if not filtered_test_matrix["include"] and not valid_test_config_labels:
-        # Found no valid label and the filtered test matrix is empty, return the same
+    test_config_labels = filter_labels(labels, re.compile(f"{PREFIX}.+"))
+    if not filtered_test_matrix["include"] and not test_config_labels:
+        info("Found no test-config label on the PR, so all test configs are included")
+        # Found no test-config label and the filtered test matrix is empty, return the same
        # test matrix as before so that all tests can be run normally
        return test_matrix
    else:
+        msg = f"Found {test_config_labels} on the PR so only these test configs are run"
+        info(msg)
        # When the filter test matrix contain matches or if a valid test config label
        # is found in the PR, return the filtered test matrix
        return filtered_test_matrix
@ -374,30 +357,33 @@ def process_jobs(
        # - If the target record has the job (config) name, only that test config
        #   will be skipped or marked as unstable
        if not target_job_cfg:
-            print(
+            msg = (
                f"Issue {target_url} created by {author} has {issue_type.value} "
                + f"all CI jobs for {workflow} / {job_name}"
            )
+            info(msg)
            return _filter_jobs(
                test_matrix=test_matrix,
                issue_type=issue_type,
            )

        if target_job_cfg == BUILD_JOB_NAME:
-            print(
+            msg = (
                f"Issue {target_url} created by {author} has {issue_type.value} "
                + f"the build job for {workflow} / {job_name}"
            )
+            info(msg)
            return _filter_jobs(
                test_matrix=test_matrix,
                issue_type=issue_type,
            )

        if target_job_cfg in (TEST_JOB_NAME, BUILD_AND_TEST_JOB_NAME):
-            print(
+            msg = (
                f"Issue {target_url} created by {author} has {issue_type.value} "
                + f"all the test jobs for {workflow} / {job_name}"
            )
+            info(msg)
            return _filter_jobs(
                test_matrix=test_matrix,
                issue_type=issue_type,
@ -497,7 +483,7 @@ def perform_misc_tasks(

    # Obviously, if the job name includes unstable, then this is an unstable job
    is_unstable = job_name and IssueType.UNSTABLE.value in job_name
-    if not is_unstable and test_matrix:
+    if not is_unstable and test_matrix and test_matrix.get("include"):
        # Even when the job name doesn't mention unstable, we will also mark it as
        # unstable when the test matrix only includes unstable jobs. Basically, this
        # logic allows build or build-and-test jobs to be marked as unstable too.
--- a/.github/scripts/td_llm_indexer.sh
+++ b/.github/scripts/td_llm_indexer.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -euxo pipefail
+
+# Download requirements
+cd llm-target-determinator
+pip install -q -r requirements.txt
+cd ../codellama
+pip install -e .
+
+# Run indexer
+cd ../llm-target-determinator
+
+torchrun \
+    --standalone \
+    --nnodes=1 \
+    --nproc-per-node=1 \
+    indexer.py \
+    --experiment-name indexer-files \
+    --granularity FILE
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@ -17,7 +17,6 @@ from filter_test_configs import (
    remove_disabled_jobs,
    set_periodic_modes,
    SUPPORTED_PERIODICAL_MODES,
-    VALID_TEST_CONFIG_LABELS,
 )


@ -273,13 +272,13 @@ class TestConfigFilter(TestCase):
        testcases = [
            {
                "test_matrix": '{include: [{config: "default", runner: "linux"}]}',
-                "expected": '{"include": [{"config": "default", "runner": "linux"}]}',
-                "description": "No match, keep the same test matrix",
+                "expected": '{"include": []}',
+                "description": "Request test-config/cfg but the test matrix doesn't have it",
            },
            {
                "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "plain-cfg"}]}',
-                "expected": '{"include": [{"config": "default", "runner": "linux"}, {"config": "plain-cfg"}]}',
-                "description": "No match because there is no prefix or suffix, keep the same test matrix",
+                "expected": '{"include": []}',
+                "description": "A valid test config label needs to start with test-config/",
            },
            {
                "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", shard: 1}]}',
@ -294,9 +293,8 @@ class TestConfigFilter(TestCase):
            )
            self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))

-    def test_filter_with_valid_label(self) -> None:
+    def test_filter_with_test_config_label(self) -> None:
        mocked_labels = {f"{PREFIX}cfg", "ciflow/trunk"}
-        VALID_TEST_CONFIG_LABELS.add(f"{PREFIX}cfg")

        testcases = [
            {
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -205,7 +205,6 @@ def mocked_read_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule
            approved_by=["pytorch/metamates", "ngimel"],
            mandatory_checks_name=[
                "Lint",
-                "Facebook CLA Check",
                "pull / linux-xenial-cuda11.3-py3.7-gcc7 / build",
            ],
            ignore_flaky_failures=True,
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -1398,7 +1398,10 @@ def find_matching_merge_rule(
        )
        required_checks = list(
            filter(
-                lambda x: "EasyCLA" in x or not skip_mandatory_checks, mandatory_checks
+                lambda x: ("EasyCLA" in x)
+                or ("Facebook CLA Check" in x)
+                or not skip_mandatory_checks,
+                mandatory_checks,
            )
        )
        pending_checks, failed_checks, _ = categorize_checks(
@ -1409,6 +1412,13 @@ def find_matching_merge_rule(
            else 0,
        )

+        # categorize_checks assumes all tests are required if required_checks is empty.
+        # this is a workaround as we want to keep that behavior for categorize_checks
+        # generally.
+        if not required_checks:
+            pending_checks = []
+            failed_checks = []
+
        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
        if len(failed_checks) > 0:
            if reject_reason_score < 30000:
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -28,7 +28,21 @@ on:
        description: |
          If this is set, our linter will use this to make sure that every other
          job with the same `sync-tag` is identical.
-
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+      upload-aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
    secrets:
      GH_PYTORCHBOT_TOKEN:
        required: false
@ -82,6 +96,14 @@ jobs:
      - name: Setup Linux
        uses: ./.github/actions/setup-linux

+      - name: configure aws credentials
+        if : ${{ inputs.aws-role-to-assume != '' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: ${{ inputs.aws-role-to-assume }}
+          role-session-name: gha-linux-test
+          aws-region: us-east-1
+
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@ -97,6 +119,7 @@ jobs:
        uses: ./.github/actions/download-build-artifacts
        with:
          name: ${{ inputs.build-environment }}
+          s3-bucket: ${{ inputs.s3-bucket }}

      - name: Generate netrc (only for docs-push)
        if: inputs.push
@ -156,6 +179,14 @@ jobs:
        uses: ./.github/actions/chown-workspace
        if: always()

+      - name: configure aws credentials
+        if : ${{ inputs.upload-aws-role-to-assume != '' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: ${{ inputs.upload-aws-role-to-assume }}
+          role-session-name: gha-linux-test
+          aws-region: us-east-1
+
      - name: Upload Python Docs Preview
        uses: seemethere/upload-artifact-s3@v5
        if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' && steps.build-docs.outcome == 'success' }}
--- a/.github/workflows/_linux-build-label.yml
+++ b/.github/workflows/_linux-build-label.yml
@ -0,0 +1,109 @@
+name: linux-build
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image-name:
+        required: true
+        type: string
+        description: Name of the base docker image to build with.
+      build-generates-artifacts:
+        required: false
+        type: boolean
+        default: true
+        description: If set, upload generated build artifacts.
+      build-with-debug:
+        required: false
+        type: boolean
+        default: false
+        description: If set, build in debug mode.
+      sync-tag:
+        required: false
+        type: string
+        default: ""
+        description: |
+          If this is set, our linter will use this to make sure that every other
+          job with the same `sync-tag` is identical.
+      cuda-arch-list:
+        required: false
+        type: string
+        default: "5.2"
+        description: Runner label to select worker type
+      runner:
+        required: false
+        type: string
+        default: "linux.2xlarge"
+        description: |
+          List of CUDA architectures CI build should target.
+      test-matrix:
+        required: false
+        type: string
+        description: |
+          An option JSON description of what test configs to run later on. This
+          is moved here from the Linux test workflow so that we can apply filter
+          logic using test-config labels earlier and skip unnecessary builds
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+
+    outputs:
+      docker-image:
+        value: ${{ jobs.build.outputs.docker-image }}
+        description: The docker image containing the built PyTorch.
+      test-matrix:
+        value: ${{ jobs.build.outputs.test-matrix }}
+        description: An optional JSON description of what test configs to run later on.
+
+jobs:
+  build:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 240
+    outputs:
+      docker-image: ${{ steps.linux-build.outputs.docker-image }}
+      test-matrix: ${{ steps.linux-build.outputs.test-matrix }}
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      # [pytorch repo ref]
+      # Use a pytorch/pytorch reference instead of a reference to the local
+      # checkout because when we run this action we don't *have* a local
+      # checkout. In other cases you should prefer a local checkout.
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+      - name: Linux Build
+        id: linux-build
+        uses: ./.github/actions/linux-build
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          docker-image-name: ${{ inputs.docker-image-name }}
+          build-generates-artifacts: ${{ inputs.build-generates-artifacts }}
+          build-with-debug: ${{ inputs.build-with-debug }}
+          sync-tag: ${{ inputs.sync-tag }}
+          cuda-arch-list: ${{ inputs.cuda-arch-list }}
+          test-matrix: ${{ inputs.test-matrix }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/_linux-build-rg.yml
+++ b/.github/workflows/_linux-build-rg.yml
@ -0,0 +1,105 @@
+name: linux-build-rg
+
+on:
+  workflow_call:
+    inputs:
+      build-environment:
+        required: true
+        type: string
+        description: Top-level label for what's being built/tested.
+      docker-image-name:
+        required: true
+        type: string
+        description: Name of the base docker image to build with.
+      build-generates-artifacts:
+        required: false
+        type: boolean
+        default: true
+        description: If set, upload generated build artifacts.
+      build-with-debug:
+        required: false
+        type: boolean
+        default: false
+        description: If set, build in debug mode.
+      sync-tag:
+        required: false
+        type: string
+        default: ""
+        description: |
+          If this is set, our linter will use this to make sure that every other
+          job with the same `sync-tag` is identical.
+      cuda-arch-list:
+        required: false
+        type: string
+        default: "5.2"
+        description: |
+          List of CUDA architectures CI build should target.
+      runner-group:
+        required: false
+        type: string
+        default: "arc-lf-linux.2xlarge"
+        description: Runner group to select group type
+      test-matrix:
+        required: false
+        type: string
+        description: |
+          An option JSON description of what test configs to run later on. This
+          is moved here from the Linux test workflow so that we can apply filter
+          logic using test-config labels earlier and skip unnecessary builds
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub
+
+    outputs:
+      docker-image:
+        value: ${{ jobs.build.outputs.docker-image }}
+        description: The docker image containing the built PyTorch.
+      test-matrix:
+        value: ${{ jobs.build.outputs.test-matrix }}
+        description: An optional JSON description of what test configs to run later on.
+
+jobs:
+  build:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
+    runs-on:
+      group: ${{ inputs.runner-group }}
+    timeout-minutes: 240
+    outputs:
+      docker-image: ${{ steps.linux-build.outputs.docker-image }}
+      test-matrix: ${{ steps.linux-build.outputs.test-matrix }}
+    steps:
+      # [pytorch repo ref]
+      # Use a pytorch/pytorch reference instead of a reference to the local
+      # checkout because when we run this action we don't *have* a local
+      # checkout. In other cases you should prefer a local checkout.
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+      - name: Linux Build
+        id: linux-build
+        uses: ./.github/actions/linux-build
+        with:
+          build-environment: ${{ inputs.build-environment }}
+          docker-image-name: ${{ inputs.docker-image-name }}
+          build-generates-artifacts: ${{ inputs.build-generates-artifacts }}
+          build-with-debug: ${{ inputs.build-with-debug }}
+          sync-tag: ${{ inputs.sync-tag }}
+          cuda-arch-list: ${{ inputs.cuda-arch-list }}
+          test-matrix: ${{ inputs.test-matrix }}
+          s3-bucket: ${{ inputs.s3-bucket }}
+          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -47,6 +47,16 @@ on:
          An option JSON description of what test configs to run later on. This
          is moved here from the Linux test workflow so that we can apply filter
          logic using test-config labels earlier and skip unnecessary builds
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
@ -87,6 +97,14 @@ jobs:
      - name: Setup Linux
        uses: ./.github/actions/setup-linux

+      - name: configure aws credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        if: ${{ inputs.aws-role-to-assume != '' }}
+        with:
+          role-to-assume: ${{ inputs.aws-role-to-assume }}
+          role-session-name: gha-linux-build
+          aws-region: us-east-1
+
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@ -133,6 +151,7 @@ jobs:
        with:
          cache_dir: .pytest_cache
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
+          s3_bucket: ${{ inputs.s3-bucket }}

      - name: Build
        if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
@ -197,6 +216,7 @@ jobs:
          retention-days: 14
          if-no-files-found: error
          path: artifacts.zip
+          s3-bucket: ${{ inputs.s3-bucket }}

      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped'
@ -207,6 +227,7 @@ jobs:
          retention-days: 365
          if-no-files-found: warn
          path: sccache-stats-*.json
+          s3-bucket: ${{ inputs.s3-bucket }}

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -37,6 +37,16 @@ on:
        required: false
        type: string
        default: ""
+      s3-bucket:
+        description: S3 bucket to download artifact
+        required: false
+        type: string
+        default: "gha-artifacts"
+      aws-role-to-assume:
+        description: role to assume for downloading artifacts
+        required: false
+        type: string
+        default: ""
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
@ -71,6 +81,14 @@ jobs:
      - name: Setup Linux
        uses: ./.github/actions/setup-linux

+      - name: configure aws credentials
+        if : ${{ inputs.aws-role-to-assume != '' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: ${{ inputs.aws-role-to-assume }}
+          role-session-name: gha-linux-test
+          aws-region: us-east-1
+
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@ -116,6 +134,7 @@ jobs:
        uses: ./.github/actions/download-build-artifacts
        with:
          name: ${{ inputs.build-environment }}
+          s3-bucket: ${{ inputs.s3-bucket }}

      - name: Download TD artifacts
        continue-on-error: true
@ -290,6 +309,7 @@ jobs:
        with:
          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
          use-gha: ${{ inputs.use-gha }}
+          s3-bucket: ${{ inputs.s3-bucket }}

      - name: Collect backtraces from coredumps (if any)
        if: always()
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -119,8 +119,7 @@ jobs:

      - uses: actions/upload-artifact@v3
        with:
-          # NB: Use the same name here and all wheels can be downloaded by referring to the same artifact
-          name: pytorch-triton-wheel
+          name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}
          if-no-files-found: error
          path: ${{ runner.temp }}/artifacts/*

@ -157,8 +156,15 @@ jobs:
      - name: Download Build Artifacts
        uses: actions/download-artifact@v3
        with:
-          name: pytorch-triton-wheel
-          path: ${{ runner.temp }}/artifacts/
+          # Download all available artifacts
+          path: ${{ runner.temp }}/artifacts-all
+
+      - name: Select Wheel Artifacts
+        shell: bash
+        run: |
+          set -x
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/pytorch-triton-wheel-*/* "${RUNNER_TEMP}/artifacts/"

      - name: Set DRY_RUN (only for tagged pushes)
        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
@ -246,8 +252,7 @@ jobs:

      - uses: actions/upload-artifact@v3
        with:
-          # NB: Use the same name here and all wheels can be downloaded by referring to the same artifact
-          name: pytorch-triton-conda
+          name: pytorch-triton-conda-${{ matrix.py_vers }}
          if-no-files-found: error
          path: ${{ runner.temp }}/artifacts/*

@ -267,8 +272,15 @@ jobs:
      - name: Download Build Artifacts
        uses: actions/download-artifact@v3
        with:
-          name: pytorch-triton-conda
-          path: ${{ runner.temp }}/artifacts/
+          # Download all available artifacts
+          path: ${{ runner.temp }}/artifacts-all
+
+      - name: Select Conda Artifacts
+        shell: bash
+        run: |
+          set -x
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/pytorch-triton-conda-*/* "${RUNNER_TEMP}/artifacts/"

      - name: Set DRY_RUN (only for tagged pushes)
        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -20,12 +20,12 @@ on:
        description: Run inductor_default?
        required: false
        type: boolean
-        default: true
+        default: false
      dynamic:
        description: Run inductor_dynamic_shapes?
        required: false
        type: boolean
-        default: true
+        default: false
      cudagraphs:
        description: Run inductor_cudagraphs?
        required: false
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -111,7 +111,7 @@ jobs:
    name: linux-jammy-cpu-py3.8-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-jammy-py3_8-gcc11-build
+      build-environment: linux-jammy-py3.8-gcc11-build
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -135,7 +135,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: linux-jammy-cpu-py3_8-gcc11-inductor-build
    with:
-      build-environment: linux-jammy-py3_8-gcc11-build
+      build-environment: linux-jammy-py3.8-gcc11-build
      docker-image: ${{ needs.linux-jammy-cpu-py3_8-gcc11-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_8-gcc11-inductor-build.outputs.test-matrix }}
    secrets:
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@ -1,37 +0,0 @@
-name: Labeler
-
-on:
-  # We need pull_request_target to be able to add labels to PRs from forks.
-  # Only allow pull_request_target when targeting main, not some historical branch.
-  #
-  # Make sure to don't introduce explicit checking out and installing/running
-  # untrusted user code into this workflow!
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-    branches: [main]
-
-  # To add labels on ghstack PRs.
-  # Note: as pull_request doesn't trigger on PRs targeting main,
-  # to test changes to the workflow itself one needs to create
-  # a PR that targets a gh/**/base branch.
-  pull_request:
-    types: [opened, synchronize, reopened]
-    branches: [gh/**/base]
-
-jobs:
-  triage:
-    permissions:
-      contents: read
-      pull-requests: write
-    runs-on: ubuntu-latest
-    # Do not auto-label nightly builds PR
-    if: ${{ github.event.pull_request.number != 26921 && github.repository_owner == 'pytorch' }}
-    steps:
-    - uses: actions/labeler@v4
-      with:
-        repo-token: "${{ secrets.GITHUB_TOKEN }}"
-        sync-labels: ''
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
--- a/.github/workflows/llm_td_retrieval.yml
+++ b/.github/workflows/llm_td_retrieval.yml
@ -0,0 +1,120 @@
+name: Retrieval PyTorch Tests for Target Determination
+
+on:
+  workflow_call:
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  llm-retrieval:
+    runs-on: linux.4xlarge
+    continue-on-error: true
+    steps:
+      - name: Clone PyTorch
+        uses: actions/checkout@v3
+        with:
+          repository: pytorch/pytorch
+          fetch-depth: 0
+          path: pytorch
+
+      - name: Setup Linux
+        uses: ./pytorch/.github/actions/setup-linux
+
+      - name: Clone CodeLlama
+        uses: actions/checkout@v3
+        with:
+          repository: osalpekar/codellama
+          ref: main
+          path: codellama
+
+      - name: Clone Target Determination Code
+        uses: actions/checkout@v3
+        with:
+          repository: osalpekar/llm-target-determinator
+          ref: v0.0.2
+          path: llm-target-determinator
+
+      - name: Setup Conda
+        uses: conda-incubator/setup-miniconda@v2.1.1
+        with:
+          miniconda-version: "py39_4.12.0"
+          python-version: 3.9
+
+      - name: Install Requirements
+        shell: bash -l {0}
+        run: |
+          set -euxo pipefail
+          conda create \
+            --yes \
+            --quiet \
+            --name "tdenv" \
+            "python=3.9"
+          conda activate tdenv
+          cd "${GITHUB_WORKSPACE}/llm-target-determinator"
+          pip install -r requirements.txt
+          cd ../codellama
+          pip install -e .
+
+      - name: Fetch CodeLlama Checkpoint
+        shell: bash -l {0}
+        run: |
+          set -euxo pipefail
+          conda activate tdenv
+          cd codellama/
+          mkdir "CodeLlama-7b-Python"
+          aws s3 cp "s3://target-determinator-assets/CodeLlama-7b-Python" "CodeLlama-7b-Python" --recursive --no-progress
+
+      - name: Fetch indexes
+        uses: nick-fields/retry@v2.8.2
+        with:
+          max_attempts: 3
+          retry_wait_seconds: 10
+          timeout_minutes: 5
+          shell: bash
+          command: |
+            set -euxo pipefail
+            python3 -m pip install awscli==1.29.40
+            cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets
+            aws s3 cp "s3://target-determinator-assets/indexes/latest" . --recursive
+
+            unzip -o indexer-files\*.zip
+            rm indexer-files*.zip
+
+      - name: Run Retriever
+        id: run_retriever
+        continue-on-error: true  # ghstack not currently supported due to problems getting git diff
+        shell: bash -l {0}
+        run: |
+          set -euxo pipefail
+          conda activate tdenv
+          cd "${GITHUB_WORKSPACE}"/llm-target-determinator
+          torchrun \
+            --standalone \
+            --nnodes=1 \
+            --nproc-per-node=1 \
+            retriever.py \
+            --experiment-name indexer-files \
+            --pr-parse-format GITDIFF
+          cd assets
+          zip -r mappings.zip mappings
+
+      - name: Upload results to s3
+        uses: seemethere/upload-artifact-s3@v5
+        if: ${{ steps.run_retriever.outcome == 'success' }}
+        with:
+          name: llm_results
+          retention-days: 14
+          if-no-files-found: warn
+          path: llm-target-determinator/assets/mappings.zip
+        env:
+          AWS_ACCESS_KEY_ID: ""
+          AWS_SECRET_ACCESS_KEY: ""
+          AWS_SESSION_TOKEN: ""
+          AWS_DEFAULT_REGION: ""
+          AWS_REGION: ""
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -23,9 +23,17 @@ concurrency:
 permissions: read-all

 jobs:
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
  target-determination:
    name: before-test
    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
    permissions:
      id-token: write
      contents: read
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -20,9 +20,17 @@ concurrency:
 permissions: read-all

 jobs:
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
  target-determination:
    name: before-test
    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
    permissions:
      id-token: write
      contents: read
@ -311,7 +319,7 @@ jobs:
    name: linux-focal-py3_8-clang9-xla
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-py3_8-clang9-xla
+      build-environment: linux-focal-py3.8-clang9-xla
      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.1-lite
      test-matrix: |
        { include: [
@ -323,7 +331,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: linux-focal-py3_8-clang9-xla-build
    with:
-      build-environment: linux-focal-py3_8-clang9-xla
+      build-environment: linux-focal-py3.8-clang9-xla
      docker-image: ${{ needs.linux-focal-py3_8-clang9-xla-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-py3_8-clang9-xla-build.outputs.test-matrix }}

--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -21,9 +21,17 @@ concurrency:
 permissions: read-all

 jobs:
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
  target-determination:
    name: before-test
    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
    permissions:
      id-token: write
      contents: read
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@ -2,7 +2,8 @@ name: Index PyTorch Tests for Target Determination

 on:
  workflow_dispatch:
-  # TODO: Trigger every few hours
+  schedule:
+    - cron: '0 0 * * *'

 permissions:
  id-token: write
@ -13,14 +14,20 @@ jobs:
    runs-on: linux.g5.4xlarge.nvidia.gpu # 1 GPU A10G 24GB each
    environment: target-determinator-env
    steps:
+      - name: Clone PyTorch
+        uses: actions/checkout@v3
+        with:
+          path: pytorch
+
      - name: Setup Linux
-        uses: ./.github/actions/setup-linux
+        uses: ./pytorch/.github/actions/setup-linux

      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
          docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+          working-directory: pytorch

      - name: Use following to pull public copy of the image
        id: print-ghcr-mirror
@ -40,112 +47,97 @@ jobs:
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main

-      - name: Clone PyTorch
-        uses: actions/checkout@v3
-        with:
-          path: pytorch
-
      - name: Clone CodeLlama
        uses: actions/checkout@v3
        with:
          repository: osalpekar/codellama
-          ref: main
+          ref: 1ec50e0cfc0fadc3b6ceb146617e2119ab26eb34
          path: codellama

      - name: Clone Target Determination Code
        uses: actions/checkout@v3
        with:
          repository: osalpekar/llm-target-determinator
-          ref: v0.0.1
+          ref: v0.0.2
          path: llm-target-determinator

-      - name: Install Requirements
-        shell: bash -l {0}
-        run: |
-          set -euxo pipefail
-
-          conda create \
-            --yes \
-            --quiet \
-            --name "tdenv" \
-            "python=3.9"
-
-          conda activate tdenv
-
-          cd "${GITHUB_WORKSPACE}"
-          pwd
-          cd llm-target-determinator
-          pip install -r requirements.txt
-          cd ../codellama
-          pip install -e .
-
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v3
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_target_determinator_s3_read_write
          aws-region: us-east-1

-      - name: Fetch CodeLlama Checkpoint
-        shell: bash -l {0}
+      - name: Download checkpoint
+        shell: bash
+        env:
+          AWS_DEFAULT_REGION: us-east-1
        run: |
-          set -euxo pipefail
-
-          conda activate tdenv
-          pip install awscli==1.32.18
-          cd codellama/
+          # Do this outside of docker so I don't have to put env vars in
+          pip3 install awscli==1.29.40
+          cd codellama
          mkdir "CodeLlama-7b-Python"
          aws s3 cp \
            "s3://target-determinator-assets/CodeLlama-7b-Python" \
            "CodeLlama-7b-Python" \
            --recursive

-      - name: Run Indexer
-        id: indexer
+      - name: Run indexer
        shell: bash -l {0}
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          AWS_DEFAULT_REGION: us-east-1
        run: |
-          set -euxo pipefail
+          # detached container should get cleaned up by teardown_ec2_linux
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e AWS_DEFAULT_REGION \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --security-opt seccomp=unconfined \
+            --cap-add=SYS_PTRACE \
+            --tty \
+            --detach \
+            --user jenkins \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+          chmod +x pytorch/.github/scripts/td_llm_indexer.sh
+          docker exec -t "${container_name}" sh -c 'pytorch/.github/scripts/td_llm_indexer.sh'

-          conda activate tdenv
-          cd "${GITHUB_WORKSPACE}"/llm-target-determinator
-
-          python create_filelist.py
-
-          torchrun \
-            --standalone \
-            --nnodes=1 \
-            --nproc-per-node=1 \
-            indexer.py \
-            --experiment-name indexer-files
-
-      - name: Upload Index to S3
+      - name: Upload to s3
        shell: bash -l {0}
-        if: ${{ steps.indexer.outcome == 'success' }}
+        env:
+          AWS_DEFAULT_REGION: us-east-1
        run: |
-          set -euxo pipefail
-          conda activate tdenv
-          cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets
+          cd llm-target-determinator/assets

          TIMESTAMP=$(date -Iseconds)
-          ZIP_NAME = "indexer-files-${TIMESTAMP}.zip"
+          ZIP_NAME="indexer-files-${TIMESTAMP}.zip"

          # Create a zipfile with all the generated indices
          zip -r "${ZIP_NAME}" indexer-files

+          # Note that because the below 2 operations are not atomic, there will
+          # be a period of a few seconds between these where there is no index
+          # present in the latest/ folder. To account for this, the retriever
+          # should have some retry logic with backoff to ensure fetching the
+          # index doesn't fail.
          # Move the old index into the archived/ folder
-          aws s3 cp \
-            "s3://target-determinator-assets/indexes/latest/*" \
-            "s3://target-determinator-assets/indexes/archived/"
+          aws s3 mv \
+            "s3://target-determinator-assets/indexes/latest" \
+            "s3://target-determinator-assets/indexes/archived" \
+            --recursive

          # Move the new index into the latestl/ folder
          aws s3 cp \
            "${ZIP_NAME}" \
            "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}"

-          # Note that because the above 2 operations are not atomic, there will
-          # be a period of a few seconds between these where there is no index
-          # present in the latest/ folder. To account for this, the retriever
-          # should have some retry logic with backoff to ensure fetching the
-          # index doesn't fail.
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
--- a/.github/workflows/target_determination.yml
+++ b/.github/workflows/target_determination.yml
@ -35,6 +35,13 @@ jobs:
          cache_dir: .pytest_cache
          job_identifier: ${{ github.workflow }}

+      - name: Download LLM Artifacts from S3
+        uses: seemethere/download-artifact-s3@v4
+        continue-on-error: true
+        with:
+          name: llm_results
+          path: .additional_ci_files/llm_results
+
      - name: Do TD
        id: td
        continue-on-error: true
@ -50,6 +57,7 @@ jobs:
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
+          unzip -o .additional_ci_files/llm_results/mappings.zip -d .additional_ci_files/llm_results || true
          python3 -m pip install boto3==1.19.12
          python3 tools/testing/do_target_determination_for_s3.py

--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -19,9 +19,17 @@ concurrency:
 permissions: read-all

 jobs:
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
  target-determination:
    name: before-test
    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
    permissions:
      id-token: write
      contents: read
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@ -25,7 +25,7 @@ jobs:
        with:
          repo-name: xla
          branch: master
-          pin-folder: .ci/docker/ci_commit_pins
+          pin-folder: .github/ci_commit_pins
          test-infra-ref: main
          updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }}
          pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }}
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -147,7 +147,7 @@ init_command = [
    'filelock==3.13.1',
    'junitparser==2.1.1',
    'rich==10.9.0',
-    'pyyaml==6.0',
+    'pyyaml==6.0.1',
    'optree==0.10.0',
 ]

@ -186,11 +186,12 @@ command = [
 [[linter]]
 code = 'CLANGTIDY'
 include_patterns = [
-    'aten/src/ATen/core/*.cpp',
    # Enable coverage of headers in aten/src/ATen
    # and excluding most sub-directories for now.
    'aten/src/ATen/*.h',
    'aten/src/ATen/*.cpp',
+    'aten/src/ATen/core/*.h',
+    'aten/src/ATen/core/*.cpp',
    'c10/**/*.cpp',
    'c10/**/*.h',
    'torch/csrc/*.h',
@ -204,9 +205,7 @@ exclude_patterns = [
    # CUDA files are also excluded.
    '**/fb/**',
    '**/*pb.h',
-    'c10/**/cuda/*pp',
    'aten/**/cuda/*pp',
-    '**/cuda/*pp',
    'c10/xpu/**/*.h',
    'c10/xpu/**/*.cpp',
    'c10/cuda/CUDAAlgorithm.h',
@ -225,8 +224,8 @@ exclude_patterns = [
    'third_party/**/*',
    'torch/csrc/api/**',
    'torch/csrc/autograd/generated/**',
-    'torch/csrc/dynamo/*',
    'torch/csrc/distributed/**/*',
+    'torch/csrc/dynamo/eval_frame.h',
    'torch/csrc/inductor/**/*',
    'torch/csrc/jit/**/*',
    'torch/csrc/jit/serialization/import_legacy.cpp',
@ -979,7 +978,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'PyYAML==6.0',
+    'PyYAML==6.0.1',
 ]

 # Black + usort
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -19,6 +19,12 @@ cmake_policy(SET CMP0069 NEW)
 # nice when it's possible, and it's possible on our Windows configs.
 cmake_policy(SET CMP0092 NEW)

+# Prohibit in-source builds
+if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
+message(FATAL_ERROR "In-source build are not supported")
+endif()
+
+
 # ---[ Project and semantic versioning.
 project(Torch CXX C)

@ -736,28 +742,13 @@ if(MSVC)
  append_cxx_flag_if_supported("/utf-8" CMAKE_CXX_FLAGS)
 endif()

-# Note for ROCM platform:
-# 1. USE_ROCM is always ON until include(cmake/Dependencies.cmake)
-# 2. USE_CUDA will become OFF during re-configuration
-# Truth Table:
-# CUDA 1st pass: USE_CUDA=True;USE_ROCM=True, FLASH evaluates to ON by default
-# CUDA 2nd pass: USE_CUDA=True;USE_ROCM=False, FLASH evaluates to ON by default
-# ROCM 1st pass: USE_CUDA=True;USE_ROCM=True, FLASH evaluates to ON by default
-# ROCM 2nd pass: USE_CUDA=False;USE_ROCM=True, FLASH evaluates to ON by default
-# CPU 1st pass: USE_CUDA=False(Cmd Option);USE_ROCM=True, FLASH evaluates to OFF by default
-# CPU 2nd pass: USE_CUDA=False(Cmd Option);USE_ROCM=False, FLASH evaluates to OFF by default
-# Thus we cannot tell ROCM 2nd pass and CPU 1st pass
-#
-# The only solution is to include(cmake/Dependencies.cmake), and defer the
-# aotriton build decision later.
-
-include(cmake/Dependencies.cmake)
-
+# CAVEAT: do NOT check USE_ROCM here, because USE_ROCM is always True until
+#         include(cmake/Dependencies.cmake)
 cmake_dependent_option(
  USE_FLASH_ATTENTION
  "Whether to build the flash_attention kernel for scaled dot product attention.\
  Will be disabled if not supported by the platform" ON
-  "USE_CUDA OR USE_ROCM;NOT MSVC" OFF)
+  "USE_CUDA AND NOT MSVC" OFF)

 # We are currenlty not using alibi attention for Flash
 # So we disable this feature by default
@ -773,6 +764,8 @@ cmake_dependent_option(
  Will be disabled if not supported by the platform" ON
  "USE_CUDA" OFF)

+include(cmake/Dependencies.cmake)
+
 if(DEBUG_CUDA)
  string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -lineinfo")
  string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -lineinfo")
--- a/10
+++ b/10
@ -7,8 +7,8 @@
 #
 #       For reference:
 #           https://docs.docker.com/develop/develop-images/build_enhancements/
-ARG BASE_IMAGE=ubuntu:20.04
-ARG PYTHON_VERSION=3.8
+ARG BASE_IMAGE=ubuntu:22.04
+ARG PYTHON_VERSION=3.11

 FROM ${BASE_IMAGE} as dev-base
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
@ -26,7 +26,7 @@ RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
 ENV PATH /opt/conda/bin:$PATH

 FROM dev-base as conda
-ARG PYTHON_VERSION=3.8
+ARG PYTHON_VERSION=3.11
 # Automatically set by buildx
 ARG TARGETPLATFORM
 # translating Docker's TARGETPLATFORM into miniconda arches
@ -57,12 +57,12 @@ COPY --from=submodule-update /opt/pytorch /opt/pytorch
 RUN make triton
 RUN --mount=type=cache,target=/opt/ccache \
    export eval ${CMAKE_VARS} && \
-    TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
+    TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
    python setup.py install

 FROM conda as conda-installs
-ARG PYTHON_VERSION=3.8
+ARG PYTHON_VERSION=3.11
 ARG CUDA_VERSION=12.1
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch-nightly
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@ -13,6 +13,10 @@ namespace at {
 TORCH_API ScalarType toScalarType(const DLDataType& dtype);
 TORCH_API DLManagedTensor* toDLPack(const Tensor& src);
 TORCH_API Tensor fromDLPack(DLManagedTensor* src);
+C10_DEPRECATED_MESSAGE("Please migrate to a non-const variant")
+inline Tensor fromDLPack(const DLManagedTensor* src) {
+  return fromDLPack(const_cast<DLManagedTensor*>(src));
+}
 TORCH_API Tensor
 fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter);
 TORCH_API DLDataType getDLDataType(const Tensor& t);
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@ -198,6 +198,15 @@ TensorBase empty_generic(
  return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt);
 }

+TensorBase empty_generic_symint(
+    SymIntArrayRef size,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type,
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
+  return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt);
+}
+
 template <typename T>
 TensorBase _empty_strided_generic(
    T size,
--- a/aten/src/ATen/EmptyTensor.h
+++ b/aten/src/ATen/EmptyTensor.h
@ -51,6 +51,13 @@ TORCH_API TensorBase empty_generic(
    ScalarType scalar_type,
    c10::optional<c10::MemoryFormat> memory_format_opt);

+TORCH_API TensorBase empty_generic_symint(
+    SymIntArrayRef size,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type,
+    c10::optional<c10::MemoryFormat> memory_format_opt);
+
 TORCH_API TensorBase empty_strided_generic(
    IntArrayRef size,
    IntArrayRef stride,
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -174,8 +174,8 @@ Tensor FunctionalInverses::expand_inverse(const Tensor& base, const Tensor& muta
      return mutated_view.as_strided_symint(
          base.sym_sizes(), base.sym_strides(), base.sym_storage_offset());
    } else {
-      return at::sum_to(
-          mutated_view,
+      return base + at::sum_to(
+          mutated_view - base,
          base.sym_sizes(),
          /*always_return_non_view=*/inverse_return_mode == InverseReturnMode::NeverView
      );
@ -303,6 +303,29 @@ Tensor FunctionalInverses::_nested_view_from_buffer_inverse(const Tensor& base,
    return Tensor();
 }

+Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx) {
+  auto values = at::_nested_get_values(mutated_view);
+  if (inverse_return_mode != InverseReturnMode::NeverView) {
+    return values;
+  } else {
+    return values.clone(/*memory_format=*/at::MemoryFormat::Contiguous);
+  }
+}
+
+Tensor FunctionalInverses::_nested_get_values_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode) {
+  auto offsets = at::_nested_get_offsets(base);
+  auto lengths = at::_nested_get_lengths(base);
+  auto ragged_idx = at::_nested_get_ragged_idx(base);
+  auto dummy = at::_nested_get_jagged_dummy(base);
+  auto nt = at::_nested_view_from_jagged(mutated_view, offsets, dummy, lengths, ragged_idx);
+
+  if (inverse_return_mode != InverseReturnMode::NeverView) {
+    return nt;
+  } else {
+    return nt.clone(/*memory_format=*/at::MemoryFormat::Contiguous);
+  }
+}
+
 Tensor FunctionalInverses::unsqueeze_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t dim) {
    if (inverse_return_mode != InverseReturnMode::NeverView) {
      return at::squeeze(mutated_view, dim);
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@ -101,7 +101,7 @@ inline std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
 }

 // assume contiguous, we can construct stride from size
-inline at::Tensor construct_nested_strides(const at::Tensor& sizes) {
+at::Tensor construct_nested_strides(const at::Tensor& sizes) {
  // empty `sizes` means empty nested tensor, so return empty strides
  if (sizes.dim() == 0) {
    return sizes;
@ -139,7 +139,7 @@ inline at::Tensor construct_nested_strides(const at::Tensor& sizes) {
   *
   * @return A tensor of offsets
  */
-inline at::Tensor construct_offsets(const at::Tensor& sizes) {
+at::Tensor construct_offsets(const at::Tensor& sizes) {
  // empty `sizes` means empty nested tensor, so return empty strides
  if (sizes.dim() == 0) {
    return at::empty({0}, sizes.options().dtype(kLong));
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@ -14,6 +14,8 @@ namespace at::native {
 struct NestedTensorImpl;
 inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt);
 int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor);
+at::Tensor construct_nested_strides(const at::Tensor& nested_size);
+at::Tensor construct_offsets(const at::Tensor& nested_size);

 struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
  explicit NestedTensorImpl(
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@ -152,7 +152,7 @@ void invoke_parallel(
    std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
    std::exception_ptr eptr;
    std::mutex mutex;
-    volatile size_t remaining{0};
+    std::atomic_size_t remaining{0};
    std::condition_variable cv;
  } state;

--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@ -835,7 +835,7 @@ void TensorIteratorBase::cast_outputs() {
      // and tensor, this condition should no longer ever be true
      const auto &original_tensor = op.original_tensor();
      const auto &tensor = op.tensor();
-      if (original_tensor.sizes() != tensor.sizes()){
+      if (original_tensor.sizes() != tensor.sizes()) {
        original_tensor.resize_as_(tensor).as_strided_(tensor.sizes(), tensor.strides());
      }
      original_tensor.copy_(tensor);
@ -1196,6 +1196,9 @@ void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config)
  }
  for (const auto i : c10::irange(num_outputs_)) {
    const auto& output = tensor(i);
+    if (!output.defined()) {
+      operands_[i].will_resize = true;
+    }
    if (output.defined() && !output.sizes().equals(shape_)) {
      if (config.resize_outputs_ && !operands_[i].is_read_write) {
        operands_[i].will_resize = true;
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@ -167,6 +167,17 @@ struct TORCH_API OperandInfo {

  bool is_output = false;

+  // will_resize is only for output tensor.
+  // 1) Functional call(like torch.add(self, other)): output tensor is
+  //    undefined, and pytorch creates a new tensor by using common shape
+  //    and computed stride in TensorIterator;
+  // 2) Inplace call(like torch.add_(self, other)): output tensor is same
+  //    with input tensor, and can't to modify tensor's size and stride;
+  // 3) Op call with output(like torch.add(self, other, out = output)):
+  //    output tensor is defined, but tensor shape maybe different with common
+  //    shape. If tensor shape is not same with common shape, this output
+  //    tensor will be resized by using common shape and computed stride in
+  //    TensorIterator. Otherwise can't modify tensor's size and stride.
  bool will_resize = false;

  bool is_read_write = false;
@ -472,6 +483,21 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase {
    operands_[arg].data = data;
  }

+  // Helper functions for custom device, custom device can get OperandInfo and
+  // NameVector in their side.
+  const OperandInfo& operand(int arg = 0) const {
+    return operands_[arg];
+  }
+  OperandInfo& operand(int arg = 0) {
+    return operands_[arg];
+  }
+  NameVector& get_dim_names() {
+    return names_;
+  }
+  const NameVector& get_dim_names() const {
+    return names_;
+  }
+
  /// true if the stride computation can use 32-bit arithmetic. Used by GPU
  /// kernels
  bool can_use_32bit_indexing() const;
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@ -104,37 +104,40 @@ inline void maybe_wrap_dims(
 // dimension behavior and dimension size checking). We maintain this behavior
 // for backwards compatibility, but only for this specific size (i.e. other
 // empty sizes are not skipped).
-template <typename T>
-inline int64_t _legacy_cat_wrap_dim(
+inline int64_t legacy_cat_wrap_dim(
    int64_t dim,
-    const std::vector<std::vector<T>>& tensor_sizes) {
+    const std::vector<std::vector<int64_t>>& tensor_sizes) {
  for (auto& sizes : tensor_sizes) {
    if (sizes.size() == 1 && sizes[0] == 0) {
      continue;
    }
-    return maybe_wrap_dim(dim, sizes.size());
+    return maybe_wrap_dim(dim, static_cast<int64_t>(sizes.size()));
+  }
+  return dim;
+}
+
+inline int64_t legacy_cat_wrap_dim_symint(
+    int64_t dim,
+    const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
+  for (auto& sizes : tensor_sizes) {
+    if (sizes.size() == 1) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) {
+        continue;
+      }
+    }
+    return maybe_wrap_dim(dim, static_cast<int64_t>(sizes.size()));
  }
  return dim;
 }

-inline int64_t legacy_cat_wrap_dim(
-    int64_t dim,
-    const std::vector<std::vector<int64_t>>& tensor_sizes) {
-  return _legacy_cat_wrap_dim<int64_t>(dim, tensor_sizes);
-}
-
-inline int64_t legacy_cat_wrap_dim_symint(
-    int64_t dim,
-    const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
-  return _legacy_cat_wrap_dim<c10::SymInt>(dim, tensor_sizes);
-}
-
 inline int64_t legacy_cat_wrap_dim(
    int64_t dim,
    const MaterializedITensorListRef& tensors) {
  for (const Tensor& tensor : tensors) {
-    if (tensor.dim() == 1 && tensor.sizes()[0] == 0) {
-      continue;
+    if (tensor.dim() == 1) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) {
+        continue;
+      }
    }
    return maybe_wrap_dim(dim, tensor.dim());
  }
--- a/aten/src/ATen/core/Array.h
+++ b/aten/src/ATen/core/Array.h
@ -6,10 +6,11 @@
 #include <c10/macros/Macros.h>
 #include <c10/util/irange.h>

-namespace at { namespace detail {
+namespace at::detail {

 template <typename T, int size_>
 struct Array {
+  // NOLINTNEXTLINE(*c-array*)
  T data[size_];

  C10_HOST_DEVICE T operator[](int i) const {
@ -27,7 +28,9 @@ struct Array {
  Array(const Array&) = default;
  Array& operator=(const Array&) = default;
 #endif
-  static constexpr int size(){return size_;}
+  static constexpr int size() {
+    return size_;
+  }
  // Fill the array with x.
  C10_HOST_DEVICE Array(T x) {
    for (int i = 0; i < size_; i++) {
@ -36,4 +39,4 @@ struct Array {
  }
 };

-}}
+} // namespace at::detail
--- a/aten/src/ATen/core/CheckMemoryFormat.h
+++ b/aten/src/ATen/core/CheckMemoryFormat.h
@ -1,6 +1,6 @@
 #include <c10/core/TensorOptions.h>

-namespace c10 { namespace impl {
+namespace c10::impl {

 inline c10::optional<MemoryFormat>
 check_tensor_options_and_extract_memory_format(
@ -22,4 +22,4 @@ check_tensor_options_and_extract_memory_format(
  }
 }

-}} // namespace impl namespace c10
+} // namespace impl namespace c10
--- a/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h
+++ b/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h
@ -22,6 +22,7 @@ class TORCH_API DeprecatedTypePropertiesRegistry {
  DeprecatedTypeProperties& getDeprecatedTypeProperties(Backend p, ScalarType s) const;

 private:
+  // NOLINTNEXTLINE(*c-array*)
  std::unique_ptr<DeprecatedTypeProperties> registry
    [static_cast<int>(Backend::NumOptions)]
    [static_cast<int>(ScalarType::NumOptions)];
--- a/aten/src/ATen/core/Dict.cpp
+++ b/aten/src/ATen/core/Dict.cpp
@ -1,7 +1,7 @@
 #include <ATen/core/Dict.h>

-namespace c10 {
-namespace detail {
+
+namespace c10::detail {
 bool operator==(const DictImpl& lhs, const DictImpl& rhs) {
  bool isEqualFastChecks =
      *lhs.elementTypes.keyType == *rhs.elementTypes.keyType &&
@ -25,5 +25,4 @@ bool operator==(const DictImpl& lhs, const DictImpl& rhs) {

  return true;
 }
-} // namespace detail
-} // namespace c10
+} // namespace c10::detail
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@ -207,7 +207,7 @@ template<class Key, class Value> Dict<IValue, IValue> toGenericDict(Dict<Key, Va
 template<class Key, class Value>
 class Dict final {
 private:
-  static_assert((std::is_same<IValue, Key>::value && std::is_same<IValue, Value>::value) || guts::typelist::contains<impl::valid_dict_key_types, Key>::value, "Invalid Key type for Dict. We only support int64_t, double, bool, and string.");
+  static_assert((std::is_same_v<IValue, Key> && std::is_same_v<IValue, Value>) || guts::typelist::contains<impl::valid_dict_key_types, Key>::value, "Invalid Key type for Dict. We only support int64_t, double, bool, and string.");

  // impl_ stores the underlying map as a ska_ordered::order_preserving_flat_hash_map.
  // We intentionally don't offer conversion from/to
--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@ -20,7 +20,7 @@ bool Dimname::isValidName(const std::string& name) {
  // letters A through Z, the underscore _ and, except for the first
  // character, the digits 0 through 9" (at least length 1)
  // https://docs.python.org/3/reference/lexical_analysis.html#identifiers
-  if (name.length() == 0) {
+  if (name.empty()) {
    return false;
  }
  for (auto it = name.begin(); it != name.end(); ++it) {
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@ -160,7 +160,7 @@ static void __printIndent(std::ostream &stream, int64_t indent)

 static void printScale(std::ostream & stream, double scale) {
  FormatGuard guard(stream);
-  stream << defaultfloat << scale << " *" << std::endl;
+  stream << defaultfloat << scale << " *" << '\n';
 }
 static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t linesize, int64_t indent)
 {
@ -178,7 +178,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
    }
    if(nColumnPerLine < self.size(1)) {
      if(firstColumn != 0) {
-        stream << std::endl;
+        stream << '\n';
      }
      stream << "Columns " << firstColumn+1 << " to " << lastColumn+1;
      __printIndent(stream, indent);
@ -193,7 +193,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line
      for (const auto c : c10::irange(firstColumn, lastColumn+1)) {
        stream << std::setw(sz) << row_ptr[c]/scale;
        if(c == lastColumn) {
-          stream << std::endl;
+          stream << '\n';
          if(l != self.size(0)-1) {
            if(scale != 1) {
              __printIndent(stream, indent);
@ -239,7 +239,7 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
    if(start) {
      start = false;
    } else {
-      stream << std::endl;
+      stream << '\n';
    }
    stream << "(";
    Tensor tensor = self;
@ -247,7 +247,7 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize)
      tensor = tensor.select(0, counter[i]);
      stream << counter[i]+1 << ",";
    }
-    stream << ".,.) = " << std::endl;
+    stream << ".,.) = " << '\n';
    __printMatrix(stream, tensor, linesize, 1);
  }
 }
@ -279,7 +279,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
      tensor = tensor_.to(kCPU, kDouble).contiguous();
    }
    if(tensor.ndimension() == 0) {
-      stream << defaultfloat << tensor.data_ptr<double>()[0] << std::endl;
+      stream << defaultfloat << tensor.data_ptr<double>()[0] << '\n';
      stream << "[ " << tensor_.toString() << "{}";
    } else if(tensor.ndimension() == 1) {
      if (tensor.numel() > 0) {
@ -289,7 +289,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
        }
        double* tensor_p = tensor.data_ptr<double>();
        for (const auto i : c10::irange(tensor.size(0))) {
-          stream << std::setw(sz) << tensor_p[i]/scale << std::endl;
+          stream << std::setw(sz) << tensor_p[i]/scale << '\n';
        }
      }
      stream << "[ " << tensor_.toString() << "{" << tensor.size(0) << "}";
@ -329,7 +329,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
    if (tensor.getIntrusivePtr()->autograd_meta()) {
      auto& fw_grad = tensor._fw_grad(/* level */ 0);
      if (fw_grad.defined()) {
-        stream << ", tangent:" << std::endl << fw_grad;
+        stream << ", tangent:" << '\n' << fw_grad;
      }
    }
    stream << " ]";
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@ -1,12 +1,9 @@
 #pragma once

-#include <mutex>
-#include <deque>
-#include <atomic>
-#include <typeinfo>
-#include <utility>
-#include <cstddef>
 #include <cstdint>
+#include <deque>
+#include <mutex>
+#include <utility>

 #include <c10/util/Exception.h>
 #include <c10/util/intrusive_ptr.h>
--- a/aten/src/ATen/core/IListRef.h
+++ b/aten/src/ATen/core/IListRef.h
@ -307,10 +307,10 @@ class IListRefTagImplBase {};
 * reference type, then it's left unchanged.
 */
 template <typename T>
-using _MaterializedIListRefElem = typename std::conditional<
-    std::is_reference<T>::value,
-    typename std::reference_wrapper<typename std::remove_reference<T>::type>,
-    T>::type;
+using _MaterializedIListRefElem = std::conditional_t<
+    std::is_reference_v<T>,
+    typename std::reference_wrapper<std::remove_reference_t<T>>,
+    T>;

 template <typename T>
 using MaterializedIListRefElem = _MaterializedIListRefElem<IListRefConstRef<T>>;
@ -540,7 +540,7 @@ class IListRef {
  template <
      typename... UnboxedConstructorArgs,
      typename = std::enable_if_t<
-          std::is_constructible<unboxed_type, UnboxedConstructorArgs...>::value>>
+          std::is_constructible_v<unboxed_type, UnboxedConstructorArgs...>>>
  IListRef(UnboxedConstructorArgs&&... args) : tag_(IListRefTag::Unboxed) {
    payload_.unboxed = unboxed_type(std::forward<UnboxedConstructorArgs>(args)...);
  }
--- a/aten/src/ATen/core/IListRef_inl.h
+++ b/aten/src/ATen/core/IListRef_inl.h
@ -8,8 +8,8 @@ class Tensor;
 class OptionalTensorRef;
 }

-namespace c10 {
-namespace detail {
+
+namespace c10::detail {

 /*
 * Specializations of `IListRefTagImplBase` that implement the default
@ -184,8 +184,8 @@ class IListRefTagImpl<IListRefTag::Materialized, at::OptionalTensorRef>
          at::OptionalTensorRef,
          MaterializedIListRefElem<at::OptionalTensorRef>> {};

-} // namespace detail
-} // namespace c10
+} // namespace c10::detail
+

 namespace at {

--- a/aten/src/ATen/core/IListRef_test.cpp
+++ b/aten/src/ATen/core/IListRef_test.cpp
@ -103,7 +103,7 @@ TEST(ITensorListRefTest, Boxed_GetConstRefTensor) {
  const List<at::Tensor> boxed(vec);
  at::ITensorListRef list(boxed);
  static_assert(
-      std::is_same<decltype(*list.begin()), const at::Tensor&>::value,
+      std::is_same_v<decltype(*list.begin()), const at::Tensor&>,
      "Accessing elements from List<Tensor> through a ITensorListRef should be const references.");
  EXPECT_TRUE(boxed[0].is_same(*list.begin()));
  EXPECT_TRUE(boxed[1].is_same(*(++list.begin())));
@ -113,7 +113,7 @@ TEST(ITensorListRefTest, Unboxed_GetConstRefTensor) {
  auto vec = get_tensor_vector();
  at::ITensorListRef list(vec);
  static_assert(
-      std::is_same<decltype(*list.begin()), const at::Tensor&>::value,
+      std::is_same_v<decltype(*list.begin()), const at::Tensor&>,
      "Accessing elements from ArrayRef<Tensor> through a ITensorListRef should be const references.");
  EXPECT_TRUE(vec[0].is_same(*list.begin()));
  EXPECT_TRUE(vec[1].is_same(*(++list.begin())));
--- a/aten/src/ATen/core/List.cpp
+++ b/aten/src/ATen/core/List.cpp
@ -1,7 +1,7 @@
 #include <ATen/core/List.h>

-namespace c10 {
-namespace detail {
+
+namespace c10::detail {
 bool operator==(const ListImpl& lhs, const ListImpl& rhs) {
  return *lhs.elementType == *rhs.elementType &&
      lhs.list.size() == rhs.list.size() &&
@ -16,5 +16,4 @@ bool operator==(const ListImpl& lhs, const ListImpl& rhs) {
 ListImpl::ListImpl(list_type list_, TypePtr elementType_)
  : list(std::move(list_))
  , elementType(std::move(elementType_)) {}
-} // namespace detail
-} // namespace c10
+} // namespace c10::detail
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@ -44,7 +44,7 @@ template<class T, class Iterator> class ListIterator;
 template<class T, class Iterator> class ListElementReference;

 template<class T, class Iterator>
-void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs);
+void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs) noexcept;

 template<class T, class Iterator>
 bool operator==(const ListElementReference<T, Iterator>& lhs, const T& rhs);
@ -68,8 +68,8 @@ template<class T, class Iterator>
 class ListElementReference final {
 public:
  operator std::conditional_t<
-      std::is_reference<typename c10::detail::
-                            ivalue_to_const_ref_overload_return<T>::type>::value,
+      std::is_reference_v<typename c10::detail::
+                            ivalue_to_const_ref_overload_return<T>::type>,
      const T&,
      T>() const;

@ -84,7 +84,7 @@ public:
    return *iterator_;
  }

-  friend void swap<T, Iterator>(ListElementReference&& lhs, ListElementReference&& rhs);
+  friend void swap<T, Iterator>(ListElementReference&& lhs, ListElementReference&& rhs) noexcept;

  ListElementReference(const ListElementReference&) = delete;
  ListElementReference& operator=(const ListElementReference&) = delete;
--- a/aten/src/ATen/core/List_inl.h
+++ b/aten/src/ATen/core/List_inl.h
@ -120,8 +120,8 @@ namespace impl {

 template <class T, class Iterator>
 ListElementReference<T, Iterator>::operator std::conditional_t<
-    std::is_reference<typename c10::detail::ivalue_to_const_ref_overload_return<
-        T>::type>::value,
+    std::is_reference_v<typename c10::detail::ivalue_to_const_ref_overload_return<
+        T>::type>,
    const T&,
    T>() const {
  return iterator_->template to<T>();
@ -146,7 +146,7 @@ ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(
 }

 template<class T, class Iterator>
-void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs) {
+void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs)  noexcept {
  std::swap(*lhs.iterator_, *rhs.iterator_);
 }

--- a/aten/src/ATen/core/List_test.cpp
+++ b/aten/src/ATen/core/List_test.cpp
@ -1118,7 +1118,7 @@ TEST(ListTestNonIValueBasedList, sameValueDifferentStorage_thenIsReturnsFalse) {
 TEST(ListTest, canAccessStringByReference) {
  List<std::string> list({"one", "two"});
  const auto& listRef = list;
-  static_assert(std::is_same<decltype(listRef[1]), const std::string&>::value,
+  static_assert(std::is_same_v<decltype(listRef[1]), const std::string&>,
                "const List<std::string> access should be by const reference");
  std::string str = list[1];
  const std::string& strRef = listRef[1];
@ -1130,7 +1130,7 @@ TEST(ListTest, canAccessOptionalStringByReference) {
  List<c10::optional<std::string>> list({"one", "two", c10::nullopt});
  const auto& listRef = list;
  static_assert(
-      std::is_same<decltype(listRef[1]), c10::optional<std::reference_wrapper<const std::string>>>::value,
+      std::is_same_v<decltype(listRef[1]), c10::optional<std::reference_wrapper<const std::string>>>,
      "List<c10::optional<std::string>> access should be by const reference");
  c10::optional<std::string> str1 = list[1];
  c10::optional<std::string> str2 = list[2];
@ -1148,7 +1148,7 @@ TEST(ListTest, canAccessTensorByReference) {
  List<at::Tensor> list;
  const auto& listRef = list;
  static_assert(
-      std::is_same<decltype(listRef[0]), const at::Tensor&>::value,
+      std::is_same_v<decltype(listRef[0]), const at::Tensor&>,
      "List<at::Tensor> access should be by const reference");
 }

--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@ -121,9 +121,9 @@ void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names,
  }
  auto* meta = get_named_tensor_meta(impl);
  if (meta == nullptr) {
-    impl->set_named_tensor_meta(std::make_unique<NamedTensorMeta>(NamedTensorMeta::HasNonWildcard, names));
+    impl->set_named_tensor_meta(std::make_unique<NamedTensorMeta>(NamedTensorMeta::HasNonWildcard, std::move(names)));
  } else {
-    meta->set_names(NamedTensorMeta::HasNonWildcard, names);
+    meta->set_names(NamedTensorMeta::HasNonWildcard, std::move(names));
  }
 }

--- a/aten/src/ATen/core/NamedTensor.h
+++ b/aten/src/ATen/core/NamedTensor.h
@ -44,7 +44,7 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {

  // Used for an assertion in TensorImpl.h
  int64_t slow_dim() const override {
-    return names_.size();
+    return static_cast<int64_t>(names_.size());
  }

  void check_invariants() const {
@ -79,7 +79,7 @@ struct TORCH_API NamesMode {
 // A RAII, thread local (!) guard that enables or disables names upon
 // construction, and sets it back to the original value upon destruction.
 struct TORCH_API NoNamesGuard {
-  NoNamesGuard() : prev_mode(NamesMode::is_enabled()), initialized(true) {
+  NoNamesGuard() : prev_mode(NamesMode::is_enabled()) {
    NamesMode::set_enabled(false);
  }
  ~NoNamesGuard() {
@ -93,7 +93,7 @@ struct TORCH_API NoNamesGuard {
  }
 private:
  bool prev_mode;
-  bool initialized;
+  bool initialized{true};
 };

 void check_names_valid_for(const TensorBase& tensor, DimnameList names);
--- a/aten/src/ATen/core/NestedIntSymNodeImpl.cpp
+++ b/aten/src/ATen/core/NestedIntSymNodeImpl.cpp
@ -67,9 +67,7 @@ c10::SymNode NestedIntSymNodeImpl::le(const c10::SymNode& other) {
 }

 c10::SymNode NestedIntSymNodeImpl::mul(const c10::SymNode& other) {
-  if (auto mb_si = other->nested_int()) {
-    TORCH_CHECK(false, "nested int cannot be multiplied by nested int");
-  }
+  TORCH_CHECK(!other->nested_int(), "nested int cannot be multiplied by nested int");
  c10::optional<int64_t> c = other->constant_int();
  TORCH_CHECK(c.has_value());
  return SymNode(c10::make_intrusive<NestedIntSymNodeImpl>(val_, coeff_ * *c));
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@ -120,8 +120,8 @@ void preDispatchFallback(const c10::OperatorHandle& op, c10::DispatchKeySet disp

 } // anonymous namespace

-namespace at {
-namespace impl {
+
+namespace at::impl {

 RestorePythonTLSSnapshot::RestorePythonTLSSnapshot() : saved_(safe_get_tls_on_entry()), guard_(safe_get_tls_on_entry()) {
  tls_on_entry = c10::nullopt;
@ -148,8 +148,7 @@ MaybeSetTLSOnEntryGuard::~MaybeSetTLSOnEntryGuard() {
 }


-} // namespace impl
-} // namespace at
+} // namespace at::impl

 TORCH_LIBRARY_IMPL(_, Python, m) {
  m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>());
--- a/aten/src/ATen/core/PythonFallbackKernel.h
+++ b/aten/src/ATen/core/PythonFallbackKernel.h
@ -1,8 +1,8 @@
 #pragma once
 #include <ATen/core/TorchDispatchUtils.h>

-namespace at {
-namespace impl {
+
+namespace at::impl {

 struct TORCH_API RestorePythonTLSSnapshot {
  RestorePythonTLSSnapshot();
@ -24,5 +24,4 @@ private:
  bool value_set_;
 };

-} // namespace impl
-} // namespace at
+} // namespace at::impl
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp
@ -1,7 +1,6 @@
 #include <ATen/core/PythonOpRegistrationTrampoline.h>

-namespace at {
-namespace impl {
+namespace at::impl {

 // The strategy is that all python interpreters attempt to register themselves
 // as the main interpreter, but only one wins.  Only that interpreter is
@ -9,14 +8,15 @@ namespace impl {
 // logic on that interpreter, we do so hermetically, never setting pyobj field
 // on Tensor.

-std::atomic<c10::impl::PyInterpreter*> PythonOpRegistrationTrampoline::interpreter_{nullptr};
+std::atomic<c10::impl::PyInterpreter*>
+    PythonOpRegistrationTrampoline::interpreter_{nullptr};

 c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() {
  return PythonOpRegistrationTrampoline::interpreter_.load();
-
 }

-bool PythonOpRegistrationTrampoline::registerInterpreter(c10::impl::PyInterpreter* interp) {
+bool PythonOpRegistrationTrampoline::registerInterpreter(
+    c10::impl::PyInterpreter* interp) {
  c10::impl::PyInterpreter* expected = nullptr;
  interpreter_.compare_exchange_strong(expected, interp);
  if (expected != nullptr) {
@ -29,5 +29,4 @@ bool PythonOpRegistrationTrampoline::registerInterpreter(c10::impl::PyInterprete
  }
 }

-} // namespace impl
-} // namespace at
+} // namespace at::impl
--- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
+++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h
@ -4,8 +4,8 @@

 // TODO: this can probably live in c10

-namespace at {
-namespace impl {
+
+namespace at::impl {

 class TORCH_API PythonOpRegistrationTrampoline final {
  static std::atomic<c10::impl::PyInterpreter*> interpreter_;
@ -19,5 +19,4 @@ public:
  static c10::impl::PyInterpreter* getInterpreter();
 };

-} // namespace impl
-} // namespace at
+} // namespace at::impl
--- a/aten/src/ATen/core/QuantizerBase.h
+++ b/aten/src/ATen/core/QuantizerBase.h
@ -37,6 +37,7 @@ using QuantizerPtr = c10::intrusive_ptr<Quantizer>;
 * share the same Quantizer. Quantizer should be immutable.
 */
 struct TORCH_API Quantizer : public c10::intrusive_ptr_target {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
  const ScalarType scalar_type_;
  explicit Quantizer(ScalarType scalar_type) : scalar_type_(scalar_type) {}
  ~Quantizer() override;
--- a/aten/src/ATen/core/Reduction.h
+++ b/aten/src/ATen/core/Reduction.h
@ -1,16 +1,14 @@
 #pragma once

-namespace at {
-namespace Reduction {
+namespace at::Reduction {

 // NB: Keep this in sync with Reduction class in torch/nn/_reduction.py
 // These constants control the reduction behavior of loss functions.
 // Ideally, this would be a scoped enum, but jit doesn't support that
 enum Reduction {
-  None,             // Do not reduce
-  Mean,             // (Possibly weighted) mean of losses
-  Sum,              // Sum losses
+  None, // Do not reduce
+  Mean, // (Possibly weighted) mean of losses
+  Sum, // Sum losses
  END
 };
-} // namespace Reduction
-} // namespace at
+} // namespace at::Reduction
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -72,9 +72,9 @@ void TensorBase::enforce_invariants() {

 void TensorBase::print() const {
  if (defined()) {
-    std::cerr << "[" << toString() << " " << sizes() << "]" << std::endl;
+    std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
  } else {
-    std::cerr << "[UndefinedTensor]" << std::endl;
+    std::cerr << "[UndefinedTensor]" << '\n';
  }
 }

--- a/aten/src/ATen/core/Tensor.h
+++ b/aten/src/ATen/core/Tensor.h
@ -68,6 +68,7 @@ class TORCH_API TensorRef {
 };

 template <typename T>
+// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
 auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_void_t<T> {
  // Return the grad argument in case of a hook with void return type to have an
  // std::function with Tensor return type
@ -81,6 +82,7 @@ auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_void_t<T> {
 }

 template <typename T>
+// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
 auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_var_t<T> {
  return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad_base) {
    TensorRef grad(grad_base);
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@ -7,6 +7,7 @@
 #include <c10/util/irange.h>
 #include <cstddef>
 #include <cstdint>
+#include <type_traits>

 namespace at {

@ -131,7 +132,7 @@ public:
  }

  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = typename std::enable_if<std::is_same<source_index_t, int64_t>::value>::type>
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
  C10_HOST GenericPackedTensorAccessorBase(
      PtrType data_,
@ -184,7 +185,7 @@ public:
      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}

  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = typename std::enable_if<std::is_same<source_index_t, int64_t>::value>::type>
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
  C10_HOST GenericPackedTensorAccessor(
      PtrType data_,
      const source_index_t* sizes_,
@ -231,7 +232,7 @@ public:
      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}

  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = typename std::enable_if<std::is_same<source_index_t, int64_t>::value>::type>
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
  C10_HOST GenericPackedTensorAccessor(
      PtrType data_,
      const source_index_t* sizes_,
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -28,11 +28,11 @@ namespace c10 {
 class Scalar;
 }

-namespace torch { namespace autograd {
+namespace torch::autograd {

 struct Node;

-}} // namespace torch::autograd
+} // namespace torch::autograd

 namespace at {

@ -594,10 +594,10 @@ class TORCH_API TensorBase {
    return mutable_data_ptr();
  }

-  template <typename T, std::enable_if_t<!std::is_const<T>::value, int> = 0>
+  template <typename T, std::enable_if_t<!std::is_const_v<T>, int> = 0>
  const T* const_data_ptr() const;

-  template <typename T, std::enable_if_t<std::is_const<T>::value, int> = 0>
+  template <typename T, std::enable_if_t<std::is_const_v<T>, int> = 0>
  const std::remove_const_t<T>* const_data_ptr() const;

  template <typename T>
@ -831,9 +831,9 @@ class TORCH_API TensorBase {
  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  template <typename T>
-  using hook_return_void_t = std::enable_if_t<std::is_void<typename c10::invoke_result_t<T&, TensorBase>>::value, unsigned>;
+  using hook_return_void_t = std::enable_if_t<std::is_void_v<typename c10::invoke_result_t<T&, TensorBase>>, unsigned>;
  template <typename T>
-  using hook_return_var_t = std::enable_if_t<std::is_same<typename c10::invoke_result_t<T&, TensorBase>, TensorBase>::value, unsigned>;
+  using hook_return_var_t = std::enable_if_t<std::is_same_v<typename c10::invoke_result_t<T&, TensorBase>, TensorBase>, unsigned>;

  /// Registers a backward hook.
  ///
@ -925,10 +925,11 @@ inline DeviceIndex get_device(const TensorBase& self) {
 }

 template <typename T>
+// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
 auto TensorBase::register_hook(T&& hook) const -> TensorBase::hook_return_void_t<T> {
  // Return the grad argument in case of a hook with void return type to have an
  // std::function with Tensor return type
-  static_assert(std::is_same<decltype(hook(TensorBase())), void>::value,
+  static_assert(std::is_same_v<decltype(hook(TensorBase())), void>,
                "Expected hook to return void");
  return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad) {
    fn(grad);
@ -1026,9 +1027,9 @@ inline c10::MaybeOwned<TensorBase> TensorBase::expect_contiguous(MemoryFormat me
 namespace symint {

 template <typename T>
-using enable_if_symint = std::enable_if_t<std::is_same<T, c10::SymInt>::value>;
+using enable_if_symint = std::enable_if_t<std::is_same_v<T, c10::SymInt>>;
 template <typename T>
-using enable_if_int = std::enable_if_t<std::is_same<T, int64_t>::value>;
+using enable_if_int = std::enable_if_t<std::is_same_v<T, int64_t>>;

 template <typename T, typename = enable_if_symint<T>>
 c10::SymIntArrayRef sizes(const TensorBase& t) { return t.sym_sizes(); }
--- a/aten/src/ATen/core/TorchDispatchUtils.cpp
+++ b/aten/src/ATen/core/TorchDispatchUtils.cpp
@ -1,7 +1,7 @@
 #include <ATen/core/TorchDispatchUtils.h>

-namespace at {
-namespace impl {
+
+namespace at::impl {

 bool tensor_has_dispatch(const at::Tensor& t) {
  DispatchKeySet key_set({DispatchKey::Python, DispatchKey::PythonTLSSnapshot});
@ -27,5 +27,4 @@ bool tensorlist_has_dispatch(const c10::List<c10::optional<at::Tensor>>& li) {
  return false;
 }

-} // namespace impl
-} // namespace at
+} // namespace at::impl
--- a/aten/src/ATen/core/TorchDispatchUtils.h
+++ b/aten/src/ATen/core/TorchDispatchUtils.h
@ -6,12 +6,11 @@
 #include <c10/util/Optional.h>
 #include <c10/core/impl/TorchDispatchModeTLS.h>

-namespace at {
-namespace impl {
+namespace at::impl {

 TORCH_API bool tensor_has_dispatch(const at::Tensor& t);
 TORCH_API bool tensorlist_has_dispatch(at::ITensorListRef li);
 TORCH_API bool tensorlist_has_dispatch(const c10::List<c10::optional<at::Tensor>>& li);
 using c10::impl::dispatch_mode_enabled;

-}}
+}
--- a/aten/src/ATen/core/TransformationHelper.h
+++ b/aten/src/ATen/core/TransformationHelper.h
@ -1,11 +1,13 @@
+#include <ATen/NumericUtils.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Half.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/MathConstants.h>
-#include <ATen/NumericUtils.h>
-#include <limits>
+#include <cmath>
 #include <cstdint>
 #include <cassert>
+#include <limits>
+#include <type_traits>

 namespace at {

@ -54,12 +56,12 @@ C10_HOST_DEVICE inline T uniform_int_full_range(V val) {
 * in this overloaded version
 */
 template <typename T, typename V>
-C10_HOST_DEVICE inline typename std::enable_if<!(std::is_floating_point<T>::value), T>::type uniform_int(V val) {
+C10_HOST_DEVICE inline std::enable_if_t<!(std::is_floating_point_v<T>), T>uniform_int(V val) {
  if constexpr (std::is_same_v<T, bool>) {
    return static_cast<bool>(val & 1);
  } else if constexpr (std::is_same_v<T, int64_t>) {
    return static_cast<T>(val % (static_cast<uint64_t>(std::numeric_limits<T>::max()) + 1));
-  } else if constexpr (std::is_same_v<T, at::Half> || std::is_same<T, at::BFloat16>::value) {
+  } else if constexpr (std::is_same_v<T, at::Half> || std::is_same_v<T, at::BFloat16>) {
    return static_cast<T>(val % static_cast<uint64_t>((1ULL << std::numeric_limits<T>::digits) + 1));
  } else if constexpr (std::is_integral_v<T>) {
    return static_cast<T>(val % (static_cast<uint64_t>(std::numeric_limits<T>::max()) + 1));
@ -74,7 +76,7 @@ C10_HOST_DEVICE inline typename std::enable_if<!(std::is_floating_point<T>::valu
 * added to fix compiler warnings reported in GitHub issue 46391. T is either float or double in this version.
 */
 template<typename T, typename V>
-C10_HOST_DEVICE inline typename std::enable_if<std::is_floating_point<T>::value, T>::type uniform_int(V val) {
+C10_HOST_DEVICE inline std::enable_if_t<std::is_floating_point_v<T>, T>uniform_int(V val) {
  return static_cast<T>(val % static_cast<uint64_t>((1ULL << std::numeric_limits<T>::digits) + 1));
 }

--- a/aten/src/ATen/core/VariableHooksInterface.cpp
+++ b/aten/src/ATen/core/VariableHooksInterface.cpp
@ -1,6 +1,6 @@
 #include <ATen/core/VariableHooksInterface.h>

-namespace at { namespace impl {
+namespace at::impl {

 namespace {
 VariableHooksInterface* hooks = nullptr;
@ -17,4 +17,4 @@ bool HasVariableHooks() {
  return hooks != nullptr;
 }

-}} // namespace at::impl
+} // namespace at::impl
--- a/aten/src/ATen/core/VariableHooksInterface.h
+++ b/aten/src/ATen/core/VariableHooksInterface.h
@ -1,7 +1,7 @@
 #pragma once

-#include <c10/macros/Export.h>
 #include <ATen/core/Tensor.h>
+#include <c10/macros/Export.h>

 // A little explanation about why this file exists at all.  We have
 // a few methods on Tensor class which require access to reified access to
@ -29,20 +29,20 @@
 // have weird signatures that are not supported by autograd, and (2)
 // see this bug https://github.com/pytorch/pytorch/issues/30102

-namespace torch { namespace autograd {
+namespace torch::autograd {

 struct Node;

-}} // namespace torch::autograd
+} // namespace torch::autograd

-namespace at {
-namespace impl {
+namespace at::impl {

 struct TORCH_API VariableHooksInterface {
  virtual ~VariableHooksInterface() = default;
  virtual TensorBase tensor_data(const TensorBase&) const = 0;
  virtual TensorBase variable_data(const TensorBase&) const = 0;
-  virtual const std::shared_ptr<torch::autograd::Node>& grad_fn(const TensorBase&) const = 0;
+  virtual const std::shared_ptr<torch::autograd::Node>& grad_fn(
+      const TensorBase&) const = 0;
  virtual unsigned _register_hook(
      const TensorBase&,
      std::function<TensorBase(const TensorBase&)> hook) const = 0;
@ -57,9 +57,17 @@ struct TORCH_API VariableHooksInterface {
  virtual int64_t _version(const TensorBase&) const = 0;
  virtual void retain_grad(const TensorBase&) const = 0;
  virtual bool retains_grad(const TensorBase&) const = 0;
-  virtual void _backward(const Tensor&, TensorList, const c10::optional<Tensor>&, c10::optional<bool>, bool) const = 0;
+  virtual void _backward(
+      const Tensor&,
+      TensorList,
+      const c10::optional<Tensor>&,
+      c10::optional<bool>,
+      bool) const = 0;
  virtual void requires_grad_(const TensorBase&, bool) const = 0;
-  virtual void basic_autograd_not_implemented_fallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) const = 0;
+  virtual void basic_autograd_not_implemented_fallback(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet dispatch_keys,
+      torch::jit::Stack* stack) const = 0;
 };

 TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
@ -72,4 +80,4 @@ struct TORCH_API VariableHooksRegisterer {
  }
 };

-}} // namespace at::impl
+} // namespace at::impl
--- a/aten/src/ATen/core/Variadic.h
+++ b/aten/src/ATen/core/Variadic.h
@ -1,8 +1,5 @@
 #pragma once

-#include <cstdint>
-#include <tuple>
-#include <type_traits>
 #include <utility>

 #include <c10/util/ArrayRef.h>
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@ -2,8 +2,7 @@
 #include <cstdlib>
 #include <iostream>

-namespace at {
-namespace vitals {
+namespace at::vitals {

 APIVitals VitalsAPI;

@ -78,8 +77,7 @@ bool APIVitals::setVital(
  auto iter = name_map_.find(vital_name);
  TorchVital* vital = nullptr;
  if (iter == name_map_.end()) {
-    auto r =
-        name_map_.emplace(vital_name, TorchVital(vital_name));
+    auto r = name_map_.emplace(vital_name, TorchVital(vital_name));
    vital = &r.first->second;
  } else {
    vital = &iter->second;
@ -95,5 +93,4 @@ APIVitals::APIVitals() : vitals_enabled(false), name_map_() {
  setVital("CUDA", "used", "False", /* force = */ true);
 }

-} // namespace vitals
-} // namespace at
+} // namespace at::vitals
--- a/aten/src/ATen/core/Vitals.h
+++ b/aten/src/ATen/core/Vitals.h
@ -1,15 +1,11 @@
 #pragma once
-#include <cstring>
-#include <map>
-#include <memory>
 #include <ostream>
 #include <sstream>
 #include <unordered_map>

 #include <c10/core/impl/LocalDispatchKeySet.h>

-namespace at {
-namespace vitals {
+namespace at::vitals {

 TORCH_API bool torchVitalEnabled();

@ -82,8 +78,7 @@ class TORCH_API APIVitals {

 extern TORCH_API APIVitals VitalsAPI;

-} // namespace vitals
-} // namespace at
+} // namespace at::vitals

 #define TORCH_VITAL_DECLARE(name) \
  TORCH_API at::vitals::TorchVital TorchVital_##name;
--- a/aten/src/ATen/core/adaption.cpp
+++ b/aten/src/ATen/core/adaption.cpp
@ -1,15 +1,13 @@
 #include <ATen/core/op_registration/adaption.h>

-namespace c10 {
-namespace impl {
+
+namespace c10::impl {

 void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
  TORCH_CHECK(false,
    "Expected all tensors to be on the same device, but "
-    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    "found at least two devices, ", common_device, " and ", tensor.device(), "! "
    "(when checking argument for argument ", argName, " in method ", methodName, ")");
 }

-} // namespace impl
-} // namespace c10
+} // namespace c10::impl
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@ -1,10 +1,6 @@
 #pragma once

-#include <cstddef>
-#include <sstream>
 #include <type_traits>
-#include <typeinfo>
-#include <vector>

 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/typeid.h>
@ -26,7 +22,7 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target {
  /**
   * Initializes an empty Blob.
   */
-  Blob() noexcept : meta_(), pointer_(nullptr), has_ownership_(false) {}
+  Blob() noexcept : meta_() {}
  ~Blob() override {
    Reset();
  }
@ -148,11 +144,11 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target {
   * call is made or the blob is destructed.
   */
  template <class T>
-  typename std::remove_const<T>::type* ShareExternal(
-      typename std::remove_const<T>::type* allocated) {
+  std::remove_const_t<T>* ShareExternal(
+      std::remove_const_t<T>* allocated) {
    return static_cast<T*>(ShareExternal(
        static_cast<void*>(allocated),
-        TypeMeta::Make<typename std::remove_const<T>::type>()));
+        TypeMeta::Make<std::remove_const_t<T>>()));
  }

  void* ShareExternal(void* allocated, const TypeMeta meta) {
@ -176,7 +172,7 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target {
  /**
   * @brief Swaps the underlying storage of two blobs.
   */
-  void swap(Blob& rhs) {
+  void swap(Blob& rhs)  noexcept {
    using std::swap;
    swap(meta_, rhs.meta_);
    swap(pointer_, rhs.pointer_);
@ -191,13 +187,13 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target {
  }

  TypeMeta meta_;
-  void* pointer_;
-  bool has_ownership_;
+  void* pointer_{nullptr};
+  bool has_ownership_{false};

  C10_DISABLE_COPY_AND_ASSIGN(Blob);
 };

-inline void swap(Blob& lhs, Blob& rhs) {
+inline void swap(Blob& lhs, Blob& rhs)  noexcept {
  lhs.swap(rhs);
 }

--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@ -7,8 +7,7 @@
 #include <functional>
 #include <utility>

-namespace torch {
-namespace jit {
+namespace torch::jit {

 struct BuiltinOpFunction : public Function {
  BuiltinOpFunction(
@ -62,12 +61,16 @@ struct BuiltinOpFunction : public Function {
    return *this;
  }

-  bool call(Stack& stack, c10::optional<size_t>, c10::function_ref<void(const Code&)>) override {
+  bool call(
+      Stack& stack,
+      c10::optional<size_t>,
+      c10::function_ref<void(const Code&)>) override {
    run(stack);
    return false;
  }

-  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>) override {
+  bool call(Stack& stack, c10::function_ref<void(const mobile::Code&)>)
+      override {
    run(stack);
    return false;
  }
@ -84,5 +87,4 @@ struct BuiltinOpFunction : public Function {
  std::string doc_string_;
 };

-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@ -6,12 +6,12 @@
 #include <ATen/core/jit_type_base.h>
 #include <c10/util/Optional.h>

-namespace torch {
-namespace jit {
+
+namespace torch::jit {
 struct CompilationUnit;
 struct Function;
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
+

 namespace c10 {

@ -390,7 +390,7 @@ struct TORCH_API ClassType : public NamedType {
      std::string doc_string = "",
      std::vector<std::string> unresolved_class_attributes = {});

-  std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override {
+  std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override {
    const auto& n = name().value();
    return n.qualifiedName();
  }
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@ -187,7 +187,7 @@ class DynamicType : public SharedType {
  bool equals(const DynamicType& other) const;

  template <typename F>
-  bool compareArguments(const DynamicType& other, F&& f) const {
+  bool compareArguments(const DynamicType& other, const F& f) const {
    if (arguments_.elems.size() != other.arguments_.elems.size()) {
      return false;
    }
--- a/aten/src/ATen/core/enum_type.h
+++ b/aten/src/ATen/core/enum_type.h
@ -88,7 +88,7 @@ struct TORCH_API EnumType : public NamedType {
        cu_(std::move(cu)) {}

  std::string annotation_str_impl(
-      C10_UNUSED TypePrinter printer = nullptr) const override {
+      C10_UNUSED const TypePrinter& printer = nullptr) const override {
    const auto& n = name().value();
    return n.qualifiedName();
  }
--- a/aten/src/ATen/core/function.h
+++ b/aten/src/ATen/core/function.h
@ -14,8 +14,7 @@ namespace at {
 TORCH_API void launch(std::function<void()> func);
 }

-namespace torch {
-namespace jit {
+namespace torch::jit {

 struct Graph;
 struct Code;
@ -29,7 +28,9 @@ using Kwargs = std::unordered_map<std::string, at::IValue>;
 struct RecursiveMethodCallError : public std::exception {};
 using TaskLauncher = std::function<void(std::function<void()>)>;

-TORCH_API void preoptimizeGraph(std::shared_ptr<Graph>& graph, bool disable_autocast=false);
+TORCH_API void preoptimizeGraph(
+    std::shared_ptr<Graph>& graph,
+    bool disable_autocast = false);

 // A Function is a pure Graph with no implicit `self` object bound.
 // It contains schema information and the executor that manages the
@ -54,14 +55,13 @@ struct TORCH_API Function {

  virtual c10::intrusive_ptr<c10::ivalue::Future> runAsync(
      Stack& /*stack*/,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
      C10_UNUSED TaskLauncher taskLauncher = at::launch) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
    return {};
  }

-  at::IValue operator()(
-    Stack stack,
-    const Kwargs& kwargs = Kwargs()) {
+  at::IValue operator()(Stack stack, const Kwargs& kwargs = Kwargs()) {
    getSchema().checkAndNormalizeInputs(stack, kwargs);
    run(stack);
    return stack.front();
@ -93,8 +93,12 @@ struct TORCH_API Function {
  // If call() returns true, then callback completes successfully, otherwise
  // call() returns false.

-  // Overload for server interpreter, a bailout size is needed for graph executor.
-  virtual bool call(Stack&, c10::optional<size_t>, c10::function_ref<void(const Code&)>) {
+  // Overload for server interpreter, a bailout size is needed for graph
+  // executor.
+  virtual bool call(
+      Stack&,
+      c10::optional<size_t>,
+      c10::function_ref<void(const Code&)>) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
    return false;
  }
@ -107,5 +111,4 @@ struct TORCH_API Function {

  virtual ~Function() = default;
 };
-} // namespace jit
-} // namespace torch
+} // namespace torch::jit
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -143,10 +143,10 @@ struct Argument {
        inferred_type_hint);
  }

-  Argument cloneWithType(TypePtr new_type) const {
+  Argument cloneWithType(const TypePtr& new_type) const {
    return Argument(
        name_,
-        std::move(new_type),
+        new_type,
        N_,
        default_value_,
        kwarg_only_,
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -1,9 +1,4 @@
 #pragma once
-#include <vector>
-#include <cstdint>
-#include <string>
-#include <unordered_map>
-#include <algorithm>

 #include <c10/macros/Macros.h>

--- a/Show More
+++ b/Show More