Update on "[2/N] Dynamic Shape: Enable dynamic shape support for aoti_eager"

cc voznesenskym penguinwu jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 ColinPeppler amjames desertfire chauhang [ghstack-poisoned]
2025-11-02 14:34:54 +08:00 · 2024-06-16 02:16:42 +00:00
parent f74bf468a6 6efaaa47e4
commit eefd580d45
195 changed files with 5437 additions and 1125 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -373,6 +373,13 @@ case "$image" in
    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
+  pytorch-linux-jammy-py3.12-halide)
+    CUDA_VERSION=12.4
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    CONDA_CMAKE=yes
+    HALIDE=yes
+    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -490,6 +497,7 @@ docker build \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
+       --build-arg "HALIDE=${HALIDE}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--- a/.ci/docker/ci_commit_pins/halide.txt
+++ b/.ci/docker/ci_commit_pins/halide.txt
@ -0,0 +1 @@
+340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d
--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+COMMIT=$(get_pinned_commit halide)
+test -n "$COMMIT"
+
+# activate conda to populate CONDA_PREFIX
+test -n "$ANACONDA_PYTHON_VERSION"
+eval "$(conda shell.bash hook)"
+conda activate py_$ANACONDA_PYTHON_VERSION
+
+if [ -n "${UBUNTU_VERSION}" ];then
+    apt update
+    apt-get install -y lld liblld-15-dev libpng-dev libjpeg-dev libgl-dev \
+                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
+fi
+
+conda_install numpy scipy imageio cmake ninja
+
+git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
+cmake -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_ENABLE_PROJECTS="clang" \
+        -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
+        -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \
+        -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \
+        -S llvm-project/llvm -B llvm-build -G Ninja
+cmake --build llvm-build
+cmake --install llvm-build --prefix llvm-install
+export LLVM_ROOT=`pwd`/llvm-install
+export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config
+
+git clone https://github.com/halide/Halide.git
+pushd Halide
+git checkout ${COMMIT} && git submodule update --init --recursive
+pip_install -r requirements.txt
+cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
+cmake --build build
+test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
+cmake --install build --prefix ${CONDA_PREFIX}
+chown -R jenkins ${CONDA_PREFIX}
+popd
+rm -rf Halide llvm-build llvm-project llvm-install
+
+python -c "import halide"  # check for errors
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -33,7 +33,9 @@ pip_install coloredlogs packaging
 pip_install onnxruntime==1.18
 pip_install onnx==1.16.0
 # pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
-pip_install onnxscript==0.1.0.dev20240523 --no-deps
+pip_install onnxscript==0.1.0.dev20240613 --no-deps
+# required by onnxscript
+pip_install ml_dtypes

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -103,6 +103,14 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

+ARG HALIDE
+# Build and install halide
+COPY ./common/install_halide.sh install_halide.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/halide.txt halide.txt
+RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
+RUN rm install_halide.sh common_utils.sh halide.txt
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -155,6 +155,14 @@ COPY ci_commit_pins/executorch.txt executorch.txt
 RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
 RUN rm install_executorch.sh common_utils.sh executorch.txt

+ARG HALIDE
+# Build and install halide
+COPY ./common/install_halide.sh install_halide.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/halide.txt halide.txt
+RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
+RUN rm install_halide.sh common_utils.sh halide.txt
+
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -188,28 +188,6 @@ function clone_pytorch_xla() {
  fi
 }

-function checkout_install_torchdeploy() {
-  local commit
-  commit=$(get_pinned_commit multipy)
-  pushd ..
-  git clone --recurse-submodules https://github.com/pytorch/multipy.git
-  pushd multipy
-  git checkout "${commit}"
-  python multipy/runtime/example/generate_examples.py
-  BUILD_CUDA_TESTS=1 pip install -e .
-  popd
-  popd
-}
-
-function test_torch_deploy(){
- pushd ..
- pushd multipy
- ./multipy/runtime/build/test_deploy
- ./multipy/runtime/build/test_deploy_gpu
- popd
- popd
-}
-
 function checkout_install_torchbench() {
  local commit
  commit=$(get_pinned_commit torchbench)
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -550,6 +550,11 @@ test_inductor_micro_benchmark() {
  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
 }

+test_inductor_halide() {
+  python test/run_test.py --include inductor/test_halide.py --verbose
+  assert_git_not_dirty
+}
+
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -1237,11 +1242,10 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_rpc
  fi
-elif [[ "$TEST_CONFIG" == deploy ]]; then
-  checkout_install_torchdeploy
-  test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
+elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
+  test_inductor_halide
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -180,6 +180,9 @@ def mock_gh_get_info() -> Any:
    return {
        "closed": False,
        "isCrossRepository": False,
+        "headRefName": "foo",
+        "baseRefName": "bar",
+        "baseRepository": {"defaultBranchRef": {"name": "bar"}},
        "files": {"nodes": [], "pageInfo": {"hasNextPage": False}},
        "changedFiles": 0,
    }
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -2330,6 +2330,15 @@ def main() -> None:
            dry_run=args.dry_run,
        )
        return
+    if not pr.is_ghstack_pr() and pr.base_ref() != pr.default_branch():
+        gh_post_pr_comment(
+            org,
+            project,
+            args.pr_num,
+            f"PR targets {pr.base_ref()} rather than {pr.default_branch()}, refusing merge request",
+            dry_run=args.dry_run,
+        )
+        return

    if args.check_mergeability:
        if pr.is_ghstack_pr():
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -54,6 +54,7 @@ jobs:
          pytorch-linux-focal-py3-clang9-android-ndk-r21e,
          pytorch-linux-jammy-py3.8-gcc11,
          pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-2024.0-py3,
          pytorch-linux-jammy-py3-clang15-asan,
          pytorch-linux-focal-py3-clang10-onnx,
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -102,6 +102,26 @@ jobs:
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}

+  linux-jammy-cpu-py3_12-inductor-halide-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.12-halide
+      test-matrix: |
+        { include: [
+          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+        ]}
+
+  linux-jammy-cpu-py3_12-inductor-halide-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_12-inductor-halide-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+
  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
    name: cuda12.4-py3.10-gcc9-sm86
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -270,7 +270,6 @@ jobs:
          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-test:
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -68,6 +68,7 @@ include_patterns = [
    'aten/src/ATen/native/cudnn/*.cpp',
    'c10/**/*.h',
    'c10/**/*.cpp',
+    'distributed/c10d/*SymmetricMemory.*',
    'torch/csrc/**/*.h',
    'torch/csrc/**/*.hpp',
    'torch/csrc/**/*.cpp',
@ -216,7 +217,6 @@ exclude_patterns = [
    'c10/util/complex_math.h',
    'c10/util/complex_utils.h',
    'c10/util/flat_hash_map.h',
-    'c10/util/Float8*.h',
    'c10/util/logging*.h',
    'c10/util/hash.h',
    'c10/util/strong_type.h',
@ -1756,9 +1756,7 @@ exclude_patterns = [
    'torch/testing/_internal/codegen/__init__.py',
    'torch/testing/_internal/codegen/random_topo_test.py',
    'torch/testing/_internal/common_cuda.py',
-    'torch/testing/_internal/common_device_type.py',
    'torch/testing/_internal/common_distributed.py',
-    'torch/testing/_internal/common_dtype.py',
    'torch/testing/_internal/common_jit.py',
    'torch/testing/_internal/common_methods_invocations.py',
    'torch/testing/_internal/common_modules.py',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -744,6 +744,7 @@ cc_library(
            "torch/csrc/cuda/python_nccl.cpp",
            "torch/csrc/cuda/nccl.cpp",
            "torch/csrc/distributed/c10d/intra_node_comm.cu",
+            "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
            "torch/csrc/distributed/c10d/Utils.cu",
            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
        ],
--- a/SECURITY.md
+++ b/SECURITY.md
@ -6,7 +6,7 @@
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
   - [Using distributed features](#using-distributed-features)
-
+- [**CI/CD security principles**](#cicd-security-principles)
 ## Reporting Security Issues

 Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch.
@ -61,3 +61,27 @@ If applicable, prepare your model against bad inputs and prompt injections. Some
 PyTorch can be used for distributed computing, and as such there is a `torch.distributed` package. PyTorch Distributed features are intended for internal communication only. They are not built for use in untrusted environments or networks.

 For performance reasons, none of the PyTorch Distributed primitives (including c10d, RPC, and TCPStore) include any authorization protocol and will send messages unencrypted. They accept connections from anywhere, and execute the workload sent without performing any checks. Therefore, if you run a PyTorch Distributed program on your network, anybody with access to the network can execute arbitrary code with the privileges of the user running PyTorch.
+
+## CI/CD security principles
+_Audience_: Contributors and reviewers, especially if modifying the workflow files/build system.
+
+PyTorch CI/CD security philosophy is based on finding a balance between open and transparent CI pipelines while keeping the environment efficient and safe.
+
+PyTorch testing requirements are complex, and a large part of the code base can only be tested on specialized powerful hardware, such as GPU, making it a lucrative target for resource misuse. To prevent this, we require workflow run approval for PRs from non-member contributors. To keep the volume of those approvals relatively low, we easily extend write permissions to the repository to regular contributors.
+
+More widespread write access to the repo presents challenges when it comes to reviewing changes, merging code into trunk, and creating releases. [Protected branches](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/managing-protected-branches/about-protected-branches) are used to restrict the ability to merge to the trunk/release branches only to the repository administrators and merge bot. The merge bot is responsible for mechanistically merging the change and validating reviews against the path-based rules defined in [merge_rules.yml](https://github.com/pytorch/pytorch/blob/main/.github/merge_rules.yaml). Once a PR has been reviewed by person(s) mentioned in these rules, leaving a `@pytorchbot merge` comment on the PR will initiate the merge process. To protect merge bot credentials from leaking, merge actions must be executed only on ephemeral runners (see definition below) using a specialized deployment environment.
+
+To speed up the CI system, build steps of the workflow rely on the distributed caching mechanism backed by [sccache](https://github.com/mozilla/sccache), making them susceptible to cache corruption compromises. For that reason binary artifacts generated during CI should not be executed in an environment that contains an access to any sensitive/non-public information and should not be published for use by general audience. One should not have any expectation about the lifetime of those artifacts, although in practice they likely remain accessible for about two weeks after the PR has been closed.
+
+To speed up CI system setup, PyTorch relies heavily on Docker to pre-build and pre-install the dependencies. To prevent a potentially malicious PR from altering ones that were published in the past, ECR has been configured to use immutable tags.
+
+To improve runner availability and more efficient resource utilization, some of the CI runners are non-ephemeral, i.e., workflow steps from completely unrelated PRs could be scheduled sequentially on the same runner, making them susceptible to reverse shell attacks. For that reason, PyTorch does not rely on the repository secrets mechanism, as these can easily be compromised in such attacks.
+
+### Release pipelines security
+
+To ensure safe binary releases, PyTorch release pipelines are built on the following principles:
+ - All binary builds/upload jobs must be run on ephemeral runners, i.e., on a machine that is allocated from the cloud to do the build and released back to the cloud after the build is finished. This protects those builds from interference from external actors, who potentially can get reverse shell access to a non-ephemeral runner and wait there for a binary build.
+ - All binary builds are cold-start builds, i.e., distributed caching/incremental builds are not permitted. This renders builds much slower than incremental CI builds but isolates them from potential compromises of the intermediate artifacts caching systems.
+ - All upload jobs are executed in a [deployment environments](https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment) that are restricted to protected branches
+ - Security credentials needed to upload binaries to PyPI/conda or stable indexes `download.pytorch.org/whl` are never uploaded to repo secrets storage/environment. This requires an extra manual step to publish the release but ensures that access to those would not be compromised by deliberate/accidental leaks of secrets stored in the cloud.
+ - No binary artifacts should be published to GitHub releases pages, as these are overwritable by anyone with write permission to the repo.
--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@ -35,6 +35,12 @@ void SavedTensorDefaultHooks::enable() {
  tls.disabled_error_message = c10::nullopt;
 }

+/* static */ bool SavedTensorDefaultHooks::set_tracing(bool is_tracing) {
+  bool prior  = tls.is_tracing;
+  tls.is_tracing = is_tracing;
+  return prior;
+}
+
 const std::optional<std::string>& SavedTensorDefaultHooks::get_disabled_error_message() {
  return tls.disabled_error_message;
 }
@ -59,25 +65,20 @@ void SavedTensorDefaultHooks::push_hooks(PyObject* pack_hook, PyObject* unpack_h
  tls.stack.emplace(pack_hook, unpack_hook);
 }

-void SavedTensorDefaultHooks::pop_hooks() {
+std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::pop_hooks() {
  // Reference counting is handled by the caller of `pop_hooks`
  TORCH_INTERNAL_ASSERT(is_initialized && !tls.stack.empty());
+  std::pair<PyObject*, PyObject*> hooks = tls.stack.top();
  tls.stack.pop();
+  return hooks;
 }

 std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::get_hooks() {
-  if (!is_initialized || tls.stack.empty()) {
+  // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
+  if (!is_initialized || tls.stack.empty() || tls.is_tracing) {
    return std::make_pair(nullptr, nullptr);
  }
  return tls.stack.top();
 }

-std::stack<std::pair<PyObject*, PyObject*>> SavedTensorDefaultHooks::get_stack() {
-  return tls.stack;
-}
-
-void SavedTensorDefaultHooks::set_stack(std::stack<std::pair<PyObject*, PyObject*>> stack_) {
-  tls.stack = std::move(stack_);
-}
-
 }
--- a/aten/src/ATen/SavedTensorHooks.h
+++ b/aten/src/ATen/SavedTensorHooks.h
@ -22,17 +22,18 @@ struct TORCH_API SavedTensorDefaultHooksTLS {
  // We did this for efficiency (so we didn't have to keep a separate bool
  // around)
  std::optional<std::string> disabled_error_message;
+
+  // See NOTE: [Deferring tensor pack/unpack hooks until runtime]
+  bool is_tracing = false;
 };

 } // namespace impl

 struct TORCH_API SavedTensorDefaultHooks {
  static void push_hooks(PyObject* pack_hook, PyObject* unpack_hook);
-  static void pop_hooks();
+  static std::pair<PyObject*, PyObject*> pop_hooks();
  static std::pair<PyObject*, PyObject*> get_hooks();
  static void lazy_initialize();
-  static std::stack<std::pair<PyObject*, PyObject*>> get_stack();
-  static void set_stack(std::stack<std::pair<PyObject*, PyObject*>>);

  static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
  static void set_tls_state(const impl::SavedTensorDefaultHooksTLS& tls);
@ -42,11 +43,20 @@ struct TORCH_API SavedTensorDefaultHooks {
  // hooks, especially if their feature does not work with it. If they are
  // disabled, then the following will raise an error:
  // - Attempting to push_hooks
-  // - calling disable(message) with a non-zero stack (from get_stack) size
+  // - calling disable(message) with a non-zero stack (hooks) size
  static void disable(const std::string& error_message);
  static void enable();
  static bool is_enabled();
  static const std::optional<std::string>& get_disabled_error_message();
+
+  // NOTE: [Deferring tensor pack/unpack hooks until runtime]
+  // To preserve eager semantics of pack/unpack hooks firing only once per saved
+  // variable, Dynamo/AOTAutograd need to defer hook firing until runtime. Using
+  // disable() would loud error at trace time, and pushing a no-op hook would
+  // fail when the traced code is wrapped in a disable_saved_tensors_hooks ctx.
+  // To do so, we disable these hooks during tracing. See
+  // https://github.com/pytorch/pytorch/issues/113263.
+  static bool set_tracing(bool is_tracing);
 };

 } // namespace at
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@ -794,12 +794,16 @@ Vectorized<BFloat16> inline clamp_min(const Vectorized<BFloat16>& a, const Vecto
 template <>
 inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
    auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
    _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = src[i];
  }
@ -992,12 +996,16 @@ Vectorized<Half> inline clamp_min(const Vectorized<Half>& a, const Vectorized<Ha
 template <>
 inline void convert(const Half* src, Half* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<Half>::size()); i += Vectorized<Half>::size()) {
    auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
    _mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = src[i];
  }
--- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@ -416,11 +416,15 @@ inline Vectorized<double> Vectorized<double>::le(const Vectorized<double>& other
 template <>
 inline void convert(const double* src, double* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
    _mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = src[i];
  }
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@ -512,11 +512,15 @@ inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) c
 template <>
 inline void convert(const float* src, float* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
    _mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = src[i];
  }
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
@ -823,12 +823,16 @@ inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) c
 template <>
 inline void convert(const float* src, int32_t* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
    vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i)));
    vst1q_s32(dst + i + 4, vcvtq_s32_f32(vld1q_f32(src + i + 4)));
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = static_cast<int32_t>(src[i]);
  }
@ -837,12 +841,16 @@ inline void convert(const float* src, int32_t* dst, int64_t n) {
 template <>
 inline void convert(const int32_t* src, float* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
    vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i)));
    vst1q_f32(dst + i + 4, vcvtq_f32_s32(vld1q_s32(src + i + 4)));
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = static_cast<float>(src[i]);
  }
--- a/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h
@ -765,13 +765,17 @@ inline Vectorized<c10::Half> Vectorized<c10::Half>::le(
 template <>
 inline void convert(const float16_t* src, int16_t* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<c10::Half>::size());
       i += Vectorized<c10::Half>::size()) {
    vst1q_s16(dst + i, vcvtq_s16_f16(vld1q_f16(src + i)));
    vst1q_s16(dst + i + 8, vcvtq_s16_f16(vld1q_f16(src + i + 8)));
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = static_cast<int16_t>(src[i]);
  }
@ -780,13 +784,17 @@ inline void convert(const float16_t* src, int16_t* dst, int64_t n) {
 template <>
 inline void convert(const int16_t* src, float16_t* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<c10::Half>::size());
       i += Vectorized<c10::Half>::size()) {
    vst1q_f16(dst + i, vcvtq_f16_s16(vld1q_s16(src + i)));
    vst1q_f16(dst + i + 8, vcvtq_f16_s16(vld1q_s16(src + i + 8)));
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = static_cast<float16_t>(src[i]);
  }
--- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h
@ -914,12 +914,16 @@ Vectorized<BFloat16> inline clamp_min(const Vectorized<BFloat16>& a, const Vecto
 template <>
 inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
    auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
    _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = src[i];
  }
@ -986,7 +990,9 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
  // j0-j15  n0-n15
  // k0-k15  o0-o15
  // l0-l15  p0-p15
+#ifndef __msvc_cl__
 #pragma unroll(4)
+#endif
  for (int i = 0; i < 4; i++) {
    r[i] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i]), t[i + 4], 0x01);
    r[i + 4] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01);
@ -998,7 +1004,9 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
  // u3: c4c5 d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15   g4g5 h4h5 g6g7 h6h7 g12g13 h12h13 g14g15 h14h15
  // i j  m n
  // k l  o p
+#ifndef __msvc_cl__
 #pragma unroll(4)
+#endif
  for (int i = 0; i < 8; i += 2) {
    u[i] = _mm512_unpacklo_epi32(r[i], r[i + 1]);
    u[i + 1] = _mm512_unpackhi_epi32(r[i], r[i + 1]);
@ -1061,7 +1069,9 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
  // 12-- 13--
  // 6-- 7--
  // 14-- 15--
+#ifndef __msvc_cl__
 #pragma unroll(4)
+#endif
  for (int i = 0; i < 4; i++) {
    u[i] = _mm512_permutex2var_epi16(r[i], const1, r[i + 4]);
    u[i + 4] = _mm512_permutex2var_epi16(r[i], const2, r[i + 4]);
@ -1095,7 +1105,9 @@ inline void transpose_mxn<BFloat16, 16, 16>(
  // n: n0  n1  n2  n3  n4  n5  n6  n7  n8  n9  n10 n11 n12 n13 n14 n15
  // o: o0  o1  o2  o3  o4  o5  o6  o7  o8  o9  o10 o11 o12 o13 o14 o15
  // p: p0  p1  p2  p3  p4  p5  p6  p7  p8  p9  p10 p11 p12 p13 p14 p15
+#ifndef __msvc_cl__
 #pragma unroll(16)
+#endif
  for (int i = 0; i < 16; i++) {
    t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
  }
@ -1103,7 +1115,9 @@ inline void transpose_mxn<BFloat16, 16, 16>(
  __m512i u[8];
  _transpose_mxn_half_16_16(t, u);

+#ifndef __msvc_cl__
 #pragma unroll(8)
+#endif
  for (int i = 0; i < 8; i++) {
    _mm256_storeu_si256(
      reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
@ -1125,7 +1139,9 @@ inline void transpose_mxn<Half, 16, 16>(
  __m256i t[16];
  // load from src to registers
  // Same matrix indices as above transpose_mxn<BFloat16, 16, 16>
+#ifndef __msvc_cl__
 #pragma unroll(16)
+#endif
  for (int i = 0; i < 16; i++) {
    t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
  }
@ -1133,7 +1149,9 @@ inline void transpose_mxn<Half, 16, 16>(
  __m512i u[8];
  _transpose_mxn_half_16_16(t, u);

+#ifndef __msvc_cl__
 #pragma unroll(8)
+#endif
  for (int i = 0; i < 8; i++) {
    _mm256_storeu_si256(
      reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
@ -1164,7 +1182,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
  // t[16]: 512 544 513 545 514 546 515 547 520 552 521 553 522 554 523 555 528 ... 571
  // ...
  // t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007 980 ... 1023
+#ifndef __msvc_cl__
 #pragma unroll(16)
+#endif
  for (int i = 0; i < 16; ++i) {
    d[i * 2] = _mm512_unpacklo_epi16(r[i * 2], r[i * 2 + 1]);
    d[i * 2 + 1] = _mm512_unpackhi_epi16(r[i * 2], r[i * 2 + 1]);
@ -1189,7 +1209,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
  // t[16]: 512 544 576 608 513 545 577 609 520 552 584 616 521 553 585 617 528 ... 633
  // ...
  // t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007 918 ... 1023
+#ifndef __msvc_cl__
 #pragma unroll(8)
+#endif
  for (int i = 0; i < 8; ++i) {
    r[i * 4] = _mm512_unpacklo_epi32(d[i * 4], d[i * 4 + 2]);
    r[i * 4 + 1] = _mm512_unpackhi_epi32(d[i * 4], d[i * 4 + 2]);
@ -1216,7 +1238,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
  // t[16]: 512 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760
  // ...
  // t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791 ... 1023
+#ifndef __msvc_cl__
 #pragma unroll(4)
+#endif
  for (int i = 0; i < 4; ++i) {
    d[i * 8] = _mm512_unpacklo_epi64(r[i * 8], r[i * 8 + 4]);
    d[i * 8 + 1] = _mm512_unpackhi_epi64(r[i * 8], r[i * 8 + 4]);
@ -1265,7 +1289,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
      0x000000000000000a,
      0x0000000000000003,
      0x0000000000000002);
+#ifndef __msvc_cl__
 #pragma unroll(8)
+#endif
  for (int i = 0; i < 8; ++i) {
    r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/const1, d[i + 8]);
    r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/const2, d[i + 8]);
@ -1310,7 +1336,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
      0x0000000000000006,
      0x0000000000000005,
      0x0000000000000004);
+#ifndef __msvc_cl__
 #pragma unroll(16)
+#endif
  for (int i = 0; i < 16; ++i) {
    d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/const3, r[i + 16]);
    d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/const4, r[i + 16]);
@ -1327,7 +1355,9 @@ inline void transpose_mxn<BFloat16, 32, 32>(
    int64_t ld_dst) {
  // Load from memory
  __m512i r[32];
+#ifndef __msvc_cl__
 #pragma unroll(32)
+#endif
  for (int i = 0; i < 32; ++i) {
    r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i* ld_src));
  }
@ -1336,7 +1366,9 @@ inline void transpose_mxn<BFloat16, 32, 32>(
  _transpose_mxn_half_32_32(r, d);

  // Store to dst
+#ifndef __msvc_cl__
 #pragma unroll(32)
+#endif
  for (int i = 0; i < 32; ++i) {
    _mm512_storeu_si512(dst + i* ld_dst, d[i]);
  }
@ -1350,7 +1382,9 @@ inline void transpose_mxn<Half, 32, 32>(
    int64_t ld_dst) {
  // Load from memory
  __m512i r[32];
+#ifndef __msvc_cl__
 #pragma unroll(32)
+#endif
  for (int i = 0; i < 32; ++i) {
    r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i* ld_src));
  }
@ -1359,7 +1393,9 @@ inline void transpose_mxn<Half, 32, 32>(
  _transpose_mxn_half_32_32(r, d);

  // Store to dst
+#ifndef __msvc_cl__
 #pragma unroll(32)
+#endif
  for (int i = 0; i < 32; ++i) {
    _mm512_storeu_si512(dst + i* ld_dst, d[i]);
  }
@ -1514,12 +1550,16 @@ Vectorized<Half> inline clamp_min(const Vectorized<Half>& a, const Vectorized<Ha
 template <>
 inline void convert(const Half* src, Half* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<Half>::size()); i += Vectorized<Half>::size()) {
    auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
    _mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = src[i];
  }
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@ -443,11 +443,15 @@ inline Vectorized<double> Vectorized<double>::le(const Vectorized<double>& other
 template <>
 inline void convert(const double* src, double* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
    _mm512_storeu_pd(dst + i, _mm512_loadu_pd(src + i));
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = src[i];
  }
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@ -552,11 +552,15 @@ inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) c
 template <>
 inline void convert(const float* src, float* dst, int64_t n) {
  int64_t i;
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
    _mm512_storeu_ps(dst + i, _mm512_loadu_ps(src + i));
  }
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
  for (; i < n; i++) {
    dst[i] = src[i];
  }
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -42,6 +42,15 @@
 #define __FORCE_INLINE __forceinline
 #endif

+#if defined(_MSC_FULL_VER)
+/*
+https://learn.microsoft.com/en-us/cpp/overview/compiler-versions?view=msvc-170
+Use _MSC_FULL_VER to identify current compiler is msvc,
+Windows llvm will not have this defination.
+*/
+#define __msvc_cl__
+#endif
+
 // These macros helped us unify vec_base.h
 #ifdef CPU_CAPABILITY_AVX512
 #if defined(__GNUC__)
--- a/aten/src/ATen/cpu/vec/vec_mask.h
+++ b/aten/src/ATen/cpu/vec/vec_mask.h
@ -127,7 +127,9 @@ class VecMask {
  static VecMask<T, N> from(U* b) {
    using int_t = int_same_size_t<T>;
    __at_align__ T mask[size()];
+#ifndef __msvc_cl__
 #pragma unroll
+#endif
    for (int i = 0; i < size(); i++) {
      *(int_t*)(mask + i) = b[i] ? ~(int_t)0 : (int_t)0;
    }
--- a/aten/src/ATen/native/TensorFactories.h
+++ b/aten/src/ATen/native/TensorFactories.h
@ -103,10 +103,10 @@ inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tens
 // with max value if it is integer type
 inline Tensor& fill_empty_deterministic_(Tensor& tensor) {
  if (tensor.is_floating_point() || tensor.is_complex()) {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
-      kBFloat16, kHalf, tensor.scalar_type(), "fill_empty_deterministic_", [&]() {
+    AT_DISPATCH_V2(
+      tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
        tensor.fill_(std::numeric_limits<scalar_t>::quiet_NaN());
-    });
+    }), AT_EXPAND(AT_FLOATING_TYPES), AT_EXPAND(AT_COMPLEX_TYPES), AT_EXPAND(AT_FLOAT8_TYPES), kBFloat16, kHalf);
  } else {
    AT_DISPATCH_V2(
      tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
--- a/aten/src/ATen/native/cpu/FillKernel.cpp
+++ b/aten/src/ATen/native/cpu/FillKernel.cpp
@ -43,6 +43,14 @@ void fill_kernel(TensorIterator& iter, const Scalar& value_scalar) {
    fill_non_native_type<at::BFloat16>(iter, value_scalar);
  } else if (iter.dtype() == ScalarType::ComplexHalf) {
    fill_non_native_type<c10::complex<at::Half>>(iter, value_scalar);
+  } else if (iter.dtype() == ScalarType::Float8_e4m3fn) {
+    fill_non_native_type<at::Float8_e4m3fn>(iter, value_scalar);
+  } else if (iter.dtype() == ScalarType::Float8_e5m2) {
+    fill_non_native_type<at::Float8_e5m2>(iter, value_scalar);
+  } else if (iter.dtype() == ScalarType::Float8_e4m3fnuz) {
+    fill_non_native_type<at::Float8_e4m3fnuz>(iter, value_scalar);
+  } else if (iter.dtype() == ScalarType::Float8_e5m2fnuz) {
+    fill_non_native_type<at::Float8_e5m2fnuz>(iter, value_scalar);
  } else {
    AT_DISPATCH_V2(
      iter.dtype(), "fill_cpu", AT_WRAP([&]() {
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -685,7 +685,6 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
    switch (backend) {
      case SDPBackend::cudnn_attention:
        if (sdp::can_use_cudnn_attention(kernel_params, print_debug)) {
-              TORCH_WARN("USING CUDNN SDPA");
              return SDPBackend::cudnn_attention;
        }
        break;
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -6,6 +6,14 @@ import textwrap
 import pandas as pd


+# Hack to have something similar to DISABLED_TEST. These models are flaky.
+
+flaky_models = {
+    "yolov3",
+    "gluon_inception_v3",
+}
+
+
 def get_field(csv, model_name: str, field: str):
    try:
        return csv.loc[csv["name"] == model_name][field].item()
@ -25,6 +33,13 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
            status = "PASS" if expected_accuracy == "pass" else "XFAIL"
            print(f"{model:34}  {status}")
            continue
+        elif model in flaky_models:
+            if accuracy == "pass":
+                # model passed but marked xfailed
+                status = "PASS_BUT_FLAKY:"
+            else:
+                # model failed but marked passe
+                status = "FAIL_BUT_FLAKY:"
        elif accuracy != "pass":
            status = "FAIL:"
            failed.append(model)
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@ -378,4 +378,4 @@ vision_maskrcnn,pass,17



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@ -286,4 +286,4 @@ vision_maskrcnn,pass,34



-yolov3,pass,9
+yolov3,pass,8
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@ -350,4 +350,4 @@ vision_maskrcnn,fail_to_run,0



-yolov3,fail_to_run,0
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_freezing_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_freezing_inference.csv
@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0



-DALLE2_pytorch,model_fail_to_load,0
+DALLE2_pytorch,eager_fail_to_run,0



@ -338,4 +338,4 @@ vision_maskrcnn,pass,28



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0



-DALLE2_pytorch,model_fail_to_load,0
+DALLE2_pytorch,eager_fail_to_run,0



@ -338,4 +338,4 @@ vision_maskrcnn,pass,28



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cu124/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cu124/aot_inductor_torchbench_inference.csv
@ -350,4 +350,4 @@ vision_maskrcnn,fail_to_run,0



-yolov3,fail_to_run,0
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cu124/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cu124/inductor_torchbench_training.csv
@ -98,7 +98,7 @@ hf_Bert_large,pass,6



-hf_BigBird,pass,52
+hf_BigBird,pass,49



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@ -374,4 +374,4 @@ vision_maskrcnn,pass,17



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@ -282,4 +282,4 @@ vision_maskrcnn,pass,34



-yolov3,pass,9
+yolov3,pass,8
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0



-DALLE2_pytorch,model_fail_to_load,0
+DALLE2_pytorch,eager_fail_to_run,0



@ -298,4 +298,4 @@ vision_maskrcnn,pass,28



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@ -374,4 +374,4 @@ vision_maskrcnn,pass,17



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@ -282,4 +282,4 @@ vision_maskrcnn,pass,34



-yolov3,pass,9
+yolov3,pass,8
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@ -378,4 +378,4 @@ vision_maskrcnn,pass,17



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@ -286,4 +286,4 @@ vision_maskrcnn,pass,34



-yolov3,pass,9
+yolov3,pass,8
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@ -378,4 +378,4 @@ vision_maskrcnn,pass,17



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@ -286,4 +286,4 @@ vision_maskrcnn,pass,34



-yolov3,pass,9
+yolov3,pass,8
--- a/benchmarks/gpt_fast/benchmark.py
+++ b/benchmarks/gpt_fast/benchmark.py
@ -2,12 +2,17 @@ import argparse
 import csv
 import dataclasses
 import os
-import time

 from generate import run_llama2_7b_bf16, run_llama2_7b_int8, run_mixtral_8x7b_int8
+from triton.testing import do_bench

 import torch
 import torch.nn as nn
+from torch.utils.flop_counter import FlopCounterMode
+
+WARMUP_ITER = 5
+
+A100_80G_BF16_TFLOPS = 312


@dataclasses.dataclass
@ -16,59 +21,191 @@ class Experiment:
    metric: str
    target: float
    actual: float
+    dtype: str
+    device: str


-def do_inference(mod, x, num_samples: int = 5):
-    total_time = 0
-    start = -1
-
-    for i in range(start, num_samples):
-        torch.cuda.synchronize("cuda")
-
-        t0 = time.perf_counter()
-        mod(x)
-
-        if i == -1:
-            print(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
-            continue
-
-        torch.cuda.synchronize("cuda")
-        total_time += time.perf_counter() - t0
-
-    total_time = total_time / num_samples
-
-    return total_time
-
-
-def run_multi_layer_norm():
-    class MultiLayerNorm(nn.Module):
-        def __init__(self, num_layers, normalized_shape, eps=1e-5, bias=True):
-            super().__init__()
-            self.num_layers = num_layers
-            self.norm_layers = nn.ModuleList(
-                [
-                    nn.LayerNorm(normalized_shape, eps=eps, bias=bias)
-                    for _ in range(num_layers)
-                ]
-            )
-
-        def forward(self, x):
-            for layer_norm in self.norm_layers:
-                x = layer_norm(x)
-            return x
-
-    mod = MultiLayerNorm(num_layers=8, normalized_shape=4096).to("cuda")
-    mod = torch.compile(mod)
-    input = torch.randn([512, 1024, 4096], dtype=torch.bfloat16, device="cuda")
-    inference_time = do_inference(mod, input)
-
-    memory_bandwidth = input.numel() * input.dtype.itemsize / inference_time / 1e9
-
-    return [
-        Experiment(
-            "multi_layer_norm", "memory_bandwidth(GB/s)", 92, f"{memory_bandwidth:.02f}"
+class SimpleMLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim, dtype):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                nn.Linear(input_dim, hidden_dim, dtype=dtype),
+                nn.LayerNorm(hidden_dim, dtype=dtype),
+                nn.Linear(hidden_dim, output_dim, dtype=dtype),
+                nn.LayerNorm(output_dim, dtype=dtype),
+            ]
        )
-    ]
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+def run_mlp_layer_norm_gelu(device: str = "cuda"):
+    dtype_flops_utilization_map = {
+        torch.bfloat16: "0.71",
+    }
+    input_shapes = [1024, 4096, 8192, 16384]
+    intermediate_size = 14336
+    results = []
+    for dtype, expected_flops_utilization in dtype_flops_utilization_map.items():
+        flops_utilization = 0
+        for D in input_shapes:
+            mod = SimpleMLP(
+                input_dim=D, hidden_dim=intermediate_size, output_dim=D, dtype=dtype
+            ).to(device)
+
+            x = torch.randn(D, device=device, dtype=torch.bfloat16)
+
+            with FlopCounterMode(display=False) as mode:
+                mod(x)
+
+            flops = mode.get_total_flops()
+
+            compiled_mod = torch.compile(mod, dynamic=False)
+
+            for _ in range(WARMUP_ITER):
+                compiled_mod(x)
+
+            us_per_iter = do_bench(lambda: compiled_mod(x)) * 1000
+            flops_utilization += us_per_iter * flops / 1e9 / A100_80G_BF16_TFLOPS
+
+        flops_utilization = flops_utilization / len(input_shapes)
+        dtype_str = str(dtype).replace("torch.", "")
+        results.append(
+            Experiment(
+                f"mlp_layer_norm_gelu_{dtype_str}",
+                "flops_utilization",
+                expected_flops_utilization,
+                f"{flops_utilization:.02f}",
+                dtype_str,
+                device,
+            )
+        )
+    return results
+
+
+def run_layer_norm(device: str = "cuda"):
+    dtype_memory_bandwidth_map = {
+        torch.bfloat16: "1017",
+    }
+    input_shapes = [1024, 4096, 8192, 16384]
+    BS = 4096
+    results = []
+    for dtype, expected_memory_bandwidth in dtype_memory_bandwidth_map.items():
+        memory_bandwidth = 0
+        for D in input_shapes:
+            mod = nn.LayerNorm(D).to(device)
+
+            x = torch.randn(BS, D, device=device, dtype=dtype)
+
+            compiled_mod = torch.compile(mod, dynamic=False)
+
+            for _ in range(WARMUP_ITER):
+                compiled_mod(x)
+
+            us_per_iter = do_bench(lambda: compiled_mod(x)) * 1000
+            memory_bandwidth += (1e6 / us_per_iter) * 2 * BS * D * dtype.itemsize / 1e9
+
+        memory_bandwidth = memory_bandwidth / len(input_shapes)
+        dtype_str = str(dtype).replace("torch.", "")
+        results.append(
+            Experiment(
+                f"layer_norm_{dtype_str}",
+                "memory_bandwidth(GB/s)",
+                expected_memory_bandwidth,
+                f"{memory_bandwidth:.02f}",
+                dtype_str,
+                device,
+            )
+        )
+    return results
+
+
+@torch._inductor.config.patch(coordinate_descent_tuning=True)
+def run_gather_gemv(device: str = "cuda"):
+    E = 8
+    dtype_memory_bandwidth_map = {
+        torch.int8: "1113",
+        torch.bfloat16: "1249",
+    }
+    input_shapes = [1024, 4096, 8192, 16384]
+    results = []
+    for dtype, expected_memory_bandwidth in dtype_memory_bandwidth_map.items():
+        memory_bandwidth = 0
+        for D in input_shapes:
+
+            def gather_gemv(W, score_idxs, x):
+                return W[score_idxs].to(x.dtype) @ x
+
+            W = torch.randn(E, D, D, device=device).to(dtype=dtype)
+            x = torch.randn(D, device=device, dtype=torch.bfloat16)
+            score_idxs = torch.tensor([3, 5], device=device)
+
+            compiled_fn = torch.compile(gather_gemv, dynamic=False)
+
+            for _ in range(WARMUP_ITER):
+                compiled_fn(W, score_idxs, x)
+
+            us_per_iter = do_bench(lambda: compiled_fn(W, score_idxs, x)) * 1000
+            memory_bandwidth += (1e6 / us_per_iter) * 2 * D * D * dtype.itemsize / 1e9
+
+        memory_bandwidth = memory_bandwidth / len(input_shapes)
+        dtype_str = str(dtype).replace("torch.", "")
+        results.append(
+            Experiment(
+                f"gather_gemv_{dtype_str}",
+                "memory_bandwidth(GB/s)",
+                expected_memory_bandwidth,
+                f"{memory_bandwidth:.02f}",
+                dtype_str,
+                device,
+            )
+        )
+    return results
+
+
+@torch._inductor.config.patch(coordinate_descent_tuning=True)
+def run_gemv(device: str = "cuda"):
+    dtype_memory_bandwidth_map = {
+        torch.int8: "990",
+        torch.bfloat16: "1137",
+    }
+    input_shapes = [1024, 4096, 8192, 16384]
+    results = []
+    for dtype, expected_memory_bandwidth in dtype_memory_bandwidth_map.items():
+        memory_bandwidth = 0
+        for D in input_shapes:
+
+            def gemv(W, x):
+                return W.to(x.dtype) @ x
+
+            W = torch.randn(D, D, device="cuda").to(dtype=dtype)
+            x = torch.randn(D, device="cuda", dtype=torch.bfloat16)
+
+            compiled_fn = torch.compile(gemv, dynamic=False)
+
+            for _ in range(WARMUP_ITER):
+                compiled_fn(W, x)
+
+            us_per_iter = do_bench(lambda: compiled_fn(W, x)) * 1000
+            memory_bandwidth += (1e6 / us_per_iter) * D * D * dtype.itemsize / 1e9
+
+        memory_bandwidth = memory_bandwidth / len(input_shapes)
+        dtype_str = str(dtype).replace("torch.", "")
+        results.append(
+            Experiment(
+                f"gemv_{dtype_str}",
+                "memory_bandwidth(GB/s)",
+                expected_memory_bandwidth,
+                f"{memory_bandwidth:.02f}",
+                dtype_str,
+                device,
+            )
+        )
+    return results


 def output_csv(output_file, headers, row):
@ -100,7 +237,10 @@ all_experiments = {
    run_llama2_7b_int8,
    run_mixtral_8x7b_int8,
    # A list of micro-benchmarks.
-    run_multi_layer_norm,
+    run_mlp_layer_norm_gelu,
+    run_layer_norm,
+    run_gather_gemv,
+    run_gemv,
 }


--- a/benchmarks/gpt_fast/generate.py
+++ b/benchmarks/gpt_fast/generate.py
@ -172,8 +172,8 @@ def run_experiment(
    max_new_tokens: int = 200,
    top_k: int = 200,
    temperature: float = 0.8,
+    device: str = "cuda",
 ) -> None:
-    device = "cuda"
    print(f"Loading model {x.name}")
    t0 = time.time()
    model = _load_model(x)
@ -221,7 +221,7 @@ def run_experiment(


 # token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
-def run_llama2_7b_bf16():
+def run_llama2_7b_bf16(device: str = "cuda"):
    from benchmark import Experiment

    model = GPTModelConfig(
@ -235,22 +235,26 @@ def run_llama2_7b_bf16():
    token_per_sec, memory_bandwidth = run_experiment(model)
    return [
        Experiment(
-            "llama2_7b_bf16",
+            model.name,
            "token_per_sec",
            model.token_per_sec,
            f"{token_per_sec:.02f}",
+            model.mode,
+            device,
        ),
        Experiment(
-            "llama2_7b_bf16",
+            model.name,
            "memory_bandwidth(GB/s)",
            model.memory_bandwidth,
            f"{memory_bandwidth:.02f}",
+            model.mode,
+            device,
        ),
    ]


 # token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
-def run_llama2_7b_int8():
+def run_llama2_7b_int8(device: str = "cuda"):
    from benchmark import Experiment

    model = GPTModelConfig(
@ -264,22 +268,26 @@ def run_llama2_7b_int8():
    token_per_sec, memory_bandwidth = run_experiment(model)
    return [
        Experiment(
-            "llama2_7b_int8",
+            model.name,
            "token_per_sec",
            model.token_per_sec,
            f"{token_per_sec:.02f}",
+            model.mode,
+            device,
        ),
        Experiment(
-            "llama2_7b_int8",
+            model.name,
            "memory_bandwidth(GB/s)",
            model.memory_bandwidth,
            f"{memory_bandwidth:.02f}",
+            model.mode,
+            device,
        ),
    ]


 # token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
-def run_mixtral_8x7b_int8():
+def run_mixtral_8x7b_int8(device: str = "cuda"):
    from benchmark import Experiment

    # We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
@ -294,15 +302,19 @@ def run_mixtral_8x7b_int8():
    token_per_sec, memory_bandwidth = run_experiment(model)
    return [
        Experiment(
-            "mixtral_8x7b_int8",
+            model.name,
            "token_per_sec",
            model.token_per_sec,
            f"{token_per_sec:.02f}",
+            model.mode,
+            device,
        ),
        Experiment(
-            "mixtral_8x7b_int8",
+            model.name,
            "memory_bandwidth(GB/s)",
            model.memory_bandwidth,
            f"{memory_bandwidth:.02f}",
+            model.mode,
+            device,
        ),
    ]
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -501,6 +501,7 @@ libtorch_distributed_base_sources = [
    "torch/csrc/distributed/c10d/ProcessGroupMPI.cpp",
    "torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
    "torch/csrc/distributed/c10d/Store.cpp",
+    "torch/csrc/distributed/c10d/SymmetricMemory.cpp",
    "torch/csrc/distributed/c10d/TCPStore.cpp",
    "torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
    "torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
@ -684,6 +685,7 @@ libtorch_cuda_distributed_extra_sources = [
    "torch/csrc/distributed/c10d/UCCUtils.cpp",
    "torch/csrc/distributed/c10d/intra_node_comm.cpp",
    "torch/csrc/distributed/c10d/intra_node_comm.cu",
+    "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
    "torch/csrc/distributed/c10d/Utils.cu",
    "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
    "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@ -18,14 +18,17 @@
    }                                                                      \
  } while (0)

-#define C10_LIBCUDA_DRIVER_API(_) \
-  _(cuMemAddressReserve)          \
-  _(cuMemRelease)                 \
-  _(cuMemMap)                     \
-  _(cuMemAddressFree)             \
-  _(cuMemSetAccess)               \
-  _(cuMemUnmap)                   \
-  _(cuMemCreate)                  \
+#define C10_LIBCUDA_DRIVER_API(_)   \
+  _(cuMemAddressReserve)            \
+  _(cuMemRelease)                   \
+  _(cuMemMap)                       \
+  _(cuMemAddressFree)               \
+  _(cuMemSetAccess)                 \
+  _(cuMemUnmap)                     \
+  _(cuMemCreate)                    \
+  _(cuMemGetAllocationGranularity)  \
+  _(cuMemExportToShareableHandle)   \
+  _(cuMemImportFromShareableHandle) \
  _(cuGetErrorString)

 #define C10_NVML_DRIVER_API(_)           \
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@ -64,6 +64,25 @@
 #define C10_ASAN_ENABLED 0
 #endif

+// Detect undefined-behavior sanitizer (UBSAN)
+#undef C10_UBSAN_ENABLED
+
+// for clang or gcc >= 14
+// NB: gcc 14 adds support for Clang's __has_feature
+//   https://gcc.gnu.org/gcc-14/changes.html
+//   gcc < 14 doesn't have a macro for UBSAN
+//   (e.g. __SANITIZE_UNDEFINED__ does not exist in gcc)
+//   https://github.com/google/sanitizers/issues/765
+#if defined(__has_feature)
+#if ((__has_feature(undefined_behavior_sanitizer)))
+#define C10_UBSAN_ENABLED 1
+#endif
+#endif
+
+#if !defined(C10_UBSAN_ENABLED)
+#define C10_UBSAN_ENABLED 0
+#endif
+
 // Disable the copy and assignment operator for a class. Note that this will
 // disable the usage of the class in std containers.
 #define C10_DISABLE_COPY_AND_ASSIGN(classname) \
--- a/c10/test/core/SymInt_test.cpp
+++ b/c10/test/core/SymInt_test.cpp
@ -2,6 +2,7 @@

 #include <c10/core/SymInt.h>
 #include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Macros.h>

 using namespace c10;
 #ifndef C10_MOBILE
@ -22,6 +23,8 @@ TEST(SymIntTest, CheckRange) {
  EXPECT_FALSE(SymInt::check_range(INT64_MIN));
 }

+#if !C10_UBSAN_ENABLED
+// This test fails signed-integer-overflow UBSAN check
 TEST(SymIntTest, Overflows) {
  const auto x = SymInt(INT64_MAX);
  EXPECT_NE(-(x + 1), 0);
@ -30,5 +33,6 @@ TEST(SymIntTest, Overflows) {
  EXPECT_NE(-y, 0);
  EXPECT_NE(0 - y, 0);
 }
+#endif

 #endif
--- a/c10/util/Float8_e4m3fn-inl.h
+++ b/c10/util/Float8_e4m3fn-inl.h
@ -1,7 +1,7 @@
 #pragma once

 #include <c10/macros/Macros.h>
-#include <cstring>
+#include <cstdint>
 #include <limits>

 C10_CLANG_DIAGNOSTIC_PUSH()
--- a/c10/util/Float8_e4m3fn.h
+++ b/c10/util/Float8_e4m3fn.h
@ -15,9 +15,7 @@
 /// and inspired by Half implementation from pytorch/c10/util/Half.h

 #include <c10/macros/Macros.h>
-#include <c10/util/TypeSafeSignMath.h>
 #include <c10/util/floating_point_utils.h>
-#include <type_traits>

 #if defined(__cplusplus)
 #include <cmath>
@ -32,16 +30,7 @@
 #endif

 #include <climits>
-#include <cstdint>
-#include <cstring>
-#include <iosfwd>
-#include <limits>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <utility>
-
-#include <typeinfo> // operator typeid
+#include <iostream>

 namespace c10 {

--- a/c10/util/Float8_e4m3fnuz.cpp
+++ b/c10/util/Float8_e4m3fnuz.cpp
@ -1,3 +1,4 @@
+#include <c10/macros/Macros.h>
 #include <c10/util/Float8_e4m3fnuz.h>

 namespace c10 {
--- a/c10/util/Float8_e4m3fnuz.h
+++ b/c10/util/Float8_e4m3fnuz.h
@ -17,8 +17,8 @@
 /// Implementation based on the paper https://arxiv.org/pdf/2206.02915.pdf and
 /// the existing Float8_e4m3fn implementation.

+#include <c10/macros/Export.h>
 #include <c10/macros/Macros.h>
-#include <c10/util/TypeSafeSignMath.h>
 #include <c10/util/floating_point_utils.h>
 #include <type_traits>

--- a/c10/util/Float8_e5m2-inl.h
+++ b/c10/util/Float8_e5m2-inl.h
@ -235,7 +235,7 @@ class numeric_limits<c10::Float8_e5m2> {
  static constexpr bool is_specialized = true;
  static constexpr bool is_exact = false;
  static constexpr bool has_infinity = true;
-  static constexpr bool has_quiet_NaN = false;
+  static constexpr bool has_quiet_NaN = true;
  static constexpr bool has_signaling_NaN = false;
  static constexpr auto has_denorm = true;
  static constexpr auto has_denorm_loss = true;
@ -273,6 +273,9 @@ class numeric_limits<c10::Float8_e5m2> {
  static constexpr c10::Float8_e5m2 infinity() {
    return c10::Float8_e5m2(0x7C, c10::Float8_e5m2::from_bits());
  }
+  static constexpr c10::Float8_e5m2 quiet_NaN() {
+    return c10::Float8_e5m2(0x7F, c10::Float8_e5m2::from_bits());
+  }
  static constexpr c10::Float8_e5m2 denorm_min() {
    return c10::Float8_e5m2(0x01, c10::Float8_e5m2::from_bits());
  }
--- a/c10/util/Float8_e5m2.cpp
+++ b/c10/util/Float8_e5m2.cpp
@ -3,7 +3,7 @@
 namespace c10 {

 static_assert(
-    std::is_standard_layout<Float8_e5m2>::value,
+    std::is_standard_layout_v<Float8_e5m2>,
    "c10::Float8_e5m2 must be standard layout.");

 } // namespace c10
--- a/c10/util/Float8_e5m2fnuz-inl.h
+++ b/c10/util/Float8_e5m2fnuz-inl.h
@ -270,6 +270,11 @@ class numeric_limits<c10::Float8_e5m2fnuz> {
  static constexpr c10::Float8_e5m2fnuz infinity() {
    return c10::Float8_e5m2fnuz(0x80, c10::Float8_e5m2fnuz::from_bits());
  }
+  // TODO(future): we are mapping neg_zero to both inf and NaN, this is
+  // surprising and we should figure out what to do about it.
+  static constexpr c10::Float8_e5m2fnuz quiet_NaN() {
+    return c10::Float8_e5m2fnuz(0x80, c10::Float8_e5m2fnuz::from_bits());
+  }
  static constexpr c10::Float8_e5m2fnuz denorm_min() {
    return c10::Float8_e5m2fnuz(0x01, c10::Float8_e5m2fnuz::from_bits());
  }
--- a/c10/util/Float8_e5m2fnuz.cpp
+++ b/c10/util/Float8_e5m2fnuz.cpp
@ -1,3 +1,4 @@
+#include <c10/macros/Macros.h>
 #include <c10/util/Float8_e5m2fnuz.h>

 namespace c10 {
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -560,6 +560,7 @@ if(USE_CUDA)
      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
      set_source_files_properties(
        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemory.cu
        PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
      )
    endif()
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -28,8 +28,6 @@

 #include <c10/macros/Macros.h>

-#include "c10/util/string_utils.h"
-
 namespace caffe2 {

 // Using statements for common classes that we refer to in caffe2 very often.
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@ -213,9 +213,9 @@ void PyTorchStreamReader::init() {
  if (version_ < static_cast<decltype(version_)>(kMinSupportedFileFormatVersion)) {
    CAFFE_THROW(
        "Attempted to read a PyTorch file with version ",
-        c10::to_string(version_),
+        std::to_string(version_),
        ", but the minimum supported version for reading is ",
-        c10::to_string(kMinSupportedFileFormatVersion),
+        std::to_string(kMinSupportedFileFormatVersion),
        ". Your PyTorch script module file is too old. Please regenerate it",
        " with latest version of PyTorch to mitigate this issue.");
  }
@ -733,7 +733,7 @@ void PyTorchStreamWriter::writeEndOfFile() {
  auto allRecords = getAllWrittenRecords();
  // If no ".data/version" or "version" record in the output model, rewrites version info
  if(allRecords.find(".data/version") == allRecords.end() && allRecords.find("version") == allRecords.end()) {
-    std::string version = c10::to_string(version_);
+    std::string version = std::to_string(version_);
    version.push_back('\n');
    if (version_ >= 0x6L) {
      writeRecord(".data/version", version.c_str(), version.size());
--- a/docs/cpp/source/installing.rst
+++ b/docs/cpp/source/installing.rst
@ -154,6 +154,19 @@ should now merrily print the tensor (exact output subject to randomness):
  Also, make sure you specify the correct configuration in the ``cmake --build .``
  line above.

+System Requirements
+-------------------
+
+To ensure smooth installation and usage of LibTorch, please ensure your system
+meets the following requirements:
+
+1. **GLIBC Version**:
+  - GLIBC 2.29 or newer for cxx11 ABI version
+  - GLIBC 2.17 or newer for pre-cxx11 ABI version
+
+2. **GCC Version**:
+  - GCC 9 or newer for cxx11 and pre-cxx11 ABI versions
+
 Visual Studio Extension
 -----------------------

--- a/docs/source/notes/get_start_xpu.rst
+++ b/docs/source/notes/get_start_xpu.rst
@ -0,0 +1,339 @@
+Pytorch 2.4: Getting Started on Intel GPU
+=========================================
+
+The support for Intel GPUs is released alongside PyTorch v2.4.
+
+This release only supports build from source for Intel GPUs.
+
+Hardware Prerequisites
+----------------------
+
+.. list-table::
+   :header-rows: 1
+
+   * - Supported Hardware
+     - Intel® Data Center GPU Max Series
+   * - Supported OS
+     - Linux
+
+
+PyTorch for Intel GPUs is compatible with Intel® Data Center GPU Max Series and only supports OS Linux with release 2.4.
+
+Software Prerequisites
+----------------------
+
+As a prerequisite, install the driver and required packages by following the `PyTorch Installation Prerequisites for Intel GPUs <https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html>`_.
+
+Set up Environment
+------------------
+
+Before you begin, you need to set up the environment. This can be done by sourcing the ``setvars.sh`` script provided by the ``intel-for-pytorch-gpu-dev`` and  ``intel-pti-dev`` packages.
+
+.. code-block::
+
+   source ${ONEAPI_ROOT}/setvars.sh
+
+.. note::
+   The ``ONEAPI_ROOT`` is the folder you installed your ``intel-for-pytorch-gpu-dev`` and  ``intel-pti-dev`` packages. Typically, it is located at ``/opt/intel/oneapi/`` or ``~/intel/oneapi/``.
+
+Build from source
+-----------------
+
+Now we have all the required packages installed and environment acitvated. Use the following commands to install ``pytorch``, ``torchvision``, ``torchaudio`` by building from source. For more details, refer to official guides in `PyTorch from source <https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support>`_, `Vision from source <https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation>`_ and `Audio from source <https://pytorch.org/audio/main/build.linux.html>`_.
+
+.. code-block::
+
+   # Get PyTorch Source Code
+   git clone --recursive https://github.com/pytorch/pytorch
+   cd pytorch
+   git checkout main # or checkout the specific release version >= v2.4
+   git submodule sync
+   git submodule update --init --recursive
+
+   # Get required packages for compilation
+   conda install cmake ninja
+   pip install -r requirements.txt
+
+   # Pytorch for Intel GPUs only support Linux platform for now.
+   # Install the required packages for pytorch compilation.
+   conda install intel::mkl-static intel::mkl-include
+
+   # (optional) If using torch.compile with inductor/triton, install the matching version of triton
+   # Run from the pytorch directory after cloning
+   # For Intel GPU support, please explicitly `export USE_XPU=1` before running command.
+   USE_XPU=1 make triton
+
+   # If you would like to compile PyTorch with new C++ ABI enabled, then first run this command:
+   export _GLIBCXX_USE_CXX11_ABI=1
+
+   # pytorch build from source
+   export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+   python setup.py develop
+   cd ..
+
+   # (optional) If using torchvison.
+   # Get torchvision Code
+   git clone https://github.com/pytorch/vision.git
+   cd vision
+   git checkout main # or specific version
+   python setup.py develop
+   cd ..
+
+   # (optional) If using torchaudio.
+   # Get torchaudio Code
+   git clone https://github.com/pytorch/audio.git
+   cd audio
+   pip install -r requirements.txt
+   git checkout main # or specific version
+   git submodule sync
+   git submodule update --init --recursive
+   python setup.py develop
+   cd ..
+
+Check availability for Intel GPU
+--------------------------------
+
+.. note::
+   Make sure the environment is properly set up by following `Environment Set up <#set-up-environment>`_ before running the code.
+
+To check if your Intel GPU is available, you would typically use the following code:
+
+.. code-block::
+
+   import torch
+   torch.xpu.is_available()  # torch.xpu is the API for Intel GPU support
+
+If the output is ``False``, ensure that you have Intel GPU in your system and correctly follow the `PyTorch Installation Prerequisites for Intel GPUs <https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html>`_. Then, check that the PyTorch compilation is correctly finished.
+
+Minimum Code Change
+-------------------
+
+If you are migrating code from ``cuda``, you would change references from ``cuda`` to ``xpu``. For example:
+
+.. code-block::
+
+   # CUDA CODE
+   tensor = torch.tensor([1.0, 2.0]).to("cuda")
+
+   # CODE for Intel GPU
+   tensor = torch.tensor([1.0, 2.0]).to("xpu")
+
+The following points outline the support and limitations for PyTorch with Intel GPU:
+
+#. Both training and inference workflows are supported.
+#. Both eager mode and ``torch.compile`` is supported.
+#. Data types such as FP32, BF16, FP16, and Automatic Mixed Precision (AMP) are all supported.
+#. Models that depend on third-party components, will not be supported until PyTorch v2.5 or later.
+
+Examples
+--------
+
+This section contains usage examples for both inference and training workflows.
+
+Inference Examples
+^^^^^^^^^^^^^^^^^^
+
+Here is a few inference workflow examples.
+
+
+Inference with FP32
+"""""""""""""""""""
+
+.. code-block::
+
+   import torch
+   import torchvision.models as models
+
+   model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
+   model.eval()
+   data = torch.rand(1, 3, 224, 224)
+
+   ######## code changes #######
+   model = model.to("xpu")
+   data = data.to("xpu")
+   ######## code changes #######
+
+   with torch.no_grad():
+       model(data)
+
+   print("Execution finished")
+
+Inference with AMP
+""""""""""""""""""
+
+.. code-block::
+
+   import torch
+   import torchvision.models as models
+
+   model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
+   model.eval()
+   data = torch.rand(1, 3, 224, 224)
+
+   #################### code changes #################
+   model = model.to("xpu")
+   data = data.to("xpu")
+   #################### code changes #################
+
+   with torch.no_grad():
+       d = torch.rand(1, 3, 224, 224)
+       ############################# code changes #####################
+       d = d.to("xpu")
+       # set dtype=torch.bfloat16 for BF16
+       with torch.autocast(device_type="xpu", dtype=torch.float16, enabled=True):
+       ############################# code changes #####################
+           model(data)
+
+   print("Execution finished")
+
+Inference with ``torch.compile``
+""""""""""""""""""""""""""""""""
+
+.. code-block::
+
+   import torch
+   import torchvision.models as models
+
+   model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
+   model.eval()
+   data = torch.rand(1, 3, 224, 224)
+   ITERS = 10
+
+   ######## code changes #######
+   model = model.to("xpu")
+   data = data.to("xpu")
+   ######## code changes #######
+
+   model = torch.compile(model)
+   for i in range(ITERS):
+       with torch.no_grad():
+           model(data)
+
+   print("Execution finished")
+
+Training Examples
+^^^^^^^^^^^^^^^^^
+
+Here is a few training workflow examples.
+
+Train with FP32
+"""""""""""""""
+
+.. code-block::
+
+   import torch
+   import torchvision
+
+   LR = 0.001
+   DOWNLOAD = True
+   DATA = "datasets/cifar10/"
+
+   transform = torchvision.transforms.Compose(
+       [
+           torchvision.transforms.Resize((224, 224)),
+           torchvision.transforms.ToTensor(),
+           torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+       ]
+   )
+   train_dataset = torchvision.datasets.CIFAR10(
+       root=DATA,
+       train=True,
+       transform=transform,
+       download=DOWNLOAD,
+   )
+   train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128)
+
+   model = torchvision.models.resnet50()
+   criterion = torch.nn.CrossEntropyLoss()
+   optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9)
+   model.train()
+   ######################## code changes #######################
+   model = model.to("xpu")
+   criterion = criterion.to("xpu")
+   ######################## code changes #######################
+
+   for batch_idx, (data, target) in enumerate(train_loader):
+       ########## code changes ##########
+       data = data.to("xpu")
+       target = target.to("xpu")
+       ########## code changes ##########
+       optimizer.zero_grad()
+       output = model(data)
+       loss = criterion(output, target)
+       loss.backward()
+       optimizer.step()
+       print(batch_idx)
+   torch.save(
+       {
+           "model_state_dict": model.state_dict(),
+           "optimizer_state_dict": optimizer.state_dict(),
+       },
+       "checkpoint.pth",
+   )
+
+   print("Execution finished")
+
+Train with AMP
+""""""""""""""
+
+.. code-block::
+
+   import torch
+   import torchvision
+
+   LR = 0.001
+   DOWNLOAD = True
+   DATA = "datasets/cifar10/"
+
+   use_amp=True
+
+   transform = torchvision.transforms.Compose(
+       [
+           torchvision.transforms.Resize((224, 224)),
+           torchvision.transforms.ToTensor(),
+           torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+       ]
+   )
+   train_dataset = torchvision.datasets.CIFAR10(
+       root=DATA,
+       train=True,
+       transform=transform,
+       download=DOWNLOAD,
+   )
+   train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128)
+
+   model = torchvision.models.resnet50()
+   criterion = torch.nn.CrossEntropyLoss()
+   optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9)
+   scaler = torch.amp.GradScaler(enabled=use_amp)
+
+   model.train()
+   ######################## code changes #######################
+   model = model.to("xpu")
+   criterion = criterion.to("xpu")
+   ######################## code changes #######################
+
+   for batch_idx, (data, target) in enumerate(train_loader):
+       ########## code changes ##########
+       data = data.to("xpu")
+       target = target.to("xpu")
+       ########## code changes ##########
+       # set dtype=torch.bfloat16 for BF16
+       with torch.autocast(device_type="xpu", dtype=torch.float16, enabled=use_amp):
+           output = model(data)
+           loss = criterion(output, target)
+       scaler.scale(loss).backward()
+       scaler.step(optimizer)
+       scaler.update()
+       optimizer.zero_grad()
+       print(batch_idx)
+
+   torch.save(
+       {
+           "model_state_dict": model.state_dict(),
+           "optimizer_state_dict": optimizer.state_dict(),
+       },
+       "checkpoint.pth",
+   )
+
+   print("Execution finished")
--- a/test/cpp/jit/test_custom_class_registrations.cpp
+++ b/test/cpp/jit/test_custom_class_registrations.cpp
@ -140,7 +140,7 @@ struct TensorQueue : torch::CustomClassHolder {

    for (const auto index : c10::irange(queue_size)) {
      at::Tensor val;
-      queue_[index] = dict.at(key + "/" + c10::to_string(index));
+      queue_[index] = dict.at(key + "/" + std::to_string(index));
      queue_.push_back(val);
    }
  }
@ -152,7 +152,7 @@ struct TensorQueue : torch::CustomClassHolder {
    dict.insert(
        key + "/size", torch::tensor(static_cast<int64_t>(queue_.size())));
    for (const auto index : c10::irange(queue_.size())) {
-      dict.insert(key + "/" + c10::to_string(index), queue_[index]);
+      dict.insert(key + "/" + std::to_string(index), queue_[index]);
    }
    return dict;
  }
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@ -6,6 +6,7 @@
 #include <ATen/core/interned_strings.h>
 #include <ATen/core/ivalue.h>
 #include <ATen/core/jit_type_base.h>
+#include <c10/macros/Macros.h>
 #include <test/cpp/jit/test_utils.h>
 #include <torch/csrc/jit/passes/remove_mutation.h>
 #include <torch/csrc/jit/passes/tensorexpr_fuser.h>
@ -491,13 +492,7 @@ TEST(ControlFlowTest, Basic) {
  ASSERT_EQ(256, run_binary("while_test", 2, 0));
 }

-#if defined(__has_feature)
-#if __has_feature(address_sanitizer)
-#define HAS_ASANUBSAN 1
-#endif
-#endif
-
-#ifndef HAS_ASANUBSAN
+#if !(C10_ASAN_ENABLED || C10_UBSAN_ENABLED)
 // This test fails vptr UBSAN checks

 TEST(ProtoTest, Basic) {
--- a/test/distributed/_composable/test_checkpoint.py
+++ b/test/distributed/_composable/test_checkpoint.py
@ -2,8 +2,9 @@

 import unittest
 from collections import deque, OrderedDict
-from contextlib import ContextDecorator
+from contextlib import ContextDecorator, contextmanager, nullcontext
 from copy import deepcopy
+from functools import partial
 from typing import Tuple

 import torch
@ -11,6 +12,7 @@ import torch.nn as nn
 from torch.distributed._composable import checkpoint
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.utils.checkpoint import CheckpointError


 class MemoryDelta(ContextDecorator):
@ -68,7 +70,7 @@ class MultiOutputModel(nn.Module):
        self.w1 = nn.Parameter(torch.randn((100, 100), device=device))
        self.w2 = nn.Parameter(torch.randn((100, 100), device=device))

-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        z = x @ self.w1
        z = nn.functional.relu(z)
        z = z @ self.w2
@ -219,6 +221,116 @@ class TestCheckpoint(TestCase):

        self.assertEqual(None, checkpoint.state(m)._ac_generator)

+    def test_checkpoint_kwargs(self):
+        class MyModel(torch.nn.Module):
+            def __init__(self, raise_exp: bool, change_shape_in_recomp: bool):
+                super().__init__()
+                self.fwd_count = 0
+                self.raise_exp = raise_exp
+                self.change_shape_in_recomp = change_shape_in_recomp
+                self.a = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                if self.raise_exp and self.fwd_count == 0:
+                    raise RuntimeError("foo")
+                if self.raise_exp and self.fwd_count == 1:
+                    raise RuntimeError("bar")
+                if self.change_shape_in_recomp and self.fwd_count == 1:
+                    x.relu_()
+                random_tensor = torch.randn(1, 2)
+                x = self.a(x + random_tensor)
+                self.fwd_count += 1
+                return x
+
+        m = MyModel(True, False)
+        m0, m1, m2, m3 = (deepcopy(m) for _ in range(4))
+
+        # composable checkpoint does not support use_reentrant=True
+        with self.assertRaisesRegex(
+            NotImplementedError,
+            "use_reentrant=True is not supported in composable checkpoint. "
+            "Please use torch.utils.checkpoint.checkpoint instead.",
+        ):
+            checkpoint(m, use_reentrant=True)
+
+        # check giving an unsupported kwarg
+        with self.assertRaisesRegex(ValueError, "Unexpected keyword arguments: foo"):
+            checkpoint(m0, foo="bar")
+
+        handled_fwd_exp = False
+        handled_recomp_exp = False
+
+        @contextmanager
+        def fwd_ctx(mod: MyModel):
+            try:
+                mod.raise_exp = False
+                yield
+            finally:
+                nonlocal handled_fwd_exp
+                handled_fwd_exp = True
+                mod.raise_exp = True
+
+        @contextmanager
+        def recomp_ctx(mod: MyModel):
+            try:
+                mod.raise_exp = False
+                yield
+            finally:
+                nonlocal handled_recomp_exp
+                handled_recomp_exp = True
+                mod.raise_exp = True
+
+        # Test different context functions
+        x = torch.randn(1, 2, requires_grad=True)
+        checkpoint(
+            m1, context_fn=lambda: (partial(fwd_ctx, m1)(), partial(recomp_ctx, m1)())
+        )
+        m1(x.clone()).sum().backward()
+        self.assertEqual((handled_fwd_exp, handled_recomp_exp), (True, True))
+
+        checkpoint(m2, context_fn=lambda: (nullcontext(), partial(recomp_ctx, m2)()))
+        with self.assertRaisesRegex(RuntimeError, "foo"):
+            m2(x.clone())
+
+        handled_fwd_exp = False  # Reset flag
+        checkpoint(m3, context_fn=lambda: (partial(fwd_ctx, m3)(), nullcontext()))
+        with self.assertRaisesRegex(RuntimeError, "bar"):
+            m3(x.clone()).sum().backward()
+        self.assertEqual(handled_fwd_exp, True)
+
+        # Test determinism check failure
+        m4 = MyModel(False, True)
+        m5 = deepcopy(m4)
+        # Determinism check should not throw an error,
+        # but autograd should throw a RuntimeError
+        checkpoint(m4, determinism_check="none")
+        with self.assertRaises(RuntimeError):
+            m4(x.clone()).sum().backward()
+
+        # Determinism check should throw a CheckpointError
+        checkpoint(m5, determinism_check="default")
+        with self.assertRaises(CheckpointError):
+            m5(x.clone()).sum().backward()
+
+        # Test preserving random state
+        m6 = MyModel(False, False)
+        m7, m8 = (deepcopy(m6) for _ in range(2))
+        checkpoint(m7, preserve_rng_state=False)
+        checkpoint(m8, preserve_rng_state=True)
+
+        for mi in (m6, m7, m8):
+            torch.manual_seed(42)
+            loss = mi(x.clone()).sum()
+            torch.manual_seed(41)
+            loss.backward()
+        # check that m6 and m7 have at least one different grad
+        self.assertNotEqual(
+            (p1.grad for p1 in m6.parameters()), (p2.grad for p2 in m7.parameters())
+        )
+        # check that m6 and m8 have identical grads
+        for p1, p2 in zip(m6.parameters(), m8.parameters()):
+            self.assertEqual(p1.grad, p2.grad)
+

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/_tools/test_mod_tracker.py
+++ b/test/distributed/_tools/test_mod_tracker.py
@ -0,0 +1,140 @@
+# Owner(s): ["module: unknown"]
+
+from copy import copy
+
+import torch
+from torch.distributed._tools.mod_tracker import ModTracker
+from torch.testing._internal.common_utils import run_tests, TestCase, xfailIfTorchDynamo
+
+
+class TestModTracker(TestCase):
+    # "https://github.com/pytorch/pytorch/issues/127112
+    @xfailIfTorchDynamo
+    def test_module_hierarchy(self):
+        seen_fw = []
+        seen_bw = []
+
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                x = x["a"].relu_()
+                seen_fw.append((copy(tracker.parents), tracker.is_bw))
+                x.register_hook(
+                    lambda grad: seen_bw.append((copy(tracker.parents), tracker.is_bw))
+                )
+                return {"a": torch.mm(x, x)}
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = Foo()
+                self.b = torch.nn.ModuleDict({"nest": Foo()})
+                self.c = torch.nn.ModuleList([Foo()])
+
+            def forward(self, x):
+                x = self.c[0](x)
+                return self.b["nest"](self.a(x))
+
+        mod = Mod()
+
+        with ModTracker() as tracker:
+            mod({"a": torch.randn(10, 10, requires_grad=True).clone()})[
+                "a"
+            ].sum().backward()
+            mod({"a": torch.randn(10, 10, requires_grad=True).clone()})[
+                "a"
+            ].sum().backward()
+
+        self.assertEqual(
+            seen_fw,
+            [
+                ({"Global", "Mod", "Mod.c.0"}, False),
+                ({"Global", "Mod", "Mod.a"}, False),
+                ({"Global", "Mod", "Mod.b.nest"}, False),
+                ({"Global", "Mod", "Mod.c.0"}, False),
+                ({"Global", "Mod", "Mod.a"}, False),
+                ({"Global", "Mod", "Mod.b.nest"}, False),
+            ],
+        )
+
+        self.assertEqual(
+            seen_bw,
+            [
+                ({"Global", "Mod", "Mod.b.nest"}, True),
+                ({"Global", "Mod", "Mod.a"}, True),
+                ({"Global", "Mod", "Mod.c.0"}, True),
+                ({"Global", "Mod", "Mod.b.nest"}, True),
+                ({"Global", "Mod", "Mod.a"}, True),
+                ({"Global", "Mod", "Mod.c.0"}, True),
+            ],
+        )
+
+    def test_bw_detection(self):
+        mod = torch.nn.Linear(2, 2)
+
+        with ModTracker() as tracker:
+            mod(torch.rand(2, requires_grad=True)).sum().backward()
+            self.assertFalse(tracker.is_bw)
+            self.assertEqual(tracker.parents, {"Global"})
+
+    @xfailIfTorchDynamo
+    def test_user_hooks(self):
+        class Bar(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.foo = torch.nn.Linear(10, 10)
+
+            def forward(self, x):
+                return self.foo(x).relu_()
+
+        mt = ModTracker()
+        test_op = []
+
+        def hook(mod, hook_name):
+            mfqn = mt.get_known_fqn(mod) if mod is not None else None
+            test_op.append((hook_name, mfqn, mfqn in mt.parents, mt.is_bw))
+
+        mod = Bar()
+
+        mt.register_user_hooks(
+            lambda m, inp: hook(m, "pre_fw"),
+            lambda m, inp, op: hook(m, "post_fw"),
+            lambda m, gop: hook(m, "pre_bw"),
+            lambda m, ginp: hook(m, "post_bw"),
+        )
+        with mt:
+            mod(torch.rand(10, 10, requires_grad=True)).sum().backward()
+        expected_op = [
+            ("pre_fw", "Bar", True, False),
+            ("pre_fw", "Bar.foo", True, False),
+            ("post_fw", "Bar.foo", True, False),
+            ("post_fw", "Bar", True, False),
+            ("pre_bw", "Bar", True, True),
+            ("pre_bw", "Bar.foo", True, True),
+            ("post_bw", "Bar", True, True),
+            ("post_bw", "Bar.foo", True, True),
+        ]
+        self.assertEqual(test_op, expected_op)
+
+        with self.assertRaises(AssertionError):
+            mt.register_user_hooks(lambda x, y: x, None, None, None)
+
+        test_op.clear()
+        with mt:
+            loss = mod(torch.rand(10, 10, requires_grad=True)).sum()
+            del mod
+            loss.backward()
+        expected_op = [
+            ("pre_fw", "Bar", True, False),
+            ("pre_fw", "Bar.foo", True, False),
+            ("post_fw", "Bar.foo", True, False),
+            ("post_fw", "Bar", True, False),
+            ("pre_bw", None, False, True),
+            ("pre_bw", None, False, True),
+            ("post_bw", None, False, True),
+            ("post_bw", None, False, True),
+        ]
+        self.assertEqual(test_op, expected_op)
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@ -16,10 +16,7 @@ import torch.distributed.distributed_c10d as c10d
 import torch.distributed.rpc as rpc
 from torch.distributed import DistError, DistNetworkError, DistStoreError
 from torch.testing._internal.common_distributed import MultiThreadedTestCase
-from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
-    parametrize,
-)
+from torch.testing._internal.common_utils import instantiate_parametrized_tests

 if not dist.is_available():
    print("torch.distributed not available, skipping tests", file=sys.stderr)
@ -841,19 +838,11 @@ class TestPythonStore(TestCase):


 class TestMultiThreadedWait(MultiThreadedTestCase):
-    # TODO (xilunwu): Use less hacky means of instantiating stores.
-    # Note, stores accumulate values per test.
-    stores = [
-        dist.FileStore(tempfile.NamedTemporaryFile(delete=False).name, 1),
-        dist.HashStore(),
-        dist.PrefixStore(
-            "pre", dist.FileStore(tempfile.NamedTemporaryFile(delete=False).name, 1)
-        ),
-        create_tcp_store(use_libuv=False),
-        create_tcp_store(use_libuv=True),
-        dist.PrefixStore("pre", create_tcp_store(use_libuv=False)),
-        dist.PrefixStore("pre", create_tcp_store(use_libuv=True)),
-    ]
+    file_store = dist.FileStore(tempfile.NamedTemporaryFile(delete=False).name, 1)
+    hash_store = dist.HashStore()
+
+    tcp_store = create_tcp_store(use_libuv=False)
+    tcp_store_uv = create_tcp_store(use_libuv=True)

    @property
    def world_size(self):
@ -863,10 +852,7 @@ class TestMultiThreadedWait(MultiThreadedTestCase):
        super().setUp()
        self._spawn_threads()

-    # Iterates over self.stores, keep 7 in sync with len(self.stores).
-    @parametrize("i", range(7))
-    def test_wait(self, i):
-        store = self.stores[i]
+    def _test_wait(self, store):
        store.set_timeout(timedelta(seconds=2))
        if dist.get_rank() == 0:
            store.wait(["key1"])
@ -874,6 +860,39 @@ class TestMultiThreadedWait(MultiThreadedTestCase):
        if dist.get_rank() == 1:
            store.set("key1", "value1")

+    def test_wait_hash_store(self):
+        self._test_wait(self.hash_store)
+
+    def test_wait_file_store(self):
+        self._test_wait(self.file_store)
+
+    def test_wait_prefix_file_store(self):
+        store = dist.PrefixStore("pre", self.file_store)
+        self._test_wait(store)
+
+    def _test_wait_tcp_store(self, master_store):
+        store = (
+            master_store
+            if dist.get_rank() == 0
+            else dist.TCPStore(
+                host_name=master_store.host,
+                port=master_store.port,
+                is_master=False,
+                wait_for_workers=False,
+                use_libuv=False,
+            )
+        )
+        self._test_wait(store)
+
+        prefix_store = dist.PrefixStore("pre", store)
+        self._test_wait(prefix_store)
+
+    def test_wait_tcp_store(self):
+        self._test_wait_tcp_store(self.tcp_store)
+
+    def test_wait_tcp_store_uv(self):
+        self._test_wait_tcp_store(self.tcp_store_uv)
+

 instantiate_parametrized_tests(TestMultiThreadedWait)

--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@ -0,0 +1,156 @@
+# Owner(s): ["module: c10d"]
+
+import torch
+
+import torch.distributed as dist
+from torch._C._distributed_c10d import _SymmetricMemory
+from torch.distributed.distributed_c10d import _get_process_group_store
+
+from torch.testing._internal.common_distributed import (
+    MultiProcessTestCase,
+    skip_if_lt_x_gpu,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    run_tests,
+    skip_but_pass_in_sandcastle_if,
+    skipIfRocm,
+)
+
+
+def requires_cuda_p2p_access():
+    cuda_p2p_access_available = (
+        torch.cuda.is_available() and torch.cuda.device_count() >= 2
+    )
+    num_devices = torch.cuda.device_count()
+    for i in range(num_devices - 1):
+        for j in range(i + 1, num_devices):
+            if not torch.cuda.can_device_access_peer(i, j):
+                cuda_p2p_access_available = False
+                break
+        if not cuda_p2p_access_available:
+            break
+
+    return skip_but_pass_in_sandcastle_if(
+        not cuda_p2p_access_available,
+        "cuda p2p access is not available",
+    )
+
+
+@instantiate_parametrized_tests
+@requires_cuda_p2p_access()
+class SymmetricMemoryTest(MultiProcessTestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(f"cuda:{self.rank}")
+
+    def _init_process(self):
+        torch.cuda.set_device(self.device)
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        _SymmetricMemory.set_group_info(
+            "0",
+            self.rank,
+            self.world_size,
+            _get_process_group_store(dist.GroupMember.WORLD),
+        )
+
+    def _verify_symmetric_memory(self, symm_mem):
+        self.assertEqual(symm_mem.world_size, 2)
+
+        buf = symm_mem.get_buffer(0, (64, 64), torch.float32)
+        if symm_mem.rank == 0:
+            symm_mem.wait_signal(src_rank=1)
+            self.assertTrue(buf.eq(42).all())
+        else:
+            buf.fill_(42)
+            symm_mem.put_signal(dst_rank=0)
+
+        symm_mem.barrier()
+
+        if symm_mem.rank == 0:
+            symm_mem.barrier()
+            self.assertTrue(buf.eq(43).all())
+        else:
+            buf.fill_(43)
+            symm_mem.barrier()
+
+        symm_mem.barrier()
+
+    @skipIfRocm
+    @skip_if_lt_x_gpu(2)
+    def test_empty_strided_p2p(self) -> None:
+        self._init_process()
+
+        shape = (64, 64)
+        stride = (64, 1)
+        dtype = torch.float32
+        device = self.device
+        group_name = "0"
+        alloc_args = (shape, stride, dtype, device, group_name)
+
+        t = torch.empty(shape, dtype=dtype, device=device)
+        with self.assertRaises(RuntimeError):
+            _SymmetricMemory.rendezvous(t)
+
+        t = _SymmetricMemory.empty_strided_p2p(*alloc_args)
+        symm_mem = _SymmetricMemory.rendezvous(t)
+
+        del t
+        self._verify_symmetric_memory(symm_mem)
+
+    @skipIfRocm
+    @skip_if_lt_x_gpu(2)
+    def test_empty_strided_p2p_persistent(self) -> None:
+        self._init_process()
+
+        shape = (64, 64)
+        stride = (64, 1)
+        dtype = torch.float32
+        device = self.device
+        alloc_id = 42  # Persistent allocation
+        group_name = "0"
+        alloc_args = (shape, stride, dtype, device, group_name, alloc_id)
+
+        t = _SymmetricMemory.empty_strided_p2p(*alloc_args)
+        data_ptr = t.data_ptr()
+
+        # Verify that persistent allocation would fail if there's an active
+        # allocation with the same alloc_id.
+        with self.assertRaises(RuntimeError):
+            _SymmetricMemory.empty_strided_p2p(*alloc_args)
+
+        # Verify that persistent allocation would succeed in lieu of activate
+        # allocations with the same alloc_id, and the returned tensor would
+        # have the same data pointer.
+        del t
+        t = _SymmetricMemory.empty_strided_p2p(*alloc_args)
+        self.assertEqual(t.data_ptr(), data_ptr)
+
+        # Verify that get_symmetric_memory would fail if called before
+        # rendezvous.
+        with self.assertRaises(RuntimeError):
+            _SymmetricMemory.get_symmetric_memory(t)
+
+        symm_mem_0 = _SymmetricMemory.rendezvous(t)
+        symm_mem_1 = _SymmetricMemory.get_symmetric_memory(t)
+        self.assertEqual(id(symm_mem_0), id(symm_mem_1))
+
+        self._verify_symmetric_memory(symm_mem_0)
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@ -27,6 +27,8 @@ from torch._dynamo.testing import (
    normalize_gm,
 )
 from torch._dynamo.utils import ifdynstaticdefault, same
+from torch._dynamo.variables import ConstantVariable
+from torch._dynamo.variables.lists import RangeVariable

 from torch.nn import functional as F
 from torch.testing._internal.common_utils import (
@ -2369,6 +2371,157 @@ class GraphModule(torch.nn.Module):
                opt_fn = torch._dynamo.optimize(nopython=True)(fn)
                self.assertEqual(opt_fn(), fn())

+    def gen_random_range_args(self):
+        args_count = random.randint(1, 3)
+        args = [random.randint(-10, 10) for _ in range(args_count)]
+        if args_count == 3 and args[2] == 0:
+            args[2] = 1
+        return args
+
+    def test_range_length(self):
+        def test(*args, expected=None):
+            r = range(*args)
+            range_variable = RangeVariable([ConstantVariable.create(v) for v in args])
+
+            self.assertEqual(len(r), range_variable.range_length())
+
+            if expected is not None:
+                self.assertEqual(len(r), expected)
+
+        test(1, 1, 1, expected=0)
+        test(1, 0, expected=0)
+        test(-10, expected=0)
+
+        test(4, expected=4)
+        test(10, expected=10)
+
+        # step >1
+        test(1, 10, 2, expected=5)
+
+        # negative step
+        test(10, 1, -1, expected=9)
+        test(10, 1, -3)
+
+        # Fuzz testing
+        for i in range(100):
+            args = self.gen_random_range_args()
+            print("testing :", args)
+            test(*args)
+
+    def test_indexed_range(self):
+        def test(range, index, expected=None):
+            range_variable = RangeVariable(
+                [
+                    ConstantVariable.create(v)
+                    for v in [range.start, range.stop, range.step]
+                ]
+            )
+
+            self.assertEqual(
+                range[index],
+                range_variable.apply_index(index).as_python_constant(),
+            )
+
+            if expected is not None:
+                self.assertEqual(range[index], expected)
+
+        test(range(10), 1, expected=1)
+        test(range(10, 20, 2), 1, expected=12)
+
+        # Fuzz testing
+        for i in range(100):
+            range_args = self.gen_random_range_args()
+            r = range(*range_args)
+
+            if len(r) == 0:
+                continue
+
+            index = random.randint(0, len(r) - 1)
+
+            print("testing:", r, index)
+            test(r, index)
+
+    def test_sliced_range(self):
+        def test(range, slice, expected=None):
+            range_variable = RangeVariable(
+                [
+                    ConstantVariable.create(v)
+                    for v in [range.start, range.stop, range.step]
+                ]
+            )
+
+            self.assertEqual(
+                range[slice],
+                range_variable.apply_slice(slice).as_python_constant(),
+            )
+
+            if expected is not None:
+                self.assertEqual(
+                    range[slice],
+                    expected,
+                )
+
+        test(range(10), slice(1, 10, 2), expected=range(1, 10, 2))
+        test(range(10), slice(None, 10, None), expected=range(0, 10))
+        test(range(10), slice(-1, 7, None), expected=range(9, 7))
+        test(range(10), slice(-1, 7, 2), expected=range(9, 7, 2))
+        test(range(1, 10, 2), slice(3, 7, 2), expected=range(7, 11, 4))
+        test(range(1, 10, 2), slice(-3, 7, 2), expected=range(5, 11, 4))
+        test(range(-1, -5, -3), slice(5, None, -3), expected=range(-4, 2, 9))
+
+        def rand_slice():
+            def flip_coin():
+                # 1 out of 10
+                return random.randint(1, 10) == 5
+
+            def r_item(allow_zero=True):
+                i = random.randint(-10, 10)
+                if not allow_zero and i == 0:
+                    i = 1
+                if flip_coin():
+                    i = None
+                return i
+
+            arg_count = random.randint(1, 3)
+
+            if arg_count == 1:
+                return slice(r_item())
+            elif arg_count == 2:
+                return slice(r_item(), r_item())
+            else:
+                return slice(r_item(), r_item(), r_item(False))
+
+        # Fuzz testing
+        for i in range(100):
+            range_args = self.gen_random_range_args()
+            r = range(*range_args)
+            # generate random slice
+            s = rand_slice()
+
+            print("testing:", r, s)
+            test(r, s)
+
+    def test_range_with_slice_index(self):
+        def fn(x):
+            acc = 1
+            for k in range(2)[1::2]:
+                acc *= acc * k
+            return x * acc
+
+        opt_fn = torch.compile(fullgraph=True)(fn)
+        x = torch.ones(1)
+        self.assertEqual(opt_fn(x), fn(x))
+
+    def test_range_with_index(self):
+        def fn(x):
+            acc = 1
+            acc *= acc * range(10, 20, 2)[2]
+            return x * acc
+
+        opt_fn = torch.compile(fullgraph=True)(fn)
+        x = torch.ones(1)
+        self.assertEqual(opt_fn(x), fn(x))
+
    def test_rand_inlined(self):
        @torch.compile(backend="eager", dynamic=True)
        def fn():
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@ -45,7 +45,8 @@ def check_dynamic_shape_capture():


 def count_ops(gm, args, freq, op):
-    assert [node.target for node in gm.graph.nodes].count(op) == freq
+    actual = [node.target for node in gm.graph.nodes].count(op)
+    assert actual == freq, f"expected={freq}, actual={actual}"
    return gm


@ -6049,9 +6050,7 @@ class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
        y = torch.randn(4, 4, requires_grad=True)

        fw_compiler = functools.partial(count_ops, freq=1, op=torch.ops.aten.mm.default)
-        bw_compiler = functools.partial(
-            count_ops, freq=3, op=torch.ops.aten.mm.default
-        )  # mm recomputed in the bwd
+        bw_compiler = functools.partial(count_ops, freq=2, op=torch.ops.aten.mm.default)
        backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
        self._validate(fn, backend, x, y)

@ -6074,9 +6073,7 @@ class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
        y = torch.randn(4, 4, requires_grad=True)

        fw_compiler = functools.partial(count_ops, freq=1, op=torch.ops.aten.mm.default)
-        bw_compiler = functools.partial(
-            count_ops, freq=3, op=torch.ops.aten.mm.default
-        )  # mm recomputed in the bwd
+        bw_compiler = functools.partial(count_ops, freq=2, op=torch.ops.aten.mm.default)
        backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
        self._validate(fn, backend, x, y)

@ -6097,8 +6094,9 @@ class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
        fw_compiler = functools.partial(
            count_ops, freq=1, op=torch.ops.rngprims.philox_rand.default
        )
+        # philox_rand is passed from fwd
        bw_compiler = functools.partial(
-            count_ops, freq=1, op=torch.ops.rngprims.philox_rand.default
+            count_ops, freq=0, op=torch.ops.rngprims.philox_rand.default
        )
        backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
        self._validate(
@ -6178,8 +6176,9 @@ class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
        fw_compiler = functools.partial(
            count_ops, freq=1, op=torch.ops.aten.sigmoid.default
        )
+        # sigmoid passed from fwd
        bw_compiler = functools.partial(
-            count_ops, freq=1, op=torch.ops.aten.sigmoid.default
+            count_ops, freq=0, op=torch.ops.aten.sigmoid.default
        )
        backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
        self._validate(fn, backend, x)
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@ -574,6 +574,23 @@ class MiscTests(torch._inductor.test_case.TestCase):
        cleanup_op("mylib::foo")
        del lib

+    def test_auto_functionalize_can_with_none_return(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            lib.define("foo(Tensor x, Tensor(a!) out) -> None")
+
+            def foo_impl(x, out):
+                out.copy_(x)
+
+            lib.impl("foo", foo_impl, "CompositeExplicitAutograd")
+            x = torch.randn(3)
+            out = torch.zeros(3)
+
+            @torch.compile
+            def f(x, out):
+                torch.ops.mylib.foo(x, out)
+
+            f(x, out)
+
    def test_user_defined_setattr1(self):
        @torch.compile(backend="eager", fullgraph=True)
        def fn(obj):
@ -10446,6 +10463,14 @@ fn
        res = opt_fn(x)
        self.assertEqual(ref, res)

+    def test_assert_size_stride(self):
+        x = torch.randn(2, 3, 4)
+        with self.assertRaisesRegex(
+            AssertionError,
+            "expected size 2==5, stride 12==9 at dim=0; expected size 3==6, stride 4==9 at dim=1; expected size 4==7, stride 1==10 at dim=2",
+        ):
+            torch._C._dynamo.guards.assert_size_stride(x, (5, 6, 7), (9, 9, 10))
+
    def test_module_dunder_dict(self):
        class MyModule(torch.nn.Module):
            def __init__(self):
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@ -2512,6 +2512,19 @@ class OptimizedModuleTest(torch._dynamo.test_case.TestCase):
        self.assertEqual(eager_res, optim_res)
        self.assertEqual(cnt.frame_count, 1)

+    def test_module_setattr(self):
+        models = torch.nn.Sequential(torch.nn.Linear(3, 3))
+        models[0].abc = False
+
+        def run():
+            models[0].abc = True
+            x = torch.randn(1, 3)
+            return models(x)
+
+        run = torch.compile(run, fullgraph=True)
+        run()
+        self.assertTrue(models[0].abc)
+
    def test_assign_does_not_exist(self):
        class MyModule(torch.nn.Module):
            def forward(self, x):
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@ -8,6 +8,7 @@ import collections
 import contextlib
 import copy
 import functools
+import gc
 import inspect
 import itertools
 import random
@ -1079,6 +1080,67 @@ class ReproTests(torch._dynamo.test_case.TestCase):
        out_test.sum().backward()
        self.assertEqual(leaf.grad, leaf_test.grad)

+    # https://github.com/pytorch/pytorch/issues/113263
+    def test_unpack_hooks_dont_run_during_tracing(self):
+        def f(x, y):
+            return x * y
+
+        f_compiled = torch.compile(f, backend="aot_eager")
+
+        pack_count = 0
+        unpack_count = 0
+
+        def pack_hook(x):
+            nonlocal pack_count
+            pack_count += 1
+            return x
+
+        # unpack hook shouldn't run during compilation, while we trace the forward
+        def unpack_hook(x):
+            nonlocal unpack_count
+            unpack_count += 1
+            return x
+
+        x = torch.ones(4, requires_grad=True)
+        y = torch.ones(4, requires_grad=False)
+        with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
+            out_test = f_compiled(x, y)
+            self.assertEqual(pack_count, 1)
+            self.assertEqual(unpack_count, 0)
+            out_test.sum().backward()
+            self.assertEqual(pack_count, 1)
+            self.assertEqual(unpack_count, 1)
+
+    # https://github.com/pytorch/pytorch/issues/113263
+    def test_unpack_hooks_can_be_disabled(self):
+        def f(x, y):
+            return x * y
+
+        f_compiled = torch.compile(f, backend="aot_eager")
+
+        x = torch.ones(4, requires_grad=True)
+        y = torch.ones(4, requires_grad=False)
+        with torch.autograd.graph.disable_saved_tensors_hooks("hooks are disabled"):
+            out_test = f_compiled(x, y)
+            out_test.sum().backward()
+
+    # https://github.com/pytorch/pytorch/issues/113263
+    def test_disabling_unpack_hooks_within_compiled_region(self):
+        def g(z):
+            with torch.autograd.graph.disable_saved_tensors_hooks("hooks are disabled"):
+                return z + 5
+
+        def f(x, y):
+            z = x * y
+            return g(z)
+
+        f_compiled = torch.compile(f, backend="aot_eager")
+
+        x = torch.ones(4, requires_grad=True)
+        y = torch.ones(4, requires_grad=False)
+        out_test = f_compiled(x, y)
+        out_test.sum().backward()
+
    # See https://github.com/pytorch/pytorch/issues/97745
    def test_gan_repro_trying_to_backward_through_the_graph_a_second_time(self):
        def f(a, b):
@ -4659,6 +4721,66 @@ def forward(self, s0 : torch.SymInt, s1 : torch.SymInt, L_x_ : torch.Tensor):
        self.assertEqual(type(actual), type(expected))
        self.assertEqual(actual.__dict__, expected.__dict__)

+    def test_weakref(self):
+        def fn(x_weak, weight, y):
+            if x_weak is not None and x_weak() is not weight:
+                return torch.sin(y)
+            return torch.cos(y)
+
+        weight = torch.randn(4)
+        y = torch.randn(4)
+        x_weak = weakref.ref(weight)
+
+        ref = fn(x_weak, weight, y)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x_weak, weight, y)
+        self.assertEqual(ref, res)
+
+    def test_weakref_reconstruct(self):
+        def fn(x_weak, weight, y):
+            y = torch.sin(y)
+            referent = x_weak()
+            torch._dynamo.graph_break()
+            if referent is not weight:
+                return torch.sin(y)
+            return torch.cos(y)
+
+        weight = torch.randn(4)
+        y = torch.randn(4)
+        x_weak = weakref.ref(weight)
+
+        ref = fn(x_weak, weight, y)
+
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnt)
+        res = opt_fn(x_weak, weight, y)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnt.frame_count, 2)
+
+    def test_weakref_del(self):
+        def fn(x_weak, y):
+            x = x_weak()
+            if x is not None:
+                return torch.sin(y)
+            return torch.cos(y)
+
+        weight = torch.randn(4)
+        x_weak = weakref.ref(weight)
+        y = torch.randn(4)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+
+        ref = fn(x_weak, y)
+        res = opt_fn(x_weak, y)
+        self.assertEqual(ref, res)
+
+        del weight
+        gc.collect()
+        ref = fn(x_weak, y)
+        res = opt_fn(x_weak, y)
+        self.assertEqual(ref, res)
+
    def test_storage_resize_forward_full_graph(self):
        class TestModule(torch.nn.Module):
            def __init__(self):
--- a/test/dynamo/test_subclasses.py
+++ b/test/dynamo/test_subclasses.py
@ -1392,6 +1392,7 @@ class GraphModule(torch.nn.Module):
            yield t.select(-1, 6), False
            # https://github.com/pytorch/pytorch/issues/128649
            yield t[2:3, 5:9], dynamic
+            yield t.view(-1, 15), False

        def f(x):
            return x * 2
--- a/test/dynamo_expected_failures/TestIterDataPipeGraphFastForward.test_simple_snapshot_custom_non_generator
+++ b/test/dynamo_expected_failures/TestIterDataPipeGraphFastForward.test_simple_snapshot_custom_non_generator
--- a/test/dynamo_expected_failures/TestIterDataPipeGraphFastForward.test_simple_snapshot_custom_self_next
+++ b/test/dynamo_expected_failures/TestIterDataPipeGraphFastForward.test_simple_snapshot_custom_self_next
--- a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_kwargs_early_stop_False
+++ b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_kwargs_early_stop_False
--- a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_non_tensor_inputs_and_outputs_early_stop_False
+++ b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_non_tensor_inputs_and_outputs_early_stop_False
--- a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_reentrant_backwards_early_stop_False
+++ b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_reentrant_backwards_early_stop_False
--- a/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_same_graph_early_stop_False
+++ b/test/dynamo_expected_failures/TestNestedCheckpoint.test_nested_checkpoint_same_graph_early_stop_False
--- a/test/dynamo_expected_failures/TestAOTAutograd.test_output_aliases_multiple_inputs_get_correct_one
+++ b/test/dynamo_expected_failures/TestAOTAutograd.test_output_aliases_multiple_inputs_get_correct_one
--- a/test/export/test_torchbind.py
+++ b/test/export/test_torchbind.py
@ -182,10 +182,12 @@ def forward(self, x, n):
        self.assertExpectedInline(
            ep.graph_module.code.strip(),
            """\
-def forward(self, obj_attr, x, n):
-    call_torchbind = torch.ops.higher_order.call_torchbind(obj_attr, 'add_tensor', x);  obj_attr = None
-    add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
-    return (add,)""",
+def forward(self, token, obj_attr, x, n):
+    with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops.higher_order.call_torchbind, obj_attr, 'add_tensor', x);  token = obj_attr = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1];  with_effects = None
+    add = torch.ops.aten.add.Tensor(x, getitem_1);  x = getitem_1 = None
+    return (getitem, add)""",  # noqa: B950
        )

    def test_method_schema(self):
@ -227,10 +229,12 @@ def forward(self, x):
        self.assertExpectedInline(
            ep.graph_module.code.strip(),
            """\
-def forward(self, obj_attr, x):
-    call_torchbind = torch.ops.higher_order.call_torchbind(obj_attr, 'add_tensor', x);  obj_attr = None
-    add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
-    return (add,)""",
+def forward(self, token, obj_attr, x):
+    with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops.higher_order.call_torchbind, obj_attr, 'add_tensor', x);  token = obj_attr = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1];  with_effects = None
+    add = torch.ops.aten.add.Tensor(x, getitem_1);  x = getitem_1 = None
+    return (getitem, add)""",  # noqa: B950
        )

    @parametrize("pre_dispatch", [True, False])
@ -293,10 +297,12 @@ def forward(self, x, cc):
        self.assertExpectedInline(
            ep.graph_module.code.strip(),
            """\
-def forward(self, x, cc):
-    call_torchbind = torch.ops.higher_order.call_torchbind(cc, 'add_tensor', x);  cc = None
-    add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
-    return (add,)""",
+def forward(self, token, x, cc):
+    with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops.higher_order.call_torchbind, cc, 'add_tensor', x);  token = cc = None
+    getitem = with_effects[0]
+    getitem_1 = with_effects[1];  with_effects = None
+    add = torch.ops.aten.add.Tensor(x, getitem_1);  x = getitem_1 = None
+    return (getitem, add)""",  # noqa: B950
        )
        # aot_export_function runs the program twice
        # in run_functionalized_fw_and_collect_metadata and create_aot_dispatcher_function
--- a/test/higher_order_ops/test_with_effects.py
+++ b/test/higher_order_ops/test_with_effects.py
@ -198,6 +198,33 @@ def forward(self, arg0_1, arg1_1, arg2_1):
        res = torch.compile(f, backend="inductor")(*inputs)
        self.assertTrue(torch.allclose(res, f(*inputs)))

+    @unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
+    @skipIfNoDynamoSupport
+    def test_compile_inductor_external_op_return_none(self):
+        with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
+            torch.library.define(
+                "mylib::inplace_add",
+                "(Tensor input, Tensor(a!) output) -> ()",
+                lib=lib,
+            )
+
+            def inplace_add(input: torch.Tensor, output: torch.Tensor) -> None:
+                assert input.device == output.device
+                output.add_(input)
+
+            lib.impl("inplace_add", inplace_add, "CompositeExplicitAutograd")
+
+            def f(x):
+                out = torch.empty(3)
+                out = torch.zeros_like(out)
+                torch.ops.mylib.inplace_add(x, out)
+                return out
+
+            inputs = (torch.randn(3),)
+
+            res = torch.compile(f, backend="inductor")(*inputs)
+            self.assertTrue(torch.allclose(res, f(*inputs)))
+
    def test_compile_aot_eager_requires_grad(self):
        def f(x):
            torch.ops.aten._print("moo")
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@ -109,6 +109,7 @@ class TestFxGraphCache(TestCase):

    @requires_triton()
    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    @parametrize("device", (GPU_TYPE, "cpu"))
    @parametrize("dtype", (torch.float32, torch.bfloat16))
    @parametrize("dynamic", (False, True))
@ -216,6 +217,7 @@ class TestFxGraphCache(TestCase):

    @requires_triton()
    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    @parametrize("device", (GPU_TYPE, "cpu"))
    @parametrize("dtype", (torch.float32, torch.float64))
    @parametrize("dynamic", (False, True))
@ -255,6 +257,7 @@ class TestFxGraphCache(TestCase):

    @largeTensorTest("64GB", device=GPU_TYPE)
    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    @parametrize("device", (GPU_TYPE,))
    @parametrize("dtype", (torch.float16, torch.bfloat16))
    def test_cache_load_with_guards_int32_bounds(self, device, dtype):
@ -303,6 +306,7 @@ class TestFxGraphCache(TestCase):
            self.assertEqual(res1, res2)

    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    @parametrize("device", (GPU_TYPE, "cpu"))
    @parametrize("dtype", (torch.float32, torch.bfloat16))
    def test_cache_load_with_guards_static_bounds(self, device, dtype):
@ -346,6 +350,7 @@ class TestFxGraphCache(TestCase):
            self.assertEqual(res1, res2)

    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    @parametrize("device", (GPU_TYPE, "cpu"))
    def test_constant_handling(self, device):
        """
@ -378,6 +383,7 @@ class TestFxGraphCache(TestCase):
    @requires_gpu()
    @requires_triton()
    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    def test_higher_order_op_bypass(self):
        """
        Verify that we bypass the cache when we have higher order ops.
@ -403,6 +409,7 @@ class TestFxGraphCache(TestCase):
        self.assertGreater(counters["inductor"]["fxgraph_cache_bypass"], 0)

    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    def test_generated_kernel_count(self):
        """
        Test that we bump the generated_kernel_count metric on a cache hit.
@ -431,6 +438,7 @@ class TestFxGraphCache(TestCase):
        self.assertEqual(metrics.generated_kernel_count, 2)

    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    def test_cache_clear(self):
        """
        Test clearing the cache.
@ -465,6 +473,7 @@ class TestFxGraphCache(TestCase):
        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    def test_cache_with_nt(self):
        def gen_nt(r):
            values = torch.randn(r, 16)
@ -493,6 +502,7 @@ class TestFxGraphCache(TestCase):
        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)

    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    def test_cache_with_symint_non_arg_guard(self):
        def fn(x, ref_id):
            self_id = 22
@ -516,6 +526,7 @@ class TestFxGraphCache(TestCase):
        self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)

    @config.patch({"fx_graph_cache": True})
+    @config.patch({"fx_graph_remote_cache": False})
    def test_cache_guard(self):
        def f(x, val):
            if val > 5:
@ -740,6 +751,7 @@ class TestFxGraphCacheHashing(TestCase):


 class TestUtils(TestCase):
+    @config.patch({"fx_graph_remote_cache": False})
    def test_fresh_inductor_cache(self):
        def fn(x, y):
            return x + y
--- a/test/inductor/test_compile_worker.py
+++ b/test/inductor/test_compile_worker.py
@ -1,5 +1,6 @@
 # Owner(s): ["module: inductor"]
 import operator
+import os

 from torch._inductor.compile_worker.subproc_pool import (
    raise_testexc,
@ -31,6 +32,21 @@ class TestCompileWorker(TestCase):
        finally:
            pool.shutdown()

+    def test_crash(self):
+        pool = SubprocPool(2)
+        try:
+            with self.assertRaises(Exception):
+                a = pool.submit(os._exit, 1)
+                a.result()
+
+            # Pool should still be usable after a crash
+            b = pool.submit(operator.add, 100, 1)
+            c = pool.submit(operator.sub, 100, 1)
+            self.assertEqual(b.result(), 101)
+            self.assertEqual(c.result(), 99)
+        finally:
+            pool.shutdown()
+

 if __name__ == "__main__":
    from torch._inductor.test_case import run_tests
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@ -2223,6 +2223,7 @@ known_failing_tests = {
    "test_save_for_backward_inputs_are_namedtuple",  # torch._dynamo.exc.Unsupported: 'skip function
    "test_setitem",  # AssertionError: Tensor-likes are not close!
    "test_grad_nonleaf_register_hook",  # IndexError: list index out of range (NB: x.grad = y where both x and y are input tensors)
+    "test_unpack_hooks_exec_count",  # pack/unpack saved tensor hooks firing more than once
    "test_scalar_grad_mixed_device",  # Fake Tensors aren't propagating device properly for 0-dim grads
 }

--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@ -18,7 +18,10 @@ from torch._inductor.runtime.hints import DeviceProperties
 from torch._inductor.utils import run_and_get_code
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+    SM80OrLater,
+)
 from torch.testing._internal.common_utils import (
    DeterministicGuard,
    freeze_rng_state,
@ -27,6 +30,8 @@ from torch.testing._internal.common_utils import (
    TEST_WITH_ASAN,
 )

+from torch.testing._internal.inductor_utils import skipCUDAIf
+
 try:
    try:
        import triton
@ -1239,6 +1244,47 @@ def triton_(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    tl.store(out_ptr0 + (x3), tmp2, xmask)""",  # noqa: B950
        )

+    @skipCUDAIf(not SM80OrLater, "uses bfloat16 which requires SM >= 80")
+    def test_int64_index_intermediate(self):
+        def foo(inp):
+            view_23 = torch.ops.aten.view.default(inp, [-1, 8192, 8192])
+            split_1 = torch.ops.aten.split.Tensor(view_23, 1024, 1)
+            view_23 = None
+            getitem_17 = split_1[0]
+            getitem_18 = split_1[1]
+            getitem_19 = split_1[2]
+            getitem_20 = split_1[3]
+            getitem_21 = split_1[4]
+            getitem_22 = split_1[5]
+            getitem_23 = split_1[6]
+            getitem_24 = split_1[7]
+            split_1 = None
+            cat_1 = torch.ops.aten.cat.default(
+                [
+                    getitem_17,
+                    getitem_18,
+                    getitem_19,
+                    getitem_20,
+                    getitem_21,
+                    getitem_22,
+                    getitem_23,
+                    getitem_24,
+                ]
+            )
+            getitem_17 = (
+                getitem_18
+            ) = (
+                getitem_19
+            ) = getitem_20 = getitem_21 = getitem_22 = getitem_23 = getitem_24 = None
+            return cat_1
+
+        for mark_dynamic in [False, True]:
+            inp = torch.rand((65536, 8192), dtype=torch.bfloat16, device="cuda")
+            if mark_dynamic:
+                torch._dynamo.mark_dynamic(inp, 0)
+            foo_c = torch.compile(foo)
+            torch.testing.assert_allclose(foo(inp), foo_c(inp))
+

 if __name__ == "__main__":
    from torch._inductor.test_case import run_tests
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@ -2526,6 +2526,7 @@ class TestPatternMatcher(TestPatternMatcherBase):
            om(*example_inputs)
            om(*example_inputs)

+    @torch._dynamo.config.patch("inline_inbuilt_nn_modules", True)
    def test_reproduce_121253_issue(self):
        class Mod(torch.nn.Module):
            def __init__(self, weight, bias, beta, alpha):
@ -2550,8 +2551,8 @@ class TestPatternMatcher(TestPatternMatcherBase):
                else "mkldnn._linear_pointwise"
            )
            for beta, alpha in zip([1.0, 0.1, 0.0], [1.0, 0.1, 1.0]):
-                weight = torch.randn(64, 64, dtype=dtype)
-                bias = torch.randn(64, dtype=dtype)
+                weight = torch.nn.Parameter(torch.randn(64, 64, dtype=dtype))
+                bias = torch.nn.Parameter(torch.randn(64, dtype=dtype))
                mod = Mod(weight, bias, beta, alpha).to(dtype).eval()
                with torch.no_grad():
                    x = torch.randn(1, 64, dtype=dtype)
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@ -1459,6 +1459,26 @@ class CommonTemplate:
        actual = _run_and_assert_no_indirect_indexing(self, copy_opt, x)
        self.assertEqual(expect, actual)

+    @dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
+    @config.patch(implicit_fallbacks=True)
+    def test_index_propagation_nested_indirect_indexing(self):
+        def nested(x, repeats):
+            rank = torch.arange(repeats.numel(), device=x.device)
+            index = rank.repeat_interleave(repeats, dim=0)
+            return torch.index_select(x, index=index, dim=0)
+
+        example_inputs = (
+            torch.randn((32, 64), device=self.device),
+            repeats := torch.tensor([5, 10, 15], device=self.device),
+        )
+        torch._dynamo.mark_dynamic(repeats, 0)  # create backed symint
+
+        nested_opt = torch._dynamo.optimize("inductor")(nested)
+
+        expect = nested(*example_inputs)
+        actual = nested_opt(*example_inputs)
+        self.assertEqual(expect, actual)
+
    def test_index_propagation_flip(self):
        def flip(x):
            i = torch.arange(x.size(0) - 1, -1, -1, device=x.device)
@ -10843,6 +10863,7 @@ if HAS_GPU and not TEST_WITH_ASAN:

            self.assertEqual(fn_opt(*inps), fn(*inps))

+        @config.patch({"fx_graph_remote_cache": False})
        def test_optimize_indexing_dtype_with_constraint(self):
            def fn1(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
                x = torch.arange(0, b.shape[0], device=GPU_TYPE)
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@ -406,44 +406,132 @@ inductor_override_kwargs = {
 }


-# Always test with all sample for following ops
-inductor_all_samples = {
-    "arange",
-    "diagonal",
-    "diagonal_copy",
-    "diagonal_scatter",
-    "softmax.with_dtype",
-    "index_add",
-    "index_copy",
-    "index_reduce.prod",
-    "index_reduce.mean",
-    "index_reduce.amax",
-    "index_reduce.amin",
-    "scatter_reduce.sum",
-    "select_scatter",
-    "squeeze",
-    "unfold",
-    "unsqueeze",
-    "sum",
-    "amax",
-    "amin",
-    "all",
-    "T",
-    "H",
-    "isinf",
-    "isposinf",
-    "isneginf",
-    "nan_to_num",
-    "mT",
-    "mH",
-    "rsub",
-    "triu",
-    "cummax",
-    "cummin",
-    "nextafter",
-    "gather",
-    "_chunk_cat",
-    "constant_pad_nd",
+# Test with one sample only for following ops
+inductor_one_sample = {
+    "_segment_reduce.lengths": {f16},
+    "_segment_reduce.offsets": {f16},
+    "addmv": {f16},
+    "argsort": {b8, f16, f32, f64, i32, i64},
+    "as_strided.partial_views": {f16},
+    "clamp_max": {b8},
+    "clamp_min": {b8},
+    "corrcoef": {f16},
+    "diff": {f16},
+    "einsum": {f16, i32},
+    "gradient": {f16},
+    "histogram": {f32, f64},
+    "histogramdd": {f32, f64},
+    "index_put": {f16, f32, f64},
+    "linalg.eig": {f32, f64},
+    "linspace": {f16, i32, i64},
+    "linspace.tensor_overload": {f16, f32, f64, i32, i64},
+    "logspace": {f16},
+    "logspace.tensor_overload": {f16, f32, f64, i32, i64},
+    "masked_logsumexp": {i64},
+    "max.binary": {b8},
+    "max_pool2d_with_indices_backward": {f16, f32, f64},
+    "maximum": {b8},
+    "min.binary": {b8},
+    "minimum": {b8},
+    "ne": {b8},
+    "new_empty_strided": {f16},
+    "nn.functional.adaptive_avg_pool3d": {f16},
+    "nn.functional.adaptive_max_pool1d": {f16, f32},
+    "nn.functional.adaptive_max_pool2d": {f16, f32},
+    "nn.functional.bilinear": {f16},
+    "nn.functional.conv_transpose1d": {f16},
+    "nn.functional.conv_transpose2d": {f16},
+    "nn.functional.conv_transpose3d": {f16},
+    "nn.functional.cosine_similarity": {f16},
+    "nn.functional.cross_entropy": {f16, f32, f64},
+    "nn.functional.gaussian_nll_loss": {f16},
+    "nn.functional.grid_sample": {f32, f64},
+    "nn.functional.interpolate.area": {f16},
+    "nn.functional.max_pool2d": {f16, f32, f64, i32, i64},
+    "nn.functional.nll_loss": {f16, f32, f64},
+    "normal": {f16, f32, f64},
+    "put": {f16, f32, f64},
+    "rot90": {b8, f16, f32, f64, i32, i64},
+    "scatter": {b8, i64},
+    "take": {b8, f16, f32, f64, i32, i64},
+    ("__rdiv__", "cuda"): {f16},
+    ("__rmod__", "cuda"): {f16, i64},
+    ("__rmul__", "cuda"): {f16},
+    ("__rpow__", "cuda"): {f16},
+    ("addcdiv", "cuda"): {f16},
+    ("addcmul", "cuda"): {f16},
+    ("atan2", "cuda"): {f16},
+    ("cumsum", "cuda"): {f16},
+    ("cumulative_trapezoid", "cuda"): {f16},
+    ("dist", "cuda"): {f16},
+    ("div.no_rounding_mode", "cuda"): {f16},
+    ("fmod", "cuda"): {f16},
+    ("grid_sampler_2d", "cuda"): {f16},
+    ("index_fill", "cuda"): {f16, f32, f64},
+    ("ldexp", "cuda"): {f16},
+    ("lerp", "cuda"): {f16},
+    ("linalg.householder_product", "cuda"): {f32},
+    ("linalg.matrix_norm", "cuda"): {f16},
+    ("linalg.vector_norm", "cuda"): {f16},
+    ("logspace", "cuda"): {i32, i64},
+    ("masked.cumsum", "cuda"): {f16},
+    ("masked.logsumexp", "cuda"): {f16},
+    ("masked.mean", "cuda"): {b8},
+    ("masked.normalize", "cuda"): {f16},
+    ("masked.prod", "cuda"): {f16},
+    ("masked.std", "cuda"): {f16},
+    ("masked.var", "cuda"): {f16},
+    ("mul", "cuda"): {f16},
+    ("nn.functional.alpha_dropout", "cuda"): {f16, f32, f64},
+    ("nn.functional.avg_pool1d", "cuda"): {f16, f32, f64},
+    ("nn.functional.avg_pool2d", "cuda"): {f16, f32, f64},
+    ("nn.functional.avg_pool3d", "cuda"): {f16, f32, f64},
+    ("nn.functional.binary_cross_entropy", "cuda"): {f16},
+    ("nn.functional.binary_cross_entropy_with_logits", "cuda"): {f16},
+    ("nn.functional.conv2d", "cuda"): {f16},
+    ("nn.functional.cosine_embedding_loss", "cuda"): {f16},
+    ("nn.functional.dropout2d", "cuda"): {f16, f32, f64},
+    ("nn.functional.dropout3d", "cuda"): {f16, f32, f64},
+    ("nn.functional.dropout", "cuda"): {f16, f32, f64},
+    ("nn.functional.feature_alpha_dropout.with_train", "cuda"): {f16, f32, f64},
+    ("nn.functional.fractional_max_pool2d", "cuda"): {f16, f32, f64},
+    ("nn.functional.fractional_max_pool3d", "cuda"): {f16, f32, f64},
+    ("nn.functional.grid_sample", "cuda"): {f16},
+    ("nn.functional.group_norm", "cuda"): {f16},
+    ("nn.functional.hinge_embedding_loss", "cuda"): {f16},
+    ("nn.functional.interpolate.bicubic", "cuda"): {f16},
+    ("nn.functional.interpolate.bilinear", "cuda"): {f16},
+    ("nn.functional.interpolate.trilinear", "cuda"): {f16},
+    ("nn.functional.kl_div", "cuda"): {f16},
+    ("nn.functional.margin_ranking_loss", "cuda"): {f16},
+    ("nn.functional.max_pool1d", "cuda"): {f16, f32, f64},
+    ("nn.functional.max_pool3d", "cuda"): {f16},
+    ("nn.functional.mse_loss", "cuda"): {f16},
+    ("nn.functional.multi_margin_loss", "cuda"): {f16},
+    ("nn.functional.multilabel_margin_loss", "cuda"): {f16},
+    ("nn.functional.multilabel_soft_margin_loss", "cuda"): {f16},
+    ("nn.functional.normalize", "cuda"): {f16},
+    ("nn.functional.pad.replicate", "cuda"): {f16, f32, f64},
+    ("nn.functional.pad.reflect", "cuda"): {f16},
+    ("nn.functional.pairwise_distance", "cuda"): {f16},
+    ("nn.functional.poisson_nll_loss", "cuda"): {f16},
+    ("nn.functional.rms_norm", "cuda"): {f16},
+    ("norm", "cuda"): {f16},
+    ("pow", "cuda"): {f16},
+    ("prod", "cuda"): {f16},
+    ("scatter_reduce.amax", "cuda"): {f16, f32, f64},
+    ("scatter_reduce.amin", "cuda"): {f16, f32, f64},
+    ("scatter_reduce.mean", "cuda"): {f16, f32, f64},
+    ("special.xlog1py", "cuda"): {f16},
+    ("std", "cuda"): {f16},
+    ("std_mean", "cuda"): {f16},
+    ("svd_lowrank", "cuda"): {f32, f64},
+    ("trapezoid", "cuda"): {f16},
+    ("trapz", "cuda"): {f16},
+    ("true_divide", "cuda"): {f16},
+    ("var", "cuda"): {f16},
+    ("var_mean", "cuda"): {f16},
+    ("xlogy", "cuda"): {f16},
 }


@ -489,10 +577,14 @@ class TestInductorOpInfo(TestCase):
    )
    @collection_decorator
    def test_comprehensive(self, device, dtype, op):
+        device_type = torch.device(device).type
+
+        assert device_type in (GPU_TYPE, "cpu")
+
        torch._dynamo.reset()
        with torch.no_grad():
            # TODO: should we move empty_cache to the common device interface
-            if device == "cuda":
+            if device_type == "cuda":
                torch.cuda.empty_cache()
        op_name = op.name
        if op.variant_test_name:
@ -509,10 +601,6 @@ class TestInductorOpInfo(TestCase):
            if dtype not in allowed_dtypes:
                raise unittest.SkipTest("Skipped!")

-        device_type = torch.device(device).type
-
-        assert device_type in (GPU_TYPE, "cpu")
-
        # with open("test_output.txt", "a") as f:
        #     print(f"CONSIDERING OP {op_name} on {device_type} with {dtype} |
        # {inductor_skips[device_type].get(op_name, set())}", flush=True, file=f)
@ -557,7 +645,10 @@ class TestInductorOpInfo(TestCase):
        )
        samples = op.sample_inputs(device, dtype, requires_grad=requires_grad)

-        if op_name not in inductor_all_samples and not ALL_SAMPLES:
+        if (
+            dtype in inductor_one_sample.get(op_name, {})
+            or dtype in inductor_one_sample.get((op_name, device_type), {})
+        ) and not ALL_SAMPLES:
            if isinstance(samples, (list, tuple)):
                samples = [samples[0]]
            else:
--- a/test/inductor/test_unbacked_symints.py
+++ b/test/inductor/test_unbacked_symints.py
@ -1,16 +1,16 @@
 # Owner(s): ["module: inductor"]

+import functools
 import unittest

 import torch
-
 from torch._dynamo import config as dynamo_config
 from torch._inductor import config as inductor_config
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch._inductor.utils import is_big_gpu
 from torch.testing import make_tensor
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.testing._internal.common_utils import IS_LINUX
+from torch.testing._internal.common_utils import IS_LINUX, parametrize
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA, skipCUDAIf


@ -214,6 +214,44 @@ class TestUnbackedSymints(InductorTestCase):
        torch.testing.assert_close(actual, expected)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

+    @dynamo_config.patch({"capture_scalar_outputs": True})
+    @parametrize(
+        "torch_fn", [torch.mm, torch.bmm, torch.addmm], name_fn=lambda fn: fn.__name__
+    )
+    @parametrize("coordinate_descent_tuning", [True, False], name_fn=str)
+    def test_mm_and_friends(self, device, torch_fn, coordinate_descent_tuning):
+        if torch_fn == torch.addmm:
+            torch_fn = functools.partial(torch_fn, torch.ones(1, device=device))
+
+        def fn(x, w, repeats, is_bmm):
+            u0 = repeats.item()
+            torch._check_is_size(u0)
+
+            x_unbacked = x.expand(u0, 32)
+            w_unbacked = w.expand(32, u0)
+            if is_bmm:
+                # Make sure inputs are batched.
+                x_unbacked = x_unbacked.expand(10, *x_unbacked.shape)
+                w_unbacked = w_unbacked.expand(10, *w_unbacked.shape)
+
+            return torch_fn(x_unbacked, w_unbacked)
+
+        example_inputs = (
+            torch.randn(1, 32, device=device),
+            torch.randn(32, 1, device=device),
+            torch.tensor(100, device=device),
+            torch_fn == torch.bmm,
+        )
+        with inductor_config.patch(
+            {
+                # coordinate_descent_tuning has its own path during decomp
+                "coordinate_descent_tuning": coordinate_descent_tuning,
+            }
+        ):
+            actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+

 instantiate_device_type_tests(
    TestUnbackedSymints, globals(), only_for=(GPU_TYPE, "cpu")
--- a/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
+++ b/test/onnx/dynamo/test_dynamo_with_onnxruntime_backend.py
@ -471,7 +471,12 @@ class TestDynamoWithONNXRuntime(onnx_test_common._TestONNXRuntime):

        if test_local_backend:
            assert local_ort is not None
-            number_of_captured_graphs = 2 if test_backward else 1
+            if torch._dynamo.config.inline_inbuilt_nn_modules:
+                # with inlining and dynamic=True, we have more graph captures
+                number_of_captured_graphs = 3 if test_backward else 2
+            else:
+                number_of_captured_graphs = 2 if test_backward else 1
+
            execution_count = len(example_args_collection) * number_of_captured_graphs
            self._assert_counting_information(
                local_ort,
@ -564,8 +569,14 @@ class TestDynamoWithONNXRuntime(onnx_test_common._TestONNXRuntime):

        if test_local_backend:
            assert local_ort is not None
-            number_of_captured_graphs = 2 if test_backward else 1
+            if torch._dynamo.config.inline_inbuilt_nn_modules:
+                # with inlining and dynamic=True, we have more graph captures
+                number_of_captured_graphs = 3 if test_backward else 2
+            else:
+                number_of_captured_graphs = 2 if test_backward else 1
+
            execution_count = len(example_args_collection) * number_of_captured_graphs
+
            self._assert_counting_information(
                local_ort,
                expected_execution_count=execution_count,
@ -649,7 +660,11 @@ class TestDynamoWithONNXRuntime(onnx_test_common._TestONNXRuntime):

        if test_local_backend:
            assert local_ort is not None
-            number_of_captured_graphs = 2 if test_backward else 1
+            if torch._dynamo.config.inline_inbuilt_nn_modules:
+                # with inlining and dynamic=True, we have more graph captures
+                number_of_captured_graphs = 3 if test_backward else 2
+            else:
+                number_of_captured_graphs = 2 if test_backward else 1
            execution_count = len(example_args_collection) * number_of_captured_graphs
            self._assert_counting_information(
                local_ort,
--- a/test/onnx/dynamo/test_exporter_api.py
+++ b/test/onnx/dynamo/test_exporter_api.py
@ -33,6 +33,11 @@ class SampleModelTwoInputs(torch.nn.Module):
        return (y, z)


+class SampleModelForDynamicShapes(torch.nn.Module):
+    def forward(self, x, b):
+        return x.relu(), b.sigmoid()
+
+
 class _LargeModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
@ -230,8 +235,15 @@ class TestLargeProtobufONNXProgramSerializerAPI(common_utils.TestCase):

 class TestONNXExportWithDynamo(common_utils.TestCase):
    def test_args_normalization_with_no_kwargs(self):
+        exported_program = torch.export.export(
+            SampleModelTwoInputs(),
+            (
+                torch.randn(1, 1, 2),
+                torch.randn(1, 1, 2),
+            ),
+        )
        onnx_program_from_new_exporter = torch.onnx.dynamo_export(
-            SampleModelTwoInputs(), torch.randn(1, 1, 2), torch.randn(1, 1, 2)
+            exported_program, torch.randn(1, 1, 2), torch.randn(1, 1, 2)
        )
        onnx_program_from_old_exporter = torch.onnx.export(
            SampleModelTwoInputs(),
@ -243,9 +255,25 @@ class TestONNXExportWithDynamo(common_utils.TestCase):
            onnx_program_from_old_exporter.model_proto,
        )

-    def test_args_normalization_with_kwargs(self):
+    def test_args_is_tensor_not_tuple(self):
+        exported_program = torch.export.export(SampleModel(), (torch.randn(1, 1, 2),))
        onnx_program_from_new_exporter = torch.onnx.dynamo_export(
-            SampleModelTwoInputs(), torch.randn(1, 1, 2), b=torch.randn(1, 1, 2)
+            exported_program, torch.randn(1, 1, 2)
+        )
+        onnx_program_from_old_exporter = torch.onnx.export(
+            SampleModel(), torch.randn(1, 1, 2), dynamo=True
+        )
+        self.assertEqual(
+            onnx_program_from_new_exporter.model_proto,
+            onnx_program_from_old_exporter.model_proto,
+        )
+
+    def test_args_normalization_with_kwargs(self):
+        exported_program = torch.export.export(
+            SampleModelTwoInputs(), (torch.randn(1, 1, 2),), {"b": torch.randn(1, 1, 2)}
+        )
+        onnx_program_from_new_exporter = torch.onnx.dynamo_export(
+            exported_program, torch.randn(1, 1, 2), b=torch.randn(1, 1, 2)
        )
        onnx_program_from_old_exporter = torch.onnx.export(
            SampleModelTwoInputs(),
@ -258,8 +286,11 @@ class TestONNXExportWithDynamo(common_utils.TestCase):
        )

    def test_args_normalization_with_empty_dict_at_the_tail(self):
+        exported_program = torch.export.export(
+            SampleModelTwoInputs(), (torch.randn(1, 1, 2),), {"b": torch.randn(1, 1, 2)}
+        )
        onnx_program_from_new_exporter = torch.onnx.dynamo_export(
-            SampleModelTwoInputs(), torch.randn(1, 1, 2), b=torch.randn(1, 1, 2)
+            exported_program, torch.randn(1, 1, 2), b=torch.randn(1, 1, 2)
        )
        onnx_program_from_old_exporter = torch.onnx.export(
            SampleModelTwoInputs(),
@ -271,17 +302,111 @@ class TestONNXExportWithDynamo(common_utils.TestCase):
            onnx_program_from_old_exporter.model_proto,
        )

-    def test_dynamic_axes_enable_dynamic_shape(self):
+    def test_dynamic_axes_enable_dynamic_shapes_with_fully_specified_axes(self):
+        exported_program = torch.export.export(
+            SampleModelForDynamicShapes(),
+            (
+                torch.randn(2, 2, 3),
+                torch.randn(2, 2, 3),
+            ),
+            dynamic_shapes={
+                "x": {
+                    0: torch.export.Dim("customx_dim_0"),
+                    1: torch.export.Dim("customx_dim_1"),
+                    2: torch.export.Dim("customx_dim_2"),
+                },
+                "b": {
+                    0: torch.export.Dim("customb_dim_0"),
+                    1: torch.export.Dim("customb_dim_1"),
+                    2: torch.export.Dim("customb_dim_2"),
+                },
+            },
+        )
        onnx_program_from_new_exporter = torch.onnx.dynamo_export(
-            SampleModelTwoInputs(),
-            torch.randn(1, 1, 2),
-            b=torch.randn(1, 1, 2),
-            export_options=ExportOptions(dynamic_shapes=True),
+            exported_program,
+            torch.randn(2, 2, 3),
+            b=torch.randn(2, 2, 3),
        )
        onnx_program_from_old_exporter = torch.onnx.export(
-            SampleModelTwoInputs(),
-            (torch.randn(1, 1, 2), {"b": torch.randn(1, 1, 2)}, {}),
-            dynamic_axes={"b": [0, 1, 2]},
+            SampleModelForDynamicShapes(),
+            (torch.randn(2, 2, 3), {"b": torch.randn(2, 2, 3)}, {}),
+            dynamic_axes={
+                "x": {0: "customx_dim_0", 1: "customx_dim_1", 2: "customx_dim_2"},
+                "b": {0: "customb_dim_0", 1: "customb_dim_1", 2: "customb_dim_2"},
+            },
+            dynamo=True,
+        )
+        self.assertEqual(
+            onnx_program_from_new_exporter.model_proto,
+            onnx_program_from_old_exporter.model_proto,
+        )
+
+    def test_dynamic_axes_enable_dynamic_shapes_with_default_axe_names(self):
+        exported_program = torch.export.export(
+            SampleModelForDynamicShapes(),
+            (
+                torch.randn(2, 2, 3),
+                torch.randn(2, 2, 3),
+            ),
+            dynamic_shapes={
+                "x": {
+                    0: torch.export.Dim("customx_dim_0"),
+                    1: torch.export.Dim("customx_dim_1"),
+                    2: torch.export.Dim("customx_dim_2"),
+                },
+                "b": {
+                    0: torch.export.Dim("customb_dim_0"),
+                    1: torch.export.Dim("customb_dim_1"),
+                    2: torch.export.Dim("customb_dim_2"),
+                },
+            },
+        )
+        onnx_program_from_new_exporter = torch.onnx.dynamo_export(
+            exported_program,
+            torch.randn(2, 2, 3),
+            b=torch.randn(2, 2, 3),
+        )
+        onnx_program_from_old_exporter = torch.onnx.export(
+            SampleModelForDynamicShapes(),
+            (torch.randn(2, 2, 3), {"b": torch.randn(2, 2, 3)}, {}),
+            dynamic_axes={
+                "x": [0, 1, 2],
+                "b": [0, 1, 2],
+            },
+            dynamo=True,
+        )
+        self.assertEqual(
+            onnx_program_from_new_exporter.model_proto,
+            onnx_program_from_old_exporter.model_proto,
+        )
+
+    def test_dynamic_axes_supports_partial_dynamic_shapes(self):
+        exported_program = torch.export.export(
+            SampleModelForDynamicShapes(),
+            (
+                torch.randn(2, 2, 3),
+                torch.randn(2, 2, 3),
+            ),
+            dynamic_shapes={
+                "x": None,
+                "b": {
+                    0: torch.export.Dim("customb_dim_0"),
+                    1: torch.export.Dim("customb_dim_1"),
+                    2: torch.export.Dim("customb_dim_2"),
+                },
+            },
+        )
+        onnx_program_from_new_exporter = torch.onnx.dynamo_export(
+            exported_program,
+            torch.randn(2, 2, 3),
+            b=torch.randn(2, 2, 3),
+        )
+        onnx_program_from_old_exporter = torch.onnx.export(
+            SampleModelForDynamicShapes(),
+            (torch.randn(2, 2, 3), {"b": torch.randn(2, 2, 3)}, {}),
+            dynamic_axes={
+                "b": [0, 1, 2],
+            },
            dynamo=True,
        )
        self.assertEqual(
@ -303,16 +428,37 @@ class TestONNXExportWithDynamo(common_utils.TestCase):
                dynamo=True,
            )

-    def test_raises_unsupported_specific_dynamic_axes_warning(self):
-        message = (
-            "Specified dynamic axes is not supported for dynamo export at the moment."
-        )
-
-        with self.assertWarnsOnceRegex(UserWarning, message):
+    def test_input_names_are_not_yet_supported_in_dynamic_axes(self):
+        with self.assertRaisesRegex(
+            ValueError,
+            "Assinging new input names is not supported yet. Please use model forward signature "
+            "to specify input names in dynamix_axes.",
+        ):
            _ = torch.onnx.export(
-                SampleModel(),
-                (torch.randn(1, 1, 2),),
-                dynamic_axes={"input": [0, 1, 2]},
+                SampleModelForDynamicShapes(),
+                (
+                    torch.randn(2, 2, 3),
+                    torch.randn(2, 2, 3),
+                ),
+                input_names=["input"],
+                dynamic_axes={"input": [0, 1]},
+                dynamo=True,
+            )
+
+    def test_dynamic_shapes_hit_constraints_in_dynamo(self):
+        # SampleModelTwoInputs has constraints becuse of add of two inputs,
+        # so the two input shapes are related.
+        with self.assertRaisesRegex(
+            torch._dynamo.exc.UserError,
+            "Constraints violated",
+        ):
+            _ = torch.onnx.export(
+                SampleModelTwoInputs(),
+                (torch.randn(2, 2, 3), torch.randn(2, 2, 3)),
+                dynamic_axes={
+                    "x": {0: "x_dim_0", 1: "x_dim_1", 2: "x_dim_2"},
+                    "b": {0: "b_dim_0", 1: "b_dim_1", 2: "b_dim_2"},
+                },
                dynamo=True,
            )

@ -323,6 +469,17 @@ class TestONNXExportWithDynamo(common_utils.TestCase):
            )
            self.assertTrue(os.path.exists(path))

+    def test_raises_error_when_input_is_script_module(self):
+        class ScriptModule(torch.jit.ScriptModule):
+            def forward(self, x):
+                return x
+
+        with self.assertRaisesRegex(
+            TypeError,
+            "Dynamo export does not support ScriptModule or ScriptFunction.",
+        ):
+            _ = torch.onnx.export(ScriptModule(), torch.randn(1, 1, 2), dynamo=True)
+

 if __name__ == "__main__":
    common_utils.run_tests()
--- a/test/onnx/test_fx_op_consistency.py
+++ b/test/onnx/test_fx_op_consistency.py
@ -527,8 +527,7 @@ EXPECTED_SKIPS_OR_FAILS_WITH_DTYPES: Tuple[onnx_test_common.DecorateMeta, ...] =
    ),
    xfail(
        "gather",
-        reason="HandleNegativeAxis(int64_t, int64_t) IsAxisInRange(axis, tensor_rank) was \
-            false. axis 0 is not in valid range [-0,-1]"
+        reason="GatherElements op: Rank of input 'data' needs to be equal to rank of input 'indices'"
    ),
    xfail(
        "geometric",
@ -1517,7 +1516,6 @@ SKIP_XFAIL_SUBTESTS_WITH_MATCHER_AND_MODEL_TYPE: tuple[
        "nn.functional.batch_norm",
        matcher=lambda sample: sample.kwargs.get("training") is True
        and any(arg is not None for arg in sample.args[2:4]),
-        model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
        reason="Flaky failure: https://github.com/pytorch/pytorch/issues/115106",
    ),
    xfail(
@ -1998,7 +1996,7 @@ class TestOnnxModelOutputConsistency(onnx_test_common._TestONNXRuntime):
        "nn.functional.hardsigmoid": [1e-3, 5e-3],
        "nn.functional.hardswish": [1e-3, 5e-3],
        "nn.functional.hinge_embedding_loss": [4e-1, 3e-3],
-        "nn.functional.huber_loss": [1e-3, 1e-2],
+        "nn.functional.huber_loss": [1e-2, 1e-1],
        "nn.functional.instance_norm": [1e-2, 1e-3],
        "nn.functional.interpolate": [1e-2, 1e-3],
        "nn.functional.kl_div": [2e-3, 2e-4],
--- a/test/onnx/test_fx_to_onnx.py
+++ b/test/onnx/test_fx_to_onnx.py
@ -171,13 +171,9 @@ class TestFxToOnnx(pytorch_test_common.ExportTestCase):
                    torch.argmax(input, dim=1, keepdim=True),
                )

-        # NOTE: KeyError: dim raised in optimizer
-        with self.assertWarnsOnceRegex(
-            UserWarning, "ONNXScript optimizer failed. Skipping optimization."
-        ):
-            _ = dynamo_export(
-                ArgminArgmaxModel(), model_input, export_options=self.export_options
-            )
+        _ = dynamo_export(
+            ArgminArgmaxModel(), model_input, export_options=self.export_options
+        )

    def test_multiple_outputs_op_with_evaluator(self):
        class TopKModel(torch.nn.Module):
--- a/Show More
+++ b/Show More