Dynamo testing: add some skips (#128734 )

The following tests are failing consistently for me locally, so we're going to skip them. They're disabled in CI but it looks like they're just always failing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/128734 Approved by: https://github.com/williamwen42 ghstack dependencies: #128731
[Dynamo] skip some TorchScript tests (#128731 )
2025-10-21 05:34:18 +08:00 · 2024-06-14 20:53:30 +00:00 · 2024-06-14 20:53:30 +00:00 · 2024-06-14 20:28:08 +00:00 · 2024-06-14 20:17:03 +00:00 · 2024-06-14 20:17:03 +00:00
490 changed files with 16089 additions and 16456 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -373,6 +373,13 @@ case "$image" in
    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
+  pytorch-linux-jammy-py3.12-halide)
+    CUDA_VERSION=12.4
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    CONDA_CMAKE=yes
+    HALIDE=yes
+    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -490,6 +497,7 @@ docker build \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
+       --build-arg "HALIDE=${HALIDE}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--- a/.ci/docker/ci_commit_pins/halide.txt
+++ b/.ci/docker/ci_commit_pins/halide.txt
@ -0,0 +1 @@
+340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-b8c64f64c18d8cac598b3adb355c21e7439c21de
+aac14a3b93f11d781d1d5ebc5400b15ae8df5185
--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+COMMIT=$(get_pinned_commit halide)
+test -n "$COMMIT"
+
+# activate conda to populate CONDA_PREFIX
+test -n "$ANACONDA_PYTHON_VERSION"
+eval "$(conda shell.bash hook)"
+conda activate py_$ANACONDA_PYTHON_VERSION
+
+if [ -n "${UBUNTU_VERSION}" ];then
+    apt update
+    apt-get install -y lld liblld-15-dev libpng-dev libjpeg-dev libgl-dev \
+                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
+fi
+
+conda_install numpy scipy imageio cmake ninja
+
+git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
+cmake -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_ENABLE_PROJECTS="clang" \
+        -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
+        -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \
+        -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \
+        -S llvm-project/llvm -B llvm-build -G Ninja
+cmake --build llvm-build
+cmake --install llvm-build --prefix llvm-install
+export LLVM_ROOT=`pwd`/llvm-install
+export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config
+
+git clone https://github.com/halide/Halide.git
+pushd Halide
+git checkout ${COMMIT} && git submodule update --init --recursive
+pip_install -r requirements.txt
+cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
+cmake --build build
+test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
+cmake --install build --prefix ${CONDA_PREFIX}
+chown -R jenkins ${CONDA_PREFIX}
+popd
+rm -rf Halide llvm-build llvm-project llvm-install
+
+python -c "import halide"  # check for errors
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -85,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.9.0
+mypy==1.10.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.9.0
+#Pinned versions: 1.10.0
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -103,6 +103,14 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

+ARG HALIDE
+# Build and install halide
+COPY ./common/install_halide.sh install_halide.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/halide.txt halide.txt
+RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
+RUN rm install_halide.sh common_utils.sh halide.txt
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -155,6 +155,14 @@ COPY ci_commit_pins/executorch.txt executorch.txt
 RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
 RUN rm install_executorch.sh common_utils.sh executorch.txt

+ARG HALIDE
+# Build and install halide
+COPY ./common/install_halide.sh install_halide.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/halide.txt halide.txt
+RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
+RUN rm install_halide.sh common_utils.sh halide.txt
+
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -550,6 +550,11 @@ test_inductor_micro_benchmark() {
  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
 }

+test_inductor_halide() {
+  python test/run_test.py --include inductor/test_halide.py --verbose
+  assert_git_not_dirty
+}
+
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -1242,6 +1247,8 @@ elif [[ "$TEST_CONFIG" == deploy ]]; then
  test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
+elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
+  test_inductor_halide
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
--- a/.github/actions/diskspace-cleanup/action.yml
+++ b/.github/actions/diskspace-cleanup/action.yml
@ -14,12 +14,14 @@ runs:
    - name: Cleans up diskspace
      shell: bash
      run: |
+        set -ex
        diskspace_cutoff=${{ inputs.diskspace-cutoff }}
-        diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
+        docker_root_dir=$(docker info -f '{{.DockerRootDir}}')
+        diskspace=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
        if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
            docker system prune -af
-            diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
+            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
                echo "$msg"
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -244,6 +244,7 @@
  - torch/csrc/xpu/**
  - torch/xpu/**
  - test/xpu/**
+  - test/test_xpu.py
  - third_party/xpu.txt
  - .ci/docker/ci_commit_pins/triton-xpu.txt
  approved_by:
--- a/.github/scripts/get_workflow_type.py
+++ b/.github/scripts/get_workflow_type.py
@ -6,9 +6,9 @@ from github import Auth, Github
 from github.Issue import Issue


-WORKFLOW_TYPE_LABEL = "label"
-WORKFLOW_TYPE_RG = "rg"
-WORKFLOW_TYPE_BOTH = "both"
+WORKFLOW_LABEL_META = ""  # use meta runners
+WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
+LABEL_TYPE_KEY = "label_type"


 def parse_args() -> Any:
@ -49,47 +49,44 @@ def is_exception_branch(branch: str) -> bool:


 def get_workflow_type(issue: Issue, username: str) -> str:
-    user_list = issue.get_comments()[0].body.split("\r\n")
    try:
-        run_option = issue.get_comments()[1].body.split("\r\n")[0]
-    except Exception as e:
-        run_option = "single"
+        user_list = issue.get_comments()[0].body.split()

-    if user_list[0] == "!":
-        # Use old runners for everyone
-        return WORKFLOW_TYPE_LABEL
-    elif user_list[1] == "*":
-        if run_option == WORKFLOW_TYPE_BOTH:
-            # Use ARC runners and old runners for everyone
-            return WORKFLOW_TYPE_BOTH
+        if user_list[0] == "!":
+            print("LF Workflows are disabled for everyone. Using meta runners.")
+            return WORKFLOW_LABEL_META
+        elif user_list[0] == "*":
+            print("LF Workflows are enabled for everyone. Using LF runners.")
+            return WORKFLOW_LABEL_LF
+        elif username in user_list:
+            print(f"LF Workflows are enabled for {username}. Using LF runners.")
+            return WORKFLOW_LABEL_LF
        else:
-            # Use only ARC runners for everyone
-            return WORKFLOW_TYPE_RG
-    elif username in user_list:
-        if run_option == WORKFLOW_TYPE_BOTH:
-            # Use ARC runners and old runners for a specific user
-            return WORKFLOW_TYPE_BOTH
-        else:
-            # Use only ARC runners for a specific user
-            return WORKFLOW_TYPE_RG
-    else:
-        # Use old runners by default
-        return WORKFLOW_TYPE_LABEL
+            print(f"LF Workflows are disabled for {username}. Using meta runners.")
+            return WORKFLOW_LABEL_META
+    except Exception as e:
+        print(
+            f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
+        )
+        return WORKFLOW_LABEL_META


 def main() -> None:
    args = parse_args()

    if is_exception_branch(args.github_branch):
-        output = {"workflow_type": WORKFLOW_TYPE_LABEL}
+        print(f"Exception branch: '{args.github_branch}', using meta runners")
+        output = {LABEL_TYPE_KEY: WORKFLOW_LABEL_META}
    else:
        try:
            gh = get_gh_client(args.github_token)
+            # The default issue we use - https://github.com/pytorch/test-infra/issues/5132
            issue = get_issue(gh, args.github_repo, args.github_issue)

-            output = {"workflow_type": get_workflow_type(issue, args.github_user)}
+            output = {LABEL_TYPE_KEY: get_workflow_type(issue, args.github_user)}
        except Exception as e:
-            output = {"workflow_type": WORKFLOW_TYPE_LABEL}
+            print(f"Failed to get issue. Falling back to meta runners. Exception: {e}")
+            output = {LABEL_TYPE_KEY: WORKFLOW_LABEL_META}

    json_output = json.dumps(output)
    print(json_output)
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -29,6 +29,7 @@ python3 -m tools.pyi.gen_pyi \
    --native-functions-path aten/src/ATen/native/native_functions.yaml \
    --tags-path aten/src/ATen/native/tags.yaml \
    --deprecated-functions-path "tools/autograd/deprecated.yaml"
+python3 torch/utils/data/datapipes/gen_pyi.py

 RC=0
 # Run lintrunner on all files
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -180,6 +180,9 @@ def mock_gh_get_info() -> Any:
    return {
        "closed": False,
        "isCrossRepository": False,
+        "headRefName": "foo",
+        "baseRefName": "bar",
+        "baseRepository": {"defaultBranchRef": {"name": "bar"}},
        "files": {"nodes": [], "pageInfo": {"hasNextPage": False}},
        "changedFiles": 0,
    }
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -2330,6 +2330,15 @@ def main() -> None:
            dry_run=args.dry_run,
        )
        return
+    if not pr.is_ghstack_pr() and pr.base_ref() != pr.default_branch():
+        gh_post_pr_comment(
+            org,
+            project,
+            args.pr_num,
+            f"PR targets {pr.base_ref()} rather than {pr.default_branch()}, refusing merge request",
+            dry_run=args.dry_run,
+        )
+        return

    if args.check_mergeability:
        if pr.is_ghstack_pr():
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -15,17 +15,20 @@ on:
        required: false
        type: string
        default: "5132"
+        description: |
+          Fetch's GitHub Issue from pytorch/test-infra
+          Example: https://github.com/pytorch/test-infra/issues/5132

    outputs:
-      workflow-type:
+      label-type:
        description: Type of runners to use
-        value: ${{ jobs.runner-determinator.outputs.workflow-type }}
+        value: ${{ jobs.runner-determinator.outputs.label-type }}

 jobs:
  runner-determinator:
    runs-on: linux.4xlarge
    outputs:
-      workflow-type: ${{ steps.set-condition.outputs.workflow-type }}
+      label-type: ${{ steps.set-condition.outputs.label-type }}
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      ISSUE_NUMBER: ${{ inputs.issue_number }}
@ -54,5 +57,5 @@ jobs:

          echo "Output: '${output}'"

-          WORKFLOW_TYPE=$(echo "${output}" | jq -r '.workflow_type')
-          echo "workflow-type=$WORKFLOW_TYPE" >> "$GITHUB_OUTPUT"
+          LABEL_TYPE=$(echo "${output}" | jq -r '.label_type')
+          echo "label-type=$LABEL_TYPE" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -54,6 +54,7 @@ jobs:
          pytorch-linux-focal-py3-clang9-android-ndk-r21e,
          pytorch-linux-jammy-py3.8-gcc11,
          pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-2024.0-py3,
          pytorch-linux-jammy-py3-clang15-asan,
          pytorch-linux-focal-py3-clang10-onnx,
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -56,3 +56,29 @@ jobs:
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-test-gcp:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -81,32 +81,6 @@ jobs:
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

-  linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
-    name: cuda12.1-py3.10-gcc9-sm80
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.0'
-      test-matrix: |
-        { include: [
-          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
-        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-  linux-focal-cuda12_1-py3_10-gcc9-inductor-test-gcp:
-    name: cuda12.1-py3.10-gcc9-sm80
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp
-    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
-      use-gha: anything-non-empty-to-use-gha
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
    name: cuda12.1-py3.12-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
@ -128,6 +102,26 @@ jobs:
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}

+  linux-jammy-cpu-py3_12-inductor-halide-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.12-halide
+      test-matrix: |
+        { include: [
+          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+        ]}
+
+  linux-jammy-cpu-py3_12-inductor-halide-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_12-inductor-halide-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+
  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
    name: cuda12.4-py3.10-gcc9-sm86
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -136,7 +136,7 @@ init_command = [
    'numpy==1.24.3 ; python_version == "3.8"',
    'numpy==1.26.0 ; python_version >= "3.9"',
    'expecttest==0.1.6',
-    'mypy==1.9.0',
+    'mypy==1.10.0',
    'sympy==1.11.1',
    'types-requests==2.27.25',
    'types-PyYAML==6.0.7',
@ -216,7 +216,6 @@ exclude_patterns = [
    'c10/util/complex_math.h',
    'c10/util/complex_utils.h',
    'c10/util/flat_hash_map.h',
-    'c10/util/Float8*.h',
    'c10/util/logging*.h',
    'c10/util/hash.h',
    'c10/util/strong_type.h',
@ -999,7 +998,6 @@ command = [
 ]
 exclude_patterns = [
    'tools/gen_vulkan_spv.py',
-    'torch/__init__.py',  # Skip this file to format because it's part of the public API
    # We don't care too much about files in this directory, don't enforce
    # formatting on them
    'caffe2/**/*.py',
@ -1099,7 +1097,6 @@ exclude_patterns = [
    'test/test_namedtuple_return_api.py',
    'test/test_native_functions.py',
    'test/test_native_mha.py',
-    'test/test_nestedtensor.py',
    'test/test_nn.py',
    'test/test_out_dtype_op.py',
    'test/test_overrides.py',
@ -1558,7 +1555,6 @@ exclude_patterns = [
    'torch/distributed/tensor/parallel/style.py',
    'torch/fft/__init__.py',
    'torch/func/__init__.py',
-    'torch/functional.py',
    'torch/futures/__init__.py',
    'torch/fx/__init__.py',
    'torch/fx/_compatibility.py',
@ -1644,8 +1640,6 @@ exclude_patterns = [
    'torch/fx/subgraph_rewriter.py',
    'torch/fx/tensor_type.py',
    'torch/fx/traceback.py',
-    'torch/hub.py',
-    'torch/library.py',
    'torch/linalg/__init__.py',
    'torch/monitor/__init__.py',
    'torch/nested/__init__.py',
@ -1745,35 +1739,6 @@ exclude_patterns = [
    'torch/nn/quantized/modules/normalization.py',
    'torch/nn/quantized/modules/rnn.py',
    'torch/nn/quantized/modules/utils.py',
-    'torch/nn/utils/__init__.py',
-    'torch/nn/utils/_deprecation_utils.py',
-    'torch/nn/utils/_expanded_weights/__init__.py',
-    'torch/nn/utils/_expanded_weights/conv_expanded_weights.py',
-    'torch/nn/utils/_expanded_weights/conv_utils.py',
-    'torch/nn/utils/_expanded_weights/embedding_expanded_weights.py',
-    'torch/nn/utils/_expanded_weights/expanded_weights_impl.py',
-    'torch/nn/utils/_expanded_weights/expanded_weights_utils.py',
-    'torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py',
-    'torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py',
-    'torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py',
-    'torch/nn/utils/_expanded_weights/linear_expanded_weights.py',
-    'torch/nn/utils/_per_sample_grad.py',
-    'torch/nn/utils/clip_grad.py',
-    'torch/nn/utils/convert_parameters.py',
-    'torch/nn/utils/fusion.py',
-    'torch/nn/utils/init.py',
-    'torch/nn/utils/memory_format.py',
-    'torch/nn/utils/parametrizations.py',
-    'torch/nn/utils/parametrize.py',
-    'torch/nn/utils/prune.py',
-    'torch/nn/utils/rnn.py',
-    'torch/nn/utils/spectral_norm.py',
-    'torch/nn/utils/weight_norm.py',
-    'torch/overrides.py',
-    'torch/quasirandom.py',
-    'torch/random.py',
-    'torch/return_types.py',
-    'torch/serialization.py',
    'torch/signal/__init__.py',
    'torch/signal/windows/__init__.py',
    'torch/signal/windows/windows.py',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -461,7 +461,6 @@ filegroup(
 filegroup(
    name = "caffe2_perfkernels_srcs",
    srcs = [
-        "caffe2/perfkernels/embedding_lookup.cc",
        "caffe2/perfkernels/embedding_lookup_idx.cc",
    ],
 )
@ -499,7 +498,6 @@ cc_library(
    hdrs = [
        "caffe2/core/common.h",
        "caffe2/perfkernels/common.h",
-        "caffe2/perfkernels/embedding_lookup.h",
        "caffe2/perfkernels/embedding_lookup_idx.h",
        "caffe2/utils/fixed_divisor.h",
    ] + glob([
--- a/10
+++ b/10
@ -57,6 +57,7 @@ nn/qat/ @jerryzh168
 /.ci/docker/ @jeffdaily
 /.ci/docker/ci_commit_pins/triton.txt @desertfire @Chillee @eellison @shunting314 @bertmaher @jeffdaily @jataylo @jithunnair-amd @pruthvistony
 /.ci/docker/ci_commit_pins/triton-rocm.txt @jeffdaily @jataylo @jithunnair-amd @pruthvistony
+/.ci/docker/ci_commit_pins/triton-xpu.txt @EikanWang @gujinghui

 # Github Actions
 # This list is for people wanting to be notified every time there's a change
@ -132,6 +133,15 @@ caffe2/operators/hip @jeffdaily @jithunnair-amd
 caffe2/operators/rnn/hip @jeffdaily @jithunnair-amd
 caffe2/utils/hip @jeffdaily @jithunnair-amd

+# XPU-specific files
+/aten/src/ATen/xpu/ @EikanWang @gujinghui
+/c10/xpu/ @EikanWang @gujinghui
+/torch/csrc/xpu/ @EikanWang @gujinghui
+/torch/xpu/ @EikanWang @gujinghui
+/test/xpu/ @EikanWang @gujinghui
+/test/test_xpu.py @EikanWang @gujinghui
+/third_party/xpu.txt @EikanWang @gujinghui
+
 # torch.export
 /torch/export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17
 /torch/_export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17
--- a/SECURITY.md
+++ b/SECURITY.md
@ -6,7 +6,7 @@
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
   - [Using distributed features](#using-distributed-features)
-
+- [**CI/CD security principles**](#cicd-security-principles)
 ## Reporting Security Issues

 Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch.
@ -61,3 +61,27 @@ If applicable, prepare your model against bad inputs and prompt injections. Some
 PyTorch can be used for distributed computing, and as such there is a `torch.distributed` package. PyTorch Distributed features are intended for internal communication only. They are not built for use in untrusted environments or networks.

 For performance reasons, none of the PyTorch Distributed primitives (including c10d, RPC, and TCPStore) include any authorization protocol and will send messages unencrypted. They accept connections from anywhere, and execute the workload sent without performing any checks. Therefore, if you run a PyTorch Distributed program on your network, anybody with access to the network can execute arbitrary code with the privileges of the user running PyTorch.
+
+## CI/CD security principles
+_Audience_: Contributors and reviewers, especially if modifying the workflow files/build system.
+
+PyTorch CI/CD security philosophy is based on finding a balance between open and transparent CI pipelines while keeping the environment efficient and safe.
+
+PyTorch testing requirements are complex, and a large part of the code base can only be tested on specialized powerful hardware, such as GPU, making it a lucrative target for resource misuse. To prevent this, we require workflow run approval for PRs from non-member contributors. To keep the volume of those approvals relatively low, we easily extend write permissions to the repository to regular contributors.
+
+More widespread write access to the repo presents challenges when it comes to reviewing changes, merging code into trunk, and creating releases. [Protected branches](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/managing-protected-branches/about-protected-branches) are used to restrict the ability to merge to the trunk/release branches only to the repository administrators and merge bot. The merge bot is responsible for mechanistically merging the change and validating reviews against the path-based rules defined in [merge_rules.yml](https://github.com/pytorch/pytorch/blob/main/.github/merge_rules.yaml). Once a PR has been reviewed by person(s) mentioned in these rules, leaving a `@pytorchbot merge` comment on the PR will initiate the merge process. To protect merge bot credentials from leaking, merge actions must be executed only on ephemeral runners (see definition below) using a specialized deployment environment.
+
+To speed up the CI system, build steps of the workflow rely on the distributed caching mechanism backed by [sccache](https://github.com/mozilla/sccache), making them susceptible to cache corruption compromises. For that reason binary artifacts generated during CI should not be executed in an environment that contains an access to any sensitive/non-public information and should not be published for use by general audience. One should not have any expectation about the lifetime of those artifacts, although in practice they likely remain accessible for about two weeks after the PR has been closed.
+
+To speed up CI system setup, PyTorch relies heavily on Docker to pre-build and pre-install the dependencies. To prevent a potentially malicious PR from altering ones that were published in the past, ECR has been configured to use immutable tags.
+
+To improve runner availability and more efficient resource utilization, some of the CI runners are non-ephemeral, i.e., workflow steps from completely unrelated PRs could be scheduled sequentially on the same runner, making them susceptible to reverse shell attacks. For that reason, PyTorch does not rely on the repository secrets mechanism, as these can easily be compromised in such attacks.
+
+### Release pipelines security
+
+To ensure safe binary releases, PyTorch release pipelines are built on the following principles:
+ - All binary builds/upload jobs must be run on ephemeral runners, i.e., on a machine that is allocated from the cloud to do the build and released back to the cloud after the build is finished. This protects those builds from interference from external actors, who potentially can get reverse shell access to a non-ephemeral runner and wait there for a binary build.
+ - All binary builds are cold-start builds, i.e., distributed caching/incremental builds are not permitted. This renders builds much slower than incremental CI builds but isolates them from potential compromises of the intermediate artifacts caching systems.
+ - All upload jobs are executed in a [deployment environments](https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment) that are restricted to protected branches
+ - Security credentials needed to upload binaries to PyPI/conda or stable indexes `download.pytorch.org/whl` are never uploaded to repo secrets storage/environment. This requires an extra manual step to publish the release but ensures that access to those would not be compromised by deliberate/accidental leaks of secrets stored in the cloud.
+ - No binary artifacts should be published to GitHub releases pages, as these are overwritable by anyone with write permission to the repo.
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -364,7 +364,7 @@ class TORCH_API Context {
  bool enabled_flashSDP = true;
  bool enabled_mem_efficientSDP = true;
  bool enabled_mathSDP = true;
-  bool enabled_cudnnSDP = true;
+  bool enabled_cudnnSDP = false;
 #ifdef USE_ROCM
  bool benchmark_cudnn = true;
 #else
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -462,7 +462,7 @@ inline Tensor _sum_to(
    reduce_dims.push_back(i);
  }
  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
-    if (shape[i - leading_dims] == 1 &&
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(shape[i - leading_dims], 1)) &&
        TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(sizes[i], 1))) {
      reduce_dims.push_back(i);
    }
--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@ -35,6 +35,12 @@ void SavedTensorDefaultHooks::enable() {
  tls.disabled_error_message = c10::nullopt;
 }

+/* static */ bool SavedTensorDefaultHooks::set_tracing(bool is_tracing) {
+  bool prior  = tls.is_tracing;
+  tls.is_tracing = is_tracing;
+  return prior;
+}
+
 const std::optional<std::string>& SavedTensorDefaultHooks::get_disabled_error_message() {
  return tls.disabled_error_message;
 }
@ -59,25 +65,20 @@ void SavedTensorDefaultHooks::push_hooks(PyObject* pack_hook, PyObject* unpack_h
  tls.stack.emplace(pack_hook, unpack_hook);
 }

-void SavedTensorDefaultHooks::pop_hooks() {
+std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::pop_hooks() {
  // Reference counting is handled by the caller of `pop_hooks`
  TORCH_INTERNAL_ASSERT(is_initialized && !tls.stack.empty());
+  std::pair<PyObject*, PyObject*> hooks = tls.stack.top();
  tls.stack.pop();
+  return hooks;
 }

 std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::get_hooks() {
-  if (!is_initialized || tls.stack.empty()) {
+  // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
+  if (!is_initialized || tls.stack.empty() || tls.is_tracing) {
    return std::make_pair(nullptr, nullptr);
  }
  return tls.stack.top();
 }

-std::stack<std::pair<PyObject*, PyObject*>> SavedTensorDefaultHooks::get_stack() {
-  return tls.stack;
-}
-
-void SavedTensorDefaultHooks::set_stack(std::stack<std::pair<PyObject*, PyObject*>> stack_) {
-  tls.stack = std::move(stack_);
-}
-
 }
--- a/aten/src/ATen/SavedTensorHooks.h
+++ b/aten/src/ATen/SavedTensorHooks.h
@ -22,17 +22,18 @@ struct TORCH_API SavedTensorDefaultHooksTLS {
  // We did this for efficiency (so we didn't have to keep a separate bool
  // around)
  std::optional<std::string> disabled_error_message;
+
+  // See NOTE: [Deferring tensor pack/unpack hooks until runtime]
+  bool is_tracing = false;
 };

 } // namespace impl

 struct TORCH_API SavedTensorDefaultHooks {
  static void push_hooks(PyObject* pack_hook, PyObject* unpack_hook);
-  static void pop_hooks();
+  static std::pair<PyObject*, PyObject*> pop_hooks();
  static std::pair<PyObject*, PyObject*> get_hooks();
  static void lazy_initialize();
-  static std::stack<std::pair<PyObject*, PyObject*>> get_stack();
-  static void set_stack(std::stack<std::pair<PyObject*, PyObject*>>);

  static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
  static void set_tls_state(const impl::SavedTensorDefaultHooksTLS& tls);
@ -42,11 +43,20 @@ struct TORCH_API SavedTensorDefaultHooks {
  // hooks, especially if their feature does not work with it. If they are
  // disabled, then the following will raise an error:
  // - Attempting to push_hooks
-  // - calling disable(message) with a non-zero stack (from get_stack) size
+  // - calling disable(message) with a non-zero stack (hooks) size
  static void disable(const std::string& error_message);
  static void enable();
  static bool is_enabled();
  static const std::optional<std::string>& get_disabled_error_message();
+
+  // NOTE: [Deferring tensor pack/unpack hooks until runtime]
+  // To preserve eager semantics of pack/unpack hooks firing only once per saved
+  // variable, Dynamo/AOTAutograd need to defer hook firing until runtime. Using
+  // disable() would loud error at trace time, and pushing a no-op hook would
+  // fail when the traced code is wrapped in a disable_saved_tensors_hooks ctx.
+  // To do so, we disable these hooks during tracing. See
+  // https://github.com/pytorch/pytorch/issues/113263.
+  static bool set_tracing(bool is_tracing);
 };

 } // namespace at
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@ -478,8 +478,6 @@ namespace impl {
 // (maybe except for some internal prim ops).
 using GenericList = List<IValue>;

-const IValue* ptr_to_first_element(const GenericList& list);
-
 }
 }

--- a/aten/src/ATen/core/List_inl.h
+++ b/aten/src/ATen/core/List_inl.h
@ -350,11 +350,4 @@ void List<T>::unsafeSetElementType(TypePtr t) {
  impl_->elementType = std::move(t);
 }

-namespace impl {
-
-inline const IValue* ptr_to_first_element(const GenericList& list) {
-  return &list.impl_->list[0];
-}
-
-}
 }
--- a/aten/src/ATen/cpu/Utils.cpp
+++ b/aten/src/ATen/cpu/Utils.cpp
@ -20,7 +20,7 @@ bool is_cpu_support_avx512() {
 #endif
 }

-bool is_cpu_support_vnni() {
+bool is_cpu_support_avx512_vnni() {
 #if !defined(__s390x__) && !defined(__powerpc__)
  return cpuinfo_initialize() && cpuinfo_has_x86_avx512vnni();
 #else
--- a/aten/src/ATen/cpu/Utils.h
+++ b/aten/src/ATen/cpu/Utils.h
@ -8,6 +8,6 @@ TORCH_API bool is_cpu_support_avx2();
 TORCH_API bool is_cpu_support_avx512();

 // Detect if CPU support Vector Neural Network Instruction.
-TORCH_API bool is_cpu_support_vnni();
+TORCH_API bool is_cpu_support_avx512_vnni();

 } // namespace at::cpu
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -81,7 +81,8 @@ struct GemmParams : OpParams {
  }

  std::string Signature() const override {
-    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
+    return val;
  }

  size_t GetSize(bool duplicate_inputs) const {
@ -143,6 +144,73 @@ private:
  bool duplicate_inputs_;
 };

+template <typename T>
+struct GemmAndBiasParams : OpParams {
+  std::string Signature() const override {
+    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
+    return val;
+  }
+
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = sizeof(T) * ldc * n;
+    if (duplicate_inputs) {
+      size += sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
+      size += sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
+    }
+    return size;
+  }
+
+  GemmAndBiasParams* DeepCopy(bool duplicate_inputs) const {
+    GemmAndBiasParams* copy = new GemmAndBiasParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = ldc * n * sizeof(T);
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
+      size_t b_size = sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
+      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
+    }
+  }
+
+  TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, ldc*n) ? OK : FAIL;
+  }
+
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  at::opmath_type<T> alpha;
+  const T* a;
+  int64_t lda;
+  const T* b;
+  int64_t ldb;
+  T* c;
+  int64_t ldc;
+  const T* bias;
+  at::cuda::blas::GEMMAndBiasActivationEpilogue activation;
+private:
+  bool duplicate_inputs_;
+};
+
 template <typename T>
 struct GemmStridedBatchedParams : OpParams {
  GemmStridedBatchedParams() {
@ -150,7 +218,8 @@ struct GemmStridedBatchedParams : OpParams {
  }

  std::string Signature() const override {
-    return c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
+    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
+    return val;
  }

  size_t GetSize(bool duplicate_inputs) const {
@ -223,7 +292,8 @@ struct ScaledGemmParams : OpParams {
  }

  std::string Signature() const override {
-    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+    static std::string val = c10::str(transa, transb, "_", m, "_", n, "_", k);
+    return val;
  }

  size_t GetSize(bool duplicate_inputs) const {
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@ -25,35 +25,35 @@
 namespace at::cuda::tunable {

 template <typename T>
-constexpr hipblasDatatype_t HipBlasDataTypeFor();
+constexpr hipblasDatatype_t HipDataTypeFor();

 template <>
-constexpr hipblasDatatype_t HipBlasDataTypeFor<float>() {
-  return HIPBLAS_R_32F;
+constexpr hipblasDatatype_t HipDataTypeFor<float>() {
+  return HIP_R_32F;
 }

 template <>
-constexpr hipblasDatatype_t HipBlasDataTypeFor<Half>() {
-  return HIPBLAS_R_16F;
+constexpr hipblasDatatype_t HipDataTypeFor<Half>() {
+  return HIP_R_16F;
 }

 template <>
-constexpr hipblasDatatype_t HipBlasDataTypeFor<BFloat16>() {
-  return HIPBLAS_R_16B;
+constexpr hipblasDatatype_t HipDataTypeFor<BFloat16>() {
+  return HIP_R_16BF;
 }

 template <>
-constexpr hipblasDatatype_t HipBlasDataTypeFor<double>() {
-  return HIPBLAS_R_64F;
+constexpr hipblasDatatype_t HipDataTypeFor<double>() {
+  return HIP_R_64F;
 }

 template <>
-constexpr hipblasDatatype_t HipBlasDataTypeFor<c10::Float8_e4m3fnuz>() {
+constexpr hipblasDatatype_t HipDataTypeFor<c10::Float8_e4m3fnuz>() {
  return HIP_R_8F_E4M3_FNUZ;
 }

 template <>
-constexpr hipblasDatatype_t HipBlasDataTypeFor<c10::Float8_e5m2fnuz>() {
+constexpr hipblasDatatype_t HipDataTypeFor<c10::Float8_e5m2fnuz>() {
  return HIP_R_8F_E5M2_FNUZ;
 }

@ -62,6 +62,11 @@ int GetBatchFromParams(const GemmParams<T>* params) {
  return 1;
 }

+template <typename T>
+int GetBatchFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+
 template <typename T>
 int GetBatchFromParams(const GemmStridedBatchedParams<T>* params) {
  return params->batch;
@ -77,6 +82,11 @@ int GetStrideAFromParams(const GemmParams<T>* params) {
  return 1;
 }

+template <typename T>
+int GetStrideAFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+
 template <typename T>
 int GetStrideAFromParams(const GemmStridedBatchedParams<T>* params) {
  return params->stride_a;
@ -92,6 +102,11 @@ int GetStrideBFromParams(const GemmParams<T>* params) {
  return 1;
 }

+template <typename T>
+int GetStrideBFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+
 template <typename T>
 int GetStrideBFromParams(const GemmStridedBatchedParams<T>* params) {
  return params->stride_b;
@ -107,6 +122,11 @@ int GetStrideCFromParams(const GemmParams<T>* params) {
  return 1;
 }

+template <typename T>
+int GetStrideCFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+
 template <typename T>
 int GetStrideCFromParams(const GemmStridedBatchedParams<T>* params) {
  return params->stride_c;
@ -122,6 +142,11 @@ float GetAlphaFromParams(const GemmParams<T>* params) {
  return params->alpha;
 }

+template <typename T>
+float GetAlphaFromParams(const GemmAndBiasParams<T>* params) {
+  return params->alpha;
+}
+
 template <typename T>
 float GetAlphaFromParams(const GemmStridedBatchedParams<T>* params) {
  return params->alpha;
@ -137,6 +162,11 @@ float GetBetaFromParams(const GemmParams<T>* params) {
  return params->beta;
 }

+template <typename T>
+float GetBetaFromParams(const GemmAndBiasParams<T>* params) {
+  return 0.0;
+}
+
 template <typename T>
 float GetBetaFromParams(const GemmStridedBatchedParams<T>* params) {
  return params->beta;
@ -152,6 +182,11 @@ const void* GetAScalePointerFromParams(const GemmParams<T>* params) {
  return nullptr;
 }

+template <typename T>
+const void* GetAScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+
 template <typename T>
 const void* GetAScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
  return nullptr;
@ -167,6 +202,11 @@ const void* GetBScalePointerFromParams(const GemmParams<T>* params) {
  return nullptr;
 }

+template <typename T>
+const void* GetBScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+
 template <typename T>
 const void* GetBScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
  return nullptr;
@ -182,6 +222,11 @@ const void* GetDScalePointerFromParams(const GemmParams<T>* params) {
  return nullptr;
 }

+template <typename T>
+const void* GetDScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+
 template <typename T>
 const void* GetDScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
  return nullptr;
@ -197,6 +242,11 @@ const void* GetBiasPointerFromParams(const GemmParams<T>* params) {
  return nullptr;
 }

+template <typename T>
+const void* GetBiasPointerFromParams(const GemmAndBiasParams<T>* params) {
+  return params->bias;
+}
+
 template <typename T>
 const void* GetBiasPointerFromParams(const GemmStridedBatchedParams<T>* params) {
  return nullptr;
@ -212,6 +262,11 @@ hipDataType GetBiasTypeFromParams(const GemmParams<T>* params) {
  return HIP_R_32F;
 }

+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmAndBiasParams<T>* params) {
+  return HipDataTypeFor<T>();
+}
+
 template <typename T>
 hipDataType GetBiasTypeFromParams(const GemmStridedBatchedParams<T>* params) {
  return HIP_R_32F;
@ -222,6 +277,26 @@ hipDataType GetBiasTypeFromParams(const ScaledGemmParams<T>* params) {
  return at::cuda::ScalarTypeToCudaDataType(params->bias_dtype);
 }

+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmAndBiasParams<T>* params) {
+  return params->activation;
+}
+
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmStridedBatchedParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const ScaledGemmParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+
 static hipblasOperation_t _hipblasOpFromChar(char op) {
  switch (op) {
    case 'n':
@ -327,9 +402,9 @@ class HipblasltGemmOp : public Callable<ParamsT> {
    TuningStatus Call(const ParamsT* params) override {
      hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
      hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
-      auto a_datatype = HipBlasDataTypeFor<AT>();
-      auto b_datatype = HipBlasDataTypeFor<BT>();
-      auto in_out_datatype = HipBlasDataTypeFor<CT>();
+      auto a_datatype = HipDataTypeFor<AT>();
+      auto b_datatype = HipDataTypeFor<BT>();
+      auto in_out_datatype = HipDataTypeFor<CT>();
      auto opa = _hipblasOpFromChar(params->transa);
      auto opb = _hipblasOpFromChar(params->transb);

@ -385,13 +460,22 @@ class HipblasltGemmOp : public Callable<ParamsT> {
        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
+      }

-        const void* bias_ptr = GetBiasPointerFromParams<CT>(params);
-        auto bias_datatype = GetBiasTypeFromParams<CT>(params);
-        if (bias_ptr) {
-          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
+      const void* bias_ptr = GetBiasPointerFromParams<CT>(params);
+      auto bias_datatype = GetBiasTypeFromParams<CT>(params);
+      if (bias_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype);
+        auto activation = GetActivationFromParams<CT>(params);
+        if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::RELU) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_RELU_BIAS);
+        }
+        else if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::GELU) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_GELU_BIAS);
+        }
+        else {
          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS);
-          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype);
        }
      }

@ -460,9 +544,9 @@ template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout,
 auto GetHipBlasLtTypeStringAndOps() {
  hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
  hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
-  auto a_datatype = HipBlasDataTypeFor<AT>();
-  auto b_datatype = HipBlasDataTypeFor<BT>();
-  auto in_out_datatype = HipBlasDataTypeFor<CT>();
+  auto a_datatype = HipDataTypeFor<AT>();
+  auto b_datatype = HipDataTypeFor<BT>();
+  auto in_out_datatype = HipDataTypeFor<CT>();
  std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;

  hipblasLtHandle_t handle;
@ -505,6 +589,11 @@ auto GetHipBlasLtGemmTypeStringAndOps() {
  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmParams<T>>();
 }

+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmAndBiasTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmAndBiasParams<T>>();
+}
+
 template <typename T, BlasOp ALayout, BlasOp BLayout>
 auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() {
  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmStridedBatchedParams<T>>();
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -376,8 +376,8 @@ void TuningContext::EnableNumericsCheck(bool value) {

 bool TuningContext::IsNumericsCheckEnabled() const {
  static const char *env = getenv("PYTORCH_TUNABLEOP_NUMERICAL_CHECK");
-  if (env != nullptr && strcmp(env, "0") == 0) {
-    return false;
+  if (env != nullptr && strcmp(env, "1") == 0) {
+    return true;
  }
  return numerics_check_enable_;
 }
--- a/aten/src/ATen/cuda/tunable/TunableGemm.h
+++ b/aten/src/ATen/cuda/tunable/TunableGemm.h
@ -48,6 +48,28 @@ class DefaultGemmOp : public Callable<GemmParams<T>> {
    }
 };

+static bool _transposeBoolFromChar(char op) {
+  return op == 't' || op == 'T';
+}
+
+template <typename T>
+class DefaultGemmAndBiasOp : public Callable<GemmAndBiasParams<T>> {
+  public:
+    TuningStatus Call(const GemmAndBiasParams<T>* params) override {
+      at::cuda::blas::gemm_and_bias<T>(
+          _transposeBoolFromChar(params->transa),
+          _transposeBoolFromChar(params->transb),
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda,
+          params->b, params->ldb,
+          params->bias,
+          params->c, params->ldc,
+          params->activation);
+      return OK;
+    }
+};
+
 template <typename T>
 class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
  public:
@ -265,7 +287,45 @@ class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
  }

  std::string Signature() override {
-    return c10::str("GemmTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+    static std::string val = c10::str("GemmTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+    return val;
+  }
+};
+
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmAndBiasTunableOp : public TunableOp<GemmAndBiasParams<T>, StreamTimer> {
+ public:
+  GemmAndBiasTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmAndBiasOp<T>>());
+
+    auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators();
+
+#if defined(USE_ROCM)
+    bool rocm_validators = false;
+
+    static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+      rocm_validators = true;
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmAndBiasTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+      AddHipblasltValidator();
+    }
+
+    if (rocm_validators) {
+      AddRocmValidator();
+    }
+#endif
+  }
+
+  std::string Signature() override {
+    static std::string val = c10::str("GemmAndBiasTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+    return val;
  }
 };

@ -308,7 +368,8 @@ class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>
  }

  std::string Signature() override {
-    return c10::str("GemmStridedBatchedTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+    static std::string val = c10::str("GemmStridedBatchedTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+    return val;
  }
 };

@ -330,11 +391,12 @@ class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer>
  }

  std::string Signature() override {
-    return c10::str("ScaledGemmTunableOp",
+    static std::string val = c10::str("ScaledGemmTunableOp",
            "_", TypeName<AT>(AT{}),
            "_", TypeName<BT>(BT{}),
            "_", TypeName<CT>(CT{}),
            "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+    return val;
  }
 };

--- a/aten/src/ATen/detail/MPSHooksInterface.h
+++ b/aten/src/ATen/detail/MPSHooksInterface.h
@ -57,6 +57,9 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
  virtual size_t getDriverAllocatedMemory() const {
    FAIL_MPSHOOKS_FUNC(__func__);
  }
+  virtual size_t getRecommendedMaxMemory() const {
+    FAIL_MPSHOOKS_FUNC(__func__);
+  }
  virtual void setMemoryFraction(double /*ratio*/) const {
    FAIL_MPSHOOKS_FUNC(__func__);
  }
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@ -324,6 +324,8 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
  OP_DECOMPOSE(type_as);
  OP_DECOMPOSE(linalg_diagonal);
  OP_DECOMPOSE(diagonal_copy);
+  OP_DECOMPOSE(alias_copy);
+  m.impl("as_strided_copy", native::as_strided_copy_symint);
  m.impl("pad", native::pad_symint);
  m.impl("_pad_circular", native::_pad_circular_symint);
  OP_DECOMPOSE(swapdims_);
--- a/aten/src/ATen/mps/MPSAllocator.h
+++ b/aten/src/ATen/mps/MPSAllocator.h
@ -308,6 +308,8 @@ public:
  // total GPU memory allocated in the process by Metal driver; including
  // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
  size_t getDriverAllocatedMemory() const { return current_allocated_size(); }
+  // recommended Max memory for Metal
+  size_t getRecommendedMaxMemory() const { return max_device_size(); }
  // (see enum DebugVerbosity for description)
  uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
  // returns the device that we allocate from
--- a/aten/src/ATen/mps/MPSAllocator.mm
+++ b/aten/src/ATen/mps/MPSAllocator.mm
@ -794,6 +794,9 @@ struct TORCH_API MPSAllocator final : public IMPSAllocator {
  size_t getDriverAllocatedMemory() const override {
    return _getAllocImpl().getDriverAllocatedMemory();
  }
+  size_t getRecommendedMaxMemory() const override {
+    return _getAllocImpl().getRecommendedMaxMemory();
+  }
  ssize_t getLowWatermarkValue() const override {
    return _getAllocImpl().getLowWatermarkValue();
  }
--- a/aten/src/ATen/mps/MPSAllocatorInterface.h
+++ b/aten/src/ATen/mps/MPSAllocatorInterface.h
@ -33,6 +33,7 @@ public:
  virtual size_t getTotalAllocatedMemory() const = 0;
  virtual size_t getCurrentAllocatedMemory() const = 0;
  virtual size_t getDriverAllocatedMemory() const = 0;
+  virtual size_t getRecommendedMaxMemory() const = 0;
  virtual std::pair<const void*, uint32_t> getSharedBufferPtr(const void* ptr) const = 0;
  virtual bool recordEvents(c10::ArrayRef<const void*> buffers) const = 0;
  virtual bool waitForEvents(c10::ArrayRef<const void*> buffers) const = 0;
--- a/aten/src/ATen/mps/MPSHooks.h
+++ b/aten/src/ATen/mps/MPSHooks.h
@ -32,6 +32,7 @@ struct MPSHooks : public at::MPSHooksInterface {
  void emptyCache() const override;
  size_t getCurrentAllocatedMemory() const override;
  size_t getDriverAllocatedMemory() const override;
+  size_t getRecommendedMaxMemory() const override;
  void setMemoryFraction(double ratio) const override;

  // MPSProfiler interface
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@ -80,6 +80,10 @@ size_t MPSHooks::getDriverAllocatedMemory() const {
  return at::mps::getIMPSAllocator()->getDriverAllocatedMemory();
 }

+size_t MPSHooks::getRecommendedMaxMemory() const {
+  return at::mps::getIMPSAllocator()->getRecommendedMaxMemory();
+}
+
 void MPSHooks::setMemoryFraction(double ratio) const {
  at::mps::getIMPSAllocator()->setHighWatermarkRatio(ratio);
 }
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -4,6 +4,7 @@
 #include <ATen/OpMathType.h>
 #include <ATen/Parallel.h>
 #include <c10/core/ScalarType.h>
+#include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Unroll.h>
 #include <c10/util/complex.h>
@ -16,6 +17,7 @@
 #include <arm_neon.h>
 #endif

+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function")
 namespace {

 /// Wrapper for const_cast<T*> with type-inference.
@ -967,3 +969,4 @@ INSTANTIATE_VDOT_IMPL(c10::complex<double>);
 #undef INSTANTIATE_DOT_IMPL

 } // namespace at::native
+C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/native/GridSamplerUtils.h
+++ b/aten/src/ATen/native/GridSamplerUtils.h
@ -18,10 +18,8 @@ enum class GridSamplerPadding {Zeros, Border, Reflection};
 using detail::GridSamplerInterpolation;
 using detail::GridSamplerPadding;

-namespace {
-
 // See NOTE [ grid_sampler Native Functions ].
-void check_grid_sampler_common(
+inline void check_grid_sampler_common(
  const TensorBase& input,
  const TensorBase& grid
 ) {
@ -60,7 +58,7 @@ void check_grid_sampler_common(
 }

 // See NOTE [ grid_sampler Native Functions ].
-void check_grid_sampler_2d(
+inline void check_grid_sampler_2d(
  const TensorBase& input,
  const TensorBase& grid
 ) {
@ -72,7 +70,7 @@ void check_grid_sampler_2d(
 }

 // See NOTE [ grid_sampler Native Functions ].
-void check_grid_sampler_3d(
+inline void check_grid_sampler_3d(
  const TensorBase& input,
  const TensorBase& grid,
  int64_t interpolation_mode
@ -91,7 +89,7 @@ void check_grid_sampler_3d(

 // See NOTE [ grid_sampler Native Functions ].
 // cudnn does not support inputs larger than 1024.
-bool cond_cudnn_grid_sampler(
+inline bool cond_cudnn_grid_sampler(
  const TensorBase& input,
  const TensorBase& grid
 ) {
@ -104,6 +102,4 @@ bool cond_cudnn_grid_sampler(
    input.sym_size(1) <= 1024);
 }

-} // anonymous namespace
-
 } // namespace at::native
--- a/aten/src/ATen/native/LossMulti.h
+++ b/aten/src/ATen/native/LossMulti.h
@ -5,8 +5,7 @@
 #include <ATen/TensorUtils.h>

 namespace at::native {
-namespace {
-  static C10_UNUSED void multilabel_margin_loss_shape_check(
+  inline void multilabel_margin_loss_shape_check(
    int64_t& nframe,
    int64_t& dim,
    const int64_t& ndims,
@ -35,7 +34,7 @@ namespace {
    }
  }

-  static C10_UNUSED void multi_margin_loss_shape_check(
+  inline void multi_margin_loss_shape_check(
    int64_t& nframe,
    int64_t& dim,
    const int64_t& ndims,
@ -67,6 +66,4 @@ namespace {
    }
 }

-
-}  // anonymous namespace
 } // namespace at::native
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@ -525,10 +525,10 @@ static Tensor cross_entropy_loss_prob_target(

    switch (reduction) {
      case Reduction::Mean:
-        if (input.numel()==0){
+        if (input.sym_numel()==0){
          return -(input * target * weight_).sum().fill_(std::numeric_limits<double>::quiet_NaN());
        } else {
-          return -(input * target * weight_).sum() / (input.numel() / n_classes);
+          return -(input * target * weight_).sum() / (input.sym_numel() / n_classes);
        }
      case Reduction::Sum:
        return -(input * target * weight_).sum();
@ -540,10 +540,10 @@ static Tensor cross_entropy_loss_prob_target(
  } else {
    switch (reduction) {
      case Reduction::Mean:
-        if (input.numel()==0){
+        if (input.sym_numel()==0){
          return -(input * target).sum().fill_(std::numeric_limits<double>::quiet_NaN());
        } else {
-          return -(input * target).sum() / (input.numel() / n_classes);
+          return -(input * target).sum() / (input.sym_numel() / n_classes);
        }
      case Reduction::Sum:
        return -(input * target).sum();
--- a/aten/src/ATen/native/MaxPooling.h
+++ b/aten/src/ATen/native/MaxPooling.h
@ -7,7 +7,7 @@

 namespace at::native {

-static void check_max_pool1d(
+inline void check_max_pool1d(
    const Tensor& self,
    IntArrayRef kernel_size,
    IntArrayRef stride,
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@ -1195,15 +1195,6 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> ho
 #undef REPR
 }

-static Tensor istft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop_lengthOpt,
-             const optional<int64_t> win_lengthOpt, const Tensor& window,
-             const bool center, const bool normalized, const optional<bool> onesidedOpt,
-             const optional<int64_t> lengthOpt) {
-  return at::native::istft(
-      self, n_fft, hop_lengthOpt, win_lengthOpt, window, center, normalized,
-      onesidedOpt, lengthOpt, /*return_complex=*/false);
-}
-
 void _fft_fill_with_conjugate_symmetry_(const Tensor& input, IntArrayRef dim_) {
  const auto input_sizes = input.sizes();
  const auto input_strides = input.strides();
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -210,7 +210,6 @@
 #include <ATen/ops/zeros_native.h>
 #endif

-#include <c10/util/StringUtil.h>
 #include <algorithm>
 #include <cstdint>
 #include <utility>
@ -421,8 +420,9 @@ Tensor& set_storage_meta__symint(Tensor& result, Storage storage, c10::SymInt st
    // it.  TODO: Actually this might not quite be correct if we use special
    // pointers to track whether or not fake cuda tensors are pinned or not
    const auto itemsize = result.dtype().itemsize();
-    c10::SymInt new_size_bytes = at::detail::computeStorageNbytes(
-        size, stride, itemsize, std::move(storage_offset));
+    c10::SymInt new_size_bytes = result.is_contiguous()
+      ? at::detail::computeStorageNbytesContiguous(size, itemsize, std::move(storage_offset))
+      : at::detail::computeStorageNbytes(size, stride, itemsize, std::move(storage_offset));
    // TODO: When there are unbacked SymInts, we unconditionally skip the
    // setter.  This is technically wrong, but we cannot conveniently test
    // the real condition in many cases, because a lot of people are using
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@ -103,7 +103,7 @@ DECLARE_DISPATCH(upsampling_bicubic2d, upsample_bicubic2d_kernel);
 DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_kernel);
 DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_backward_kernel);

-static C10_UNUSED std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+inline C10_UNUSED std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
  TORCH_CHECK(
      output_size.size() == 1,
      "It is expected output_size equals to 1, but got size ",
@ -131,7 +131,7 @@ static C10_UNUSED std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef in
  return {nbatch, channels, output_width};
 }

-static C10_UNUSED std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+inline C10_UNUSED std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
  TORCH_CHECK(
      output_size.size() == 2,
      "It is expected output_size equals to 2, but got size ",
@ -167,7 +167,7 @@ static C10_UNUSED std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef in
  return {nbatch, channels, output_height, output_width};
 }

-static C10_UNUSED
+inline C10_UNUSED
 std::array<int64_t, 5> upsample_3d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
  TORCH_CHECK(
      output_size.size() == 3,
@ -365,7 +365,7 @@ inline int64_t nearest_exact_idx(
 typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, std::optional<double>);

 template <typename scalar_t>
-static scalar_t upsample_get_value_bounded(
+scalar_t upsample_get_value_bounded(
    scalar_t* data,
    int64_t width,
    int64_t height,
@ -377,7 +377,7 @@ static scalar_t upsample_get_value_bounded(
 }

 template <typename scalar_t>
-static void upsample_increment_value_bounded(
+void upsample_increment_value_bounded(
    scalar_t* data,
    int64_t width,
    int64_t height,
@ -392,17 +392,17 @@ static void upsample_increment_value_bounded(
 // Based on
 // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
 template <typename scalar_t>
-inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
  return ((A + 2) * x - (A + 3)) * x * x + 1;
 }

 template <typename scalar_t>
-inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
 }

 template <typename scalar_t>
-inline void get_cubic_upsample_coefficients(
+void get_cubic_upsample_coefficients(
    scalar_t coeffs[4],
    scalar_t t) {
  scalar_t A = -0.75;
--- a/aten/src/ATen/native/cpu/BlasKernel.cpp
+++ b/aten/src/ATen/native/cpu/BlasKernel.cpp
@ -190,8 +190,7 @@ void gemm_transa_(
 }

 template <typename scalar_t, typename opmath_t>
-typename std::enable_if<std::is_same<scalar_t, opmath_t>::value, void>::type
-gemm_transb_(
+void gemm_transb_impl(
    TransposeType transb,
    int64_t m,
    int64_t n,
@ -201,12 +200,9 @@ gemm_transb_(
    int64_t lda,
    const scalar_t* b,
    int64_t ldb,
-    opmath_t beta,
-    scalar_t* c,
+    /* we expect pre-applied beta */
+    opmath_t* c,
    int64_t ldc) {
-  // c *= beta
-  scale_(m, n, beta, c, ldc);
-
  // c += alpha * (a @ b.T)
  for (const auto l : c10::irange(k)) {
    for (const auto j : c10::irange(n)) {
@ -225,6 +221,27 @@ gemm_transb_(
  }
 }

+template <typename scalar_t, typename opmath_t>
+typename std::enable_if<std::is_same<scalar_t, opmath_t>::value, void>::type
+gemm_transb_(
+    TransposeType transb,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    opmath_t alpha,
+    const scalar_t* a,
+    int64_t lda,
+    const scalar_t* b,
+    int64_t ldb,
+    opmath_t beta,
+    scalar_t* c,
+    int64_t ldc) {
+  // c *= beta
+  scale_(m, n, beta, c, ldc);
+
+  gemm_transb_impl(transb, m, n, k, alpha, a, lda, b, ldb, c, ldc);
+}
+
 // std::is_same<scalar_t, at::BFloat16> || std::is_same<scalar_t, at::Half>
 template <typename scalar_t, typename opmath_t>
 typename std::enable_if<!std::is_same<scalar_t, opmath_t>::value, void>::type
@ -241,19 +258,45 @@ gemm_transb_(
    opmath_t beta,
    scalar_t* c,
    int64_t ldc) {
-  // c += alpha * (a @ b.T)
-  for (const auto i : c10::irange(m)) {
+  // We need to calculate full-precision dot products for correctness;
+  // users notice error accumulation with reduced-width types (e.g.,
+  // https://github.com/pytorch/pytorch/issues/95125 and
+  // https://github.com/pytorch/pytorch/issues/83863, which were filed
+  // when we used gemm_transb_impl naively, accumulating into
+  // float16/bfloat16). The straightforward way to do this is to use
+  // the vector dot column algorithm anyway, but this gives terrible
+  // performance because of the non-contiguous matrix
+  // access. Therefore, we instead elect to allocate temporary space
+  // to hold the output at higher-precision so that we can accumulate
+  // into it using the above cache-friendly "load one vector element,
+  // FMA it with an entire matrix row into the entire result vector"
+  // algorithm instead.
+  const auto c_size = m * n;
+  auto c_accum = std::make_unique<opmath_t[]>(c_size);
+  if (beta == 1) {
    for (const auto j : c10::irange(n)) {
-      const auto dot = sum(k, [&](int64_t l) -> opmath_t {
-        return static_cast<opmath_t>(a[l * lda + i]) *
-            static_cast<opmath_t>(transb == TransposeType::ConjTranspose ? conj_impl(b[l * ldb + j]) : b[l * ldb + j]);
-      });
-      if (beta == opmath_t(0)) {
-        c[j * ldc + i] = alpha * dot;
-      } else {
-        c[j * ldc + i] = beta * c[j * ldc + i] + alpha * dot;
+      for (const auto i : c10::irange(m)) {
+        c_accum[j * m + i] = c[j * ldc + i];
      }
    }
+  } else if (beta == 0) {
+    for (const auto j : c10::irange(n)) {
+      for (const auto i : c10::irange(m)) {
+        c_accum[j * m + i] = 0;
+      }
+    }
+  } else {
+    for (const auto j : c10::irange(n)) {
+      for (const auto i : c10::irange(m)) {
+        c_accum[j * m + i] = beta * c[j * ldc + i];
+      }
+    }
+  }
+  gemm_transb_impl(transb, m, n, k, alpha, a, lda, b, ldb, c_accum.get(), m);
+  for (const auto j : c10::irange(n)) {
+    for (const auto i : c10::irange(m)) {
+      c[j * ldc + i] = c_accum[j * m + i];
+    }
  }
 }

--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -175,12 +175,6 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
 static bool getDisableAddmmCudaLt() {
    static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT");
 #ifdef USE_ROCM
-    // if we enable tunable op, it'll take priority over just hipblaslt (heuristics)
-    // note the current tunable op is not the hipblaslt path (gemm_and_bias)
-    auto tuning_ctx = at::cuda::tunable::getTuningContext();
-    if (tuning_ctx->IsTunableOpEnabled()) {
-      return true;
-    }
    // allow both CUDA and HIP env var names for ROCm builds
    // also, current default for ROCm builds is disable by default
    if (env_value == nullptr) {
@ -214,6 +208,49 @@ static bool isSupportedHipLtROCmArch(int index) {
 }
 #endif

+template <typename scalar_t>
+static void launchTunableGemmAndBias(cublasCommonArgs &args, Tensor& result, const Tensor& self, bool is_rocm) {
+  bool transa_ = ((args.transa != 'n') && (args.transa != 'N'));
+  bool transb_ = ((args.transb != 'n') && (args.transb != 'N'));
+  at::cuda::tunable::GemmAndBiasParams<scalar_t> params;
+  params.transa = args.transa;
+  params.transb = args.transb;
+  params.m = args.m;
+  params.n = args.n;
+  params.k = args.k;
+  params.a = args.mata->const_data_ptr<scalar_t>();
+  params.lda = args.lda;
+  params.b = args.matb->const_data_ptr<scalar_t>();
+  params.ldb = args.ldb;
+  if (is_rocm) {
+    params.bias = (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr;
+  }
+  else {
+    params.bias = self.const_data_ptr<scalar_t>();
+  }
+  params.c = args.result->data_ptr<scalar_t>();
+  params.ldc = args.result_ld;
+  if (transa_ && transb_) {
+    static at::cuda::tunable::GemmAndBiasTunableOp<scalar_t, at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T> gemm{};
+    gemm(&params);
+  }
+  else if (transa_ && !transb_) {
+    static at::cuda::tunable::GemmAndBiasTunableOp<scalar_t, at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N> gemm{};
+    gemm(&params);
+  }
+  else if (!transa_ && transb_) {
+    static at::cuda::tunable::GemmAndBiasTunableOp<scalar_t, at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T> gemm{};
+    gemm(&params);
+  }
+  else if (!transa_ && !transb_) {
+    static at::cuda::tunable::GemmAndBiasTunableOp<scalar_t, at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N> gemm{};
+    gemm(&params);
+  }
+  else {
+    TORCH_CHECK(false, "unreachable");
+  }
+}
+
 Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None) {
  // Make sure to keep addmm_cuda below in sync with this code; it
  // preflights a check to try to avoid actually needing to call
@ -341,6 +378,11 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
+        auto tuning_ctx = at::cuda::tunable::getTuningContext();
+        if (tuning_ctx->IsTunableOpEnabled()) {
+          launchTunableGemmAndBias<scalar_t>(args, result, self, true);
+        }
+        else {
          at::cuda::blas::gemm_and_bias<scalar_t>(
              args.transa == 't',
              args.transb == 't',
@ -359,7 +401,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
              args.result_ld,
              activation_to_gemm_and_blas_arg(activation)
          );
-        });
+        }});
 #else
    auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
 #if (defined(CUDA_VERSION) && (CUDA_VERSION < 11080))
@ -377,6 +419,11 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
+        auto tuning_ctx = at::cuda::tunable::getTuningContext();
+        if (tuning_ctx->IsTunableOpEnabled()) {
+          launchTunableGemmAndBias<scalar_t>(args, result, self, false);
+        }
+        else {
          at::cuda::blas::gemm_and_bias<scalar_t>(
              args.transa == 't',
              args.transb == 't',
@ -393,7 +440,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
              args.result_ld,
              activation_epilogue
          );
-        });
+        }});
 #endif
  } else
  {
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@ -191,11 +191,47 @@ std::vector<Tensor> foreach_scalar_pow_list_kernel_cuda(
 // In the case of division, integer inputs will result in float.
 // Currently multi tensor apply can only return result of the same type as
 // input.
-FOREACH_BINARY_OP_SCALAR(
-    all_types_complex_bool_half_bfloat16,
-    div,
-    std::divides,
-    /*div_op*/ true);
+//
+// Implement via multiply with reciprocal as it's faster and makes it match
+// the behavior of regular Tensor div by scalar.  Loses one bit of
+// precision.
+Scalar scalar_reciprocal(const Scalar& scalar) {
+  if (scalar.isFloatingPoint()) {
+    return Scalar(1. / scalar.toDouble());
+  } else if (scalar.isIntegral(/*includeBool*/ true)) {
+    return Scalar(1. / static_cast<double>(scalar.toLong()));
+  } else if (scalar.isComplex()) {
+    return Scalar(1. / scalar.toComplexDouble());
+  }
+  TORCH_INTERNAL_ASSERT(
+      false, "divison with ", scalar.type(), " not supported");
+}
+
+void foreach_tensor_div_scalar_kernel_cuda_(
+    TensorList tensors,
+    const Scalar& scalar) {
+  check_foreach_api_restrictions(tensors);
+  if (!can_use_fast_route(tensors, scalar, true)) {
+    return at::native::foreach_tensor_mul_scalar_kernel_slow_(
+        tensors, scalar_reciprocal(scalar));
+  }
+
+  all_types_complex_bool_half_bfloat16_<std::multiplies>(
+      tensors, scalar_reciprocal(scalar));
+}
+
+std::vector<Tensor> foreach_tensor_div_scalar_kernel_cuda(
+    TensorList tensors,
+    const Scalar& scalar) {
+  check_foreach_api_restrictions(tensors);
+  if (!can_use_fast_route(tensors, scalar, true)) {
+    return at::native::foreach_tensor_mul_scalar_kernel_slow(
+        tensors, scalar_reciprocal(scalar));
+  }
+
+  return all_types_complex_bool_half_bfloat16<std::multiplies>(
+      tensors, scalar_reciprocal(scalar));
+}

 // In the case of subtraction, we dont allow scalar to be boolean following the
 // torch.sub logic
--- a/aten/src/ATen/native/cuda/Reduce.cuh
+++ b/aten/src/ATen/native/cuda/Reduce.cuh
@ -807,6 +807,7 @@ struct ReduceOp {
    bool is_last_block_done = mark_block_finished();

    if (is_last_block_done) {
+      __threadfence(); // complete the acquire pattern after atomic
      value = ident;
      if (config.should_block_x_reduce()) {
        index_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@ -863,8 +863,8 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
            auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
              smem_reduction_sz) / sizeof(scalar_t);

-            bool can_use_smem = dim_size < max_elements_per_smem;
-            can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES);
+            bool can_use_smem = (size_t) dim_size < max_elements_per_smem;
+            can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
            can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
            can_use_smem &= !(dim_size % ILP);

@ -899,8 +899,8 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
            auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
              smem_reduction_sz) / sizeof(scalar_t);

-            bool can_use_smem = dim_size < max_elements_per_smem;
-            can_use_smem &= !(reinterpret_cast<const uintptr_t>(input_ptr) % ALIGN_BYTES);
+            bool can_use_smem = (size_t) dim_size < max_elements_per_smem;
+            can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
            can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
            can_use_smem &= !(dim_size % ILP);

--- a/aten/src/ATen/native/cuda/reduction_template.cuh
+++ b/aten/src/ATen/native/cuda/reduction_template.cuh
@ -595,6 +595,7 @@ struct ReduceJitOp {
    bool is_last_block_done = mark_block_finished();

    if (is_last_block_done) {
+      __threadfence(); //complete acquire pattern
      value = ident;
      if (config.should_block_x_reduce()) {
        uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -614,13 +614,6 @@ void run_cudnn_SDP_bprop(
    Tensor& dV,
    const Tensor& dropoutseed,
    const Tensor& dropoutoffset) {
-  Tensor dO_ = dO;
-  if (!dO.strides()[dO.strides().size() - 1]) {
-    TORCH_WARN(
-        "cuDNN SDPA backward got an innermost stride of 0 in grad_out, which is unsupported. Materializing a contiguous\
-        tensor which will increase memory usage...");
-    dO_ = dO.contiguous();
-  }
  cudnnHandle_t handle = getCudnnHandle();
  auto key = MHACacheKeyWrapper(
      b, h, s_q, s_kv, d, q, k, v, dropout_probability, is_causal, true);
@ -642,7 +635,7 @@ void run_cudnn_SDP_bprop(
        k,
        v,
        o,
-        dO_,
+        dO,
        softmaxstats,
        dQ,
        dK,
--- a/aten/src/ATen/native/mkldnn/RNN.cpp
+++ b/aten/src/ATen/native/mkldnn/RNN.cpp
@ -168,10 +168,6 @@ struct RNNParams {
  }
 };

-static std::vector<int64_t> _hidden_size(const RNNParams& rnn) {
-  return {rnn.num_layers * rnn.num_directions, rnn.mini_batch, rnn.hidden_size};
-}
-
 template<bool is_single_direction>
 std::vector<int64_t> _output_size(const RNNParams& rnn) {
  auto output_channels = is_single_direction ? rnn.hidden_size
--- a/aten/src/ATen/native/mkldnn/TensorShape.cpp
+++ b/aten/src/ATen/native/mkldnn/TensorShape.cpp
@ -99,15 +99,3 @@ Tensor& mkldnn_transpose_(Tensor& self, int64_t dim0, int64_t dim1) {
 } // namespace at

 #endif // AT_MKLDNN_ENABLED
-
-
-namespace at {
-namespace native {
-
-
-static Tensor mkldnn_view_symint(const Tensor& self, c10::SymIntArrayRef size) {
-  return mkldnn_view(self, C10_AS_INTARRAYREF_SLOW(size));
-}
-
-} // namespace native
-} // namespace at
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -659,6 +659,7 @@ id<MTLLibrary> MetalShaderLibrary::compileLibrary(const std::string& src) {
  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
  [options setLanguageVersion:is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) ? MTLLanguageVersion3_1
                                                                                      : MTLLanguageVersion2_3];
+  // [options setFastMathEnabled: NO];
  auto str = [NSString stringWithCString:src.c_str() encoding:NSASCIIStringEncoding];
  auto device = MPSDevice::getInstance()->device();
  library = [device newLibraryWithSource:str options:options error:&error];
--- a/aten/src/ATen/native/mps/UnaryConstants.h
+++ b/aten/src/ATen/native/mps/UnaryConstants.h
@ -9,35 +9,72 @@ constant float b[4] = {{-2.118377725, 1.442710462, -0.329097515, 0.012229801}};
 constant float c[4] = {{-1.970840454, -1.624906493, 3.429567803, 1.641345311}};
 constant float d[2] = {{3.543889200, 1.637067800}};

-kernel void erfinv_mps_kernel( device {0} *output [[buffer(0)]],
-                            device {1} *input [[buffer(1)]],
-                            uint index [[thread_position_in_grid]]) {{
+kernel void erfinv_kernel( device {0} *output [[buffer(0)]],
+                           device {1} *input [[buffer(1)]],
+                           uint index [[thread_position_in_grid]]) {{

  float y = input[index];
  float x, z, num, dem; /*working variables */
  /* coefficients in rational expansion */

  float y_abs = abs(y);
-  if(y_abs > 1.0f){{
-    output[index] = NAN;
+  if (y_abs >= 1.0f) {{
+    output[index] = {0}( y_abs > 1.0f ? NAN : copysign(INFINITY, y));
    return;
  }}
-  if(y_abs == 1.0f){{
-    output[index] = copysign(INFINITY, y);
-    return;
-  }}
-  if(y_abs <= 0.7f) {{
+  if (y_abs <= 0.7f) {{
    z = y * y;
-    num = (((a[3]*z + a[2])*z + a[1])*z + a[0]);
-    dem = ((((b[3]*z + b[2])*z + b[1])*z +b[0]) * z + 1.0f);
+    num = ((a[3] * z + a[2]) * z + a[1])*z + a[0];
+    dem = (((b[3] * z + b[2]) * z + b[1]) * z +b[0]) * z + 1.0f;
    x = y * num / dem;
-  }}
-  else{{
+  }} else {{
    z = sqrt(-1.0f*log((1.0-y_abs)/2.0));
-    num = ((c[3]*z + c[2])*z + c[1]) * z + c[0];
-    dem = (d[1]*z + d[0])*z + 1.0f;
+    num = ((c[3] * z + c[2]) * z + c[1]) * z + c[0];
+    dem = (d[1] * z + d[0]) * z + 1.0f;
    x = copysign(num, y) / dem;
  }}

-  output[index] = x;
-}})METAL";
+  output[index] = {0}(x);
+}}
+
+kernel void exp_kernel( device {0} *output [[buffer(0)]],
+                        device {1} *input [[ buffer(1)]],
+                        uint index [[thread_position_in_grid]]) {{
+  output[index] = {0}(precise::exp(input[index]));
+}}
+
+kernel void exp_complex_kernel( device {0}2 *output [[buffer(0)]],
+                                device {0}2 *input [[ buffer(1)]],
+                                uint index [[thread_position_in_grid]]) {{
+  output[index].x = {0}(precise::exp(input[index].x)*precise::cos(input[index].y));
+  output[index].y = {0}(precise::exp(input[index].x)*precise::sin(input[index].y));
+}}
+
+kernel void tanh_kernel( device {0} *output [[buffer(0)]],
+                        device {1} *input [[ buffer(1)]],
+                        uint index [[thread_position_in_grid]]) {{
+  output[index] = {0}(precise::tanh(input[index]));
+}}
+
+
+#if __METAL_VERSION__ >= 310
+bfloat dot(bfloat2 a, bfloat2 b) {{
+  return a.x * b.x + a.y * b.y;
+}}
+#endif
+
+template<typename T>
+T complex_div(T a, T b) {{
+  auto denom = dot(b, b);
+  return T(dot(a, b), a.y * b.x - a.x * b.y)/denom;
+}}
+
+kernel void tanh_complex_kernel( device {0}2 *output [[buffer(0)]],
+                                 device {0}2 *input [[ buffer(1)]],
+                                 uint index [[thread_position_in_grid]]) {{
+  //tanh(x+iy)=(tanh(x)+itan(y))/(1+itahnh(x)*tan(y));
+  auto tanh_x = {0}(precise::tanh(input[index].x));
+  auto tan_y = {0}(precise::tan(input[index].y));
+  output[index] = complex_div({0}2(tanh_x, tan_y), {0}2({0}(1), tanh_x * tan_y));
+}}
+)METAL";
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@ -143,7 +143,7 @@ TORCH_IMPL_FUNC(leaky_relu_out_mps)(const Tensor& self, const Scalar& negative_s
  Tensor output_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve);

  @autoreleasepool {
-    string key = "leaky_relu" + getTensorsStringKey({self}) + ":" + to_string(negative_slope.to<double>());
+    string key = "leaky_relu" + getTensorsStringKey({self}) + ":" + std::to_string(negative_slope.to<double>());
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);

@ -193,8 +193,8 @@ TORCH_IMPL_FUNC(leaky_relu_backward_out_mps)
  Tensor output_ = at::empty_like(self, self.suggest_memory_format());

  @autoreleasepool {
-    string key =
-        "leaky_relu_backward" + getTensorsStringKey({self, grad_output}) + ":" + to_string(negative_slope.to<double>());
+    string key = "leaky_relu_backward" + getTensorsStringKey({self, grad_output}) + ":" +
+        std::to_string(negative_slope.to<double>());
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
      MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
@ -242,7 +242,7 @@ TORCH_IMPL_FUNC(log_softmax_mps_out)
  MPSStream* stream = at::mps::getCurrentMPSStream();

  @autoreleasepool {
-    string key = "log_softmax_mps_out" + getTensorsStringKey({self}) + ":" + to_string(dim);
+    string key = "log_softmax_mps_out" + getTensorsStringKey({self}) + ":" + std::to_string(dim);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);

@ -285,7 +285,7 @@ TORCH_IMPL_FUNC(log_softmax_backward_mps_out)
  MPSStream* stream = at::mps::getCurrentMPSStream();

  @autoreleasepool {
-    string key = "log_softmax_backward_mps_out:" + getMPSTypeString(grad_output) + ":" + to_string(dim);
+    string key = "log_softmax_backward_mps_out:" + getMPSTypeString(grad_output) + ":" + std::to_string(dim);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* gradOutputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(grad_output));
      MPSGraphTensor* outputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(output));
@ -539,8 +539,8 @@ TORCH_IMPL_FUNC(threshold_out_mps)
  MPSStream* stream = getCurrentMPSStream();

  @autoreleasepool {
-    string key = "threshold_out_mps" + getTensorsStringKey({self}) + ":" + to_string(threshold.to<double>()) + ":" +
-        to_string(value.to<double>());
+    string key = "threshold_out_mps" + getTensorsStringKey({self}) + ":" + std::to_string(threshold.to<double>()) +
+        ":" + std::to_string(value.to<double>());

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
@ -587,7 +587,7 @@ TORCH_IMPL_FUNC(threshold_backward_out_mps)

  @autoreleasepool {
    string key =
-        "threshold_backward_out_mps" + getTensorsStringKey({self, grad}) + ":" + to_string(threshold.to<double>());
+        "threshold_backward_out_mps" + getTensorsStringKey({self, grad}) + ":" + std::to_string(threshold.to<double>());

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
@ -826,8 +826,8 @@ static void elu_variants_out_mps(const Tensor& self,
  MPSStream* stream = getCurrentMPSStream();

  @autoreleasepool {
-    string key = func_name + ":" + getTensorsStringKey({self}) + ":" + to_string(alpha.to<double>()) + ":" +
-        to_string(scale.to<double>()) + ":" + to_string(input_scale.to<double>());
+    string key = func_name + ":" + getTensorsStringKey({self}) + ":" + std::to_string(alpha.to<double>()) + ":" +
+        std::to_string(scale.to<double>()) + ":" + std::to_string(input_scale.to<double>());

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
@ -916,8 +916,8 @@ TORCH_IMPL_FUNC(elu_backward_out_mps)

  @autoreleasepool {
    string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
-        to_string(alpha.to<double>()) + ":" + to_string(scale.to<double>()) + ":" +
-        to_string(input_scale.to<double>()) + ":" + to_string(is_result);
+        std::to_string(alpha.to<double>()) + ":" + std::to_string(scale.to<double>()) + ":" +
+        std::to_string(input_scale.to<double>()) + ":" + std::to_string(is_result);

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
@ -1010,7 +1010,7 @@ TORCH_IMPL_FUNC(glu_out_mps)(const Tensor& self, const int64_t dim, const Tensor
  MPSStream* stream = getCurrentMPSStream();

  @autoreleasepool {
-    string key = "glu_out_mps" + getTensorsStringKey({self}) + ":" + to_string(dim);
+    string key = "glu_out_mps" + getTensorsStringKey({self}) + ":" + std::to_string(dim);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), getMPSShape(self));
      NSArray<MPSGraphTensor*>* outputTensorsArray = [mpsGraph splitTensor:inputTensor
@ -1052,7 +1052,7 @@ Tensor& glu_backward_mps_out(const Tensor& grad_output, const Tensor& self, cons
  MPSStream* stream = getCurrentMPSStream();

  @autoreleasepool {
-    string key = "glu_backward_mps_out" + getTensorsStringKey({grad_output, self}) + ":" + to_string(dim);
+    string key = "glu_backward_mps_out" + getTensorsStringKey({grad_output, self}) + ":" + std::to_string(dim);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), getMPSShape(self));
      MPSGraphTensor* gradOutputTensor =
@ -1855,8 +1855,8 @@ Tensor& hardtanh_backward_out_mps(const Tensor& grad_output,
  MPSStream* stream = getCurrentMPSStream();

  @autoreleasepool {
-    string key = "hardtanh_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" + to_string(min.to<double>()) +
-        ":" + to_string(max.to<double>());
+    string key = "hardtanh_backward_out_mps:" + getTensorsStringKey({grad_output}) + ":" +
+        std::to_string(min.to<double>()) + ":" + std::to_string(max.to<double>());

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@ -136,8 +136,8 @@ static Tensor& addmv_out_mps_impl(const Tensor& self,
  Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);

  @autoreleasepool {
-    string key = "addmv_out_mps_impl" + getTensorsStringKey({self, matMulVec}) + ":" + to_string(beta_.toDouble()) +
-        ":" + to_string(alpha_.toDouble());
+    string key = "addmv_out_mps_impl" + getTensorsStringKey({self, matMulVec}) + ":" +
+        std::to_string(beta_.toDouble()) + ":" + std::to_string(alpha_.toDouble());
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* matMulVecTensor = mpsGraphRankedPlaceHolder(mpsGraph, matMulVec);
      MPSGraphTensor* selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
--- a/aten/src/ATen/native/mps/operations/ConstantOps.mm
+++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm
@ -33,7 +33,7 @@ static Tensor& fill_scalar_mps_impl(Tensor& self, const Scalar& value) {
  };

  @autoreleasepool {
-    string key = "fill_scalar_mps_impl" + getTensorsStringKey(self) + ":" + to_string(value.toDouble());
+    string key = "fill_scalar_mps_impl" + getTensorsStringKey(self) + ":" + std::to_string(value.toDouble());

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphScalarPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type()));
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -193,24 +193,24 @@ static Tensor _mps_convolution_impl(const Tensor& input_t,

    string bias_shape_key;
    if (bias_defined) {
-      bias_shape_key = to_string(bias_shape[0]);
+      bias_shape_key = std::to_string(bias_shape[0]);
    } else {
      bias_shape_key = "nobias";
    }

    string key;
    if (is3DConv) {
-      key = "mps_3d_convolution:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":" + to_string(stride[2]) +
-          ":" + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":" + to_string(dilation[2]) + ":" +
-          to_string(padding[0]) + ":" + to_string(padding[1]) + ":" + to_string(padding[2]) + ":" + to_string(groups) +
-          ":" + mem_format_key + mps::getTensorsStringKey({input_t, weight_t}) + ":" + to_string(bias_defined) + ":" +
-          bias_shape_key;
+      key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
+          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
+          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;

    } else {
-      key = "mps_convolution:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":" + to_string(dilation[0]) +
-          ":" + to_string(dilation[1]) + ":" + to_string(padding[0]) + ":" + to_string(padding[1]) + ":" +
-          to_string(groups) + ":" + mem_format_key + mps::getTensorsStringKey({input_t, weight_t}) + ":" +
-          to_string(bias_defined) + ":" + bias_shape_key;
+      key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
+          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
    }

    MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
@ -388,16 +388,16 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
    NSString* ns_shape_key = [[gradOutputShape valueForKey:@"description"] componentsJoinedByString:@","];
    string key;
    if (is3DConv) {
-      key = "mps_3d_convolution_backward_input:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":" + ":" +
-          to_string(stride[2]) + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":" + to_string(dilation[2]) +
-          ":" + to_string(padding[0]) + ":" + to_string(padding[1]) + ":" + to_string(padding[2]) + ":" +
-          to_string(groups) + ":" + mem_format_key + getTensorsStringKey({grad_output_t, weight_t}) + ":" +
-          string([ns_shape_key UTF8String]);
+      key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          ":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
+          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
+          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
+          getTensorsStringKey({grad_output_t, weight_t}) + ":" + string([ns_shape_key UTF8String]);

    } else {
-      key = "mps_convolution_backward_input:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":" +
-          to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":" + to_string(padding[0]) + ":" +
-          to_string(padding[1]) + ":" + to_string(groups) + ":" + mem_format_key +
+      key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
+          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
          getTensorsStringKey({grad_output_t, weight_t}) + ":" + string([ns_shape_key UTF8String]);
    }
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@ -547,15 +547,15 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
    NSString* ns_shape_key = [[gradOutputShape valueForKey:@"description"] componentsJoinedByString:@","];
    string key;
    if (is3DConv) {
-      key = "mps_3d_convolution_backward_weights:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":" +
-          to_string(stride[2]) + ":" + to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":" +
-          to_string(dilation[2]) + ":" + to_string(padding[0]) + ":" + to_string(padding[1]) + ":" +
-          to_string(padding[2]) + ":" + to_string(groups) + ":" + mem_format_key +
+      key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
+          std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
+          std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
          getTensorsStringKey({grad_output_t, input_t, grad_weight_t}) + ":" + string([ns_shape_key UTF8String]);
    } else {
-      key = "mps_convolution_backward_weights:" + to_string(stride[0]) + ":" + to_string(stride[1]) + ":" +
-          to_string(dilation[0]) + ":" + to_string(dilation[1]) + ":" + to_string(padding[0]) + ":" +
-          to_string(padding[1]) + ":" + to_string(groups) + ":" + mem_format_key +
+      key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
+          std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
+          std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
          getTensorsStringKey({grad_output_t, input_t, grad_weight_t}) + ":" + string([ns_shape_key UTF8String]);
    }
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@ -63,7 +63,7 @@ Tensor& random_mps_impl(Tensor& self,

  @autoreleasepool {
    string key = op_name + getTensorsStringKey({self, mean_opt.value_or(Tensor()), std_opt.value_or(Tensor())}) + ":" +
-        to_string(val1) + ":" + to_string(val2);
+        std::to_string(val1) + ":" + std::to_string(val2);
    auto cachedGraph = LookUpOrCreateCachedGraph<RandomCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      newCachedGraph->stateTensor =
          mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[ @(at::mps::detail::PHILOX_STATE_N) ]);
@ -469,7 +469,7 @@ static Tensor& multinomial_with_replacement_mps_kernel(const Tensor& self,
  MPSStream* stream = getCurrentMPSStream();

  @autoreleasepool {
-    string key = "multinomial_with_replacement:" + getTensorsStringKey({self}) + ":" + to_string(n_sample);
+    string key = "multinomial_with_replacement:" + getTensorsStringKey({self}) + ":" + std::to_string(n_sample);
    auto cachedGraph = LookUpOrCreateCachedGraph<RandomCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSShape* prob_shape = getMPSShape(self_v);
      newCachedGraph->stateTensor = mpsGraphRankedPlaceHolder(mpsGraph, MPSDataTypeInt32, @[ @7 ]);
--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@ -236,7 +236,7 @@ static std::tuple<Tensor, Tensor> _mps_linear_backward_weights(const Tensor& gra
  MPSStream* stream = getCurrentMPSStream();

  @autoreleasepool {
-    string key = "mps_linear_backward_weights:" + to_string(bias_defined) + ":" +
+    string key = "mps_linear_backward_weights:" + std::to_string(bias_defined) + ":" +
        getTensorsStringKey({input_reshaped, weight, grad_output_reshaped});
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_reshaped);
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -229,8 +229,8 @@ static Tensor& addbmm_or_baddbmm_out_mps_impl(const Tensor& input,

  @autoreleasepool {
    string key = (opType == ADDBMM_OP_TYPE) ? ("addbmm_out_mps_impl") : ("baddbmm_out_mps_impl");
-    key += getTensorsStringKey({batch1, batch2, input}) + ":" + to_string(beta.toDouble()) + ":" +
-        to_string(alpha.toDouble());
+    key += getTensorsStringKey({batch1, batch2, input}) + ":" + std::to_string(beta.toDouble()) + ":" +
+        std::to_string(alpha.toDouble());

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, input);
@ -331,8 +331,8 @@ static Tensor& addmm_out_mps_impl(const Tensor& bias,
  };

  @autoreleasepool {
-    string key = "addmm_out_mps_impl" + getTensorsStringKey({self, other, *bias_}) + ":" + to_string(beta.toDouble()) +
-        ":" + to_string(alpha.toDouble());
+    string key = "addmm_out_mps_impl" + getTensorsStringKey({self, other, *bias_}) + ":" +
+        std::to_string(beta.toDouble()) + ":" + std::to_string(alpha.toDouble());
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* selfTensor = nil;
      MPSGraphTensor* otherTensor = nil;
@ -615,8 +615,8 @@ Tensor& addr_out_mps(const Tensor& self,
  };

  @autoreleasepool {
-    string key = "addr_out_mps_impl" + getTensorsStringKey({vec1, vec2, *self_}) + ":" + to_string(beta.toDouble()) +
-        ":" + to_string(alpha.toDouble());
+    string key = "addr_out_mps_impl" + getTensorsStringKey({vec1, vec2, *self_}) + ":" +
+        std::to_string(beta.toDouble()) + ":" + std::to_string(alpha.toDouble());
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* t1 = mps::mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(vec1), inputShape);
      MPSGraphTensor* t2 = mps::mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(vec2), otherShape);
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@ -69,7 +69,7 @@ static Tensor& mse_loss_backward_out_impl(const Tensor& grad_output,
  };

  @autoreleasepool {
-    string key = op_name + reductionToString(reduction) + ":" + to_string(grad_input.sizes()[1]) +
+    string key = op_name + reductionToString(reduction) + ":" + std::to_string(grad_input.sizes()[1]) +
        getTensorsStringKey({input, target, grad_output});
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
@ -327,8 +327,8 @@ static void nllnd_loss_backward_impl(Tensor& grad_input_arg,
  }
  @autoreleasepool {
    string key = "nllnd_loss_backward" + getTensorsStringKey({input, grad_output, target, weight, total_weight}) +
-        to_string(numClasses) + ":" + to_string(ignore_index) + ":" + to_string(isWeightsArrayValid) + ":" +
-        to_string(isTargetCasted) + ":" + reductionToString(reduction);
+        std::to_string(numClasses) + ":" + std::to_string(ignore_index) + ":" + std::to_string(isWeightsArrayValid) +
+        ":" + std::to_string(isTargetCasted) + ":" + reductionToString(reduction);

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
@ -463,9 +463,9 @@ static void nllnd_loss_forward_impl(Tensor& output,
    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];

    // TODO: Make the key
-    string key = "nllnd_loss_forward_impl:" + to_string(ignore_index) + ":" + to_string(isWeightsArrayValid) + ":" +
-        reductionToString(reduction) + ":" + [ns_shape_key UTF8String] + ":" + getMPSTypeString(input) + ":" +
-        getMPSTypeString(target) + ":" + to_string(isTargetCasted) + ":" + getMPSTypeString(weight);
+    string key = "nllnd_loss_forward_impl:" + std::to_string(ignore_index) + ":" + std::to_string(isWeightsArrayValid) +
+        ":" + reductionToString(reduction) + ":" + [ns_shape_key UTF8String] + ":" + getMPSTypeString(input) + ":" +
+        getMPSTypeString(target) + ":" + std::to_string(isTargetCasted) + ":" + getMPSTypeString(weight);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input), input_shape);
      MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(target), target_shape);
@ -598,7 +598,7 @@ static void smooth_l1_loss_impl(const Tensor& input,
    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];

    string key = "smooth_l1_loss_impl:" + reductionToString(reduction) + ":" + [ns_shape_key UTF8String] + ":" +
-        to_string(beta) + ":" + getMPSTypeString(input) + ":" + getMPSTypeString(target);
+        std::to_string(beta) + ":" + getMPSTypeString(input) + ":" + getMPSTypeString(target);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      // smooth_l1_loss_mps:
      // ln = 0.5 * ( xn - yn ) ^ 2 / beta,       if |xn - yn| < beta
@ -734,7 +734,7 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,

  @autoreleasepool {
    string key = "smooth_l1_loss_backward" + getTensorsStringKey({input, grad_output, grad_input, target}) + ":" +
-        reductionToString(reduction) + ":" + to_string(beta);
+        reductionToString(reduction) + ":" + std::to_string(beta);

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
--- a/aten/src/ATen/native/mps/operations/Quantized.mm
+++ b/aten/src/ATen/native/mps/operations/Quantized.mm
@ -146,64 +146,459 @@ INSTANTIATE_INT4MM(bfloat, 128);
 INSTANTIATE_INT4MM(bfloat, 256);
 #endif

-template <typename T, unsigned blockSize=8>
-kernel void
-int8pack_mm(constant T *A [[buffer(0)]], constant char *B [[buffer(1)]],
-            constant T *scales [[buffer(2)]],
-            device T *outputData [[buffer(3)]],
-            constant int3 &sizes [[buffer(4)]],
-            uint2 group_index [[threadgroup_position_in_grid]],
-            uint2 threadgroup_index [[thread_position_in_threadgroup]]) {
-  using vecT = typename Vec4Type<T>::type;
-  const uint lda = sizes.y;
-  const uint ldc = sizes.z;
-  int out_idx = (group_index.x * blockSize + threadgroup_index.x) * 4;
-  int n = out_idx % sizes.z;
-  int m = out_idx / sizes.z;
-  // Offset pointers
-  A += m * lda;
-  B += n * lda;
-  outputData += m *ldc;
+// ------------------------------ int8 MM For M >= 12 ------------------------------------
+/**
+ * The following code is heavily inspired by llama.cpp (https://github.com/ggerganov/llama.cpp).
+ * The original code is under MIT License: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
+ *
+ * Matrix Multiplication Algorithm:
+ * 1. Load A and B blocks (32x32 and 64x32 respectively) into shared memory.
+ * 2. In 4 simdgroups, calculate the outer product of the loaded blocks. Each simdgroup produces a 2x4 8x8 result.
+ *      2.1 For how to use outer product to perform matrix multiplication, refer to
+ *           http://mlwiki.org/index.php/Matrix-Matrix_Multiplication#Sum_of_Outer_Products
+ * 3. Repeat 1 & 2 along K axis, with K block size 32, accumulate the result in the 2x4 8x8 block.
+ * 4. Dequantize the final result and store it in the output matrix.
+ *
+ * Variable names are changed to adapt to PyTorch convention such as M, N, K, etc.
+ * Assuming row major order.
+ * For more details please see inline comments.
+ */
+#include <metal_stdlib>
+using namespace metal;
+template <typename T> struct BlockType {};

-  float4 rc = 0;
-  for (unsigned k = threadgroup_index.y * 4; k < sizes.y; k += 4 * blockSize) {
-    threadgroup_barrier(mem_flags::mem_none);
-    auto a_val = float4(*reinterpret_cast<constant vecT *>(A  + k));
-    float4x4 b_val;
-    for (int i = 0; i < 4; ++i) {
-      b_val[i] = float4(*reinterpret_cast<constant char4 *>(B + i * lda + k));
-    }
-    rc += transpose(b_val) * a_val;
-  }
+template <> struct BlockType<float> {
+  using simdgroup_type8x8 = simdgroup_float8x8;
+  using type4 = float4;
+};

-  // Accumulate results acorss SIMD group? (8 threads using vec4)
-  threadgroup float4 tgp_memory[blockSize][blockSize];
-  tgp_memory[threadgroup_index.x][threadgroup_index.y] = rc;
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-  if (threadgroup_index.y == 0) {
-    for (int i = 1; i < blockSize; i++) {
-      rc += tgp_memory[threadgroup_index.x][i];
-    }
-    *reinterpret_cast<device vecT *>(outputData + n) =
-        vecT(rc * float4(*reinterpret_cast<constant vecT *>(scales + n)));
-  }
+template <> struct BlockType<half> {
+  using simdgroup_type8x8 = simdgroup_half8x8;
+  using type4 = half4;
+};
+#if __METAL_VERSION__ >= 310
+template <> struct BlockType<bfloat> {
+  using simdgroup_type8x8 = simdgroup_bfloat8x8;
+  using type4 = bfloat4;
+};
+#endif
+
+template<typename T>
+float2 get_scale_zero(constant T * scalesAndZeros, uint2 index) {
+    return float2(1.0, 0.0);
 }

-#define INSTANTIATE_INT8MM(DTYPE)                                              \
-  template [[host_name("int8pack_mm_" #DTYPE)]] kernel void                    \
-  int8pack_mm<DTYPE>(                                                          \
-      constant DTYPE * A [[buffer(0)]], constant char *B [[buffer(1)]],        \
-      constant DTYPE *scales [[buffer(2)]],                                    \
-      device DTYPE *outputData [[buffer(3)]],                                  \
-      constant int3 &sizes [[buffer(4)]],                                      \
-      uint2 group_index [[threadgroup_position_in_grid]],                      \
-      uint2 threadgroup_index [[thread_position_in_threadgroup]]);
+template<typename T>
+float2 get_scale_zero_q8(constant T * scalesAndZeros, uint2 index) {
+    T scale = scalesAndZeros[index[0]];
+    return float2(scale, 0.0);
+}

-INSTANTIATE_INT8MM(half);
-INSTANTIATE_INT8MM(float);
+#define BLOCK_SIZE_M 32 // each block takes 32 rows in matrix A
+#define BLOCK_SIZE_N 64 // each block takes 64 rows in matrix B
+#define BLOCK_SIZE_K 32
+#define THREAD_MAT_M 2 // in data loading stage, each thread load 2 simdgroup matrices from matrix A
+#define THREAD_MAT_N 4 // in data loading stage, each thread load 4 simdgroup matrices from matrix B
+#define THREAD_PER_ROW_A 4 // 4 thread for each row in matrix A to load numbers
+#define THREAD_PER_ROW_B 2 // 2 thread for each row in matrix B to load numbers
+#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
+#define SG_MAT_ROW 8
+
+// T: input type, W: weight type
+template<typename T, typename W, float2 (*get_scale_zero_func)(constant T *, uint2)>
+kernel void kernel_mul_mm(
+    constant T                 * A              [[buffer(0)]],
+    constant char              * B              [[buffer(1)]],
+    constant T                 * scalesAndZeros [[buffer(2)]],
+    device T                   * outputData     [[buffer(3)]],
+    constant uint3             & sizes          [[buffer(4)]],
+    threadgroup char           * shared_memory  [[threadgroup(0)]], // threadgroup buffer at index 0
+    uint3                        tgpig          [[threadgroup_position_in_grid]], // 3d coordinates
+    uint                         tiitg          [[thread_index_in_threadgroup]], // 128 per threadgroup
+    uint                         sgitg          [[simdgroup_index_in_threadgroup]]) {
+
+    using T4 = typename BlockType<T>::type4;
+    using Tsimd8x8 = typename BlockType<T>::simdgroup_type8x8;
+    // sizes: x = M, y = K, z = N
+    // pytorch: M x K @ N x K -> M x N
+    // ggml: K x N @ K x M -> N x M
+    uint32_t M = sizes.x; // M
+    uint32_t K = sizes.y; // K
+    uint32_t N = sizes.z; // N
+    uint32_t nbytes_B = sizeof(W); // number of bytes for one element in B
+    uint32_t nbytes_B_row = nbytes_B * K; // number of bytes for one row in B
+    uint32_t nbytes_A = sizeof(T); // number of bytes for one element in A
+    uint32_t nbytes_A_row = nbytes_A * K; // number of bytes for one row in A
+
+    // shared memory for A and B
+    threadgroup T    * shared_memory_A = (threadgroup T    *)(shared_memory);
+    // using half here to store int8, gives us about 8% perf gain comparing to bfloat but not sure why
+    threadgroup half * shared_memory_B = (threadgroup half *)(shared_memory + 8192);
+
+    const uint threadgroup_M = tgpig.x; // total number (M + 31)/32, the index of this threadgroup along M axis
+    const uint threadgroup_N = tgpig.y; // total number (N + 63)/64, the index of this threadgroup along N axis
+
+    // if this block is of 64x32 shape or smaller, bound the number of rows for A and B in this block.
+    short n_rows_A = min(uint32_t(M - threadgroup_M * BLOCK_SIZE_M), uint32_t(BLOCK_SIZE_M));
+    short n_rows_B = min(uint32_t(N - threadgroup_N * BLOCK_SIZE_N), uint32_t(BLOCK_SIZE_N));
+
+    // a thread shouldn't load data outside of the matrix
+    short thread_row_A = min(((short)tiitg/THREAD_PER_ROW_A), n_rows_A - 1);
+    short thread_row_B = min(((short)tiitg/THREAD_PER_ROW_B), n_rows_B - 1);
+
+    Tsimd8x8 simdgroup_A[2]; // input, each simdgroup load 128 values of input
+    simdgroup_half8x8 simdgroup_B[4]; // weight, each simdgroup load 256 values of weight
+    simdgroup_float8x8 simdgroup_C[8]; // outer product result, 2x4 8x8 blocks.
+    for (short i = 0; i < 8; i++){
+        simdgroup_C[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
+    }
+
+    constant T * a_ptr = (constant T *)((constant char *)A
+        + nbytes_A_row * (threadgroup_M * BLOCK_SIZE_M + thread_row_A)
+        + nbytes_A * (BLOCK_SIZE_K / THREAD_PER_ROW_A * (tiitg % THREAD_PER_ROW_A)));
+
+    constant W * b_ptr = (constant W *)(B
+        + nbytes_B_row * (threadgroup_N * BLOCK_SIZE_N + thread_row_B)
+        + nbytes_B * (BLOCK_SIZE_K / THREAD_PER_ROW_B * (tiitg % THREAD_PER_ROW_B)));
+/**
+Load weight and input into shared memory:
+8192: BLOCK_SIZE_M x BLOCK_SIZE_K x 4(max bytes per value) <----- numbers don't checkout, should be 4096. Changing it to 4096 gives wrong value.
+4096: BLOCK_SIZE_N x BLOCK_SIZE_K x 2(storing int8 in half)
+
+                          K
+               ┌────────────────────────┐              8192(A)             4096(B)
+               │                        │   ┌────────────────────────┬────────────┐
+               │                        │   │++++++++++++++++++++++++│++++++++++++│
+               │                        │   └────────────────────────┴────────────┘
+               │                        │
+               │32(BLOCK_SIZE_K)        │
+               ├──┬──┬──────────────────┤                           K
+               │++│  │                  │               ┌────────────────────────┐
+             64│++│  │...               │               │                        │
+ (BLOCK_SIZE_N)│++│  │                  │               │                        │
+               ├──┴──┴──────────────────┤               │                        │
+               │                        │               │                        │
+               │      ───────────►      │               │32(BLOCK_SIZE_K)        │
+               │       for loop         │               ├──┬──┬──────────────────┤
+               │                        │             32│++│  │ ...              │
+               │                        │ (BLOCK_SIZE_M)├──┴──┴──────────────────┤
+               │                        │               │         ────────────►  │
+               │                        │               │            for loop    │
+               └────────────────────────┘               └────────────────────────┘
+                           B                                        A
+
+ */
+    for (uint32_t loop_k = 0; loop_k < K; loop_k += BLOCK_SIZE_K) {
+        // load data and store to threadgroup memory
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        #pragma unroll(16)
+        for (short i = 0; i < 16; i++) {
+            half weight = *(b_ptr + i);
+            // for example, tiitg 32, i 12 -> 0 + 1 = 1, it needs to work on sg mat grid row 1
+            short sg_mat_grid_row_index = (tiitg % THREAD_PER_ROW_B) * THREAD_PER_ROW_B + i / 8;
+            // same example, sg mat grid col index: 32 / 2 / 8 = 2, so currently need to work with sg mat at (1, 2)
+            short sg_mat_grid_col_index = tiitg / THREAD_PER_ROW_B / 8;
+            // now inside sg mat, which index to write to? starting point is SG_MAT_SIZE * sg_mat_offset
+            short row_offset = i % 8;
+            short col_offset = (tiitg / THREAD_PER_ROW_B) % 8;
+            // now calculates the overall offset for shared_memory_B
+            short sb_offset = (sg_mat_grid_row_index * 8 + sg_mat_grid_col_index) * 64 + (row_offset * 8 + col_offset);
+            *(shared_memory_B + sb_offset) = weight;
+        }
+        // read 8 values for input matrix
+
+        #pragma unroll(2)
+        for (short i = 0; i < 2; i++) {
+            *((threadgroup T4 *)(shared_memory_A + (tiitg % THREAD_PER_ROW_A) * 8 * 32 + 8 * (tiitg / THREAD_PER_ROW_A)) + i) = *((constant T4 *)a_ptr + i);
+        }
+
+        a_ptr += BLOCK_SIZE_K;
+        b_ptr += BLOCK_SIZE_K;
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        // load matrices from threadgroup memory and conduct outer products
+        // pointing to the shared memory starting address for A, for current simdgroup.
+        threadgroup T    * simdgroup_A_ptr = (shared_memory_A + THREAD_MAT_M * SG_MAT_SIZE * (sgitg / 2));
+        // pointing to the shared memory starting address for B, for current simdgroup.
+        threadgroup half * simdgroup_B_ptr = (shared_memory_B + THREAD_MAT_N * SG_MAT_SIZE * (sgitg % 2));
+
+/**
+Outer product:
+              K
+       ────────────►
+     8    for loop              8   8
+   ┌───┬───┬───┬───┐          ┌───┬───┬───┬───┬───┬───┬───┬───┐
+ 8 │+++│   │   │   │      │  8│+++│+++│+++│+++│###│###│###│###│
+   ├───┼───┼───┼───┤      │   ├───┼───┼───┼───┼───┼───┼───┼───┤
+   │+++│   │   │   │      │   │   │   │   │   │   │   │   │   │
+   ├───┼───┼───┼───┤      │ K ├───┼───┼───┼───┼───┼───┼───┼───┤
+   │###│   │   │   │      │   │   │   │   │   │   │   │   │   │
+   ├───┼───┼───┼───┤      │   ├───┼───┼───┼───┼───┼───┼───┼───┤
+   │###│   │   │   │      │   │   │   │   │   │   │   │   │   │
+   └───┴───┴───┴───┘      ▼   └───┴───┴───┴───┴───┴───┴───┴───┘
+                       for loop
+    + simdgroup 0,1                + simdgroup 0,2
+    # simdgroup 2,3                # simdgroup 1,3
+ */
+        #pragma unroll(4)
+        for (short ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+            #pragma unroll(4)
+            for (short i = 0; i < 4; i++) {
+                simdgroup_load(simdgroup_B[i], simdgroup_B_ptr + SG_MAT_SIZE * i);
+            }
+            simdgroup_barrier(mem_flags::mem_none);
+            #pragma unroll(2)
+            for (short i = 0; i < 2; i++) {
+                simdgroup_load(simdgroup_A[i], simdgroup_A_ptr + SG_MAT_SIZE * i);
+            }
+
+            simdgroup_A_ptr += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
+            simdgroup_B_ptr += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
+
+            #pragma unroll(8)
+            for (short i = 0; i < 8; i++){
+                simdgroup_multiply_accumulate(simdgroup_C[i], simdgroup_A[i/4], simdgroup_B[i%4], simdgroup_C[i]);
+            }
+        }
+    }
+
+    /**
+ * Each sgitg 0,1,2,3 handles 2x4 8x8.
+    8   8
+  ┌───┬───┬───┬───┬───┬───┬───┬───┐
+ 8│ 0 │ 0 │ 0 │ 0 │ 1 │ 1 │ 1 │ 1 │
+  ├───┼───┼───┼───┼───┼───┼───┼───┤
+  │ 0 │ 0 │ 0 │ 0 │ 1 │ 1 │ 1 │ 1 │
+  ├───┼───┼───┼───┼───┼───┼───┼───┤
+  │ 2 │ 2 │ 2 │ 2 │ 3 │ 3 │ 3 │ 3 │
+  ├───┼───┼───┼───┼───┼───┼───┼───┤
+  │ 2 │ 2 │ 2 │ 2 │ 3 │ 3 │ 3 │ 3 │
+  └───┴───┴───┴───┴───┴───┴───┴───┘
+
+   scale: 8 x BLOCK_SIZE_N, starting from shared_memory_A. Each sgitg handles 4 8x8 diagonal matrix.
+    8   8
+  ┌───┬───┬───┬───┬───┬───┬───┬───┐
+ 8│   │   │   │   │   │   │   │   │
+  └───┴───┴───┴───┴───┴───┴───┴───┘
+ */
+
+    threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
+                                  + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_N;
+    for (int i = 0; i < 8; i++) {
+        int block_start = 4 * 8 * (sgitg & 1) + (i % 4) * 8;
+        threadgroup float * temp_scale = (threadgroup float *)shared_memory_B + block_start;
+        threadgroup float * scale_iter = temp_scale;
+        // dequantize
+        for (int j = 0; j < 8; j++) {
+            // clear next 8 values of scale_iter
+            *((threadgroup float2x4 *)scale_iter) = float2x4(0.f);
+            // find scale
+            int scale_index = threadgroup_N * BLOCK_SIZE_N + block_start + j;
+            float2 scale_zero = get_scale_zero_func(scalesAndZeros, uint2(scale_index, 0));
+            // create diagonal matrix of scales
+            *(scale_iter + j) = scale_zero[0];
+            // go to next row
+            scale_iter += BLOCK_SIZE_N;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        simdgroup_float8x8 simd_scale;
+        simdgroup_load(simd_scale, temp_scale, BLOCK_SIZE_N);
+        simdgroup_multiply(simdgroup_C[i], simdgroup_C[i], simd_scale);
+        simdgroup_store(simdgroup_C[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_N * (i/4), BLOCK_SIZE_N);
+    }
+
+    device T * C = outputData + (BLOCK_SIZE_N * threadgroup_N) + (BLOCK_SIZE_M * threadgroup_M) * N;
+    if (sgitg == 0) {
+        for (int i = 0; i < n_rows_B; i++) {
+            for (int j = tiitg; j < n_rows_A; j += BLOCK_SIZE_M) {
+                float temp = *(temp_str + i + j * BLOCK_SIZE_N);
+                *(C + i + j * N) = (device T)(temp);
+            }
+        }
+    }
+}
+
+#define INSTANTIATE_MM(DTYPE, WDTYPE, DEQUANT_FUNC)                      \
+template                                                                 \
+[[host_name("large_m_int8pack_mm_" #DTYPE)]]                             \
+kernel void kernel_mul_mm<DTYPE, WDTYPE, DEQUANT_FUNC>(                  \
+    constant DTYPE             * A              [[buffer(0)]],           \
+    constant char              * B              [[buffer(1)]],           \
+    constant DTYPE             * scalesAndZeros [[buffer(2)]],           \
+    device   DTYPE             * outputData     [[buffer(3)]],           \
+    constant uint3             & sizes          [[buffer(4)]],           \
+    threadgroup char           * shared_memory  [[threadgroup(0)]],      \
+    uint3                        tgpig          [[threadgroup_position_in_grid]], \
+    uint                         tiitg          [[thread_index_in_threadgroup]],  \
+    uint                         sgitg          [[simdgroup_index_in_threadgroup]])
+
+
+INSTANTIATE_MM(float, char, get_scale_zero_q8);
+INSTANTIATE_MM(half, char, get_scale_zero_q8);
 #if __METAL_VERSION__ >= 310
-INSTANTIATE_INT8MM(bfloat);
+INSTANTIATE_MM(bfloat, char, get_scale_zero_q8);
 #endif
+// ------------------------------ int8 MM For M < 12 ------------------------------------
+/* Matrix vector multiplication, used for small M size for matrix multiplication as well.
+
+                      for loop ->
+                       1  1  1  1                                 1
+  ┌──────────────────┬──┬──┬──┬──┬───────────┬─────┐             ┌──┐
+  │      thread 0-> 8│  │  │  │  │           │     │            8│  │
+  │                  ├──┼──┼──┼──┤           │     │             ├──┤
+  │      thread 1-> 8│  │  │  │  │           │     │            8│  │
+  │                  ├──┼──┼──┼──┤           │     │             ├──┤
+  │      thread 2-> 8│  │  │  │  │           │     │            8│  │
+  │                  ├──┼──┼──┼──┤           │     │             ├──┤
+  │      thread 3-> 8│  │  │  │  │           │     │            8│  │
+  │                  ├──┼──┼──┼──┤           │     │             ├──┤
+  │                  │  │  │  │  │           │     │             │  │
+  │    thread 4-7  32│  │  │  │  │           │     │           32│  │
+  │                  │  │  │  │  │   SIMD    │     │             │  │
+K │                  ├──┼──┼──┼──┤  Group 1  │     │             ├──┤
+  │                  │  │  │  │  │           │     │             │  │
+  │    thread 8-15 64│  │  │  │  │           │     │           64│  │
+  │                  │  │  │  │  │           │     │             │  │
+  │                  ├──┼──┼──┼──┤           │     │             ├──┤
+  │                  │  │  │  │  │           │     │             │  │
+  │  thread 16-31 128│  │  │  │  │           │     │          128│  │
+  │                  │  │  │  │  │           │     │             │  │
+  │                  ├──┼──┼──┼──┼───────────┤     │             ├──┤
+  │                  │  │  │  │  │           │     │             │  │
+  └──────────────────┴──┴──┴──┴──┴───────────┴─────┘             └──┘
+                      SIMD Group 0                                input
+
+                          N
+  ┌──────────────────┬──┬──┬──┬──┬───────────┬─────┐
+  │                  │  │  │  │  │           │     │
+  └──────────────────┴──┴──┴──┴──┴───────────┴─────┘
+                      scale
+
+*/
+// putting them in the kernel causes a significant performance penalty, could use function constant to optimize?
+#define NB_Q8_0 8
+#define N_DST 4        // each SIMD group works on 4 rows
+#define N_SIMDGROUP 2  // number of SIMD groups in a thread group
+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
+
+template<typename T>
+kernel void kernel_mul_mv(
+    constant T                 * A              [[buffer(0)]],
+    constant char              * B              [[buffer(1)]],
+    constant T                 * scalesAndZeros [[buffer(2)]],
+    device T                   * outputData     [[buffer(3)]],
+    constant uint3             & sizes          [[buffer(4)]],
+    threadgroup char           * shared_memory  [[threadgroup(0)]],
+    uint3                        tgpig          [[threadgroup_position_in_grid]],
+    uint                         tiisg          [[thread_index_in_simdgroup]],
+    uint                         sgitg          [[simdgroup_index_in_threadgroup]]) {
+
+    using T4 = typename BlockType<T>::type4;
+
+    const int nr  = N_DST;
+    const int nsg = N_SIMDGROUP;
+    const int nw  = N_SIMDWIDTH;
+
+    // sizes: x = M, y = K, z = N, given mv, x = M = 1
+    // pytorch: M x K @ N x K -> M x N
+    // ggml: K x N @ K x M -> N x M
+    uint32_t K = sizes.y; // K
+    uint32_t N = sizes.z; // N
+
+    const int nb = K/N_SIMDWIDTH; // number of blocks of 32 elements along K axis
+    const int threadgroup_N = tgpig.x; // threadgroup index along N axis.
+    const int threadgroup_M = tgpig.y; // threadgroup index along M axis. For matvec multiplication this will always be 0 but keep it for future usage.
+    /*
+     * Each SIMD group in a threadgroup handles N_DST = nr = 4 rows.
+     *      - threadgroup_N is the x index of the threadgroup. threadgroup_N * nsg -> the overall offset of SIMD groups, for this threadgroup.
+     *      - threadgroup_N * nsg + sgitg -> the overall index of SIMD group, in all SIMD groups.
+     *      - (threadgroup_N * nsg + sgitg) * nr -> the starting index of the row that this SIMD group needs to handle.
+     */
+    const int first_row = (threadgroup_N * nsg + sgitg) * nr;
+
+    const uint offset0 = first_row * K;
+
+    // x: weight, y: input
+    constant char * x = (constant char *) B + offset0;
+    constant T    * y = (constant T    *) A + threadgroup_M*K;
+
+    // Load data to shared memory
+    threadgroup T * shared_scale = (threadgroup T *)(shared_memory); // length 8 * sizeof(float)
+    // Load scale:
+    if (tiisg < 4) {
+        *(shared_scale + (sgitg % 2) * 4 + tiisg) = *(scalesAndZeros + (threadgroup_N * NB_Q8_0) + (sgitg % 2) * 4 + tiisg);
+    }
+
+    // Accumulate on float4
+    float2x4 yl;
+    float4x4 xl[2];
+    float4 sumf = 0;
+
+    // Group threads in SIMD group into 8x4 block, each thread handles 8 input values.
+    const int ix = tiisg/4;
+    const int il = tiisg%4;
+
+    // N_SIMDWIDTH = 32 means we have 32 weights in 1 simdgroup.
+    // Find the starting point of input that this thread need to work on, load yb into yl.
+    constant T * yb = y + ix * N_SIMDWIDTH + NB_Q8_0*il;
+
+    // each thread in a SIMD group deals with NB_Q8_0 quants at a time
+    for (short ib = ix; ib < nb; ib += nw/4) {
+        // Load y data
+        for (short i = 0; i < 2; i++) {
+            short offset = i * 4;
+            yl[i] = {*(yb + offset), *(yb + offset + 1), *(yb + offset + 2), *(yb + offset + 3)};
+        }
+
+        for (short row = 0; row < nr; row++) {
+            // Locate where x should be.
+            // row offset: row * K
+            // col offset: ib * N_SIMDWIDTH + il * NB_Q8_0
+            // x index: row * K + ib * N_SIMDWIDTH + il * NB_Q8_0
+            constant int8_t * qs = (constant int8_t *)(x + row * K + ib * N_SIMDWIDTH + il * NB_Q8_0);
+            for (short batch = 0; batch < 2; batch++) {
+                short offset = batch * 4;
+                xl[batch][row] = {(float)qs[offset], (float)qs[offset+1], (float)qs[offset+2], (float)qs[offset+3]};
+            }
+        }
+        sumf += yl[0] * xl[0];
+        sumf += yl[1] * xl[1];
+        yb += NB_Q8_0 * nw;
+    }
+
+    for (int row = 0; row < nr; ++row) {
+        const float tot = simd_sum(sumf[row]);
+        float scale = *(shared_scale + (sgitg % 2) * 4 + row);
+        if (tiisg == 0 && first_row + row < N) {
+            outputData[threadgroup_M*N + first_row + row] = (device T)(tot * scale);
+        }
+    }
+}
+
+
+#define INSTANTIATE_MV(DTYPE)                                                   \
+template                                                                        \
+[[host_name("int8pack_mv_" #DTYPE)]]                                            \
+kernel void kernel_mul_mv<DTYPE>(                                               \
+    constant DTYPE             * A              [[buffer(0)]],                  \
+    constant char              * B              [[buffer(1)]],                  \
+    constant DTYPE             * scalesAndZeros [[buffer(2)]],                  \
+    device   DTYPE             * outputData     [[buffer(3)]],                  \
+    constant uint3             & sizes          [[buffer(4)]],                  \
+    threadgroup char           * shared_memory  [[threadgroup(0)]],             \
+    uint3                        tgpig          [[threadgroup_position_in_grid]],   \
+    uint                         tiisg          [[thread_index_in_simdgroup]],      \
+    uint                         sgitg          [[simdgroup_index_in_threadgroup]])
+
+
+INSTANTIATE_MV(float);
+INSTANTIATE_MV(half);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_MV(bfloat);
+#endif
+
 )METAL_QUANTIZED");

 Tensor _weight_int4pack_mm_mps(const Tensor& A, const Tensor& B, int64_t qGroupSize, const Tensor& qScaleAndZeros) {
@ -295,7 +690,13 @@ Tensor _weight_int8pack_mm_mps(const Tensor& A, const Tensor& B, const Tensor& s
      }
 #endif
      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
-      const std::string kernel = fmt::format("int8pack_mm_{}", scalarToMetalTypeString(A));
+      std::string kernel;
+      // heuristic, to use mv kernel for mm with small M. M = 10 is the performance tipping point.
+      if (M < 12) {
+        kernel = fmt::format("int8pack_mv_{}", scalarToMetalTypeString(A));
+      } else {
+        kernel = fmt::format("large_m_int8pack_mm_{}", scalarToMetalTypeString(A));
+      }
      id<MTLComputePipelineState> quantizedPSO = lib.getPipelineStateForFunc(kernel);
      [computeEncoder setComputePipelineState:quantizedPSO];
      mtl_setBuffer(computeEncoder, A, 0);
@ -303,7 +704,15 @@ Tensor _weight_int8pack_mm_mps(const Tensor& A, const Tensor& B, const Tensor& s
      mtl_setBuffer(computeEncoder, scales, 2);
      mtl_setBuffer(computeEncoder, C, 3);
      [computeEncoder setBytes:sizes.data() length:sizeof(uint32_t) * sizes.size() atIndex:4];
-      [computeEncoder dispatchThreads:MTLSizeMake(M * N / 4, 8, 1) threadsPerThreadgroup:MTLSizeMake(8, 8, 1)];
+      if (M < 12) {
+        [computeEncoder setThreadgroupMemoryLength:32 atIndex:0];
+        [computeEncoder dispatchThreadgroups:MTLSizeMake((N + 7) / 8, M, 1)
+                       threadsPerThreadgroup:MTLSizeMake(64, 1, 1)];
+      } else {
+        [computeEncoder setThreadgroupMemoryLength:12288 atIndex:0];
+        [computeEncoder dispatchThreadgroups:MTLSizeMake((M + 31) / 32, (N + 63) / 64, 1)
+                       threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+      }
 #if _CAPTURE_KERNEL
      if (getMPSProfiler().isCapturing()) {
        getMPSProfiler().stopCapture(mpsStream);
--- a/aten/src/ATen/native/mps/operations/RangeFactories.mm
+++ b/aten/src/ATen/native/mps/operations/RangeFactories.mm
@ -106,7 +106,7 @@ Tensor& arange_mps_out(const Scalar& start, const Scalar& end, const Scalar& ste
    auto stream = getCurrentMPSStream();
    auto mpsDataType = getMPSDataType(result);
    @autoreleasepool {
-      string key = "arange_mps_out" + getTensorsStringKey({result}) + ":" + to_string(size);
+      string key = "arange_mps_out" + getTensorsStringKey({result}) + ":" + std::to_string(size);
      auto cachedGraph = cache_->LookUpAs<RangeCachedGraph>(key);
      if (!cachedGraph) {
        cachedGraph = cache_->CreateCachedGraphAs<RangeCachedGraph>(key, ^MPSCachedGraph*() {
@ -173,7 +173,7 @@ Tensor& range_mps_out(const Scalar& start, const Scalar& end, const Scalar& step
    auto stream = getCurrentMPSStream();
    auto mpsDataType = getMPSDataType(result);
    @autoreleasepool {
-      string key = "arange_mps_out" + getTensorsStringKey({result}) + ":" + to_string(size);
+      string key = "arange_mps_out" + getTensorsStringKey({result}) + ":" + std::to_string(size);
      auto cachedGraph = cache_->LookUpAs<RangeCachedGraph>(key);
      if (!cachedGraph) {
        cachedGraph = cache_->CreateCachedGraphAs<RangeCachedGraph>(key, ^MPSCachedGraph*() {
@ -221,8 +221,8 @@ Tensor& linspace_out_mps(const Scalar& start, const Scalar& end, int64_t steps,
    bool start_less_end = (start.to<double>() <= end.to<double>());

    @autoreleasepool {
-      string key =
-          "linspace_out_mps:" + getTensorsStringKey({result}) + ":" + to_string(steps) + to_string(start_less_end);
+      string key = "linspace_out_mps:" + getTensorsStringKey({result}) + ":" + std::to_string(steps) +
+          std::to_string(start_less_end);
      auto cachedGraph = cache_->LookUpAs<RangeCachedGraph>(key);

      if (!cachedGraph) {
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -359,8 +359,8 @@ static void impl_func_norm_mps(const Tensor& input_tensor,
    NSString* ns_key = [[wrappedAxes valueForKey:@"description"] componentsJoinedByString:@","];
    string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
    string tensor_key = cdist ? getTensorsStringKey({input_tensor, other_tensor}) : getTensorsStringKey({input_t});
-    string key = string("norm_out_mps:") + [ns_key UTF8String] + ":" + tensor_key + ":p" + to_string(p) + ":" +
-        keepdim_info + ":" + toString(in_dtype) + ":" + to_string(castInputData);
+    string key = string("norm_out_mps:") + [ns_key UTF8String] + ":" + tensor_key + ":p" + std::to_string(p) + ":" +
+        keepdim_info + ":" + toString(in_dtype) + ":" + std::to_string(castInputData);

    auto cachedGraph = LookUpOrCreateCachedGraph<MPSBinaryCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, input_tensor);
@ -572,7 +572,7 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
    string op_key = (stdVarType == STANDARD_DEVIATION) ? "std_mps" : "var_mps";
    NSString* ns_key = [[wrappedAxes valueForKey:@"description"] componentsJoinedByString:@","];
    string bessel_corrected = (use_correction && correction_value) ? "unbiased " : "biased ";
-    string use_dim_info = (use_dim) ? "use_dim=1:" + to_string(dim_value.size()) : "use_dim=0";
+    string use_dim_info = (use_dim) ? "use_dim=1:" + std::to_string(dim_value.size()) : "use_dim=0";
    string keepdim_info = (keepdim) ? "keepdim=1" : "keepdim=0";
    string key = op_key + ":" + getTensorsStringKey(input_t) + ":" + use_dim_info + ":" + keepdim_info + ":" +
        string([ns_key UTF8String]) + ":" + bessel_corrected + ":" + std::to_string(correction_value);
@ -700,7 +700,7 @@ static void min_max_out_mps(const Tensor& input_t,
  auto stream = at::mps::getCurrentMPSStream();

  @autoreleasepool {
-    string key = func_name + getTensorsStringKey({input_t, indices_t}) + ":" + to_string(dim_);
+    string key = func_name + getTensorsStringKey({input_t, indices_t}) + ":" + std::to_string(dim_);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
      MPSGraphTensor* outputTensor = nil;
@ -860,7 +860,7 @@ static void argmax_argmin_out_mps(const Tensor& input_t,
  @autoreleasepool {
    NSString* ns_key = [[apparent_in_shape valueForKey:@"description"] componentsJoinedByString:@","];
    string key =
-        func_name + ":" + to_string(dim_) + ":" + getTensorsStringKey(input_t) + ":" + string([ns_key UTF8String]);
+        func_name + ":" + std::to_string(dim_) + ":" + getTensorsStringKey(input_t) + ":" + string([ns_key UTF8String]);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      auto inputScalarType = input_t.scalar_type();
      MPSGraphTensor* inputTensor =
@ -1217,7 +1217,7 @@ TORCH_IMPL_FUNC(any_out_mps)

  @autoreleasepool {
    MPSShape* input_t_shape = getMPSShape(input_t);
-    string key = string("any_out_mps:") + getMPSShapeString(input_t_shape) + ":" + to_string(dim_) + ":" +
+    string key = string("any_out_mps:") + getMPSShapeString(input_t_shape) + ":" + std::to_string(dim_) + ":" +
        getMPSTypeString(input_t);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSDataType input_type = getMPSDataType(input_t);
@ -1313,7 +1313,7 @@ TORCH_IMPL_FUNC(all_out_mps)

  @autoreleasepool {
    MPSShape* input_t_shape = getMPSShape(input_t);
-    string key = string("all_out_mps:") + getMPSShapeString(input_t_shape) + ":" + to_string(dim_) + ":" +
+    string key = string("all_out_mps:") + getMPSShapeString(input_t_shape) + ":" + std::to_string(dim_) + ":" +
        getMPSTypeString(input_t);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSDataType input_type = getMPSDataType(input_t);
@ -1531,8 +1531,8 @@ static void median_out_mps(const Tensor& input_t,
  auto stream = at::mps::getCurrentMPSStream();

  @autoreleasepool {
-    string key =
-        func_name + ":" + to_string(dim_) + ":" + getTensorsStringKey(input_t) + ":" + getTensorsStringKey(indices_t);
+    string key = func_name + ":" + std::to_string(dim_) + ":" + getTensorsStringKey(input_t) + ":" +
+        getTensorsStringKey(indices_t);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
      MPSGraphTensor* castInputTensor =
--- a/aten/src/ATen/native/mps/operations/Shape.mm
+++ b/aten/src/ATen/native/mps/operations/Shape.mm
@ -108,8 +108,8 @@ TORCH_IMPL_FUNC(topk_out_mps)
    // Input as placeholders
    MPSShape* input_shape = getMPSShape(self);
    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-    string key = string("topk:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" + to_string(k) +
-        ":dim" + to_string(dim_) + ":largest" + to_string(largest);
+    string key = string("topk:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" + std::to_string(k) +
+        ":dim" + std::to_string(dim_) + ":largest" + std::to_string(largest);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);

@ -320,12 +320,12 @@ TORCH_IMPL_FUNC(cat_out_mps)
  };

  @autoreleasepool {
-    string key =
-        "cat_out_mps:" + to_string(dimension) + ":" + (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
+    string key = "cat_out_mps:" + std::to_string(dimension) + ":" +
+        (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW");
    if (!all_same_dtype) {
      key += getTensorsStringKey(input_tensors, true, all_same_sizes_and_stride);
    } else {
-      key += ":" + getMPSTypeString(input_tensors[0].scalar_type(), true) + ":" + to_string(inputs.size());
+      key += ":" + getMPSTypeString(input_tensors[0].scalar_type(), true) + ":" + std::to_string(inputs.size());
    }
    for (auto idx : skipped_tensor_indices) {
      key += "," + std::to_string(idx);
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@ -60,8 +60,8 @@ TORCH_IMPL_FUNC(sort_stable_out_mps)
    // Input as placeholders
    MPSShape* input_shape = getMPSShape(self);
    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
-    string key = string("sort:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":dim" + to_string(dim) +
-        ":descending" + to_string(descending);
+    string key = string("sort:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":dim" +
+        std::to_string(dim) + ":descending" + std::to_string(descending);
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);

--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@ -240,8 +240,8 @@ static void clamp_scalar_out_mps(const Tensor& input_t,

  @autoreleasepool {
    // the optional min/max refs could affect how we build the cached graph
-    string key = op_name + (has_min ? ("_min:" + to_string(min_scalar)) : "") +
-        (has_max ? ("_max:" + to_string(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
+    string key = op_name + (has_min ? ("_min:" + std::to_string(min_scalar)) : "") +
+        (has_max ? ("_max:" + std::to_string(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      if (has_min)
        newCachedGraph->minTensor = [mpsGraph
--- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
@ -8,47 +8,17 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/erfinv_native.h>
+#include <ATen/ops/exp_native.h>
+#include <ATen/ops/tanh_native.h>
 #endif

 #include <fmt/format.h>

 namespace at::native {
-static const std::string& getMetalType(const c10::ScalarType& t) {
-  // Mapping from c10::ScalarType to integral type that can be used for unary ops
-  static std::unordered_map<c10::ScalarType, std::string> scalar_to_metal_type = {
-      {c10::ScalarType::Half, "half"},
-      {c10::ScalarType::Float, "float"},
-      {c10::ScalarType::Long, "long"},
-      {c10::ScalarType::Int, "int"},
-      {c10::ScalarType::Short, "short"},
-      {c10::ScalarType::Bool, "bool"},
-      {c10::ScalarType::Char, "int8_t"},
-      {c10::ScalarType::Byte, "uint8_t"},
-  };
-
-  auto it = scalar_to_metal_type.find(t);
-  TORCH_CHECK(it != scalar_to_metal_type.end(), "Unsupported type ", t);
-  return it->second;
-}
-
-static const std::string& getMetalType(const c10::Scalar& s) {
-  return getMetalType(s.type());
-}
-
-static const std::string& getMetalType(const Tensor& t) {
-  return getMetalType(t.scalar_type());
-}
-
 static mps::MetalShaderLibrary lib(UNARY_KERNEL_TEMPLATE, 2);

-TORCH_IMPL_FUNC(erfinv_out_mps)(const Tensor& self, const Tensor& output_) {
-  // handle erfinv ops using metal kernel
-  // erfinv algorithm ported from aten/src/ATen/native/Math.h
-  // https://github.com/pytorch/pytorch/blob/4154c8ea159fdaecc71ee9af820ac956193c875b/aten/src/ATen/native/Math.h#L152
-
-  TORCH_CHECK(self.scalar_type() != ScalarType::Double, "MPS does not support erfinv op with scalar type: Double");
-
-  Tensor inputTensor = self;
+static void exec_unary_kernel(const Tensor& self, const Tensor& output_, const std::string& name) {
+  Tensor inputTensor = self.contiguous();
  Tensor outputTensor = output_;
  bool needs_output_copy = false;
  uint32_t length = output_.numel();
@ -57,10 +27,16 @@ TORCH_IMPL_FUNC(erfinv_out_mps)(const Tensor& self, const Tensor& output_) {
  }
  using namespace mps;
  @autoreleasepool {
-    auto cplState = lib.getPipelineStateForFunc("erfinv_mps_kernel", {getMetalType(outputTensor), getMetalType(self)});
+    id<MTLComputePipelineState> cplState = nil;
+    if (c10::isComplexType(self.scalar_type())) {
+      auto scalarStr = self.scalar_type() == kComplexFloat ? "float" : "half";
+      cplState = lib.getPipelineStateForFunc(name + "_complex_kernel", {scalarStr, scalarStr});
+    } else {
+      cplState = lib.getPipelineStateForFunc(name + "_kernel",
+                                             {scalarToMetalTypeString(outputTensor), scalarToMetalTypeString(self)});
+    }

-    if (!self.is_contiguous()) {
-      inputTensor = inputTensor.contiguous();
+    if (!outputTensor.is_contiguous()) {
      outputTensor = outputTensor.contiguous();
      needs_output_copy = true;
    }
@ -69,7 +45,7 @@ TORCH_IMPL_FUNC(erfinv_out_mps)(const Tensor& self, const Tensor& output_) {
    dispatch_sync(mpsStream->queue(), ^() {
      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();

-      getMPSProfiler().beginProfileKernel(cplState, "erf_inv", {inputTensor});
+      getMPSProfiler().beginProfileKernel(cplState, name, {self});

      [computeEncoder setComputePipelineState:cplState];
      mtl_setBuffer(computeEncoder, outputTensor, 0);
@ -83,4 +59,19 @@ TORCH_IMPL_FUNC(erfinv_out_mps)(const Tensor& self, const Tensor& output_) {
    output_.copy_(outputTensor);
  }
 }
+TORCH_IMPL_FUNC(erfinv_out_mps)(const Tensor& self, const Tensor& output_) {
+  // handle erfinv ops using metal kernel
+  // erfinv algorithm ported from aten/src/ATen/native/Math.h
+  // https://github.com/pytorch/pytorch/blob/4154c8ea159fdaecc71ee9af820ac956193c875b/aten/src/ATen/native/Math.h#L152
+
+  TORCH_CHECK(self.scalar_type() != ScalarType::Double, "MPS does not support erfinv op with scalar type: Double");
+  exec_unary_kernel(self, output_, "erfinv");
+}
+
+TORCH_IMPL_FUNC(exp_out_mps)(const Tensor& self, const Tensor& output_) {
+  exec_unary_kernel(self, output_, "exp");
+}
+TORCH_IMPL_FUNC(tanh_out_mps)(const Tensor& self, const Tensor& output_) {
+  exec_unary_kernel(self, output_, "tanh");
+}
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -26,7 +26,6 @@
 #include <ATen/ops/cumsum_native.h>
 #include <ATen/ops/erf_native.h>
 #include <ATen/ops/exp2_native.h>
-#include <ATen/ops/exp_native.h>
 #include <ATen/ops/expm1_native.h>
 #include <ATen/ops/floor_native.h>
 #include <ATen/ops/frac_native.h>
@ -54,7 +53,6 @@
 #include <ATen/ops/sinh_native.h>
 #include <ATen/ops/sqrt_native.h>
 #include <ATen/ops/tan_native.h>
-#include <ATen/ops/tanh_native.h>
 #include <ATen/ops/trunc_native.h>
 #include <ATen/ops/view_as_real.h>
 #endif
@ -236,7 +234,6 @@ CREATE_MPS_STRUCTURED_UNARY_ROUNDING_TORCH_IMPL_FUNC(round_out_mps, round)
    });                                                                                                          \
  }

-CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp_out_mps, exponent)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp2_out_mps, exponentBase2)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(reciprocal_out_mps, reciprocal)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sqrt_out_mps, squareRoot)
@ -254,7 +251,6 @@ CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(acos_out_mps, acos)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(atan_out_mps, atan)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(sinh_out_mps, sinh)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(cosh_out_mps, cosh)
-CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(tanh_out_mps, tanh)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(asinh_out_mps, asinh)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(acosh_out_mps, acosh)
 CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(atanh_out_mps, atanh)
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@ -36,8 +36,8 @@ static std::string getUniqueKey(const ScalarType& dtype,
                                const bool consecutive,
                                c10::optional<int64_t> dimOpt) {
  return "_unique2_mps:" + getMPSTypeString(dtype) + "[" + getArrayRefString(base_shape) + "]:[" +
-      (dimOpt.has_value() ? to_string(dimOpt.value()) : "None") + "]:[" + to_string(return_inverse) + "]:[" +
-      to_string(return_counts) + "]:[" + to_string(consecutive) + "]";
+      (dimOpt.has_value() ? std::to_string(dimOpt.value()) : "None") + "]:[" + std::to_string(return_inverse) + "]:[" +
+      std::to_string(return_counts) + "]:[" + std::to_string(consecutive) + "]";
 }

 // dim arg not supported when non consecutive, ie sorted
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@ -99,7 +99,7 @@ static void upsample_out_template(const Tensor& input,

  @autoreleasepool {
    string key = "upsample_" + std::string(resize_mode_str) + (align_corners ? "_aligned_corners" : "") +
-        getTensorsStringKey({input}) + ":[" + to_string(scale_h) + "," + to_string(scale_w) + "]:[" +
+        getTensorsStringKey({input}) + ":[" + std::to_string(scale_h) + "," + std::to_string(scale_w) + "]:[" +
        (is_backward_pass ? getArrayRefString(input_size) : "Undefined") + "]";

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
--- a/aten/src/ATen/native/mps/operations/View.mm
+++ b/aten/src/ATen/native/mps/operations/View.mm
@ -42,7 +42,7 @@ static std::string getStridedKey(const ScalarType& self_dtype,
  }

  return (is_scatter ? "scatter:" : "gather:") + dtype_key + "[" + getArrayRefString(base_shape) + "]:[" +
-      getArrayRefString(new_shape) + "]:[" + getArrayRefString(stride) + "]:[" + to_string(storage_offset) + "]";
+      getArrayRefString(new_shape) + "]:[" + getArrayRefString(stride) + "]:[" + std::to_string(storage_offset) + "]";
 }

 // initializes the MTLBuffers for tensor data and runs the MPSGraph for the view op
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -14648,11 +14648,13 @@
  variants: function
  dispatch:
    CUDA: _fbgemm_jagged_to_padded_dense_forward
+    CPU: _jagged_to_padded_dense_forward_cpu

 - func: _padded_dense_to_jagged_forward(Tensor dense, Tensor[] offsets, SymInt? total_L=None) -> Tensor
  variants: function
  dispatch:
    CUDA: _fbgemm_dense_to_jagged_forward_symint
+    CPU: _padded_dense_to_jagged_forward_cpu

 - func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
  dispatch:
@ -14728,12 +14730,12 @@
    CUDA: _scaled_dot_product_efficient_attention_backward_cuda
  tags: nondeterministic_seeded

- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset)
+- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
  dispatch:
    CUDA: _scaled_dot_product_cudnn_attention_cuda
  tags: nondeterministic_seeded

- func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+- func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
  dispatch:
    CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
  tags: nondeterministic_seeded
--- a/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
@ -246,5 +246,104 @@ Tensor NestedTensor_to_mask(const Tensor& nt, std::optional<int64_t> mask_dim, s
  return result;
 }

+Tensor _jagged_to_padded_dense_forward_cpu(
+    const Tensor& values,
+    TensorList offsets_list,
+    c10::IntArrayRef max_lengths,
+    const double padding_value) {
+  // TODO: Make this kernel more efficient using TensorIterator or something.
+  TORCH_INTERNAL_ASSERT(
+      offsets_list.size() == 1 && max_lengths.size() == 1,
+      "_jagged_to_padded_dense_forward(): only a single jagged dim is supported for now");
+
+  // allocate appropriately-sized padded tensor
+  auto offsets = offsets_list[0];
+  TORCH_CHECK(
+      offsets.dim() == 1,
+      "_jagged_to_padded_dense_forward(): expected 1D offsets, but got offsets.dim() == ",
+      offsets.dim());
+
+  auto batch_size = offsets.size(0) - 1;
+  auto max_length = max_lengths[0];
+  auto values_shape = values.sizes().vec();
+  std::vector<int64_t> padded_shape;
+  padded_shape.reserve(values.dim() + 1);
+  padded_shape.push_back(batch_size);
+  padded_shape.push_back(max_length);
+  padded_shape.insert(padded_shape.end(), values_shape.begin() + 1, values_shape.end());
+  Tensor padded = values.new_full(padded_shape, padding_value);
+
+  // copy data to padded tensor
+  for (auto i : c10::irange(batch_size)) {
+    auto start_offset = offsets[i].item<int64_t>();
+    auto end_offset = offsets[i + 1].item<int64_t>();
+    auto length = end_offset - start_offset;
+    // NB: truncate to max length to match CUDA kernel behavior.
+    length = std::min(length, max_length);
+    auto source = values.slice(0, start_offset, start_offset + length);
+    auto dst = padded.select(0, i).slice(0, 0, length);
+    dst.copy_(source);
+  }
+
+  return padded;
+}
+
+Tensor _padded_dense_to_jagged_forward_cpu(
+    const Tensor& padded,
+    TensorList offsets_list,
+    c10::optional<int64_t> total_L) {
+  // TODO: Make this kernel more efficient using TensorIterator or something.
+  TORCH_INTERNAL_ASSERT(
+      offsets_list.size() == 1,
+      "_padded_dense_to_jagged_forward(): only a single jagged dim is supported for now");
+
+  // allocate appropriately-sized values tensor
+  auto offsets = offsets_list[0];
+  TORCH_CHECK(
+      offsets.dim() == 1,
+      "_padded_dense_to_jagged_forward(): expected 1D offsets, but got offsets.dim() == ",
+      offsets.dim());
+
+  auto final_offset = offsets[-1].item<int64_t>();
+  int64_t total_L_val = total_L.has_value() ? (*total_L) : final_offset;
+  if (total_L.has_value()) {
+    // error if the offsets try to index past the end of the packed dimension
+    TORCH_CHECK(
+        final_offset == total_L_val,
+        "_padded_dense_to_jagged_forward(): final offset should match total_L value");
+  }
+
+  TORCH_CHECK(
+      padded.dim() >= 2,
+      "_padded_dense_to_jagged_forward(): expected padded dim >= 2, but padded.dim() == ",
+      padded.dim());
+
+  std::vector<int64_t> values_shape;
+  values_shape.reserve(padded.dim() - 1);
+  values_shape.push_back(total_L_val);
+  auto padded_shape = padded.sizes();
+  values_shape.insert(values_shape.end(), padded_shape.begin() + 2, padded_shape.end());
+  Tensor values = padded.new_empty(values_shape);
+
+  // copy data to values tensor
+  auto batch_size = offsets.size(0) - 1;
+  for (auto i : c10::irange(batch_size)) {
+    auto start_offset = offsets[i].item<int64_t>();
+    auto end_offset = offsets[i + 1].item<int64_t>();
+    auto length = end_offset - start_offset;
+
+    TORCH_CHECK(
+        length <= padded_shape[1],
+        "_padded_dense_to_jagged_forward(): found batch item of length ", length,
+        " when max length specified by padded input is ", padded_shape[1]);
+
+    auto dst = values.slice(0, start_offset, end_offset);
+    auto source = padded.select(0, i).slice(0, 0, length);
+    dst.copy_(source);
+  }
+
+  return values;
+}
+
 } // namespace native
 } // namespace at
--- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h
@ -309,7 +309,7 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {

 namespace onednn_utils {

-static ideep::attr_t create_attr_by_post_op(
+inline ideep::attr_t create_attr_by_post_op(
    const c10::string_view& binary_post_op,
    double binary_alpha,
    double input1_scale,
@ -389,27 +389,9 @@ static ideep::attr_t create_attr_by_post_op(
  return ideep::attr_t();
 }

-// Try to reorder tensor to expected desc at runtime
-// Do it in a `try...catch...` manner to avoid oneDNN's errors
-// TODO: Move it to third_party/ideep
-static void try_reorder(
-    ideep::tensor& t,
-    const ideep::tensor::desc&& desc,
-    ideep::scale_t scales) {
-  if (t.get_desc() != desc) {
-    try {
-      t = t.reorder_if_differ_in(desc);
-    } catch (...) {
-      ideep::tensor&& plain = t.to_public(nullptr, t.get_data_type());
-      t = plain.reorder_if_differ_in(desc);
-    }
-    t.set_scale(scales);
-  }
-}
-
 // ONEDNN requires symmetric quantization of weight
 // Use this util function to check.
-static bool is_weight_symmetric_quant(
+inline bool is_weight_symmetric_quant(
      const at::Tensor& weight,
      bool is_transposed_conv) {
  bool is_symmetric = true;
@ -438,7 +420,7 @@ static bool is_weight_symmetric_quant(

 // When qengine is x86, use this util func to check if onednn kernel
 // is preferred than fbgemm's to get better performance.
-static bool should_use_onednn_quant(
+inline bool should_use_onednn_quant(
    const at::Tensor& weight,
    bool is_transposed_conv,
    int groups,
@ -472,29 +454,4 @@ at::Tensor _qconv_prepack_onednn(
    int64_t groups,
    std::optional<torch::List<int64_t>> input_shape=c10::nullopt);

-static at::Tensor _quantized_convolution_onednn(
-    at::Tensor act, // contains quantized values but not QTensor
-    double act_scale,
-    int64_t act_zero_point,
-    at::Tensor weight, // MKLDNN tensor with quantized values
-    at::Tensor weight_scales,
-    at::Tensor weight_zero_points,
-    std::optional<at::Tensor> bias, // Bias is packed if not None
-    torch::List<int64_t> stride,
-    torch::List<int64_t> padding,
-    torch::List<int64_t> dilation,
-    bool transposed,
-    int64_t groups,
-    double output_scale,
-    int64_t output_zero_point,
-    std::optional<at::Tensor> accum=c10::nullopt, // accum to fused with conv add
-    double accum_scale=1.0,
-    int64_t accum_zero_point=0,
-    bool fp32_output=false,
-    std::optional<c10::string_view> binary_attr=c10::nullopt,
-    std::optional<at::Scalar> binary_alpha=c10::nullopt,
-    std::optional<c10::string_view> unary_attr=c10::nullopt,
-    torch::List<std::optional<at::Scalar>> unary_scalars=torch::List<std::optional<at::Scalar>>(),
-    std::optional<c10::string_view> unary_algorithm=c10::nullopt);
-
 #endif // #if AT_MKLDNN_ENABLED()
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@ -14,6 +14,7 @@
 #include <ATen/native/mkl/SparseBlasImpl.h>
 #include <ATen/native/sparse/SparseBlasImpl.h>
 #include <ATen/native/sparse/SparseCsrTensorMath.h>
+#include <c10/macros/Macros.h>
 #include <c10/util/irange.h>
 #include <ATen/AccumulateType.h>

@ -314,14 +315,6 @@ inline Tensor get_result_tensor_for_unary_op(F op, const Tensor& input) {
 }
 } // namespace

-// Only accept squares sparse matrices or dense input as a vector
-// TODO: Check what happens with MKL, the output error reported with non square
-// matrices tends to be high See:
-// https://github.com/pytorch/pytorch/issues/58770
-static bool is_square_or_vec(int64_t dim_i, int64_t dim_j, int64_t dim_k) {
-  return (dim_i == dim_k && dim_k == dim_j) || (dim_i == dim_j && dim_k == 1);
-}
-
 Tensor& normal_sparse_csr_(
    Tensor& self,
    double mean,
@ -473,7 +466,10 @@ CREATE_UNARY_UFUNC(tan);
 CREATE_UNARY_UFUNC(tanh);
 CREATE_UNARY_UFUNC(trunc);
 CREATE_UNARY_UFUNC(conj_physical);
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-function")
 static CREATE_UNARY_UFUNC(relu);
+C10_DIAGNOSTIC_POP()

 // With addition of `round.decimals` overload, using CREATE_UNARY_UFUNC leads
 // to unresolved overload.
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@ -598,17 +598,6 @@ at::Tensor post_process_flash_output(
  return out;
 }

-int64_t handle_private_use(const Tensor& query_, const Tensor& key, const Tensor& value,
-    const std::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, std::optional<double> scale){
-  int64_t choice_int = static_cast<int64_t>(sdp::SDPBackend::math);
-  try {
-    choice_int = _fused_sdp_choice_stub(query_.device().type(),
-        query_, key, value, attn_mask_, dropout_p, is_causal, scale);
-  } catch(const ::c10::Error& e){
-  }
-  return choice_int;
-}
-
 bool should_compute_logsumexp(const Tensor& query, const Tensor& key, const Tensor& value) {
  const bool any_inputs_require_grad = query.requires_grad() || key.requires_grad() || value.requires_grad();
  const bool gradmode_enabled = at::GradMode::is_enabled();
@ -666,7 +655,7 @@ Tensor scaled_dot_product_attention(
    case sdp::SDPBackend::cudnn_attention: {
      bool compute_logsumexp = should_compute_logsumexp(query_, key, value);
      auto out_lse_softmax = at::_scaled_dot_product_cudnn_attention(
-          query_, key, value, compute_logsumexp, dropout_p, is_causal, scale);
+          query_, key, value, dropout_p, is_causal, compute_logsumexp, scale);
      return std::get<0>(out_lse_softmax);
    }
    case sdp::SDPBackend::flash_attention: {
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@ -735,27 +735,14 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Ten
  return std::make_tuple(attention, logsumexp, Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, philox_seed, philox_offset, debug_attn_mask);
 }

-// Adapted from TE
-// extract seed and offset from PhiloxCudaState
-__global__ void unpack_cudnn(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr) {
-  if (arg.captured_) {
-    *seed_ptr = static_cast<int64_t>(*arg.seed_.ptr);
-    *offset_ptr = static_cast<int64_t>(
-                    *(arg.offset_.ptr) + static_cast<int64_t>(arg.offset_intragraph_));
-  } else {
-    *seed_ptr = static_cast<int64_t>(arg.seed_.val);
-    *offset_ptr = static_cast<int64_t>(arg.offset_.val);
-  }
-}
-
-std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_cuda(
+std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt, Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_cuda(
    const Tensor& query,
    const Tensor& key,
    const Tensor& value,
-    bool compute_logsumexp,
    double dropout_p,
    bool is_causal,
-    c10::optional<double> scale) {
+    bool training,
+    std::optional<double> scale) {
  // Used for tracking usage statistics
  C10_LOG_API_USAGE_ONCE("torch.sdpa.flash_attention_cudnn");
  // Query (Batch x Num_heads x Q_seq_len  x Dim_per_head)
@ -774,33 +761,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_c

  Tensor attention, log_sumexp;

-  at::Tensor cudnn_seed, cudnn_offset;
-  cudnn_seed = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
-  cudnn_offset = at::empty({}, at::dtype(at::kLong).device(at::kCUDA));
-
-  const bool use_dropout = std::fpclassify(dropout_p) != FP_ZERO;
-
-  // See Note [Seed and Offset Device] in _efficient_attention_forward
-  at::PhiloxCudaState philox_state;
-  const bool in_capture_stream =
-      at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None;
-  if (use_dropout) {
-    // Device
-    auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
-        c10::nullopt, at::cuda::detail::getDefaultCUDAGenerator());
-
-    // See Note [Acquire lock when using random generators]
-    std::lock_guard<std::mutex> lock(gen->mutex_);
-    // if using dropout, we produce 1 random number for each element of the
-    // attention tensor
-    // TODO(eqy): should state be advanced per thread (local) amount or per call/launch (global) amount
-    philox_state = gen->philox_cuda_state(batch_size * num_heads * max_seqlen_batch_q * max_seqlen_batch_k);
-    unpack_cudnn<<<1, 1, 0, at::cuda::getCurrentCUDAStream()>>>(
-                                      philox_state, static_cast<int64_t*>(cudnn_seed.data_ptr()), static_cast<int64_t*>(cudnn_offset.data_ptr()));
-  }
-
+  auto cudnn_seed = at::zeros({1}, query.options().dtype(kLong));
+  auto cudnn_offset = at::zeros({1}, query.options().dtype(kLong));
  const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
-  Tensor debugmask;

  run_cudnn_SDP_fprop(batch_size/*int64_t b*/,
                      num_heads/*int64_t h*/,
@ -808,7 +771,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_c
                      max_seqlen_batch_k/*int64_t s_kv*/,
                      head_dim/*int64_t d*/,
                      softmax_scale/*float scaling_factor*/,
-                      compute_logsumexp/* bool */,
+                      training/* bool */,
                      is_causal/* bool */,
                      dropout_p/*double dropout_probability*/,
                      query/* Tensor q*/,
@ -819,7 +782,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_c
                      cudnn_seed/*Tensor dropoutseed*/,
                      cudnn_offset/*Tensor dropoutoffset*/);

-  return std::make_tuple(attention, log_sumexp, cudnn_seed, cudnn_offset);
+  return std::make_tuple(attention, log_sumexp, Tensor(), Tensor(), max_seqlen_batch_q, max_seqlen_batch_k, cudnn_seed, cudnn_offset, Tensor());
 }

 std::tuple<Tensor, Tensor, Tensor, Tensor> _scaled_dot_product_efficient_attention_cuda(
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@ -171,32 +171,18 @@ std::tuple<Tensor, Tensor, Tensor> _scaled_dot_product_cudnn_attention_backward_
    const Tensor& value,
    const Tensor& out,
    const Tensor& logsumexp,
-    const Tensor& philox_seed,
-    const Tensor& philox_offset,
-//    const Tensor& cumulative_sequence_length_q,
-//    const Tensor& cumulative_sequence_length_k,
-//    const int64_t max_seqlen_batch_q,
-//    const int64_t max_seqlen_batch_k,
+    const Tensor& cumulative_sequence_length_q,
+    const Tensor& cumulative_sequence_length_k,
+    const int64_t max_seqlen_batch_q,
+    const int64_t max_seqlen_batch_k,
    double dropout_p,
    bool is_causal,
-    c10::optional<double> scale) {
-
-
-    auto& ctx = at::globalContext();
-    if (ctx.deterministicAlgorithms()) {
-      if (ctx.deterministicAlgorithmsWarnOnly()) {
-        TORCH_WARN_ONCE(
-            "cuDNN Attention defaults to a non-deterministic algorithm. ",
-            "To explicitly enable determinism call torch.use_deterministic_algorithms(True, warn_only=False).");
-      }
-    }
-
-
+    const Tensor& philox_seed,
+    const Tensor& philox_offset,
+    std::optional<double> scale) {
    const int64_t batch_size = query.size(0);
    const int64_t num_heads = query.size(1);
    const int64_t head_dim = query.size(3);
-    const int64_t max_seqlen_batch_q = query.size(1);
-    const int64_t max_seqlen_batch_k = key.size(1);

    const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();

--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -6,7 +6,6 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/core/grad_mode.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAConfig.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/transformers/cuda/sdp_utils.h>
@ -45,28 +44,14 @@

 namespace sdp {
 namespace {
-
-// TODO(eqy): more benchmarking to determine whether this should include sm86/89
-// Needs to be kept in-sync with test_fused_chocie in test_transformers.py
-bool check_prefer_cudnn_attention() {
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  return dprops->major >= 9;
-}
-
 // flash_attention V2 is universally faster than efficient_attention and Math
 std::array<SDPBackend, num_backends> priority_order(sdp_params const& params) {
  constexpr std::array<SDPBackend, num_backends> default_order{
-      SDPBackend::flash_attention,
-      SDPBackend::cudnn_attention,
-      SDPBackend::efficient_attention,
-      SDPBackend::math};
-  constexpr std::array<SDPBackend, num_backends> cudnn_order{
      SDPBackend::cudnn_attention,
      SDPBackend::flash_attention,
      SDPBackend::efficient_attention,
      SDPBackend::math};
-  static const bool prefer_cudnn = check_prefer_cudnn_attention();
-  return prefer_cudnn ? cudnn_order : default_order;
+  return default_order;
 }

 bool use_tensor_cores(sdp_params const& params, cudaDeviceProp* dprops, bool is_half) {
@ -466,6 +451,17 @@ bool check_cudnn_hardware_support(sdp_params const& params, bool debug) {
  return true;
 }

+bool check_is_causal(sdp_params const& params, bool debug) {
+  // Check that the input is causal
+  if (!params.is_causal) {
+    if (debug) {
+      TORCH_WARN("CuDNN requires is_causal=True.");
+    }
+    return false;
+  }
+  return true;
+}
+
 bool check_for_nested_inputs(sdp_params const& params, bool debug) {
  // Check that the input is nested
  if (has_for_nested_inputs(params)) {
@ -489,6 +485,22 @@ bool check_dtypes_low_precision(sdp_params const& params, bool debug) {
  }
 }

+bool check_runtime_enabled_cudnn(sdp_params const& params, bool debug) {
+  static c10::once_flag supported_flag;
+  static bool supported = false;
+  c10::call_once(supported_flag, []() {
+    supported = (c10::utils::check_env("TORCH_CUDNN_SDPA_ENABLED") == true);
+  });
+  if (!supported) {
+    if (debug) {
+      TORCH_WARN(
+          "The CuDNN backend needs to be enabled by setting the enviornment variable`TORCH_CUDNN_SDPA_ENABLED=1`");
+    }
+    return false;
+  }
+  return true;
+}
+
 bool check_runtime_disabled_cudnn(sdp_params const& params, bool debug) {
  // We check the global context to see if user has explicitly turned of cudnn
  // sdp kernels
@ -501,15 +513,13 @@ bool check_runtime_disabled_cudnn(sdp_params const& params, bool debug) {
  return true;
 }

-bool check_cudnn_deterministic(const sdp_params& params, bool debug) {
-  auto& ctx = at::globalContext();
-  if (ctx.deterministicAlgorithms()) {
-    if (!ctx.deterministicAlgorithmsWarnOnly()) {
-      if (debug) {
-        TORCH_WARN("cuDNN SDPA is not deterministic.");
-      }
-      return false;
+bool check_cudnn_requires_grad(sdp_params const& params, bool debug) {
+  // Check that the input is causal
+  if (input_requires_grad(params)) {
+    if (debug) {
+      TORCH_WARN("CuDNN does not currently support inputs with requires_grad=True.");
    }
+    return false;
  }
  return true;
 }
@ -517,29 +527,21 @@ bool check_cudnn_deterministic(const sdp_params& params, bool debug) {
 } // namespace

 bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
-#if defined(USE_ROCM) || !AT_CUDNN_ENABLED() || \
-    (defined(CUDNN_VERSION) && CUDNN_VERSION < 8900)
-  TORCH_WARN_ONCE(!debug, "Torch was not compiled with cuDNN attention.");
-  return false;
-#endif
+
  // Define gate functions that determine if a flash kernel can be ran
  // Replace with std::to_array when we migrate to c++20
  constexpr auto general_constraints =
      array_of<bool (*)(sdp_params const&, bool)>(
-          check_for_nested_inputs,
-          check_nonzero_sequence_lengths_dense,
-          check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim>*/>,
-          check_all_tensors_on_device,
-          check_tensor_shapes,
-          check_cudnn_tensor_shapes,
+          check_runtime_enabled_cudnn,
          check_runtime_disabled_cudnn,
-          check_cudnn_deterministic,
-          // check_cudnn_layout,
+          check_cudnn_hardware_support,
+          check_all_tensors_on_device,
+          check_cudnn_tensor_shapes,
+          check_cudnn_layout,
          // check_is_causal,
-          check_dtypes_low_precision,
-          check_for_attn_mask_cudnn,
-          check_cudnn_hardware_support
-          );
+          check_for_nested_inputs,
+          check_cudnn_requires_grad,
+          check_dtypes_low_precision);
  for (auto& constraint : general_constraints) {
    if (!constraint(params, debug)) {
      return false;
@ -683,7 +685,6 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
    switch (backend) {
      case SDPBackend::cudnn_attention:
        if (sdp::can_use_cudnn_attention(kernel_params, print_debug)) {
-              TORCH_WARN("USING CUDNN SDPA");
              return SDPBackend::cudnn_attention;
        }
        break;
--- a/aten/src/ATen/native/transformers/sdp_utils_cpp.h
+++ b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
@ -266,18 +266,7 @@ inline bool check_requires_grad_and_nested(sdp_params const& params, bool debug)
 inline bool check_for_attn_mask(sdp_params const& params, bool debug) {
  if (params.attn_mask.has_value()) {
    if (debug) {
-      TORCH_WARN("Flash Attention do not support non-null attn_mask.");
-    }
-    return false;
-  }
-  return true;
-}
-
-// TODO(eqy): remove this once support is added
-inline bool check_for_attn_mask_cudnn(sdp_params const& params, bool debug) {
-  if (params.attn_mask.has_value()) {
-    if (debug) {
-      TORCH_WARN("cuDNN Attention does not support non-null attn_mask.");
+      TORCH_WARN("Flash Attention does not support non-null attn_mask.");
    }
    return false;
  }
@ -324,7 +313,7 @@ inline bool check_tensor_shapes(sdp_params const& params, bool debug) {
        (query_dim == 4))) {
    if (debug) {
      TORCH_WARN(
-          "All fused kernels requires query, key and value to be 4 dimensional, but got Query dim: ",
+          "Both fused kernels requires query, key and value to be 4 dimensional, but got Query dim: ",
          query_dim,
          ", Key dim: ",
          params.key.dim(),
@ -436,7 +425,7 @@ inline bool check_nonzero_sequence_lengths_dense(sdp_params const& params, bool
  if (zero_seq_len_q || zero_seq_len_k) {
    if (debug) {
      TORCH_WARN(
-          "All fused kernels do not support zero seq_len_q or seq_len_kv.");
+          "Both fused kernels do not support zero seq_len_q or seq_len_kv.");
    }
    return false;
  }
@ -471,7 +460,7 @@ inline bool check_last_dim_stride_equals_1_dense(sdp_params const& params, bool
      }
      epilogue_message << " instead.";
      TORCH_WARN(
-          "All fused kernels require the last dimension of the input to have stride 1. ",
+          "Both fused kernels require the last dimension of the input to have stride 1. ",
          "Got Query.stride(-1): ",
          params.query.sym_stride(-1),
          ", Key.stride(-1): ",
--- a/aten/src/ATen/native/vol2col.h
+++ b/aten/src/ATen/native/vol2col.h
@ -5,7 +5,7 @@
 namespace at::native {

 template <typename T>
-static void vol2col(
+void vol2col(
    const T* data_vol,
    const int64_t channels,
    const int64_t depth,
@ -56,7 +56,7 @@ static void vol2col(
 }

 template <typename T>
-static void col2vol(
+void col2vol(
    const T* data_col,
    const int64_t channels,
    const int64_t depth,
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@ -6,6 +6,14 @@ import textwrap
 import pandas as pd


+# Hack to have something similar to DISABLED_TEST. These models are flaky.
+
+flaky_models = {
+    "yolov3",
+    "gluon_inception_v3",
+}
+
+
 def get_field(csv, model_name: str, field: str):
    try:
        return csv.loc[csv["name"] == model_name][field].item()
@ -25,6 +33,13 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
            status = "PASS" if expected_accuracy == "pass" else "XFAIL"
            print(f"{model:34}  {status}")
            continue
+        elif model in flaky_models:
+            if accuracy == "pass":
+                # model passed but marked xfailed
+                status = "PASS_BUT_FLAKY:"
+            else:
+                # model failed but marked passe
+                status = "FAIL_BUT_FLAKY:"
        elif accuracy != "pass":
            status = "FAIL:"
            failed.append(model)
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_training.csv
@ -14,11 +14,11 @@ AllenaiLongformerBase,pass,9



-BartForCausalLM,pass,12
+BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,24
+BartForConditionalGeneration,pass,8



@ -34,11 +34,11 @@ BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,12
+BlenderbotSmallForCausalLM,pass,6



-BlenderbotSmallForConditionalGeneration,pass,24
+BlenderbotSmallForConditionalGeneration,pass,8



@ -102,11 +102,11 @@ M2M100ForConditionalGeneration,pass,4



-MBartForCausalLM,pass,12
+MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,24
+MBartForConditionalGeneration,pass,8



@ -130,23 +130,23 @@ MobileBertForQuestionAnswering,pass,3



-OPTForCausalLM,pass,12
+OPTForCausalLM,pass,6



-PLBartForCausalLM,pass,12
+PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,29
+PLBartForConditionalGeneration,pass,8



-PegasusForCausalLM,pass,12
+PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,23
+PegasusForConditionalGeneration,pass,7



@ -158,7 +158,7 @@ RobertaForQuestionAnswering,pass,5



-Speech2Text2ForCausalLM,pass,12
+Speech2Text2ForCausalLM,pass,6



@ -170,11 +170,11 @@ T5Small,pass,5



-TrOCRForCausalLM,pass,12
+TrOCRForCausalLM,pass,6



-XGLMForCausalLM,pass,12
+XGLMForCausalLM,pass,6



--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@ -150,7 +150,7 @@ hf_Bert_large,pass,0



-hf_BigBird,pass,46
+hf_BigBird,pass,43



@ -378,4 +378,4 @@ vision_maskrcnn,pass,17



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@ -98,7 +98,7 @@ hf_Bert_large,pass,6



-hf_BigBird,pass, 52
+hf_BigBird,pass,49



@ -286,4 +286,4 @@ vision_maskrcnn,pass,34



-yolov3,pass,9
+yolov3,pass,8
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@ -242,7 +242,7 @@ pyhpc_equation_of_state,pass,0



-pyhpc_isoneutral_mixing,fail_to_run,0
+pyhpc_isoneutral_mixing,pass,0



@ -350,4 +350,4 @@ vision_maskrcnn,fail_to_run,0



-yolov3,fail_to_run,0
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_freezing_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_freezing_inference.csv
@ -338,4 +338,4 @@ vision_maskrcnn,pass,28



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -338,4 +338,4 @@ vision_maskrcnn,pass,28



-yolov3,pass,2
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cu124/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cu124/aot_inductor_torchbench_inference.csv
@ -242,7 +242,7 @@ pyhpc_equation_of_state,pass,0



-pyhpc_isoneutral_mixing,fail_to_run,0
+pyhpc_isoneutral_mixing,pass,0



@ -350,4 +350,4 @@ vision_maskrcnn,fail_to_run,0



-yolov3,fail_to_run,0
+yolov3,pass,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cu124/dynamic_inductor_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cu124/dynamic_inductor_huggingface_training.csv
@ -14,11 +14,11 @@ AllenaiLongformerBase,pass,9



-BartForCausalLM,pass,12
+BartForCausalLM,pass,6



-BartForConditionalGeneration,pass,24
+BartForConditionalGeneration,pass,8



@ -34,11 +34,11 @@ BlenderbotForCausalLM,eager_fail_to_run,0



-BlenderbotSmallForCausalLM,pass,12
+BlenderbotSmallForCausalLM,pass,6



-BlenderbotSmallForConditionalGeneration,pass,24
+BlenderbotSmallForConditionalGeneration,pass,8



@ -102,11 +102,11 @@ M2M100ForConditionalGeneration,pass,4



-MBartForCausalLM,pass,12
+MBartForCausalLM,pass,6



-MBartForConditionalGeneration,pass,24
+MBartForConditionalGeneration,pass,8



@ -130,23 +130,23 @@ MobileBertForQuestionAnswering,pass,3



-OPTForCausalLM,pass,12
+OPTForCausalLM,pass,6



-PLBartForCausalLM,pass,12
+PLBartForCausalLM,pass,6



-PLBartForConditionalGeneration,pass,29
+PLBartForConditionalGeneration,pass,8



-PegasusForCausalLM,pass,12
+PegasusForCausalLM,pass,6



-PegasusForConditionalGeneration,pass,23
+PegasusForConditionalGeneration,pass,7



@ -158,7 +158,7 @@ RobertaForQuestionAnswering,pass,5



-Speech2Text2ForCausalLM,pass,12
+Speech2Text2ForCausalLM,pass,6



@ -170,11 +170,11 @@ T5Small,pass,5



-TrOCRForCausalLM,pass,12
+TrOCRForCausalLM,pass,6



-XGLMForCausalLM,pass,12
+XGLMForCausalLM,pass,6



--- a/benchmarks/dynamo/ci_expected_accuracy/cu124/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cu124/dynamic_inductor_torchbench_inference.csv
@ -150,7 +150,7 @@ hf_Bert_large,pass,0



-hf_BigBird,fail_accuracy,46
+hf_BigBird,fail_accuracy,43



--- a/benchmarks/dynamo/ci_expected_accuracy/cu124/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cu124/dynamic_inductor_torchbench_training.csv
@ -98,7 +98,7 @@ hf_Bert_large,pass,6



-hf_BigBird,pass,52
+hf_BigBird,pass,49



--- a/Show More
+++ b/Show More