pytorch

frozenleaves/pytorch

Fork 0

mirror of https://github.com/pytorch/pytorch.git synced 2025-11-03 23:45:05 +08:00

Author	SHA1	Message	Date
Tushar Jain	70b9c4736d	fix fr reset api Summary: - there are various places that access fr's `entries_` field - if we empty the entries_ on reset, the accesses can result in an error - so we only perform a soft delete instead of clearing out the entries copletely - only reset id_ on the reset - keep track of a reset_epoch which increments everytime reset is called - dump_entries only returns entries from the latest epoch - api's that access entries also check if the reset epoch matches - make the `next_` always track the index in the circular buffer - this change was needed to make the soft delete's implementation easier	2025-10-29 14:36:45 -07:00

Author

SHA1

Message

Date

Tushar Jain

70b9c4736d

fix fr reset api

Summary:
- there are various places that access fr's `entries_` field
- if we empty the entries_ on reset, the accesses can result in an error
- so we only perform a soft delete instead of clearing out the entries copletely
  - only reset id_ on the reset
  - keep track of a reset_epoch which increments everytime reset is called
  - dump_entries only returns entries from the latest epoch
  - api's that access entries also check if the reset epoch matches
- make the `next_` always track the index in the circular buffer - this change was needed to make the soft delete's implementation easier

2025-10-29 14:36:45 -07:00

635 changed files with 7408 additions and 17580 deletions

									
										5

.ci/docker/build.sh
									
												View File
												
				@ -195,16 +195,13 @@ case "$tag" in

				    NINJA_VERSION=1.9.0

				    TRITON=yes

				    ;;

				  pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)

				  pytorch-linux-jammy-xpu-n-py3)

				    ANACONDA_PYTHON_VERSION=3.10

				    GCC_VERSION=11

				    VISION=yes

				    XPU_VERSION=2025.2

				    NINJA_VERSION=1.9.0

				    TRITON=yes

				    if [[ $tag =~ "benchmarks" ]]; then

				      INDUCTOR_BENCHMARKS=yes

				    fi

				    ;;

				  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)

				    ANACONDA_PYTHON_VERSION=3.10

									
										2

.ci/docker/common/install_acl.sh
									
												View File
												
				@ -3,7 +3,7 @@

				set -eux

				ACL_VERSION=${ACL_VERSION:-"v52.6.0"}

				ACL_VERSION=${ACL_VERSION:-"v25.02"}

				ACL_INSTALL_DIR="/acl"

				# Clone ACL

									
										10

.ci/docker/common/install_conda.sh
									
												View File
												
				@ -49,20 +49,12 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

				    export SYSROOT_DEP="sysroot_linux-64=2.17"

				  fi

				# Install correct Python version

				# Also ensure sysroot is using a modern GLIBC to match system compilers

				if [ "$ANACONDA_PYTHON_VERSION" = "3.14" ]; then

				  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\

				             python="3.14.0" \

				             ${SYSROOT_DEP} \

				             -c conda-forge

				else

				  # Install correct Python version

				  # Also ensure sysroot is using a modern GLIBC to match system compilers

				  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\

				             python="$ANACONDA_PYTHON_VERSION" \

				             ${SYSROOT_DEP}

				fi

				  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30

				  # which is provided in libstdcxx 12 and up.

				  conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge

									
										4

.ci/docker/common/install_rocm.sh
									
												View File
												
				@ -40,7 +40,11 @@ EOF

				    # Default url values

				    rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"

				    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"

				    # Add amdgpu repository

				    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`

				    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list

				    # Add rocm repository

				    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -

									
										4

.ci/docker/common/install_rocm_magma.sh
									
												View File
												
				@ -12,8 +12,8 @@ function do_install() {

				    rocm_version_nodot=${rocm_version//./}

				    # post merge of https://github.com/icl-utk-edu/magma/pull/65

				    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f

				    # https://github.com/icl-utk-edu/magma/pull/65

				    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec

				    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"

				    rocm_dir="/opt/rocm"

									
										2

.ci/docker/manywheel/build.sh
									
												View File
												
				@ -97,7 +97,7 @@ case ${image} in

				    manylinux2_28-builder:xpu)

				        TARGET=xpu_final

				        GPU_IMAGE=amd64/almalinux:8

				        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"

				        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"

				        MANY_LINUX_VERSION="2_28"

				        ;;

				    *)

21

.ci/docker/requirements-ci.txt

View File

 @ -138,12 +138,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_binary_ufuncs.py
 numpy==1.22.4; python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
 numpy==2.3.4; python_version >= "3.14"
 numpy==2.1.2; python_version >= "3.13"
 pandas==2.0.3; python_version < "3.13"
 pandas==2.2.3; python_version >= "3.13" and python_version < "3.14"
 pandas==2.3.3; python_version >= "3.14"
 pandas==2.2.3; python_version >= "3.13"
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
 @ -155,8 +153,7 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py
 optree==0.13.0 ; python_version < "3.14"
 optree==0.17.0 ; python_version >= "3.14"
 optree==0.13.0
 #Description: A library for tree manipulation
 #Pinned versions: 0.13.0
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 @ -255,8 +252,7 @@ scikit-image==0.22.0
 #test that import:
 scipy==1.10.1 ; python_version <= "3.11"
 scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14"
 scipy==1.16.2 ; python_version >= "3.14"
 scipy==1.14.1 ; python_version >= "3.12"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
 @ -328,8 +324,7 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:
 lxml==5.3.0 ; python_version < "3.14"
 lxml==6.0.2 ; python_version >= "3.14"
 lxml==5.3.0
 #Description: This is a requirement of unittest-xml-reporting
 PyGithub==2.3.0
 @ -339,9 +334,7 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:
 onnx==1.19.1 ; python_version < "3.14"
 # Unpin once Python 3.14 is supported. See  onnxruntime issue 26309.
 onnx==1.18.0 ; python_version == "3.14"
 onnx==1.19.1
 #Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
 @ -366,7 +359,7 @@ pwlf==2.2.1
 #test that import: test_sac_estimator.py
 # To build PyTorch itself
 pyyaml==6.0.3
 pyyaml==6.0.2
 pyzstd
 setuptools==78.1.1
 packaging==23.1

									
										5

.ci/docker/ubuntu-xpu/Dockerfile
									
												View File
												
				@ -54,15 +54,12 @@ ENV OPENSSL_DIR /opt/openssl

				RUN rm install_openssl.sh

				ARG INDUCTOR_BENCHMARKS

				ARG ANACONDA_PYTHON_VERSION

				ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION

				COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh

				COPY ./common/common_utils.sh common_utils.sh

				COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt

				COPY ci_commit_pins/timm.txt timm.txt

				COPY ci_commit_pins/torchbench.txt torchbench.txt

				RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi

				RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

				RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt

				# Install XPU Dependencies

				ARG XPU_VERSION

									
										2

.ci/lumen_cli/pyproject.toml
									
												View File
												
				@ -6,7 +6,7 @@ dependencies = [

				    "GitPython==3.1.45",

				    "docker==7.1.0",

				    "pytest==7.3.2",

				    "uv==0.9.6"

				    "uv==0.9.5"

				]

				[tool.setuptools]

									
										8

.ci/magma-rocm/Makefile
									
												View File
												
				@ -1,7 +1,7 @@

				SHELL=/usr/bin/env bash

				DOCKER_CMD ?= docker

				DESIRED_ROCM ?= 7.1

				DESIRED_ROCM ?= 7.0

				DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))

				PACKAGE_NAME = magma-rocm

				# inherit this from underlying docker image, do not pass this env var to docker

				@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \

					magma-rocm/build_magma.sh

				.PHONY: all

				all: magma-rocm71

				all: magma-rocm70

				all: magma-rocm64

				@ -25,11 +24,6 @@ clean:

					$(RM) -r magma-*

					$(RM) -r output

				.PHONY: magma-rocm71

				magma-rocm71: DESIRED_ROCM := 7.1

				magma-rocm71:

					$(DOCKER_RUN)

				.PHONY: magma-rocm70

				magma-rocm70: DESIRED_ROCM := 7.0

				magma-rocm70:

									
										6

.ci/magma-rocm/build_magma.sh
									
												View File
												
				@ -6,8 +6,8 @@ set -eou pipefail

				# The script expects DESIRED_CUDA and PACKAGE_NAME to be set

				ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

				# post merge of https://github.com/icl-utk-edu/magma/pull/65

				MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f

				# https://github.com/icl-utk-edu/magma/pull/65

				MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec

				# Folders for the build

				PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata

				@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE

				# Fetch magma sources and verify checksum

				pushd ${PACKAGE_DIR}

				git clone https://github.com/icl-utk-edu/magma

				git clone https://github.com/jeffdaily/magma

				pushd magma

				git checkout ${MAGMA_VERSION}

				popd

									
										2

.ci/pytorch/build.sh
									
												View File
												
				@ -426,7 +426,7 @@ fi

				if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then

				  # export test times so that potential sharded tests that'll branch off this build will use consistent data

				  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build

				  PYTHONPATH=. python tools/stats/export_test_times.py

				  python tools/stats/export_test_times.py

				fi

				# don't do this for bazel or s390x or riscv64 as they don't use sccache

				if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then

									
										6

.ci/pytorch/test.sh
									
												View File
												
				@ -572,8 +572,6 @@ fi

				if [[ "${TEST_CONFIG}" == *cpu* ]]; then

				  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)

				elif [[ "${TEST_CONFIG}" == *xpu* ]]; then

				  DYNAMO_BENCHMARK_FLAGS+=(--device xpu)

				else

				  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)

				fi

				@ -667,8 +665,6 @@ test_perf_for_dashboard() {

				    device=cuda_b200

				  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then

				    device=rocm

				  elif [[ "${TEST_CONFIG}" == *xpu* ]]; then

				    device=xpu

				  fi

				  for mode in "${modes[@]}"; do

				@ -1761,7 +1757,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then

				  else

				    # Do this after checkout_install_torchbench to ensure we clobber any

				    # nightlies that torchbench may pull in

				    if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then

				    if [[ "${TEST_CONFIG}" != *cpu* ]]; then

				      install_torchrec_and_fbgemm

				    fi

				    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"

									
										4

.github/actions/diskspace-cleanup/action.yml
									
										vendored
									
												View File
												
				@ -27,9 +27,7 @@ runs:

				            docker system prune -af

				            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')

				            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then

				                diskspace_cutoff_int=$((diskspace_cutoff + 0))

				                difference=$((100 - diskspace_cutoff_int))

				                echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."

				                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."

				                echo "$msg"

				                exit 1

				            else

2

.github/ci_commit_pins/audio.txt vendored

View File

 @ -1 +1 @@
 b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
 bbe7363897764f9e758d851cd0340147d27f94

2

.github/ci_commit_pins/vision.txt vendored

View File

 @ -1 +1 @@
 cfbc5c2f1c798991715a6b06bb3ce46478c4487c
 d2ab791d437309f91e0486eb9fa7f00badc17

									
										2

.github/pytorch-probot.yml
									
										vendored
									
												View File
												
				@ -19,7 +19,6 @@ ciflow_push_tags:

				- ciflow/inductor-perf-test-nightly-rocm-mi300

				- ciflow/inductor-perf-test-nightly-rocm-mi355

				- ciflow/inductor-perf-test-nightly-x86-zen

				- ciflow/inductor-perf-test-nightly-xpu

				- ciflow/inductor-periodic

				- ciflow/inductor-rocm

				- ciflow/linux-aarch64

				@ -27,7 +26,6 @@ ciflow_push_tags:

				- ciflow/nightly

				- ciflow/op-benchmark

				- ciflow/periodic

				- ciflow/periodic-rocm-mi200

				- ciflow/periodic-rocm-mi300

				- ciflow/pull

				- ciflow/quantization-periodic

									
										89

.github/scripts/generate_binary_build_matrix.py
									
										vendored
									
												View File
												
				@ -11,17 +11,11 @@ architectures:

				    * Latest XPU

				"""

				import json

				import os

				import re

				from pathlib import Path

				from typing import Optional

				SCRIPT_DIR = Path(__file__).absolute().parent

				REPO_ROOT = SCRIPT_DIR.parent.parent

				# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this

				CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]

				CUDA_STABLE = "12.8"

				CUDA_ARCHES_FULL_VERSION = {

				@ -37,7 +31,8 @@ CUDA_ARCHES_CUDNN_VERSION = {

				    "13.0": "9",

				}

				ROCM_ARCHES = ["7.0", "7.1"]

				# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this

				ROCM_ARCHES = ["6.4", "7.0"]

				XPU_ARCHES = ["xpu"]

				@ -142,48 +137,9 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {

				}

				# Used by tools/nightly.py

				PYTORCH_NIGHTLY_PIP_INDEX_URL = "https://download.pytorch.org/whl/nightly"

				NIGHTLY_SOURCE_MATRIX = {

				    "cpu": dict(

				        name="cpu",

				        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cpu",

				        supported_platforms=["Linux", "macOS", "Windows"],

				        accelerator="cpu",

				    )

				}

				CUDA_NIGHTLY_SOURCE_MATRIX = {

				    f"cuda-{major}.{minor}": dict(

				        name=f"cuda-{major}.{minor}",

				        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu{major}{minor}",

				        supported_platforms=["Linux", "Windows"],

				        accelerator="cuda",

				    )

				    for major, minor in (map(int, version.split(".")) for version in CUDA_ARCHES)

				}

				ROCM_NIGHTLY_SOURCE_MATRIX = {

				    f"rocm-{major}.{minor}": dict(

				        name=f"rocm-{major}.{minor}",

				        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm{major}.{minor}",

				        supported_platforms=["Linux"],

				        accelerator="rocm",

				    )

				    for major, minor in (map(int, version.split(".")) for version in ROCM_ARCHES)

				}

				XPU_NIGHTLY_SOURCE_MATRIX = {

				    "xpu": dict(

				        name="xpu",

				        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/xpu",

				        supported_platforms=["Linux"],

				        accelerator="xpu",

				    )

				}

				NIGHTLY_SOURCE_MATRIX.update(CUDA_NIGHTLY_SOURCE_MATRIX)

				NIGHTLY_SOURCE_MATRIX.update(ROCM_NIGHTLY_SOURCE_MATRIX)

				NIGHTLY_SOURCE_MATRIX.update(XPU_NIGHTLY_SOURCE_MATRIX)

				def get_nccl_wheel_version(arch_version: str) -> str:

				    import re

				    requirements = map(

				        str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])

				    )

				@ -191,14 +147,17 @@ def get_nccl_wheel_version(arch_version: str) -> str:

				def read_nccl_pin(arch_version: str) -> str:

				    nccl_pin_path = (

				        REPO_ROOT

				        / ".ci"

				        / "docker"

				        / "ci_commit_pins"

				        / f"nccl-cu{arch_version[:2]}.txt"

				    from pathlib import Path

				    nccl_pin_path = os.path.join(

				        Path(__file__).absolute().parents[2],

				        ".ci",

				        "docker",

				        "ci_commit_pins",

				        f"nccl-cu{arch_version[:2]}.txt",

				    )

				    return nccl_pin_path.read_text().strip()

				    with open(nccl_pin_path) as f:

				        return f.read().strip()

				def validate_nccl_dep_consistency(arch_version: str) -> None:

				@ -206,8 +165,7 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:

				    wheel_ver = get_nccl_wheel_version(arch_version)

				    if not nccl_release_tag.startswith(f"v{wheel_ver}"):

				        raise RuntimeError(

				            f"{arch_version} NCCL release tag version {nccl_release_tag} "

				            f"does not correspond to wheel version {wheel_ver}"

				            f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"

				        )

				@ -454,14 +412,7 @@ def generate_wheels_matrix(

				    return ret

				arch_version = ""

				for arch_version in CUDA_ARCHES:

				    validate_nccl_dep_consistency(arch_version)

				del arch_version

				if __name__ == "__main__":

				    # Used by tools/nightly.py

				    (SCRIPT_DIR / "nightly_source_matrix.json").write_text(

				        json.dumps(NIGHTLY_SOURCE_MATRIX, indent=4) + "\n"

				    )

				validate_nccl_dep_consistency("13.0")

				validate_nccl_dep_consistency("12.9")

				validate_nccl_dep_consistency("12.8")

				validate_nccl_dep_consistency("12.6")

									
										13

.github/workflows/_xpu-test.yml
									
										vendored
									
												View File
												
				@ -38,10 +38,6 @@ on:

				        default: ""

				        description: |

				          List of tests to include (empty string implies default list)

				      dashboard-tag:

				        required: false

				        type: string

				        default: ""

				      disable-monitor:

				        description: |

				          [Experimental] Disable utilization monitoring for tests.

				@ -62,11 +58,6 @@ on:

				        required: false

				        type: number

				        default: 1

				    secrets:

				      HUGGING_FACE_HUB_TOKEN:

				        required: false

				        description: |

				          HF Auth token to avoid rate limits when downloading models or datasets from hub

				permissions:

				  id-token: write

				  contents: read

				@ -205,8 +196,6 @@ jobs:

				          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}

				          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}

				          TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}

				          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}

				          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

				        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}

				        run: |

				          # Fetch aws credential from IMDs

				@ -257,8 +246,6 @@ jobs:

				            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \

				            -e TESTS_TO_INCLUDE \

				            -e ZE_AFFINITY_MASK \

				            -e HUGGING_FACE_HUB_TOKEN \

				            -e DASHBOARD_TAG \

				            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \

				            --ulimit stack=10485760:83886080 \

				            --ulimit core=0 \

									
										2

.github/workflows/build-almalinux-images.yml
									
										vendored
									
												View File
												
				@ -36,7 +36,7 @@ jobs:

				    runs-on: linux.9xlarge.ephemeral

				    strategy:

				      matrix:

				        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "cpu"]

				        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]

				    steps:

				      - name: Build docker image

				        uses: pytorch/pytorch/.github/actions/binary-docker-build@main

									
										2

.github/workflows/build-libtorch-images.yml
									
										vendored
									
												View File
												
				@ -52,8 +52,8 @@ jobs:

				          { tag: "cuda12.9" },

				          { tag: "cuda12.8" },

				          { tag: "cuda12.6" },

				          { tag: "rocm6.4"  },

				          { tag: "rocm7.0"  },

				          { tag: "rocm7.1"  },

				          { tag: "cpu"      },

				        ]

				    steps:

									
										2

.github/workflows/build-magma-rocm-linux.yml
									
										vendored
									
												View File
												
				@ -34,7 +34,7 @@ jobs:

				      id-token: write

				    strategy:

				      matrix:

				        rocm_version: ["71", "70"]

				        rocm_version: ["70", "64"]

				    steps:

				      - name: Checkout PyTorch

				        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

									
										2

.github/workflows/build-manywheel-images.yml
									
										vendored
									
												View File
												
				@ -54,8 +54,8 @@ jobs:

				          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },

				          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },

				          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },

				          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },

				          { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" },

				          { name: "manylinux2_28-builder",          tag: "rocm7.1",           runner: "linux.9xlarge.ephemeral" },

				          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },

				          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },

				          { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },

									
										9

.github/workflows/build-triton-wheel.yml
									
										vendored
									
												View File
												
				@ -55,7 +55,7 @@ jobs:

				        docker-image: ["pytorch/manylinux2_28-builder:cpu"]

				        include:

				          - device: "rocm"

				            rocm_version: "7.1"

				            rocm_version: "7.0"

				            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"

				          - device: "cuda"

				            rocm_version: ""

				@ -159,7 +159,12 @@ jobs:

				            WITH_CLANG_LDD="--with-clang-ldd"

				          fi

				          docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"

				          if [[ "${BUILD_DEVICE}" == xpu ]]; then

				            docker exec -t "${container_name}" bash -c "dnf install -y gcc-toolset-13-gcc-c++"

				            docker exec -t "${container_name}" bash -c "source /opt/rh/gcc-toolset-13/enable && ${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE"

				          else

				            docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"

				          fi

				          if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "xpu") ]]; then

				            docker exec -t "${container_name}"  bash -c "auditwheel repair --plat ${PLATFORM} //artifacts/*.whl"

									
										2

.github/workflows/docker-builds.yml
									
										vendored
									
												View File
												
				@ -57,7 +57,6 @@ jobs:

				          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,

				          pytorch-linux-jammy-py3.10-clang12,

				          pytorch-linux-jammy-py3.13-clang12,

				          pytorch-linux-jammy-py3.14-clang12,

				          pytorch-linux-jammy-rocm-n-py3,

				          pytorch-linux-noble-rocm-n-py3,

				          pytorch-linux-jammy-rocm-n-py3-benchmarks,

				@ -67,7 +66,6 @@ jobs:

				          pytorch-linux-jammy-py3.12-halide,

				          pytorch-linux-jammy-xpu-n-1-py3,

				          pytorch-linux-jammy-xpu-n-py3,

				          pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,

				          pytorch-linux-jammy-py3-clang18-asan,

				          pytorch-linux-jammy-py3-clang12-onnx,

				          pytorch-linux-jammy-linter,

									
										236

.github/workflows/generated-linux-binary-libtorch-nightly.yml
									
										generated
									
										vendored
									
												View File
												
				@ -384,6 +384,124 @@ jobs:

				      github-token: ${{ secrets.GITHUB_TOKEN }}

				    uses: ./.github/workflows/_binary-upload.yml

				  libtorch-rocm6_4-shared-with-deps-release-build:

				    if: ${{ github.repository_owner == 'pytorch' }}

				    uses: ./.github/workflows/_binary-build-linux.yml

				    needs: get-label-type

				    with:

				      PYTORCH_ROOT: /pytorch

				      PACKAGE_TYPE: libtorch

				      # TODO: This is a legacy variable that we eventually want to get rid of in

				      #       favor of GPU_ARCH_VERSION

				      DESIRED_CUDA: rocm6.4

				      GPU_ARCH_VERSION: "6.4"

				      GPU_ARCH_TYPE: rocm

				      DOCKER_IMAGE: libtorch-cxx11-builder

				      DOCKER_IMAGE_TAG_PREFIX: rocm6.4

				      LIBTORCH_CONFIG: release

				      LIBTORCH_VARIANT: shared-with-deps

				      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"

				      timeout-minutes: 300

				      build_name: libtorch-rocm6_4-shared-with-deps-release

				      build_environment: linux-binary-libtorch

				    secrets:

				      github-token: ${{ secrets.GITHUB_TOKEN }}

				  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing

				    if: ${{ github.repository_owner == 'pytorch' }}

				    needs:

				      - libtorch-rocm6_4-shared-with-deps-release-build

				      - get-label-type

				    runs-on: linux.rocm.gpu.mi250

				    timeout-minutes: 240

				    env:

				      PYTORCH_ROOT: /pytorch

				      PACKAGE_TYPE: libtorch

				      # TODO: This is a legacy variable that we eventually want to get rid of in

				      #       favor of GPU_ARCH_VERSION

				      DESIRED_CUDA: rocm6.4

				      GPU_ARCH_VERSION: "6.4"

				      GPU_ARCH_TYPE: rocm

				      SKIP_ALL_TESTS: 1

				      DOCKER_IMAGE: libtorch-cxx11-builder

				      DOCKER_IMAGE_TAG_PREFIX: rocm6.4

				      LIBTORCH_CONFIG: release

				      LIBTORCH_VARIANT: shared-with-deps

				    permissions:

				      id-token: write

				      contents: read

				    steps:

				      - name: Setup ROCm

				        uses: ./.github/actions/setup-rocm

				      - uses: actions/download-artifact@v4.1.7

				        name: Download Build Artifacts

				        with:

				          name: libtorch-rocm6_4-shared-with-deps-release

				          path: "${{ runner.temp }}/artifacts/"

				      - name: Checkout PyTorch

				        uses: actions/checkout@v4

				        with:

				          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}

				          submodules: recursive

				          path: pytorch

				          show-progress: false

				      - name: Clean PyTorch checkout

				        run: |

				          # Remove any artifacts from the previous checkouts

				          git clean -fxd

				        working-directory: pytorch

				      - name: ROCm set GPU_FLAG

				        run: |

				          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"

				      - name: configure aws credentials

				        id: aws_creds

				        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}

				        uses: aws-actions/configure-aws-credentials@v4

				        with:

				          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only

				          aws-region: us-east-1

				          role-duration-seconds: 18000

				      - name: Calculate docker image

				        id: calculate-docker-image

				        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main

				        with:

				          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}

				          docker-image-name: libtorch-cxx11-builder

				          custom-tag-prefix: rocm6.4

				          docker-build-dir: .ci/docker

				          working-directory: pytorch

				      - name: Pull Docker image

				        uses: pytorch/test-infra/.github/actions/pull-docker-image@main

				        with:

				          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

				      - name: Test Pytorch binary

				        uses: ./pytorch/.github/actions/test-pytorch-binary

				        env:

				          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}

				      - name: Teardown ROCm

				        uses: ./.github/actions/teardown-rocm

				  libtorch-rocm6_4-shared-with-deps-release-upload:  # Uploading

				    if: ${{ github.repository_owner == 'pytorch' }}

				    permissions:

				      id-token: write

				      contents: read

				    needs: libtorch-rocm6_4-shared-with-deps-release-test

				    with:

				      PYTORCH_ROOT: /pytorch

				      PACKAGE_TYPE: libtorch

				      # TODO: This is a legacy variable that we eventually want to get rid of in

				      #       favor of GPU_ARCH_VERSION

				      DESIRED_CUDA: rocm6.4

				      GPU_ARCH_VERSION: "6.4"

				      GPU_ARCH_TYPE: rocm

				      DOCKER_IMAGE: libtorch-cxx11-builder

				      DOCKER_IMAGE_TAG_PREFIX: rocm6.4

				      LIBTORCH_CONFIG: release

				      LIBTORCH_VARIANT: shared-with-deps

				      build_name: libtorch-rocm6_4-shared-with-deps-release

				    secrets:

				      github-token: ${{ secrets.GITHUB_TOKEN }}

				    uses: ./.github/workflows/_binary-upload.yml

				  libtorch-rocm7_0-shared-with-deps-release-build:

				    if: ${{ github.repository_owner == 'pytorch' }}

				    uses: ./.github/workflows/_binary-build-linux.yml

				@ -501,121 +619,3 @@ jobs:

				    secrets:

				      github-token: ${{ secrets.GITHUB_TOKEN }}

				    uses: ./.github/workflows/_binary-upload.yml

				  libtorch-rocm7_1-shared-with-deps-release-build:

				    if: ${{ github.repository_owner == 'pytorch' }}

				    uses: ./.github/workflows/_binary-build-linux.yml

				    needs: get-label-type

				    with:

				      PYTORCH_ROOT: /pytorch

				      PACKAGE_TYPE: libtorch

				      # TODO: This is a legacy variable that we eventually want to get rid of in

				      #       favor of GPU_ARCH_VERSION

				      DESIRED_CUDA: rocm7.1

				      GPU_ARCH_VERSION: "7.1"

				      GPU_ARCH_TYPE: rocm

				      DOCKER_IMAGE: libtorch-cxx11-builder

				      DOCKER_IMAGE_TAG_PREFIX: rocm7.1

				      LIBTORCH_CONFIG: release

				      LIBTORCH_VARIANT: shared-with-deps

				      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"

				      timeout-minutes: 300

				      build_name: libtorch-rocm7_1-shared-with-deps-release

				      build_environment: linux-binary-libtorch

				    secrets:

				      github-token: ${{ secrets.GITHUB_TOKEN }}

				  libtorch-rocm7_1-shared-with-deps-release-test:  # Testing

				    if: ${{ github.repository_owner == 'pytorch' }}

				    needs:

				      - libtorch-rocm7_1-shared-with-deps-release-build

				      - get-label-type

				    runs-on: linux.rocm.gpu.mi250

				    timeout-minutes: 240

				    env:

				      PYTORCH_ROOT: /pytorch

				      PACKAGE_TYPE: libtorch

				      # TODO: This is a legacy variable that we eventually want to get rid of in

				      #       favor of GPU_ARCH_VERSION

				      DESIRED_CUDA: rocm7.1

				      GPU_ARCH_VERSION: "7.1"

				      GPU_ARCH_TYPE: rocm

				      SKIP_ALL_TESTS: 1

				      DOCKER_IMAGE: libtorch-cxx11-builder

				      DOCKER_IMAGE_TAG_PREFIX: rocm7.1

				      LIBTORCH_CONFIG: release

				      LIBTORCH_VARIANT: shared-with-deps

				    permissions:

				      id-token: write

				      contents: read

				    steps:

				      - name: Setup ROCm

				        uses: ./.github/actions/setup-rocm

				      - uses: actions/download-artifact@v4.1.7

				        name: Download Build Artifacts

				        with:

				          name: libtorch-rocm7_1-shared-with-deps-release

				          path: "${{ runner.temp }}/artifacts/"

				      - name: Checkout PyTorch

				        uses: actions/checkout@v4

				        with:

				          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}

				          submodules: recursive

				          path: pytorch

				          show-progress: false

				      - name: Clean PyTorch checkout

				        run: |

				          # Remove any artifacts from the previous checkouts

				          git clean -fxd

				        working-directory: pytorch

				      - name: ROCm set GPU_FLAG

				        run: |

				          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"

				      - name: configure aws credentials

				        id: aws_creds

				        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}

				        uses: aws-actions/configure-aws-credentials@v4

				        with:

				          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only

				          aws-region: us-east-1

				          role-duration-seconds: 18000

				      - name: Calculate docker image

				        id: calculate-docker-image

				        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main

				        with:

				          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}

				          docker-image-name: libtorch-cxx11-builder

				          custom-tag-prefix: rocm7.1

				          docker-build-dir: .ci/docker

				          working-directory: pytorch

				      - name: Pull Docker image

				        uses: pytorch/test-infra/.github/actions/pull-docker-image@main

				        with:

				          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

				      - name: Test Pytorch binary

				        uses: ./pytorch/.github/actions/test-pytorch-binary

				        env:

				          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}

				      - name: Teardown ROCm

				        uses: ./.github/actions/teardown-rocm

				  libtorch-rocm7_1-shared-with-deps-release-upload:  # Uploading

				    if: ${{ github.repository_owner == 'pytorch' }}

				    permissions:

				      id-token: write

				      contents: read

				    needs: libtorch-rocm7_1-shared-with-deps-release-test

				    with:

				      PYTORCH_ROOT: /pytorch

				      PACKAGE_TYPE: libtorch

				      # TODO: This is a legacy variable that we eventually want to get rid of in

				      #       favor of GPU_ARCH_VERSION

				      DESIRED_CUDA: rocm7.1

				      GPU_ARCH_VERSION: "7.1"

				      GPU_ARCH_TYPE: rocm

				      DOCKER_IMAGE: libtorch-cxx11-builder

				      DOCKER_IMAGE_TAG_PREFIX: rocm7.1

				      LIBTORCH_CONFIG: release

				      LIBTORCH_VARIANT: shared-with-deps

				      build_name: libtorch-rocm7_1-shared-with-deps-release

				    secrets:

				      github-token: ${{ secrets.GITHUB_TOKEN }}

				    uses: ./.github/workflows/_binary-upload.yml

1610

.github/workflows/generated-linux-binary-manywheel-nightly.yml generated vendored

View File

File diff suppressed because it is too large Load Diff

									
										148

.github/workflows/inductor-perf-test-nightly-xpu.yml
									
										vendored
									
												View File
											
				@ -1,148 +0,0 @@

				name: inductor-perf-nightly-xpu

				on:

				  push:

				    tags:

				      - ciflow/inductor-perf-test-nightly-xpu/*

				  schedule:

				    - cron: 30 17 * * *

				  workflow_dispatch:

				    inputs:

				      training:

				        description: Run training (on by default)?

				        required: false

				        type: boolean

				        default: true

				      inference:

				        description: Run inference (on by default)?

				        required: false

				        type: boolean

				        default: true

				      default:

				        description: Run inductor_default?

				        required: false

				        type: boolean

				        default: false

				      dynamic:

				        description: Run inductor_dynamic_shapes?

				        required: false

				        type: boolean

				        default: false

				      cppwrapper:

				        description: Run inductor_cpp_wrapper?

				        required: false

				        type: boolean

				        default: false

				      cudagraphs:

				        description: Run inductor_cudagraphs?

				        required: false

				        type: boolean

				        default: false

				      freezing_cudagraphs:

				        description: Run inductor_cudagraphs with freezing for inference?

				        required: false

				        type: boolean

				        default: false

				      aotinductor:

				        description: Run aot_inductor for inference?

				        required: false

				        type: boolean

				        default: false

				      maxautotune:

				        description: Run inductor_max_autotune?

				        required: false

				        type: boolean

				        default: false

				      benchmark_configs:

				        description: The list of configs used the benchmark

				        required: false

				        type: string

				        default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench

				concurrency:

				  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}

				  cancel-in-progress: true

				permissions: read-all

				jobs:

				  get-label-type:

				    name: get-label-type

				    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main

				    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}

				    with:

				      triggering_actor: ${{ github.triggering_actor }}

				      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}

				      curr_branch: ${{ github.head_ref || github.ref_name }}

				      curr_ref_type: ${{ github.ref_type }}

				      opt_out_experiments: lf

				  xpu-n-py3_10-inductor-benchmark-build:

				    name: xpu-n-py3.10-inductor-benchmark

				    uses: ./.github/workflows/_linux-build.yml

				    needs: get-label-type

				    with:

				      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"

				      build-environment: linux-jammy-xpu-n-py3.10

				      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks

				      runner: linux.c7i.12xlarge

				      test-matrix: |

				        { include: [

				          { config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },

				          { config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" },

				          { config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" },

				          { config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" },

				          { config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" },

				          { config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },

				          { config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },

				        ]}

				    secrets: inherit

				  xpu-n-py3_10-inductor-benchmark-test-nightly:

				    permissions:

				      id-token: write

				      contents: read

				    if: github.event_name != 'workflow_dispatch'

				    name: xpu-n-py3.10-inductor-benchmark

				    uses: ./.github/workflows/_xpu-test.yml

				    needs: xpu-n-py3_10-inductor-benchmark-build

				    with:

				      build-environment: linux-jammy-xpu-n-py3.10

				      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false

				      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}

				      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}

				      timeout-minutes: 720

				      # Disable monitor in perf tests for more investigation

				      disable-monitor: true

				      monitor-log-interval: 10

				      monitor-data-collect-interval: 2

				    secrets: inherit

				  xpu-n-py3_10-inductor-benchmark-test:

				    permissions:

				      id-token: write

				      contents: read

				    if: github.event_name == 'workflow_dispatch'

				    name: xpu-n-py3.10-inductor-test

				    uses: ./.github/workflows/_xpu-test.yml

				    needs: xpu-n-py3_10-inductor-benchmark-build

				    with:

				      build-environment: linux-jammy-xpu-n-py3.10

				      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}

				      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}

				      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}

				      timeout-minutes: 720

				      disable-monitor: false

				      monitor-log-interval: 15

				      monitor-data-collect-interval: 4

				    secrets: inherit

									
										84

.github/workflows/periodic-rocm-mi200.yml
									
										vendored
									
												View File
											
				@ -1,84 +0,0 @@

				name: periodic-rocm-mi200

				on:

				  schedule:

				    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.

				    # Also run less frequently on weekends.

				    - cron: 45 0,8,16 * * 1-5

				    - cron: 45 4 * * 0,6

				    - cron: 45 4,12,20 * * 1-5

				    - cron: 45 12 * * 0,6

				    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests

				  push:

				    tags:

				      - ciflow/periodic/*

				      - ciflow/periodic-rocm-mi200/*

				    branches:

				      - release/*

				  workflow_dispatch:

				concurrency:

				  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}

				  cancel-in-progress: true

				permissions:

				  id-token: write

				  contents: read

				jobs:

				  llm-td:

				    if: github.repository_owner == 'pytorch'

				    name: before-test

				    uses: ./.github/workflows/llm_td_retrieval.yml

				    permissions:

				      id-token: write

				      contents: read

				  target-determination:

				    name: before-test

				    uses: ./.github/workflows/target_determination.yml

				    needs: llm-td

				    permissions:

				      id-token: write

				      contents: read

				  get-label-type:

				    name: get-label-type

				    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main

				    if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'

				    with:

				      triggering_actor: ${{ github.triggering_actor }}

				      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}

				      curr_branch: ${{ github.head_ref || github.ref_name }}

				      curr_ref_type: ${{ github.ref_type }}

				  linux-jammy-rocm-py3_10-build:

				    name: linux-jammy-rocm-py3.10

				    uses: ./.github/workflows/_linux-build.yml

				    needs: get-label-type

				    with:

				      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"

				      build-environment: linux-jammy-rocm-py3.10

				      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3

				      test-matrix: |

				        { include: [

				          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },

				          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },

				          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },

				        ]}

				    secrets: inherit

				  linux-jammy-rocm-py3_10-test:

				    permissions:

				      id-token: write

				      contents: read

				    name: linux-jammy-rocm-py3.10

				    uses: ./.github/workflows/_rocm-test.yml

				    needs:

				      - linux-jammy-rocm-py3_10-build

				      - target-determination

				    with:

				      build-environment: linux-jammy-rocm-py3.10

				      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}

				      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}

				    secrets: inherit

									
										31

.github/workflows/periodic.yml
									
										vendored
									
												View File
												
				@ -204,6 +204,37 @@ jobs:

				      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}

				    secrets: inherit

				  linux-jammy-rocm-py3_10-build:

				    name: linux-jammy-rocm-py3.10

				    uses: ./.github/workflows/_linux-build.yml

				    needs: get-label-type

				    with:

				      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"

				      build-environment: linux-jammy-rocm-py3.10

				      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3

				      test-matrix: |

				        { include: [

				          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },

				          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },

				          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },

				        ]}

				    secrets: inherit

				  linux-jammy-rocm-py3_10-test:

				    permissions:

				      id-token: write

				      contents: read

				    name: linux-jammy-rocm-py3.10

				    uses: ./.github/workflows/_rocm-test.yml

				    needs:

				      - linux-jammy-rocm-py3_10-build

				      - target-determination

				    with:

				      build-environment: linux-jammy-rocm-py3.10

				      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}

				      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}

				    secrets: inherit

				  linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:

				    name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck

				    uses: ./.github/workflows/_linux-build.yml

									
										1

.github/workflows/upload-test-stats.yml
									
										vendored
									
												View File
												
				@ -6,7 +6,6 @@ on:

				      - pull

				      - trunk

				      - periodic

				      - periodic-rocm-mi200

				      - periodic-rocm-mi300

				      - inductor

				      - unstable

1

.gitignore vendored

View File

 @ -143,7 +143,6 @@ scripts/release_notes/*.json
 sccache-stats*.json
 lint.json
 merge_record.json
 .github/scripts/nightly_source_matrix.json
 # These files get copied over on invoking setup.py
 torchgen/packaged/*

									
										2

CMakeLists.txt
									
												View File
												
				@ -374,7 +374,7 @@ cmake_dependent_option(

				  "Build the lazy Torchscript backend, not compatible with mobile builds" ON

				  "NOT INTERN_BUILD_MOBILE" OFF)

				cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)

				cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin folder"

				cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"

				                       OFF "USE_CUDA" OFF)

				cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON

				                        "CPU_AARCH64" OFF)

									
										8

aten/src/ATen/Context.cpp
									
												View File
												
				@ -825,14 +825,6 @@ void Context::setDisplayVmapFallbackWarnings(bool enabled) {

				  display_vmap_fallback_warnings_ = enabled;

				}

				bool Context::warnOnAccumulateGradStreamMismatch() const {

				  return warn_on_accumulate_grad_stream_mismatch_;

				}

				void Context::setWarnOnAccumulateGradStreamMismatch(bool enabled) {

				  warn_on_accumulate_grad_stream_mismatch_ = enabled;

				}

				bool Context::isDefaultMobileCPUAllocatorSet() {

				  return prev_allocator_ptr_ != nullptr;

				}

									
										4

aten/src/ATen/Context.h
									
												View File
												
				@ -404,9 +404,6 @@ class TORCH_API Context {

				  void setDisplayVmapFallbackWarnings(bool enabled);

				  bool areVmapFallbackWarningsEnabled() const;

				  void setWarnOnAccumulateGradStreamMismatch(bool enabled);

				  bool warnOnAccumulateGradStreamMismatch() const;

				  bool isDefaultMobileCPUAllocatorSet();

				  void setDefaultMobileCPUAllocator();

				  void unsetDefaultMobileCPUAllocator();

				@ -497,7 +494,6 @@ class TORCH_API Context {

				  bool release_original_weights = false;

				#endif

				  bool display_vmap_fallback_warnings_ = false;

				  bool warn_on_accumulate_grad_stream_mismatch_ = true;

				  std::atomic<at::QEngine> quantized_engine = at::QEngine::NoQEngine;

				  bool enable_sparse_tensor_invariant_checks = false;

				  bool allow_fp16_reduction_cpu = false;

									
										117

aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
									
												View File
												
				@ -19,13 +19,6 @@ inline namespace CPU_CAPABILITY {

				#error "Big endian is not supported."

				#endif

				// GCC does not properly optimize bf16 operators

				#if defined(__ARM_FEATURE_BF16) && (__clang_major__ >= 19)

				#define BF16_ARITHMETIC_SUPPORTED() 1

				#else

				#define BF16_ARITHMETIC_SUPPORTED() 0

				#endif

				// Unlike the float16_t family of types, bfloat16_t is not available

				// when we're not targeting bfloat16 hardware support on some

				// platforms (but not Mac, so we have to be careful not to shadow the

				@ -359,72 +352,18 @@ class Vectorized<c10::BFloat16> : public Vectorized16<

				        other, &Vectorized<float>::name);                        \

				  }

				  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)

				  Vectorized frac() const;

				  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)

				  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)

				  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)

				#ifdef __ARM_FEATURE_BF16

				  // Flip sign bit

				  Vectorized<c10::BFloat16> neg() const {

				    return vreinterpretq_bf16_s16(vreinterpretq_s16_bf16(values) ^ (-32768));

				  }

				  // Fast reciprocal is fine because we are truncating results

				  Vectorized<c10::BFloat16> reciprocal() const {

				    auto x = vcvtq_low_f32_bf16(values);

				    auto y = vcvtq_high_f32_bf16(values);

				    x = vrecpeq_f32(x);

				    y = vrecpeq_f32(y);

				    return vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(x), y);

				  }

				  // Clearing the sign bit

				  Vectorized<c10::BFloat16> abs() const {

				    return vreinterpretq_bf16_u16(vreinterpretq_u16_bf16(values) & 0x7FFF);

				  }

				#else

				  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)

				  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)

				  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)

				#endif

				// These functions are optimized on clang-21+

				#if BF16_ARITHMETIC_SUPPORTED() && (__clang_major__ >= 21)

				  Vectorized<c10::BFloat16> operator==(

				      const Vectorized<c10::BFloat16>& other) const {

				    return values == other.values;

				  }

				  Vectorized<c10::BFloat16> operator!=(

				      const Vectorized<c10::BFloat16>& other) const {

				    return values != other.values;

				  }

				  Vectorized<c10::BFloat16> operator<(

				      const Vectorized<c10::BFloat16>& other) const {

				    return values < other.values;

				  }

				  Vectorized<c10::BFloat16> operator<=(

				      const Vectorized<c10::BFloat16>& other) const {

				    return values <= other.values;

				  }

				  Vectorized<c10::BFloat16> operator>(

				      const Vectorized<c10::BFloat16>& other) const {

				    return values > other.values;

				  }

				  Vectorized<c10::BFloat16> operator>=(

				      const Vectorized<c10::BFloat16>& other) const {

				    return values >= other.values;

				  }

				#else

				  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)

				  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)

				  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<)

				  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=)

				  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>)

				  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=)

				#endif

				#undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD

				#undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD

				@ -473,52 +412,28 @@ template <>

				Vectorized<c10::BFloat16> inline operator+(

				    const Vectorized<c10::BFloat16>& a,

				    const Vectorized<c10::BFloat16>& b) {

				#if BF16_ARITHMETIC_SUPPORTED()

				  bfloat16x8_t x = a;

				  bfloat16x8_t y = b;

				  return x + y;

				#else

				  return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);

				#endif

				}

				template <>

				Vectorized<c10::BFloat16> inline operator-(

				    const Vectorized<c10::BFloat16>& a,

				    const Vectorized<c10::BFloat16>& b) {

				#if BF16_ARITHMETIC_SUPPORTED()

				  bfloat16x8_t x = a;

				  bfloat16x8_t y = b;

				  return x - y;

				#else

				  return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);

				#endif

				}

				template <>

				Vectorized<c10::BFloat16> inline operator*(

				    const Vectorized<c10::BFloat16>& a,

				    const Vectorized<c10::BFloat16>& b) {

				#if BF16_ARITHMETIC_SUPPORTED()

				  bfloat16x8_t x = a;

				  bfloat16x8_t y = b;

				  return x * y;

				#else

				  return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);

				#endif

				}

				template <>

				Vectorized<c10::BFloat16> inline operator/(

				    const Vectorized<c10::BFloat16>& a,

				    const Vectorized<c10::BFloat16>& b) {

				#if BF16_ARITHMETIC_SUPPORTED()

				  bfloat16x8_t x = a;

				  bfloat16x8_t y = b;

				  return x / y;

				#else

				  return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);

				#endif

				}

				// frac. Implement this here so we can use subtraction

				@ -629,19 +544,12 @@ Vectorized<c10::BFloat16> inline fmadd(

				    const Vectorized<c10::BFloat16>& a,

				    const Vectorized<c10::BFloat16>& b,

				    const Vectorized<c10::BFloat16>& c) {

				#if BF16_ARITHMETIC_SUPPORTED()

				  bfloat16x8_t x = a;

				  bfloat16x8_t y = b;

				  bfloat16x8_t z = c;

				  return x * y + z;

				#else

				  // NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16!  Also,

				  // vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered

				  // elements, not the bottom and top half, so they don't seem

				  // particularly useful here. Ideally we would include dot product in

				  // the Vectorized interface...

				  return a * b + c;

				#endif

				}

				template <>

				@ -649,15 +557,8 @@ Vectorized<c10::BFloat16> inline fnmadd(

				    const Vectorized<c10::BFloat16>& a,

				    const Vectorized<c10::BFloat16>& b,

				    const Vectorized<c10::BFloat16>& c) {

				#if BF16_ARITHMETIC_SUPPORTED()

				  bfloat16x8_t x = a;

				  bfloat16x8_t y = b;

				  bfloat16x8_t z = c;

				  return (-x) * y + z;

				#else

				  // See NOTE [BF16 FMA] above.

				  return -a * b + c;

				#endif

				}

				template <>

				@ -665,15 +566,8 @@ Vectorized<c10::BFloat16> inline fmsub(

				    const Vectorized<c10::BFloat16>& a,

				    const Vectorized<c10::BFloat16>& b,

				    const Vectorized<c10::BFloat16>& c) {

				#if BF16_ARITHMETIC_SUPPORTED()

				  bfloat16x8_t x = a;

				  bfloat16x8_t y = b;

				  bfloat16x8_t z = c;

				  return x * y - z;

				#else

				  // See NOTE [BF16 FMA] above.

				  return a * b - c;

				#endif

				}

				template <>

				@ -681,15 +575,8 @@ Vectorized<c10::BFloat16> inline fnmsub(

				    const Vectorized<c10::BFloat16>& a,

				    const Vectorized<c10::BFloat16>& b,

				    const Vectorized<c10::BFloat16>& c) {

				#if BF16_ARITHMETIC_SUPPORTED()

				  bfloat16x8_t x = a;

				  bfloat16x8_t y = b;

				  bfloat16x8_t z = c;

				  return (-x) * y - z;

				#else

				  // See NOTE [BF16 FMA] above.

				  return -a * b - c;

				#endif

				}

				#endif // !defined(C10_MOBILE) && defined(__aarch64__)

									
										4

aten/src/ATen/cpu/vec/vec128/vec128_convert.h
									
												View File
												
				@ -6,9 +6,9 @@ namespace at::vec {

				inline namespace CPU_CAPABILITY {

				#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))

				// Enable auto-vectorization for clang-17+

				// Enable auto-vectorization for GCC-13+ and clang-17+

				// GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001

				#if defined(__clang__) && (__clang_major__ >= 17)

				#if __GNUC__ > 12 || (defined(__clang__) && (__clang_major__ >= 17))

				template <typename from_type, typename to_type>

				inline void convertImpl(

									
										7

aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
									
												View File
												
				@ -309,7 +309,7 @@ class Vectorized<float> {

				  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)

				  // Implementation copied from Arm Optimized Routine

				  // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c

				  inline Vectorized<float> vexpq_f32_u20() const {

				  Vectorized<float> exp_u20() const {

				    // bail out to sleef if it's a special case:

				    // i.e. there's an input s.t. |input| > 87.3....

				    const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);

				@ -348,9 +348,6 @@ class Vectorized<float> {

				    return vfmaq_f32(scale, poly, scale);

				  }

				  Vectorized<float> exp_u20() const {

				    return vexpq_f32_u20();

				  }

				  Vectorized<float> fexp_u20() const {

				    return exp_u20();

				  }

				@ -637,7 +634,7 @@ inline Vectorized<float> Vectorized<float>::erf() const {

				  // - exp(- x * x)

				  auto pow_2 = (*this) * (*this);

				  auto neg_pow_2 = pow_2 ^ neg_zero_vec;

				  auto tmp4 = neg_pow_2.vexpq_f32_u20();

				  auto tmp4 = neg_pow_2.exp();

				  auto tmp5 = tmp4 ^ neg_zero_vec;

				  // erf(x) = sign(x) * (1 - r * t * exp(- x * x))

				  auto tmp6 = t * tmp5;

									
										144

aten/src/ATen/cuda/CUDAGreenContext.cpp
									
												View File
												
				@ -1,90 +1,78 @@

				#include <ATen/cuda/CUDAGreenContext.h>

				#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)

				#include <c10/cuda/driver_api.h>

				#include <stdexcept>

				#include <vector>

				#define HAS_CUDA_GREEN_CONTEXT() 1

				#else

				#define HAS_CUDA_GREEN_CONTEXT() 0

				// Suppress unsued private field warnings as this class is not supposed to be called

				C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-private-field")

				#endif

				namespace at::cuda {

				  GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {

				#if CUDA_HAS_GREEN_CONTEXT

				    int driver_version;

				    C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));

				    TORCH_CHECK(

				        driver_version >= 12080, "cuda driver too old to use green context!");

				    CUcontext pctx = nullptr;

				    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));

				    if (C10_UNLIKELY(!pctx)) {

				      TORCH_WARN(

				          "Attempted to create a green context but"

				          " there was no primary context! Creating a primary context...");

				GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {

				#if HAS_CUDA_GREEN_CONTEXT()

				  int driver_version;

				  C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));

				  TORCH_CHECK(

				      driver_version >= 12080, "cuda driver too old to use green context!");

				  CUcontext pctx = nullptr;

				  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));

				  if (C10_UNLIKELY(!pctx)) {

				    TORCH_WARN(

				        "Attempted to create a green context but"

				        " there was no primary context! Creating a primary context...");

				      cudaFree(0);

				    }

				    cudaFree(0);

				  }

				    CUdevice device;

				    device_id_ = device_id;

				    C10_CUDA_DRIVER_CHECK(

				        c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));

				   CUdevice device;

				  device_id_ = device_id;

				  C10_CUDA_DRIVER_CHECK(

				      c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));

				    // Get device resources

				    CUdevResource device_resource;

				    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(

				        device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));

				  // Get device resources

				  CUdevResource device_resource;

				  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(

				      device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));

				    // Split resources

				    std::vector<CUdevResource> result(1);

				    auto result_data = result.data();

				    unsigned int nb_groups = 1;

				    CUdevResource remaining;

				  // Split resources

				  std::vector<CUdevResource> result(1);

				  auto result_data = result.data();

				  unsigned int nb_groups = 1;

				  CUdevResource remaining;

				    C10_CUDA_DRIVER_CHECK(

				        c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(

				            result_data,

				            &nb_groups,

				            &device_resource,

				            &remaining,

				            0, // default flags

				            num_sms));

				  C10_CUDA_DRIVER_CHECK(

				      c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(

				          result_data,

				          &nb_groups,

				          &device_resource,

				          &remaining,

				          0, // default flags

				          num_sms));

				    TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");

				  TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");

				    // Generate resource descriptor

				    CUdevResourceDesc desc;

				    C10_CUDA_DRIVER_CHECK(

				        c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(

				            &desc, result_data, 1));

				  // Generate resource descriptor

				  CUdevResourceDesc desc;

				  C10_CUDA_DRIVER_CHECK(

				      c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(

				          &desc, result_data, 1));

				    // Create green context

				    // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:

				    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html

				    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(

				        &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));

				  // Create green context

				  // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:

				  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html

				  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(

				      &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));

				  // Convert to regular context

				  C10_CUDA_DRIVER_CHECK(

				      c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));

				  TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");

				    // Convert to regular context

				    C10_CUDA_DRIVER_CHECK(

				        c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));

				    TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");

				#else

				  TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");

				    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");

				#endif

				  }

				  std::unique_ptr<GreenContext> GreenContext::create(

				      uint32_t num_sms,

				      std::optional<uint32_t> device_id) {

				#if HAS_CUDA_GREEN_CONTEXT()

				#if CUDA_HAS_GREEN_CONTEXT

				    if (!device_id.has_value()) {

				      device_id = at::cuda::current_device();

				    }

				    return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));

				    return std::make_unique<GreenContext>(device_id.value(), num_sms);

				#else

				    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");

				#endif

				@ -92,7 +80,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {

				  // Implement move operations

				  GreenContext::GreenContext(GreenContext&& other) noexcept{

				#if HAS_CUDA_GREEN_CONTEXT()

				#if CUDA_HAS_GREEN_CONTEXT

				    device_id_ = std::exchange(other.device_id_, -1);

				    green_ctx_ = std::exchange(other.green_ctx_, nullptr);

				    context_ = std::exchange(other.context_, nullptr);

				@ -103,7 +91,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {

				  }

				  GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{

				#if HAS_CUDA_GREEN_CONTEXT()

				#if CUDA_HAS_GREEN_CONTEXT

				    if (this != &other) {

				      // Clean up current resources

				      if (green_ctx_) {

				@ -132,7 +120,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {

				  }

				  GreenContext::~GreenContext() noexcept{

				#if HAS_CUDA_GREEN_CONTEXT()

				#if CUDA_HAS_GREEN_CONTEXT

				    C10_CUDA_DRIVER_CHECK(

				        c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));

				#else

				@ -140,9 +128,25 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {

				#endif

				  }

				  // Get the underlying CUDA context

				  CUcontext GreenContext::getContext() const {

				#if CUDA_HAS_GREEN_CONTEXT

				    return context_;

				#else

				    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");

				#endif

				  }

				  // Get the underlying green context

				#if CUDA_HAS_GREEN_CONTEXT

				  CUgreenCtx GreenContext::getGreenContext() const {

				    return green_ctx_;

				  }

				#endif

				  // Make this context current

				  void GreenContext::setContext() {

				#if HAS_CUDA_GREEN_CONTEXT()

				#if CUDA_HAS_GREEN_CONTEXT

				    auto current_stream = c10::cuda::getCurrentCUDAStream();

				    parent_stream_ = current_stream.stream();

				@ -171,7 +175,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {

				  }

				  void GreenContext::popContext() {

				#if HAS_CUDA_GREEN_CONTEXT()

				#if CUDA_HAS_GREEN_CONTEXT

				    // see above note about stream being hardcoded to the default stream

				    at::cuda::CUDAEvent ev;

				    ev.record(c10::cuda::getCurrentCUDAStream());

									
										41

aten/src/ATen/cuda/CUDAGreenContext.h
									
												View File
												
				@ -1,38 +1,53 @@

				#pragma once

				#include <ATen/cuda/CUDAEvent.h>

				#include <cuda.h>

				// Forward declare green context as opaque ptr

				typedef struct CUgreenCtx_st* CUgreenCtx;

				#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)

				#include <c10/cuda/driver_api.h>

				#include <cuda.h>

				#include <memory>

				#include <stdexcept>

				#include <vector>

				#define CUDA_HAS_GREEN_CONTEXT 1

				#else

				#define CUDA_HAS_GREEN_CONTEXT 0

				#endif

				namespace at::cuda {

				class TORCH_CUDA_CPP_API GreenContext {

				 public:

				  // Green context creation

				  static std::unique_ptr<GreenContext> create(

				      uint32_t num_sms,

				      std::optional<uint32_t> device_id);

				  ~GreenContext() noexcept;

				  GreenContext(uint32_t device_id, uint32_t num_sms);

				  static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);

				  // Delete copy constructor and assignment

				  GreenContext(const GreenContext&) = delete;

				  GreenContext& operator=(const GreenContext&) = delete;

				  // Implement move operations

				  GreenContext(GreenContext&& other) noexcept;

				  GreenContext& operator=(GreenContext&& other) noexcept;

				  ~GreenContext() noexcept;

				  // Get the underlying CUDA context

				  CUcontext getContext() const;

				  // Get the underlying green context

				#if CUDA_HAS_GREEN_CONTEXT

				  CUgreenCtx getGreenContext() const;

				#endif

				  // Make this context current

				  void setContext();

				  void popContext();

				 private:

				  GreenContext(uint32_t device_id, uint32_t num_sms);

				  // Implement move operations

				  GreenContext(GreenContext&& other) noexcept;

				  GreenContext& operator=(GreenContext&& other) noexcept;

				#if CUDA_HAS_GREEN_CONTEXT

				  int32_t device_id_ = -1;

				  CUgreenCtx green_ctx_ = nullptr;

				  CUcontext context_ = nullptr;

				  cudaStream_t parent_stream_ = nullptr;

				#endif

				};

				} // namespace at::cuda

									
										11

aten/src/ATen/cuda/CUDASparse.h
									
												View File
												
				@ -7,6 +7,17 @@

				#endif

				#if defined(USE_ROCM)

				// hipSparse const API added in v2.4.0

				#if HIPSPARSE_VERSION >= 200400

				#define AT_USE_HIPSPARSE_GENERIC_API() 1

				#else

				#define AT_USE_HIPSPARSE_GENERIC_API() 1

				#endif

				#else // USE_ROCM

				#define AT_USE_HIPSPARSE_GENERIC_API() 0

				#endif // USE_ROCM

				// cuSparse Generic API spsv function was added in CUDA 11.3.0

				#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)

				#define AT_USE_CUSPARSE_GENERIC_SPSV() 1

35

aten/src/ATen/cuda/Sleep.cu

View File

 @ -1,7 +1,6 @@
 #include <ATen/cuda/CUDAContextLight.h>
 #include <ATen/cuda/Sleep.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAStream.h>
 @ -25,22 +24,8 @@ __global__ void spin_kernel(int64_t cycles) {
 #endif
   }
 }
 thread_local int *flag = nullptr;
 __global__ void busy_wait_for_flag_kernel(int *flag) {
   atomicExch(flag, 1);
   while (atomicAdd(flag, 0) == 1) {
     // do nothing
   }
 }
 __global__ void clear_flag_kernel(int *flag) {
   atomicExch(flag, 0);
 }
 } // anonymous namespace
 void sleep(int64_t cycles) {
   dim3 grid(1);
   dim3 block(1);
 @ -48,26 +33,6 @@ void sleep(int64_t cycles) {
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 void busy_wait_for_flag() {
   if (!flag) {
     flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
   }
   dim3 grid(1);
   dim3 block(1);
   busy_wait_for_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 void clear_flag() {
   if (!flag) {
     flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
   }
   dim3 grid(1);
   dim3 block(1);
   clear_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
   C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 #ifdef USE_ROCM
 __global__ void flush_icache_kernel()
 {

									
										5

aten/src/ATen/cuda/Sleep.h
									
												View File
												
				@ -7,11 +7,6 @@ namespace at::cuda {

				// enqueues a kernel that spins for the specified number of cycles

				TORCH_CUDA_CU_API void sleep(int64_t cycles);

				// enqueues a kernel that spins until a flag is cleared by a

				// corresponding call to clear_flag()

				TORCH_CUDA_CU_API void busy_wait_for_flag();

				TORCH_CUDA_CU_API void clear_flag();

				// flushes instruction cache for ROCm; no-op for CUDA

				TORCH_CUDA_CU_API void flush_icache();

31

aten/src/ATen/cuda/detail/BLASConstants.cu

View File

 @ -2,6 +2,8 @@
 #include <ATen/Tensor.h>
 #include <ATen/cuda/Exceptions.h>
 #include <mutex>
 namespace at {
 namespace cuda {
 namespace detail {
 @ -10,36 +12,39 @@ __device__ __constant__ float cublas_one_device;
 __device__ __constant__ float cublas_zero_device;
 float *get_cublas_device_one() {
   static float *ptr = nullptr;
   static auto init_flag = [&]() {
   static c10::once_flag init_flag;
   c10::call_once(init_flag, []() {
     const float one = 1.f;
     AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
     AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
     return true;
   }();
   });
   float *ptr;
   AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
   return ptr;
 }
 float *get_cublas_device_zero() {
   static float *ptr = nullptr;
   static auto init_flag = [&]() {
   static c10::once_flag init_flag;
   c10::call_once(init_flag, []() {
     const float zero = 0.f;
     AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
     AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
     return true;
   }();
   });
   float *ptr;
   AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
   return ptr;
 }
 float *get_user_alpha_ptr() {
   static float *alpha_ptr;
   static bool init_flag [[maybe_unused]] = []() {
   static c10::once_flag init_flag;
   c10::call_once(init_flag, []() {
     AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
     return true;
   }();
   });
   return alpha_ptr;
 }

									
										31

aten/src/ATen/detail/MTIAHooksInterface.h
									
												View File
												
				@ -1,6 +1,5 @@

				#pragma once

				#include <c10/core/CachingDeviceAllocator.h>

				#include <c10/core/Device.h>

				#include <c10/util/Exception.h>

				@ -152,36 +151,6 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {

				  }

				  virtual bool isAvailable() const override;

				  /* MTIAGraph related APIs */

				  virtual int64_t mtiagraphCreate(bool keep_graph = false) const {

				    FAIL_MTIAHOOKS_FUNC(__func__);

				    return -1;

				  }

				  virtual void mtiagraphCaptureBegin(int64_t handle, MempoolId_t pool) const {

				    FAIL_MTIAHOOKS_FUNC(__func__);

				  }

				  virtual void mtiagraphCaptureEnd(int64_t handle) const {

				    FAIL_MTIAHOOKS_FUNC(__func__);

				  }

				  virtual void mtiagraphInstantiate(int64_t handle) const {

				    FAIL_MTIAHOOKS_FUNC(__func__);

				  }

				  virtual void mtiagraphReplay(int64_t handle) const {

				    FAIL_MTIAHOOKS_FUNC(__func__);

				  }

				  virtual void mtiagraphReset(int64_t handle) const {

				    FAIL_MTIAHOOKS_FUNC(__func__);

				  }

				  virtual MempoolId_t mtiagraphPool(int64_t handle) const {

				    FAIL_MTIAHOOKS_FUNC(__func__);

				  }

				};

				struct TORCH_API MTIAHooksArgs {};

									
										8

aten/src/ATen/functorch/BatchRulesViews.cpp
									
												View File
												
				@ -534,20 +534,20 @@ Tensor trace_decomp(const Tensor& tensor) {

				std::tuple<Tensor, std::optional<int64_t>> tril_batch_rule(

				    const Tensor& self,

				    std::optional<int64_t> self_bdim,

				    c10::SymInt diagonal = 0) {

				    int64_t diagonal = 0) {

				  TORCH_CHECK(self.dim() >= 2, "tril: The input tensor must have at least 2 dimensions.");

				  auto self_ = moveBatchDimToFront(self, self_bdim);

				  auto result = at::tril_symint(self_, std::move(diagonal));

				  auto result = at::tril(self_, diagonal);

				  return std::make_tuple(std::move(result), 0);

				}

				std::tuple<Tensor, std::optional<int64_t>> triu_batch_rule(

				    const Tensor& self,

				    std::optional<int64_t> self_bdim,

				    c10::SymInt diagonal = 0) {

				    int64_t diagonal = 0) {

				  TORCH_CHECK(self.dim() >= 2, "triu: The input tensor must have at least 2 dimensions.");

				  auto self_ = moveBatchDimToFront(self, self_bdim);

				  auto result = at::triu_symint(self_, std::move(diagonal));

				  auto result = at::triu(self_, diagonal);

				  return std::make_tuple(std::move(result), 0);

				}

9

aten/src/ATen/mps/MPSDevice.mm

View File

 @ -1,5 +1,7 @@
 //  Copyright © 2022 Apple Inc.
 #include <c10/util/CallOnce.h>
 #include <ATen/mps/IndexKernels.h>
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSDevice.h>
 @ -8,6 +10,9 @@
 namespace at::mps {
 static std::unique_ptr<MPSDevice> mps_device;
 static c10::once_flag mpsdev_init;
 static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device) {
   // MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
   // host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+
 @ -16,8 +21,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
 }
 MPSDevice* MPSDevice::getInstance() {
   static MPSDevice mps_device;
   return &mps_device;
   c10::call_once(mpsdev_init, [] { mps_device = std::unique_ptr<MPSDevice>(new MPSDevice()); });
   return mps_device.get();
 }
 MPSDevice::~MPSDevice() {

									
										13

aten/src/ATen/native/AveragePool2d.cpp
									
												View File
												
				@ -25,19 +25,18 @@ TORCH_PRECOMPUTE_META_FUNC(avg_pool2d)

				  // #20866, #22032: Guarantee this for the official C++ API?

				  TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,

				    "avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");

				  const int kH = safe_downcast<int, int64_t>(kernel_size[0]);

				  const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);

				  const int64_t kH = kernel_size[0];

				  const int64_t kW = kernel_size.size() == 1 ? kH : kernel_size[1];

				  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,

				    "avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints");

				  const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);

				  const int dW = stride.empty() ? kW :

				                 stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);

				  const int64_t dH = stride.empty() ? kH : stride[0];

				  const int64_t dW = stride.empty() ? kW : stride.size() == 1 ? dH : stride[1];

				  TORCH_CHECK(padding.size() == 1 || padding.size() == 2,

				    "avg_pool2d: padding must either be a single int, or a tuple of two ints");

				  const int padH = safe_downcast<int, int64_t>(padding[0]);

				  const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);

				  const int64_t padH = padding[0];

				  const int64_t padW = padding.size() == 1 ? padH : padding[1];

				  TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,

				    "divisor must be not zero");

									
										4

aten/src/ATen/native/Convolution.cpp
									
												View File
												
				@ -410,8 +410,8 @@ struct ConvParams {

				      return false;

				    }

				    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();

				    // broken on cuDNN 9.8 - 9.14

				    if (cudnn_version >= 90800 && cudnn_version < 91500) {

				    // broken on cuDNN 9.8

				    if (cudnn_version >= 90800) {

				      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&

				          (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&

				          weight.dim() == 5) {

									
										2

aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp
									
												View File
												
				@ -139,7 +139,7 @@ void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, dou

				      }

				    );

				  } else {

				    AT_DISPATCH_ALL_TYPES_AND(kHalf, dtype, "smooth_l1_backward_cpu_out", [&] {

				    AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {

				    auto norm_val = norm.to<scalar_t>();

				    scalar_t beta_val(beta);

				    auto norm_val_vec = Vectorized<scalar_t>(norm_val);

									
										10

aten/src/ATen/native/cuda/Blas.cpp
									
												View File
												
				@ -170,14 +170,10 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const

				  #if defined(CUDA_VERSION) || defined(USE_ROCM)

				  const auto scalar_type = mat1.scalar_type();

				  return (beta.toComplexDouble() == 1.0

				    // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]

				    // is to use lt interface only when self is bias.

				    && self.dim() == 1 && self.sizes()[0] == mat2_sizes[1] && self.is_contiguous()

				    && result.dim() == 2 && result.is_contiguous()

				    // Conditions for bias to be fusable

				    && (

				      self.is_contiguous() &&

				      // NOTE: fine to have 1-len dims to the left from the right-most one

				      (self.dim() == 1 || self.squeeze().dim() == 1) &&

				      self.sizes().back() == mat2_sizes[1]

				    )

				    && ( // some dtype restrictions

				      #ifndef USE_ROCM

				      scalar_type == at::ScalarType::Double ||

									
										54

aten/src/ATen/native/cuda/GroupedBlas.cpp
									
												View File
												
				@ -213,9 +213,9 @@ _f4_f4_bf16_grouped_mm_fbgemm(

				      const Tensor& mat_a,

				      const Tensor& mat_b,

				      const Tensor& scale_a,

				      const std::optional<Tensor>& global_scale_a,

				      const Tensor& global_scale_a,

				      const Tensor& scale_b,

				      const std::optional<Tensor>& global_scale_b,

				      const Tensor& global_scale_b,

				      const std::optional<Tensor>& offs,

				      const std::optional<Tensor>& bias,

				      Tensor& out) {

				@ -225,28 +225,14 @@ _f4_f4_bf16_grouped_mm_fbgemm(

				      "mat_a must be Float4_e2n1fn_2, got: ", mat_a.scalar_type());

				  TORCH_CHECK_VALUE(mat_b.scalar_type() == at::kFloat4_e2m1fn_x2,

				      "mat_b must be Float4_e2n1fn_2, got: ", mat_b.scalar_type());

				  std::optional<Tensor> combined_global_scale = std::nullopt;

				  if (global_scale_a.has_value() || global_scale_b.has_value()) {

				      // NVFP4

				      TORCH_CHECK_VALUE(global_scale_a.has_value() && global_scale_b.has_value(),

				          "For NVFP4 grouped gemm both of global_scale_{a,b} must have values")

				      TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,

				          "scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());

				      TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,

				          "scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());

				      TORCH_CHECK_VALUE(global_scale_a.value().scalar_type() == at::kFloat,

				          "global_scale_a must be Float, got: ", global_scale_a.value().scalar_type());

				      TORCH_CHECK_VALUE(global_scale_b.value().scalar_type() == at::kFloat,

				          "global_scale_b must be Float, got: ", global_scale_b.value().scalar_type());

				      combined_global_scale = global_scale_a.value().mul(global_scale_b.value());

				  } else {

				      // MXFP4

				      TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e8m0fnu,

				          "scale_a must be Float8_e8m0fnu, got: ", scale_a.scalar_type());

				      TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e8m0fnu,

				          "scale_b must be Float8_e8m0fnu, got: ", scale_b.scalar_type());

				  }

				  TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,

				      "scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());

				  TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,

				      "scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());

				  TORCH_CHECK_VALUE(global_scale_a.scalar_type() == at::kFloat,

				      "global_scale_a must be Float, got: ", global_scale_a.scalar_type());

				  TORCH_CHECK_VALUE(global_scale_b.scalar_type() == at::kFloat,

				      "global_scale_b must be Float, got: ", global_scale_b.scalar_type());

				  auto o = fbgemm_gpu::f4f4bf16_grouped_mm(

				      mat_a,

				@ -255,7 +241,7 @@ _f4_f4_bf16_grouped_mm_fbgemm(

				      scale_b,

				      offs.value(),

				      out,

				      combined_global_scale

				      global_scale_a.mul(global_scale_b)

				  );

				#else

				  TORCH_CHECK_NOT_IMPLEMENTED(false, "nvfp4 grouped gemm is not supported without USE_FBGEMM_GENAI, and only for CUDA")

				@ -485,10 +471,9 @@ namespace {

				using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;

				std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 4> scale_grouped_kernel_dispatch = {{

				std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 3> scale_grouped_kernel_dispatch = {{

				  { "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},

				  { "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},

				  { "mxfp4_mxfp4", scaled_blas::check_mxfp4_recipe, ScaledGemmImplementation::MXFP4_MXFP4},

				  { "nvfp4_nvfp4", scaled_blas::check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}}};

				} // anonymous namespace

				@ -614,21 +599,6 @@ _scaled_grouped_mm_cuda_v2(

				          offs.value(),

				          out);

				    }

				    case ScaledGemmImplementation::MXFP4_MXFP4: {

				      // scale shape checks

				      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);

				      _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);

				      return _f4_f4_bf16_grouped_mm_fbgemm(

				          mat_a,

				          mat_b,

				          scale_a[0], /* block-scale A */

				          std::nullopt, /* global-scale A */

				          scale_b[0], /* block-scale B */

				          std::nullopt, /* global-scale B */

				          offs.value(),

				          std::nullopt, /* bias */

				          out);

				    }

				    case ScaledGemmImplementation::NVFP4_NVFP4: {

				      // scale shape checks

				      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);

2

aten/src/ATen/native/cuda/IndexKernelUtils.cu

View File

 @ -13,7 +13,7 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
     if (allow_neg_indices) {
         ind = (ind < 0) ? ind + ind_dim_size : ind;
     }
     CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind);
     CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
     int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
     if (off >= slice_size) return;
     auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);

									
										50

aten/src/ATen/native/cuda/ScaledBlas.cpp
									
												View File
												
				@ -794,24 +794,6 @@ void _check_deepseek_scale_stride(const Tensor& scale, const Tensor& t, const Sc

				  }

				}

				void

				_check_deepseek_support() {

				#ifndef USE_ROCM

				  auto dprops = at::cuda::getCurrentDeviceProperties();

				  if (dprops->major != 9) {

				    // Only on Hopper GPUs

				    TORCH_CHECK_NOT_IMPLEMENTED(

				      dprops->major == 9,

				      "DeepSeek style (1x128, 128x128) scaling only supported in CUDA for SM90")

				  }

				  // Only in cublasLt >= 12.9

				  TORCH_CHECK_NOT_IMPLEMENTED(

				    CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900,

				    "DeepSeek style (1x128, 128x128) scaling requires cublasLt >= 12.9"

				  );

				#endif

				}

				Tensor&

				_scaled_block1x128_block1x128(

				          const Tensor& mat_a, const Tensor& mat_b,

				@ -820,12 +802,8 @@ _scaled_block1x128_block1x128(

				          const c10::ScalarType out_dtype,

				          const bool use_fast_accum,

				          Tensor& out) {

				#ifndef USE_ROCM

				  // Restrictions:

				  // A, B are FP8, scales are fp32, shape K//128

				  // CUDA: Only Hopper GPUs

				  _check_deepseek_support();

				  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",

				      mat_a.scalar_type(), mat_b.scalar_type());

				  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,

				@ -843,12 +821,6 @@ _scaled_block1x128_block1x128(

				  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);

				  return out;

				#else

				  TORCH_CHECK_NOT_IMPLEMENTED(

				    false,

				    "1x128 and 128x128 scaling not available with ROCm"

				  );

				#endif

				}

				Tensor&

				@ -859,12 +831,10 @@ _scaled_block128x128_block1x128(

				          const c10::ScalarType out_dtype,

				          const bool use_fast_accum,

				          Tensor& out) {

				#ifndef USE_ROCM

				  // Restrictions:

				  // A, B are FP8, scales are fp32, shape K//128

				  // CUDA: Only Hopper GPUs

				  _check_deepseek_support();

				  std::cout << "mat_b: " << mat_b.dim() << ", " << mat_b.sizes() << ", " << mat_b.strides() << std::endl;

				  std::cout << "scale_b: " << scale_b.dim() << ", " << scale_b.sizes() << ", " << scale_b.strides() << std::endl;

				  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",

				      mat_a.scalar_type(), mat_b.scalar_type());

				  TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,

				@ -882,12 +852,6 @@ _scaled_block128x128_block1x128(

				  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);

				  return out;

				#else

				  TORCH_CHECK_NOT_IMPLEMENTED(

				    false,

				    "1x128 and 128x128 scaling not available with ROCm"

				  );

				#endif

				}

				Tensor&

				@ -898,12 +862,8 @@ _scaled_block1x128_block128x128(

				          const c10::ScalarType out_dtype,

				          const bool use_fast_accum,

				          Tensor& out) {

				#ifndef USE_ROCM

				  // Restrictions:

				  // A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128

				  // CUDA: Only Hopper GPUs

				  _check_deepseek_support();

				  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",

				      mat_a.scalar_type(), mat_b.scalar_type());

				  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,

				@ -921,12 +881,6 @@ _scaled_block1x128_block128x128(

				  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);

				  return out;

				#else

				  TORCH_CHECK_NOT_IMPLEMENTED(

				    false,

				    "1x128 and 128x128 scaling not available with ROCm"

				  );

				#endif

				}

				Tensor&

9

aten/src/ATen/native/cuda/ScatterGatherKernel.cu

View File

 @ -160,8 +160,8 @@ struct _cuda_scatter_gather_internal_kernel {
       auto offsets = offset_calc.get(i);
       int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
       CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
         && "scatter gather kernel index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
       CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
         && "scatter gather kernel index out of bounds");
       f(
         (scalar_t*)(self_ptr + offsets[0]),
 @ -406,8 +406,9 @@ struct _cuda_scatter_fill_internal_kernel {
       auto offsets = offset_calc.get(i);
       int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
       CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
         && "index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
       CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
         && "index out of bounds"
       );
       f(
         (scalar_t*)(self_ptr + offsets[0]),

6

aten/src/ATen/native/cuda/layer_norm_kernel.cu

View File

 @ -141,8 +141,7 @@ WelfordDataLN cuWelfordOnlineSum(
   if constexpr (!rms_norm){
     U delta = val - curr_sum.mean;
     U new_count = curr_sum.count + 1.f;
 //Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
 #if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
 #if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
     U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
 #else
     U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
 @ -164,8 +163,7 @@ WelfordDataLN cuWelfordCombine(
     U count = dataA.count + dataB.count;
     U mean, sigma2;
     if (count > decltype(dataB.count){0}) {
 //Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
 #if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
 #if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
       auto coef = __builtin_amdgcn_rcpf(count);
 #else
       auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division

									
										28

aten/src/ATen/native/mps/kernels/BinaryKernel.metal
									
												View File
												
				@ -86,28 +86,6 @@ struct zeta_functor {

				  }

				};

				struct logaddexp_functor {

				  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>

				  inline T operator()(const T a, const T b) {

				    return c10::metal::logaddexp(a, b);

				  }

				  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>

				  inline float operator()(const T a, const T b) {

				    return c10::metal::logaddexp(float(a), float(b));

				  }

				};

				struct logaddexp2_functor {

				  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>

				  inline T operator()(const T a, const T b) {

				    return c10::metal::logaddexp2(a, b);

				  }

				  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>

				  inline float operator()(const T a, const T b) {

				    return c10::metal::logaddexp2(float(a), float(b));

				  }

				};

				struct xlog1py_functor {

				  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>

				  inline T operator()(const T a, const T b) {

				@ -399,10 +377,6 @@ REGISTER_FLOAT_BINARY_OP(fmin);

				REGISTER_FLOAT_BINARY_OP(nextafter);

				REGISTER_FLOAT_BINARY_OP(zeta);

				REGISTER_INT2FLOAT_BINARY_OP(zeta);

				REGISTER_FLOAT_BINARY_OP(logaddexp);

				REGISTER_INT2FLOAT_BINARY_OP(logaddexp);

				REGISTER_FLOAT_BINARY_OP(logaddexp2);

				REGISTER_INT2FLOAT_BINARY_OP(logaddexp2);

				REGISTER_FLOAT_BINARY_OP(xlog1py);

				REGISTER_INT2FLOAT_BINARY_OP(xlog1py);

				REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_t);

				@ -489,8 +463,6 @@ REGISTER_BINARY_OP(add, float2, float2);

				REGISTER_BINARY_OP(add, half2, half2);

				REGISTER_BINARY_OP(sub, float2, float2);

				REGISTER_BINARY_OP(sub, half2, half2);

				REGISTER_BINARY_OP(logaddexp, float2, float2);

				REGISTER_BINARY_OP(logaddexp, half2, half2);

				REGISTER_BINARY_ALPHA_OP(add_alpha, float2, float2, float2);

				REGISTER_BINARY_ALPHA_OP(add_alpha, half2, half2, half2);

				REGISTER_BINARY_ALPHA_OP(sub_alpha, float2, float2, float2);

10

aten/src/ATen/native/mps/operations/BinaryKernel.mm

View File

 @ -89,14 +89,6 @@ static void zeta_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "zeta");
 }
 static void logaddexp_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "logaddexp");
 }
 static void logaddexp2_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "logaddexp2");
 }
 static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
   TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
   lib.exec_binary_kernel(iter, "xlog1py");
 @ -219,8 +211,6 @@ REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
 REGISTER_DISPATCH(copysign_stub, &copysign_mps_kernel)
 REGISTER_DISPATCH(nextafter_stub, &nextafter_mps_kernel)
 REGISTER_DISPATCH(zeta_stub, &zeta_mps_kernel)
 REGISTER_DISPATCH(logaddexp_stub, &logaddexp_mps_kernel);
 REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_mps_kernel);
 REGISTER_DISPATCH(xlog1py_stub, &xlog1py_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)

26

aten/src/ATen/native/mps/operations/BinaryOps.mm

View File

 @ -17,6 +17,8 @@
 #include <ATen/ops/ge_native.h>
 #include <ATen/ops/gt_native.h>
 #include <ATen/ops/le_native.h>
 #include <ATen/ops/logaddexp2_native.h>
 #include <ATen/ops/logaddexp_native.h>
 #include <ATen/ops/logical_and_native.h>
 #include <ATen/ops/logical_or_native.h>
 #include <ATen/ops/logical_xor_native.h>
 @ -275,6 +277,30 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
   }
 }
 TORCH_IMPL_FUNC(logaddexp_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
   mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
     MPSGraph* mpsGraph = cachedGraph->graph();
     MPSGraphTensor* sumTensor =
         [mpsGraph additionWithPrimaryTensor:[mpsGraph exponentWithTensor:primaryCastTensor name:nil]
                             secondaryTensor:[mpsGraph exponentWithTensor:secondaryCastTensor name:nil]
                                        name:nil];
     return [mpsGraph logarithmWithTensor:sumTensor name:nil];
   };
   mps::binaryOpTensor(self, other, output, "logaddexp_out_mps", logaddexp_op_block);
 }
 TORCH_IMPL_FUNC(logaddexp2_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
   mps::BinaryOpBlock logaddexp2_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
     MPSGraph* mpsGraph = cachedGraph->graph();
     MPSGraphTensor* sumTensor =
         [mpsGraph additionWithPrimaryTensor:[mpsGraph exponentBase2WithTensor:primaryCastTensor name:nil]
                             secondaryTensor:[mpsGraph exponentBase2WithTensor:secondaryCastTensor name:nil]
                                        name:nil];
     return [mpsGraph logarithmBase2WithTensor:sumTensor name:nil];
   };
   mps::binaryOpTensor(self, other, output, "logaddexp2_out_mps", logaddexp2_op_block);
 }
 TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
   mps::BinaryOpBlock xlogy_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
     MPSGraph* mpsGraph = cachedGraph->graph();

8

aten/src/ATen/native/mps/operations/LossOps.mm

View File

 @ -370,7 +370,7 @@ static void nllnd_loss_backward_impl(Tensor& grad_input_arg,
                                                      onValue:-1.0f
                                                     offValue:0.0f
                                                         name:nil];
       oneHotTensor = castMPSTensor(mpsGraph, oneHotTensor, [inputTensor dataType]);
       oneHotTensor = castMPSTensor(mpsGraph, oneHotTensor, inputTensor.dataType);
       if (isWeightsArrayValid) {
         oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
                                                  secondaryTensor:weightTensor
 @ -705,7 +705,6 @@ static void smooth_l1_loss_template(const Tensor& input,
   TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.");
   TORCH_CHECK(input.is_mps());
   TORCH_CHECK(target.is_mps());
   TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "MPS doesn't know how to do square_i64");
   if ((input.numel() == 0) || (target.numel() == 0)) {
     reduction == Reduction::Mean ? output.fill_(std::numeric_limits<float>::quiet_NaN()) : output.zero_();
     return;
 @ -772,7 +771,7 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
       MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
       MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
       MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta dataType:[inputTensor dataType]];
       MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta dataType:MPSDataTypeFloat32];
       // xn - yn
       MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor
                                                           secondaryTensor:targetTensor
 @ -798,8 +797,7 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
                                                                   name:@"lossTensor"];
       MPSGraphTensor* outputTensor = lossTensor;
       if (reduction == Reduction::Mean) {
         MPSGraphTensor* numelTensor = [mpsGraph constantWithScalar:(double)input.numel()
                                                           dataType:[lossTensor dataType]];
         MPSGraphTensor* numelTensor = [mpsGraph constantWithScalar:(double)input.numel() dataType:MPSDataTypeFloat32];
         outputTensor = [mpsGraph divisionWithPrimaryTensor:lossTensor secondaryTensor:numelTensor name:nil];
       }
       MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:outputTensor

4

aten/src/ATen/native/mps/operations/Normalization.mm

View File

 @ -84,9 +84,6 @@ std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_mps_out(const Tensor& self,
                                                          Tensor& output,
                                                          Tensor& save_mean,
                                                          Tensor& save_var) {
   TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "Long batch norm is not supported with MPS");
   TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()),
                               "Batch norm for complex is not supported for MPS");
   using namespace at::native::mps;
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
 @ -921,7 +918,6 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
   // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
   const int axis = input_ndim - normalized_ndim;
   MPSStream* stream = getCurrentMPSStream();
   TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
   @autoreleasepool {
     mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
       // which kernel variant to use based on the normalized axis N size

3

aten/src/ATen/native/mps/operations/ReduceOps.mm

View File

 @ -1028,18 +1028,15 @@ TORCH_IMPL_FUNC(prod_out_mps)
 }
 TORCH_IMPL_FUNC(amax_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
   TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amax is not defined for complex types");
   reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
 }
 TORCH_IMPL_FUNC(amin_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
   TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amin is not defined for complex types");
   reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
 }
 TORCH_IMPL_FUNC(aminmax_out_mps)
 (const Tensor& input_t, std::optional<int64_t> dim_opt, bool keepdim, const Tensor& min_t, const Tensor& max_t) {
   TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "aminmax is not defined for complex types");
   reduction_out_mps(input_t,
                     dim_opt.has_value() ? OptionalIntArrayRef({*dim_opt}) : std::nullopt,
                     keepdim,

1

aten/src/ATen/native/mps/operations/Sort.mm

View File

 @ -31,7 +31,6 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
     indices.copy_(values.toType(at::ScalarType::Long));
     return;
   }
   TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()), "kthvalue is not implemented for complex types");
   // issue #154890, raising error to prevent crash within MPSGraph until
   // workaround is implemented.
   TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");

									
										18

aten/src/ATen/native/native_functions.yaml
									
												View File
												
				@ -3622,7 +3622,8 @@

				  structured: True

				  structured_inherits: TensorIteratorBase

				  dispatch:

				    CPU, CUDA, MPS: logaddexp_out

				    CPU, CUDA: logaddexp_out

				    MPS: logaddexp_out_mps

				  tags: pointwise

				- func: logaddexp(Tensor self, Tensor other) -> Tensor

				@ -3634,7 +3635,8 @@

				  structured: True

				  structured_inherits: TensorIteratorBase

				  dispatch:

				    CPU, CUDA, MPS: logaddexp2_out

				    CPU, CUDA: logaddexp2_out

				    MPS: logaddexp2_out_mps

				  tags: pointwise

				- func: logaddexp2(Tensor self, Tensor other) -> Tensor

				@ -8865,11 +8867,11 @@

				  autogen: bitwise_right_shift.Scalar_Tensor_out

				  tags: pointwise

				- func: tril_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)

				- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)

				  structured_delegate: tril.out

				  variants: method

				- func: triu_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)

				- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)

				  structured_delegate: triu.out

				  variants: method

				@ -8993,25 +8995,25 @@

				- func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor

				  variants: method, function

				- func: triu.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)

				- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)

				  structured: True

				  dispatch:

				    CPU: triu_cpu

				    CUDA: triu_cuda

				    MPS: triu_mps_out

				- func: triu(Tensor self, SymInt diagonal=0) -> Tensor

				- func: triu(Tensor self, int diagonal=0) -> Tensor

				  structured_delegate: triu.out

				  variants: method, function

				- func: tril.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)

				- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)

				  structured: True

				  dispatch:

				    CPU: tril_cpu

				    CUDA: tril_cuda

				    MPS: tril_mps_out

				- func: tril(Tensor self, SymInt diagonal=0) -> Tensor

				- func: tril(Tensor self, int diagonal=0) -> Tensor

				  structured_delegate: tril.out

				  variants: method, function

									
										22

aten/src/ATen/native/sparse/SparseTensor.cpp
									
												View File
												
				@ -467,28 +467,6 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRe

				      !options.has_layout() || options.layout() == kSparse,

				      "expected sparse layout, but got layout ",

				      options.layout());

				  if (indices.numel() > 0) {

				    Tensor min_indices =

				        std::get</* values */ 0>(indices.min(/* dim */ 1, /* keepdim */ false));

				    Tensor cpu_min_indices;

				    if (!indices.is_cpu()) {

				      cpu_min_indices = min_indices.to(at::DeviceType::CPU);

				    } else {

				      cpu_min_indices = min_indices;

				    }

				    auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();

				    for (const auto d : c10::irange(indices.size(0))) {

				      int64_t min_index_in_dim = cpu_min_indices_accessor[d];

				      TORCH_CHECK(

				          min_index_in_dim >= 0,

				          "found negative index ",

				          min_index_in_dim,

				          " for dim ",

				          d);

				    }

				  }

				  return at::native::_sparse_coo_tensor_unsafe(

				      indices,

				      values,

									
										6

benchmarks/dynamo/common.py
									
												View File
												
				@ -1837,10 +1837,6 @@ class BenchmarkRunner:

				    def skip_models_for_cuda(self):

				        return set()

				    @property

				    def skip_models_for_xpu(self):

				        return set()

				    @property

				    def skip_models_for_cpu(self):

				        return set()

				@ -3931,8 +3927,6 @@ def run(runner, args, original_dir=None):

				            runner.skip_models.update(runner.skip_models_for_cpu_aarch64)

				    elif args.devices == ["cuda"]:

				        runner.skip_models.update(runner.skip_models_for_cuda)

				    elif args.devices == ["xpu"]:

				        runner.skip_models.update(runner.skip_models_for_xpu)

				    if not args.multiprocess:

				        runner.skip_models.update(runner.skip_multiprocess_models)

									
										59

benchmarks/dynamo/genai_layers/benchmark.py
									
												View File
												
				@ -56,20 +56,6 @@ def list_benchmarks():

				    print(f"Available benchmarks: {list(BENCHMARK_REGISTRY.keys())}")

				def _run_benchmark(

				    benchmark_cls,

				    script_args,

				):

				    benchmark = benchmark_cls(script_args)

				    benchmark.benchmark()

				    benchmark.report_geomean_speedup()

				    if script_args.print_benchmark_result:

				        print(f"Benchmarking results {benchmark.name}:")

				        print(benchmark.profiling_results)

				    if script_args.visualize:

				        benchmark.visualize()

				def run_benchmark(

				    benchmark_name: str,

				    script_args,

				@ -85,7 +71,10 @@ def run_benchmark(

				    print("=" * 60)

				    benchmark_class = BENCHMARK_REGISTRY[benchmark_name]

				    _run_benchmark(benchmark_class, script_args)

				    benchmark = benchmark_class(script_args)

				    benchmark.benchmark()

				    if script_args.visualize:

				        benchmark.visualize()

				    return True

				@ -98,7 +87,10 @@ def run_all_benchmarks(script_args):

				    for name, cls in BENCHMARK_REGISTRY.items():

				        print(f"\n{'=' * 20} {name.upper()} {'=' * 20}")

				        _run_benchmark(cls, script_args)

				        benchmark = cls(script_args)

				        benchmark.benchmark()

				        if script_args.visualize:

				            benchmark.visualize()

				        print()

				@ -157,43 +149,8 @@ Examples:

				        help="Whether to exit with an error message for accuracy failure",

				    )

				    parser.add_argument(

				        "--print-benchmark-result",

				        action="store_true",

				        help="Whether to print the raw benchmarking result. Easier to quickly check the benchmark results on a server without GUI",

				    )

				    parser.add_argument(

				        "--custom-compile-name",

				        type=str,

				        default=None,

				        help="Name for the curve with customized compilation options",

				    )

				    parser.add_argument(

				        "--custom-compile-options",

				        type=str,

				        default=None,

				        help="Json string for the custom compile options.",

				    )

				    args = parser.parse_args()

				    if args.custom_compile_options:

				        import json

				        try:

				            args.custom_compile_options = json.loads(args.custom_compile_options)

				        except json.decoder.JSONDecodeError as e:

				            raise RuntimeError(

				                f"Invalid json string for --custom-compile-options: {args.custom_compile_options}"

				            ) from e

				        if not args.custom_compile_options:

				            raise RuntimeError("Found no options for --custom-compile-options")

				        if not args.custom_compile_name:

				            raise RuntimeError("Missing label name for the custom compilation")

				    # Handle list option

				    if args.list:

				        list_benchmarks()

									
										18

benchmarks/dynamo/genai_layers/kernels.py
									
												View File
												
				@ -8,15 +8,6 @@ import torch

				import torch.nn.functional as F

				# more important shapes used by internal models

				extra_shapes_for_norm = (

				    (1152 * 500, 384),

				    (1152 * 500, 512),

				    (1152 * 1000, 384),

				    (1152 * 1000, 512),

				)

				class CrossEntropyForward(BenchmarkKernel):

				    def __init__(self, script_args):

				        super().__init__(script_args)

				@ -355,7 +346,7 @@ class RMSNormForward(BenchmarkKernel):

				            (32768, 65536),

				            (16384, 131072),

				            (8192, 262144),

				        ) + extra_shapes_for_norm

				        )

				    def get_memory_bytes(self, args, kwargs) -> int:

				        x, w = args

				@ -447,7 +438,8 @@ class RMSNormBackward(BenchmarkKernel):

				            (32768, 4096),

				            (32768, 8192),

				            (32768, 16384),

				        ) + extra_shapes_for_norm

				            (32768, 32768),

				        )

				    def get_memory_bytes(self, args, kwargs) -> int:

				        x, w, dy = args

				@ -561,7 +553,7 @@ class LayerNormForward(BenchmarkKernel):

				            (32768, 16384),

				            (32768, 32768),

				            (32768, 65536),

				        ) + extra_shapes_for_norm

				        )

				    def get_memory_bytes(self, args, kwargs) -> int:

				        x, w = args

				@ -635,7 +627,7 @@ class LayerNormBackward(BenchmarkKernel):

				            (32768, 16384),

				            (32768, 32768),

				            (32768, 65536),

				        ) + extra_shapes_for_norm

				        )

				    def get_memory_bytes(self, args, kwargs) -> int:

				        x, w, dy = args

									
										90

benchmarks/dynamo/genai_layers/utils.py
									
												View File
												
				@ -6,7 +6,6 @@ from dataclasses import dataclass

				from typing import Any, Optional

				import matplotlib.pyplot as plt

				from scipy.stats import gmean

				import torch

				from torch._inductor.runtime.benchmarking import benchmarker

				@ -108,18 +107,6 @@ class BenchmarkKernel:

				        for backend in self.available_backends:

				            args_ref, kwargs_ref = self.clone_inputs(args, kwargs)

				            res[backend] = getattr(self, backend)(args_ref, kwargs_ref)()

				        if (

				            "compiled" in self.available_backends

				            and self.script_args.custom_compile_options

				        ):

				            torch._dynamo.reset()  # cause recompile

				            with torch._inductor.config.patch(self.script_args.custom_compile_options):

				                args_ref, kwargs_ref = self.clone_inputs(args, kwargs)

				                res[self.script_args.custom_compile_name] = self.compiled(

				                    args_ref, kwargs_ref

				                )()

				        gold = res["eager"]

				        tol = {}

				@ -128,7 +115,7 @@ class BenchmarkKernel:

				                "atol": self.script_args.tolerance,

				                "rtol": self.script_args.tolerance,

				            }

				        for backend in res:

				        for backend in self.available_backends:

				            if backend == "eager":

				                continue

				            try:

				@ -147,83 +134,37 @@ class BenchmarkKernel:

				                    print("Exit right away since --exit-on-accuracy-failure is set")

				                    sys.exit(1)

				    def benchmark_single_shape_for_backend(

				        self, backend, args, kwargs, setting, fn=None

				    ) -> bool:

				        if fn is None:

				            fn = getattr(self, backend)

				        args_ref, kwargs_ref = self.clone_inputs(args, kwargs)

				        try:

				            avg_time = benchmark_kernel_in_milliseconds(fn(args_ref, kwargs_ref))

				        except Exception as e:

				            print(

				                f"Failed to run {backend} backend on {self.name} kernel for {setting} due to {e}"

				            )

				            self.available_backends.remove(backend)  # noqa: B909

				            return False

				        mem_bytes = self.get_memory_bytes(args_ref, kwargs_ref)

				        perf = Performance(setting, avg_time, mem_bytes)

				        print(f"{self.name} kernel on {backend} backend. {perf}")

				        self.profiling_results[backend].append(perf)

				        return True

				    def benchmark_single_shape(

				        self, args, kwargs=None, should_check_accuracy=True, setting: str = ""

				    ):

				        for backend in self.available_backends:

				            self.benchmark_single_shape_for_backend(backend, args, kwargs, setting)

				        if (

				            "compiled" in self.available_backends

				            and self.script_args.custom_compile_options

				        ):

				            torch._dynamo.reset()  # cause recompile

				            with torch._inductor.config.patch(self.script_args.custom_compile_options):

				                status = self.benchmark_single_shape_for_backend(

				                    self.script_args.custom_compile_name,

				                    args,

				                    kwargs,

				                    setting,

				                    fn=self.compiled,

				            args_ref, kwargs_ref = self.clone_inputs(args, kwargs)

				            try:

				                avg_time = benchmark_kernel_in_milliseconds(

				                    getattr(self, backend)(args_ref, kwargs_ref)

				                )

				            if not status:

				                self.script_args.custom_compile_options = (

				                    None  # once fail, don't run again

				            except Exception as e:

				                print(

				                    f"Failed to run {backend} backend on {self.name} kernel for {setting} due to {e}"

				                )

				                self.available_backends.remove(backend)  # noqa: B909

				                continue

				            mem_bytes = self.get_memory_bytes(args_ref, kwargs_ref)

				            perf = Performance(setting, avg_time, mem_bytes)

				            print(f"{self.name} kernel on {backend} backend. {perf}")

				            self.profiling_results[backend].append(perf)

				        if should_check_accuracy:

				            self.check_accuracy(args, kwargs)

				    def visualize(self) -> None:

				        device_name = torch.cuda.get_device_name(0)

				        visualize_comparison(

				            self.profiling_results,

				            title=f"{self.name} ({device_name})",

				            title=f"{self.name}",

				            output_path=f"{self.name}_bench",

				        )

				        return

				    def report_geomean_speedup(self) -> None:

				        print(f"Geomean speedup for benchmark {self.name}")

				        eager_result = {

				            result.setting: result for result in self.profiling_results["eager"]

				        }

				        print(f"  eager {len(eager_result)} data points")

				        for backend, backend_result in self.profiling_results.items():

				            if backend == "eager":

				                continue

				            speeduplist = []

				            for result in backend_result:

				                eager_latency = eager_result[result.setting].latency

				                backend_latency = result.latency

				                speeduplist.append(

				                    eager_latency / backend_latency if backend_latency != 0 else 0.0

				                )

				            if len(speeduplist) > 0:

				                print(

				                    f"  {backend} {len(speeduplist)} data points, {gmean(speeduplist):.2f}x speedup"

				                )

				def get_backend_colors() -> dict[str, str]:

				    """Get consistent color scheme for different backends."""

				@ -311,6 +252,5 @@ def visualize_comparison(

				        os.makedirs("pics", exist_ok=True)

				        full_path = os.path.join("pics", output_path + ".png")

				        plt.savefig(full_path, dpi=300, bbox_inches="tight", facecolor="white")

				        print(f"Chart saved to {full_path}")

				    plt.close()

									
										7

benchmarks/dynamo/timm_models.py
									
												View File
												
				@ -74,8 +74,7 @@ REQUIRE_HIGHER_TOLERANCE = {

				REQUIRE_HIGHER_TOLERANCE_AMP = {}

				REQUIRE_EVEN_HIGHER_TOLERANCE = {

				    "deit_base_distilled_patch16_224",

				    "vit_base_patch16_siglip_256",

				    "beit_base_patch16_224",

				}

				# These models need higher tolerance in MaxAutotune mode

				@ -355,9 +354,7 @@ class TimmRunner(BenchmarkRunner):

				        if is_training:

				            from torch._inductor import config as inductor_config

				            if name == "beit_base_patch16_224":

				                tolerance = 16 * 1e-2

				            elif name in REQUIRE_EVEN_HIGHER_TOLERANCE or (

				            if name in REQUIRE_EVEN_HIGHER_TOLERANCE or (

				                inductor_config.max_autotune

				                and name in REQUIRE_EVEN_HIGHER_TOLERANCE_MAX_AUTOTUNE

				            ):

									
										4

benchmarks/dynamo/torchbench.py
									
												View File
												
				@ -124,10 +124,6 @@ class TorchBenchmarkRunner(BenchmarkRunner):

				    def skip_models_for_cuda(self):

				        return self._skip["device"]["cuda"]

				    @property

				    def skip_models_for_xpu(self):

				        return self._skip["device"]["xpu"]

				    @property

				    def skip_models_for_freezing_cuda(self):

				        return self._skip["freezing"]["cuda"]

									
										3

benchmarks/dynamo/torchbench.yaml
									
												View File
												
				@ -217,9 +217,6 @@ skip:

				    cuda: []

				    xpu:

				      - *DETECTRON2_MODELS

				  test:

				    training:

				      - *DETECTRON2_MODELS

									
										6

c10/cuda/CUDAStream.cpp
									
												View File
												
				@ -15,6 +15,7 @@ namespace c10::cuda {

				namespace {

				// Global stream state and constants

				c10::once_flag init_flag;

				DeviceIndex num_gpus = -1;

				constexpr int kStreamsPerPoolBits = 5;

				constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;

				@ -225,10 +226,7 @@ void initDeviceStreamState(DeviceIndex device_index) {

				// Init front-end to ensure initialization only occurs once

				void initCUDAStreamsOnce() {

				  // Inits default streams (once, globally)

				  auto static init_flag [[maybe_unused]] = [] {

				    initGlobalStreamState();

				    return true;

				  }();

				  c10::call_once(init_flag, initGlobalStreamState);

				  if (current_streams) {

				    return;

									
										60

c10/metal/special_math.h
									
												View File
												
				@ -1,4 +1,4 @@

				// Implementation of special math functions for Metal

				// Implementation of specal math functions for Metal

				#pragma once

				#include <c10/metal/expm1f.h>

				#include <c10/metal/igamma.h>

				@ -624,64 +624,6 @@ inline T spherical_bessel_j0(T x) {

				  return static_cast<T>(::metal::sin(x) / x);

				}

				template <typename T>

				inline ::metal::enable_if_t<is_scalar_floating_point_v<T>, T> logaddexp(

				    T a,

				    T b) {

				  float a0 = static_cast<float>(a);

				  float b0 = static_cast<float>(b);

				  if (::metal::isinf(a0) && a0 == b0) {

				    return static_cast<T>(a0);

				  } else {

				    float m0 = ::metal::max(a0, b0);

				    return static_cast<T>(

				        m0 + ::c10::metal::log1p(::metal::exp(-::metal::abs(a0 - b0))));

				  }

				}

				// The function is ported from mlx

				template <typename T>

				inline ::metal::enable_if_t<is_complex_v<T>, T> logaddexp(T a, T b) {

				  if (::metal::isnan(a.x) || ::metal::isnan(a.y) || ::metal::isnan(b.x) ||

				      ::metal::isnan(b.y)) {

				    return T(NAN, NAN);

				  }

				  T maxval = a.x > b.x ? a : b;

				  T minval = a.x < b.x ? a : b;

				  constexpr auto inf = ::metal::numeric_limits<T>::infinity().x;

				  if (minval.x == -inf || maxval.x == inf) {

				    return maxval;

				  }

				  float2 maxval_ = static_cast<float2>(maxval);

				  float2 minval_ = static_cast<float2>(minval);

				  float m = ::metal::exp(minval_.x - maxval_.x);

				  float2 dexp{

				      m * ::metal::cos(minval_.y - maxval_.y),

				      m * ::metal::sin(minval_.y - maxval_.y),

				  };

				  return static_cast<T>(maxval_ + ::c10::metal::log1p(dexp));

				}

				template <typename T>

				inline T logaddexp2(T a, T b) {

				  constexpr auto log_2 = float(0.693147180559945309417232121458176);

				  constexpr auto inv_log_2 = float(1) / log_2;

				  float a0 = static_cast<float>(a);

				  float b0 = static_cast<float>(b);

				  if (::metal::isinf(a0) && a0 == b0) {

				    return static_cast<T>(a0);

				  } else {

				    float m0 = ::metal::max(a0, b0);

				    return static_cast<T>(

				        m0 +

				        ::c10::metal::log1p(::metal::pow(float(2), -::metal::abs(a0 - b0))) *

				            inv_log_2);

				  }

				}

				template <typename T>

				inline float xlog1py(T x, T y) {

				  if (::metal::isnan(y)) {

									
										18

c10/metal/utils.h
									
												View File
												
				@ -322,24 +322,6 @@ inline float log1p(float x) {

				  return rc;

				}

				// The function is ported from mlx

				inline float2 log1p(float2 in) {

				  float x = in.x;

				  float y = in.y;

				  float zabs = ::metal::precise::sqrt(x * x + y * y);

				  float theta = ::metal::atan2(y, x + 1);

				  if (zabs < 0.5f) {

				    float r = x * (2 + x) + y * y;

				    if (r == 0) { // handle underflow

				      return {x, theta};

				    }

				    return {0.5f * log1p(r), theta};

				  } else {

				    auto z0 = ::metal::sqrt((x + 1) * (x + 1) + y * y);

				    return {::metal::log(z0), theta};

				  }

				}

				template <typename T1, typename T2 = T1>

				struct pair {

				  T1 first;

									
										2

c10/mobile/CPUProfilingAllocator.cpp
									
												View File
												
				@ -34,7 +34,7 @@ struct MemEvent {

				bool overlaps(const MemBlock& a, const MemBlock& b) {

				  // two blocks dont overlap if

				  // |---a--------|--------------b--------|

				  // start_a     end_a <= start_b       end_b

				  // strat_a     end_a <= start_b       end_b

				  return !(

				      (a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));

				}

									
										4

c10/test/util/TypeList_test.cpp
									
												View File
												
				@ -239,7 +239,7 @@ struct Class2 {

				struct mapper_call_func {

				  template <class T>

				  auto operator()(T) {

				  decltype(auto) operator()(T) {

				    return T::type::func();

				  }

				};

				@ -254,7 +254,7 @@ TEST(TypeListTest, MapTypesToValues_members) {

				struct mapper_call_nonexistent_function {

				  template <class T>

				  auto operator()(T) {

				  decltype(auto) operator()(T) {

				    return T::type::this_doesnt_exist();

				  }

				};

									
										2

c10/util/Bitset.h
									
												View File
												
				@ -33,7 +33,7 @@ struct bitset final {

				  constexpr bitset() noexcept = default;

				  constexpr bitset(const bitset&) noexcept = default;

				  constexpr bitset(bitset&&) noexcept = default;

				  // there is an issue for gcc 5.3.0 when define default function as constexpr

				  // there is an issure for gcc 5.3.0 when define default function as constexpr

				  // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68754.

				  bitset& operator=(const bitset&) noexcept = default;

				  bitset& operator=(bitset&&) noexcept = default;

									
										4

c10/util/C++17.h
									
												View File
												
				@ -53,7 +53,7 @@ namespace guts {

				// member functions.

				namespace detail {

				template <class F, class Tuple, std::size_t... INDEX>

				C10_HOST_DEVICE constexpr auto apply_impl(

				C10_HOST_DEVICE constexpr decltype(auto) apply_impl(

				    F&& f,

				    Tuple&& t,

				    std::index_sequence<INDEX...>) {

				@ -62,7 +62,7 @@ C10_HOST_DEVICE constexpr auto apply_impl(

				} // namespace detail

				template <class F, class Tuple>

				C10_HOST_DEVICE constexpr auto apply(F&& f, Tuple&& t) {

				C10_HOST_DEVICE constexpr decltype(auto) apply(F&& f, Tuple&& t) {

				  return detail::apply_impl(

				      std::forward<F>(f),

				      std::forward<Tuple>(t),

									
										2

c10/util/Exception.h
									
												View File
												
				@ -469,7 +469,7 @@ C10_API std::string GetExceptionString(const std::exception& e);

				namespace c10::detail {

				template <typename... Args>

				auto torchCheckMsgImpl(const char* /*msg*/, const Args&... args) {

				decltype(auto) torchCheckMsgImpl(const char* /*msg*/, const Args&... args) {

				  return ::c10::str(args...);

				}

				inline C10_API const char* torchCheckMsgImpl(const char* msg) {

									
										2

c10/util/StringUtil.h
									
												View File
												
				@ -135,7 +135,7 @@ struct _str_wrapper<> final {

				// Convert a list of string-like arguments into a single string.

				template <typename... Args>

				inline auto str(const Args&... args) {

				inline decltype(auto) str(const Args&... args) {

				  return detail::_str_wrapper<

				      typename detail::CanonicalizeStrTypes<Args>::type...>::call(args...);

				}

									
										2

c10/util/TypeList.h
									
												View File
												
				@ -507,7 +507,7 @@ struct map_types_to_values<typelist<Types...>> final {

				} // namespace detail

				template <class TypeList, class Func>

				auto map_types_to_values(Func&& func) {

				decltype(auto) map_types_to_values(Func&& func) {

				  return detail::map_types_to_values<TypeList>::call(std::forward<Func>(func));

				}

									
										20

c10/xpu/XPUCachingAllocator.cpp
									
												View File
												
				@ -554,17 +554,6 @@ class DeviceCachingAllocator {

				    }

				  }

				  double getMemoryFraction() {

				    if (!set_fraction) {

				      return 1.0;

				    }

				    c10::xpu::DeviceProp device_prop;

				    c10::xpu::get_device_properties(&device_prop, device_index);

				    return static_cast<double>(allowed_memory_maximum) /

				        static_cast<double>(device_prop.global_mem_size);

				  }

				  void setMemoryFraction(double fraction) {

				    c10::xpu::DeviceProp device_prop;

				    c10::xpu::get_device_properties(&device_prop, device_index);

				@ -735,11 +724,6 @@ class XPUAllocator : public DeviceAllocator {

				    device_allocators[device]->resetAccumulatedStats();

				  }

				  double getMemoryFraction(DeviceIndex device) {

				    assertValidDevice(device);

				    return device_allocators[device]->getMemoryFraction();

				  }

				  void setMemoryFraction(double fraction, DeviceIndex device) {

				    assertValidDevice(device);

				    TORCH_CHECK_VALUE(

				@ -793,10 +777,6 @@ void recordStream(const DataPtr& dataPtr, XPUStream stream) {

				  return allocator.recordStream(dataPtr, stream);

				}

				double getMemoryFraction(DeviceIndex device) {

				  return allocator.getMemoryFraction(device);

				}

				void setMemoryFraction(double fraction, DeviceIndex device) {

				  return allocator.setMemoryFraction(fraction, device);

				}

									
										2

c10/xpu/XPUCachingAllocator.h
									
												View File
												
				@ -25,8 +25,6 @@ C10_XPU_API void raw_delete(void* ptr);

				C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);

				C10_XPU_API double getMemoryFraction(DeviceIndex device);

				C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);

				} // namespace c10::xpu::XPUCachingAllocator

									
										7

c10/xpu/XPUFunctions.cpp
									
												View File
												
				@ -1,3 +1,4 @@

				#include <c10/util/CallOnce.h>

				#include <c10/util/Exception.h>

				#include <c10/xpu/XPUFunctions.h>

				@ -32,6 +33,7 @@ namespace {

				 *    one iGPU and enumerate all iGPUs on that platform.

				 * 3. If neither dGPUs nor iGPUs are found, conclude that no GPUs are available.

				 */

				c10::once_flag init_flag;

				thread_local DeviceIndex curDeviceIndex = 0;

				struct DevicePool {

				@ -147,10 +149,7 @@ inline void initGlobalDevicePoolState() {

				}

				inline void initDevicePoolCallOnce() {

				  auto static init_flag [[maybe_unused]] = [] {

				    initGlobalDevicePoolState();

				    return true;

				  }();

				  c10::call_once(init_flag, initGlobalDevicePoolState);

				}

				void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {

									
										6

c10/xpu/XPUStream.cpp
									
												View File
												
				@ -12,6 +12,7 @@ namespace c10::xpu {

				namespace {

				// Global stream state and constants

				c10::once_flag init_flag;

				DeviceIndex num_gpus = -1;

				constexpr int kStreamsPerPoolBits = 5;

				constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;

				@ -162,10 +163,7 @@ void initDeviceStreamState(DeviceIndex device) {

				}

				void initXPUStreamsOnce() {

				  auto static init_flag [[maybe_unused]] = [] {

				    initGlobalStreamState();

				    return true;

				  }();

				  c10::call_once(init_flag, initGlobalStreamState);

				  if (current_streams) {

				    return;

									
										10

caffe2/serialize/crc_alt.h
									
												View File
												
				@ -38,7 +38,7 @@ uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB);

				/// compute CRC32 (bitwise algorithm)

				uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0);

				/// compute CRC32 (half-byte algorithm)

				/// compute CRC32 (half-byte algoritm)

				uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0);

				#ifdef CRC32_USE_LOOKUP_TABLE_BYTE

				@ -96,7 +96,7 @@ uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previo

				  #define __BIG_ENDIAN    4321

				#endif

				// define endianness and some integer data types

				// define endianess and some integer data types

				#if defined(_MSC_VER) || defined(__MINGW32__)

				  // Windows always little endian

				  #define __BYTE_ORDER __LITTLE_ENDIAN

				@ -168,7 +168,7 @@ namespace

				  /// zlib's CRC32 polynomial

				  const uint32_t Polynomial = 0xEDB88320;

				  /// swap endianness

				  /// swap endianess

				  static inline uint32_t swap(uint32_t x)

				  {

				  #if defined(__GNUC__) || defined(__clang__)

				@ -229,7 +229,7 @@ uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32)

				}

				/// compute CRC32 (half-byte algorithm)

				/// compute CRC32 (half-byte algoritm)

				uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)

				{

				  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF

				@ -662,7 +662,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)

				  // - if you append length(B) zeros to A and call it A' (think of it as AAAA000)

				  //   and   prepend length(A) zeros to B and call it B' (think of it as 0000BBB)

				  //   then exists a C' = A' ^ B'

				  // - remember: if you XOR something with zero, it remains unchanged: X ^ 0 = X

				  // - remember: if you XOR someting with zero, it remains unchanged: X ^ 0 = X

				  // - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B')

				  // - the trick is to compute crc(A') based on crc(A)

				  //                       and crc(B') based on crc(B)

									
										2

caffe2/serialize/inline_container.h
									
												View File
												
				@ -76,7 +76,7 @@ typedef struct mz_zip_archive mz_zip_archive;

				// 2) Writing with 1-pass sequential access

				//      -> We must take care not to require updating values that have already

				//         been written. We place the variable-length index at the end and do

				//         not put any index into the header to fulfill this constraint.

				//         not put any indicies into the header to fulfill this constraint.

				// The model.json, which contains all the metadata information,

				// should be written as the last file. One reason is that the size of tensor

									
										2

caffe2/serialize/inline_container_test.cc
									
												View File
												
				@ -519,7 +519,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoadWithAllocator) {

				  std::tie(data_ptr, size) = reader.getRecord("key1", &overrideAllocator);

				  EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);

				  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes);

				  // allocate with base allocator

				  // allcoate with base allocator

				  std::tie(data_ptr, size) = reader.getRecord("key1");

				  EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);

				  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes + kBytes1);

									
										4

docs/source/autograd.md
									
												View File
												
				@ -423,10 +423,8 @@ Also see {ref}`saved-tensors-hooks-doc`.

				```{eval-rst}

				.. autofunction:: torch.autograd.graph.get_gradient_edge

				```

				```{eval-rst}

				.. autofunction:: torch.autograd.graph.set_warn_on_accumulate_grad_stream_mismatch

				```

				% This module needs to be documented. Adding here in the meantime

									
										54

docs/source/notes/libtorch_stable_abi.md
									
												View File
												
				@ -2,9 +2,9 @@

				## Overview

				The LibTorch Stable ABI (Application Binary Interface) provides a limited interface for extending PyTorch functionality without being tightly coupled to specific PyTorch versions. This enables the development of custom operators and extensions that remain compatible across PyTorch releases. This limited set of APIs is not intended to replace existing LibTorch, but rather to provide a stable foundation for a majority of custom extension use cases. If there is any API you would like to see added to the stable ABI, please file a request through a [new issue on the PyTorch repo](https://github.com/pytorch/pytorch/issues).

				The LibTorch Stable ABI (Application Binary Interface) provides an interface for extending PyTorch functionality without being tightly coupled to specific PyTorch versions. This enables the development of custom operators and extensions that remain compatible across PyTorch releases.

				The limited stable ABI consists of three main components:

				The stable ABI consists of three main components:

				1. **Stable C headers** - Low-level C API implemented by libtorch (primarily `torch/csrc/inductor/aoti_torch/c/shim.h`)

				2. **Header-only C++ library** - Standalone utilities implemented in only headers such that there is no dependence on libtorch (`torch/headeronly/*`)

				@ -14,8 +14,8 @@ We discuss each of these in detail

				### `torch/headeronly`

				The inlined C++ headers living in [`torch/headeronly`](https://github.com/pytorch/pytorch/tree/main/torch/headeronly) are completely decoupled from LibTorch. The headers consist of certain utilities that might be familiar to custom extension writers. For example, the

				`c10::ScalarType` enum lives here as `torch::headeronly::ScalarType`, as well as a libtorch-independent version of `TORCH_CHECK` that is `STD_TORCH_CHECK`. You can trust all APIs in the `torch::headeronly` namespace to not depend on `libtorch.so`. These APIs are also globally listed in [torch/header_only_apis.txt](https://github.com/pytorch/pytorch/blob/main/torch/header_only_apis.txt).

				This is a set of inlined C++ headers are completely decoupled from libtorch. The headers consist of certain utilities that might be familiar to custom extension writers. For example, the

				`c10::ScalarType` enum lives here as `torch::headeronly::ScalarType`.

				### `torch/csrc/stable`

				@ -34,14 +34,8 @@ We are continuing to improve coverage in our `torch/csrc/stable` APIs. Please fi

				### Stable C headers

				The stable C headers started by AOTInductor form the foundation of the stable ABI. Presently, the available C headers include:

				- [torch/csrc/inductor/aoti_torch/c/shim.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/c/shim.h): Includes C-style shim APIs for commonly used regarding Tensors, dtypes, CUDA, and the like.

				- [torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h): Includes C-style shim APIs for ATen ops from `native_functions.yaml` (e.g. `aoti_torch_aten_new_empty`).

				- [torch/csrc/inductor/aoti_torch/generated/c_shim_*.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/generated): Includes C-style shim APIs for specific backend kernels dispatched from `native_functions.yaml` (e.g. `aoti_torch_cuda_pad`). These APIs should only be used for the specific backend they are named after (e.g. `aoti_torch_cuda_pad` should only be used within CUDA kernels), as they opt out of the dispatcher.

				- [torch/csrc/stable/c/shim.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/stable/c/shim.h): We are building out more ABIs to logically live in `torch/csrc/stable/c` instead of continuing the AOTI naming that no longer makes sense for our general use case.

				These headers are promised to be ABI stable across releases and adhere to a stronger backwards compatibility policy than LibTorch. Specifically, we promise not to modify them for at least 2 years after they are released. However, this is **use at your own risk**. For example, users must handle the memory lifecycle of objects returned by certain APIs. Further, the stack-based APIs discussed below which allow the user to call into the PyTorch dispatcher do not provide strong guarantees on forward and backward compatibility of the underlying op that is called.

				The stable C headers used by AOTInductor form the foundation of the stable ABI. However, this is **use at your own risk**. For example, users must handle the memory lifecycle of objects returned by certain APIs.

				 Further, the stack-based APIs discussed below which allow the user to call the PyTorch dispatcher don't provide strong guarantees on forward and backward compatibility.

				Unless absolutely necessary, we recommend the high-level C++ API in `torch/csrc/stable`

				which will handle all the rough edges of the C API for the user.

				@ -128,38 +122,12 @@ The above is relevant in two places:

				    }

				    ```

				2. `torch_call_dispatcher`

				2. `aoti_torch_call_dispatcher`

				    This API allows you to call the PyTorch dispatcher from C/C++ code. It has the following signature:

				    ```cpp

				    torch_call_dispatcher(const char* opName, const char* overloadName, StableIValue* stack, uint64_t extension_build_version);

				    aoti_torch_call_dispatcher(const char* opName, const char* overloadName, StableIValue* stack);

				    ```

				    `torch_call_dispatcher` will call the op overload defined by a given `opName`, `overloadName`, a stack of

				    StableIValues and the `TORCH_ABI_VERSION` of the user extension. This call will populate any return values of the

				    op into the stack in their StableIValue form, with `ret0` at index 0, `ret1` at index 1, and so on.

				    We caution against using this API to call functions that have been registered to the dispatcher by other extensions

				    unless the caller can guarantee that the signature they expect matches that which the custom extension has

				    registered.

				### Versioning and Forward/Backward compatibility guarantees

				We provide a `TORCH_ABI_VERSION` macro in `torch/headeronly/version.h` of the form

				```

				[ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ]

				[MAJ   ][ MIN  ][PATCH ][                 ABI TAG              ]

				```

				In the present phase of development, APIs in the C-shim will be versioned based on major.minor.patch release that they are first introduced in, with 2.10 being the first release where this will be enforced. The ABI tag is reserved for future use.

				Extensions can select the minimum abi version to be compatible with using:

				```

				#define TORCH_TARGET_VERSION (((0ULL + major) << 56) | ((0ULL + minor) << 48))

				```

				before including any stable headers or by passing the equivalent `-D` option to the compiler. Otherwise, the default will be the current `TORCH_ABI_VERSION`.

				The above ensures that if a user defines `TORCH_TARGET_VERSION` to be 0x0209000000000000 (2.9) and attempts to use a C shim API `foo` that was introduced in version 2.10, a compilation error will be raised. Similarly, the C++ wrapper APIs in `torch/csrc/stable` are compatible with older libtorch binaries up to the TORCH_ABI_VERSION they are exposed in and forward compatible with newer libtorch binaries.

				    `aoti_torch_call_dispatcher` will call the op overload defined by a given `opName`, `overloadName`, and a stack of

				    StableIValues. This call will populate any return values of the op into the stack in their StableIValue form,

				    with `ret0` at index 0, `ret1` at index 1, and so on.

									
										1

docs/source/xpu.md
									
												View File
												
				@ -76,7 +76,6 @@

				    :nosignatures:

				     empty_cache

				     get_per_process_memory_fraction

				     max_memory_allocated

				     max_memory_reserved

				     mem_get_info

									
										2

setup.py
									
												View File
												
				@ -1106,7 +1106,7 @@ class build_ext(setuptools.command.build_ext.build_ext):

				                continue

				            self.copy_file(source_lib, target_lib)

				            # Delete old rpath and add @loader_lib to the rpath

				            # This should prevent deallocate from attempting to package another instance

				            # This should prevent delocate from attempting to package another instance

				            # of OpenMP library in torch wheel as well as loading two libomp.dylib into

				            # the address space, as libraries are cached by their unresolved names

				            install_name_tool_args = [

									
										2

test/distributed/_composable/fsdp/test_fully_shard_autograd.py
									
												View File
												
				@ -266,7 +266,7 @@ class TestFullyShardPostAccGradHookMultiThread(FSDPTestMultiThread):

				        model(inp).sum().backward()

				        param_names = {param_name for param_name, _ in model.named_parameters()}

				        self.assertEqual(param_names, set(param_name_to_hook_count.keys()))

				        for count in param_name_to_hook_count.values():

				        for param_name, count in param_name_to_hook_count.items():

				            self.assertEqual(count, 1)

									
										2

test/distributed/_composable/fsdp/test_fully_shard_training.py
									
												View File
												
				@ -827,7 +827,7 @@ class TestFullyShardShardPlacementFnMultiProcess(FSDPTest):

				        torch.manual_seed(42 + self.rank)

				        inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)

				        for _ in range(5):

				        for iter_idx in range(5):

				            ref_loss = ref_model(inp).sum()

				            loss = model(inp).sum()

				            self.assertEqual(ref_loss, loss)

									
										1

test/distributed/elastic/multiprocessing/api_test.py
									
												View File
												
				@ -800,7 +800,6 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS or IS_CI):

				                    stderr_redirects={0: stderr_redir},

				                    ret_vals={0: queue},

				                    queue_finished_reading_event=worker_finished_event_mock,

				                    numa_options=None,

				                )

				                self.assertEqual("hello_0", queue.get())

				                if stdout_redir:

									
										14

test/distributed/fsdp/test_distributed_checkpoint.py
									
												View File
												
				@ -31,17 +31,17 @@ if TEST_WITH_DEV_DBG_ASAN:

				    sys.exit(0)

				_DISTRIBUTED_STATE_DICT_IMPLS = {

				_DISTRIBUTED_STATE_DICT_IMPLS = (

				    StateDictType.LOCAL_STATE_DICT,

				    StateDictType.SHARDED_STATE_DICT,

				}

				)

				class TestDistributedCheckpoint(FSDPTest):

				    @property

				    def world_size(self):

				        if torch.accelerator.is_available():

				            gpu_cnt = torch.accelerator.device_count()

				        if torch.cuda.is_available():

				            gpu_cnt = torch.cuda.device_count()

				            if gpu_cnt < 2:

				                return gpu_cnt

				        return 2

				@ -93,9 +93,7 @@ class TestDistributedCheckpoint(FSDPTest):

				        # TODO: add resharding test case.

				devices = ("cuda", "hpu", "xpu")

				instantiate_device_type_tests(

				    TestDistributedCheckpoint, globals(), only_for=devices, allow_xpu=True

				)

				devices = ("cuda", "hpu")

				instantiate_device_type_tests(TestDistributedCheckpoint, globals(), only_for=devices)

				if __name__ == "__main__":

				    run_tests()

									
										4

test/distributed/fsdp/test_fsdp_apply.py
									
												View File
												
				@ -36,8 +36,8 @@ device_type = torch.device(get_devtype())

				class TestApply(FSDPTest):

				    @property

				    def world_size(self):

				        if torch.accelerator.is_available():

				            gpu_cnt = torch.accelerator.device_count()

				        if torch.cuda.is_available():

				            gpu_cnt = torch.cuda.device_count()

				            if gpu_cnt < 2:

				                return gpu_cnt

				        return 2

									
										5

test/distributed/fsdp/test_fsdp_misc.py
									
												View File
												
				@ -514,17 +514,18 @@ class TestFSDPMiscMultiProcess(FSDPTest):

				    def test_fsdp_cpu_training(self):

				        """Tests FSDP training on CPU."""

				        gloo_pg = dist.new_group(backend="gloo")

				        for ss in [

				        for ss in [  # noqa: F841

				            ShardingStrategy.NO_SHARD,

				            ShardingStrategy.FULL_SHARD,

				            ShardingStrategy.SHARD_GRAD_OP,

				            ShardingStrategy.HYBRID_SHARD,

				            ShardingStrategy._HYBRID_SHARD_ZERO2,

				        ]:

				            torch.manual_seed(42)

				            model = MyModel()

				            ref_model = DDP(deepcopy(model), process_group=gloo_pg)

				            model = FSDP(

				                model,

				                sharding_strategy=ss,

				                auto_wrap_policy=always_wrap_policy,

				                process_group=gloo_pg,

				                device_id=torch.device("cpu"),

									
										35

test/distributed/nn/jit/test_instantiator.py
									
												View File
												
				@ -2,6 +2,7 @@

				# Owner(s): ["oncall: distributed"]

				import sys

				from pathlib import Path

				import torch

				import torch.distributed as dist

				@ -44,19 +45,53 @@ class TestInstantiator(TestCase):

				        self.assertEqual(return_type_str, "Tuple[Tensor, int, str]")

				    def test_instantiate_scripted_remote_module_template(self):

				        dir_path = Path(instantiator.INSTANTIATED_TEMPLATE_DIR_PATH)

				        # Cleanup.

				        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")

				        for file_path in file_paths:

				            file_path.unlink()

				        # Check before run.

				        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")

				        num_files_before = len(list(file_paths))

				        self.assertEqual(num_files_before, 0)

				        generated_module = instantiator.instantiate_scriptable_remote_module_template(

				            MyModuleInterface

				        )

				        self.assertTrue(hasattr(generated_module, "_remote_forward"))

				        self.assertTrue(hasattr(generated_module, "_generated_methods"))

				        # Check after run.

				        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")

				        num_files_after = len(list(file_paths))

				        self.assertEqual(num_files_after, 1)

				    def test_instantiate_non_scripted_remote_module_template(self):

				        dir_path = Path(instantiator.INSTANTIATED_TEMPLATE_DIR_PATH)

				        # Cleanup.

				        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")

				        for file_path in file_paths:

				            file_path.unlink()

				        # Check before run.

				        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")

				        num_files_before = len(list(file_paths))

				        self.assertEqual(num_files_before, 0)

				        generated_module = (

				            instantiator.instantiate_non_scriptable_remote_module_template()

				        )

				        self.assertTrue(hasattr(generated_module, "_remote_forward"))

				        self.assertTrue(hasattr(generated_module, "_generated_methods"))

				        # Check after run.

				        file_paths = dir_path.glob(f"{instantiator._FILE_PREFIX}*.py")

				        num_files_after = len(list(file_paths))

				        self.assertEqual(num_files_after, 1)

				if __name__ == "__main__":

				    run_tests()

Compare commits

1 Commits ciflow/pul ... pr165642

5 .ci/docker/build.sh Unescape Escape View File

2 .ci/docker/common/install_acl.sh Unescape Escape View File

10 .ci/docker/common/install_conda.sh Unescape Escape View File

4 .ci/docker/common/install_rocm.sh Unescape Escape View File

4 .ci/docker/common/install_rocm_magma.sh Unescape Escape View File

2 .ci/docker/manywheel/build.sh Unescape Escape View File

21 .ci/docker/requirements-ci.txt Unescape Escape View File

5 .ci/docker/ubuntu-xpu/Dockerfile Unescape Escape View File

2 .ci/lumen_cli/pyproject.toml Unescape Escape View File

8 .ci/magma-rocm/Makefile Unescape Escape View File

6 .ci/magma-rocm/build_magma.sh Unescape Escape View File

2 .ci/pytorch/build.sh Unescape Escape View File

6 .ci/pytorch/test.sh Unescape Escape View File

4 .github/actions/diskspace-cleanup/action.yml vendored Unescape Escape View File

2 .github/ci_commit_pins/audio.txt vendored Unescape Escape View File

2 .github/ci_commit_pins/vision.txt vendored Unescape Escape View File

2 .github/pytorch-probot.yml vendored Unescape Escape View File

89 .github/scripts/generate_binary_build_matrix.py vendored Unescape Escape View File

13 .github/workflows/_xpu-test.yml vendored Unescape Escape View File

2 .github/workflows/build-almalinux-images.yml vendored Unescape Escape View File

2 .github/workflows/build-libtorch-images.yml vendored Unescape Escape View File

2 .github/workflows/build-magma-rocm-linux.yml vendored Unescape Escape View File

2 .github/workflows/build-manywheel-images.yml vendored Unescape Escape View File

9 .github/workflows/build-triton-wheel.yml vendored Unescape Escape View File

2 .github/workflows/docker-builds.yml vendored Unescape Escape View File

236 .github/workflows/generated-linux-binary-libtorch-nightly.yml generated vendored Unescape Escape View File

1610 .github/workflows/generated-linux-binary-manywheel-nightly.yml generated vendored View File

148 .github/workflows/inductor-perf-test-nightly-xpu.yml vendored Unescape Escape View File

84 .github/workflows/periodic-rocm-mi200.yml vendored Unescape Escape View File

31 .github/workflows/periodic.yml vendored Unescape Escape View File

1 .github/workflows/upload-test-stats.yml vendored Unescape Escape View File

1 .gitignore vendored Unescape Escape View File

2 CMakeLists.txt Unescape Escape View File

8 aten/src/ATen/Context.cpp Unescape Escape View File

4 aten/src/ATen/Context.h Unescape Escape View File

117 aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h Unescape Escape View File

4 aten/src/ATen/cpu/vec/vec128/vec128_convert.h Unescape Escape View File

7 aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h Unescape Escape View File

144 aten/src/ATen/cuda/CUDAGreenContext.cpp Unescape Escape View File

41 aten/src/ATen/cuda/CUDAGreenContext.h Unescape Escape View File

11 aten/src/ATen/cuda/CUDASparse.h Unescape Escape View File

35 aten/src/ATen/cuda/Sleep.cu Unescape Escape View File

5 aten/src/ATen/cuda/Sleep.h Unescape Escape View File

31 aten/src/ATen/cuda/detail/BLASConstants.cu Unescape Escape View File

31 aten/src/ATen/detail/MTIAHooksInterface.h Unescape Escape View File

8 aten/src/ATen/functorch/BatchRulesViews.cpp Unescape Escape View File

9 aten/src/ATen/mps/MPSDevice.mm Unescape Escape View File

13 aten/src/ATen/native/AveragePool2d.cpp Unescape Escape View File

4 aten/src/ATen/native/Convolution.cpp Unescape Escape View File

2 aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp Unescape Escape View File

10 aten/src/ATen/native/cuda/Blas.cpp Unescape Escape View File

54 aten/src/ATen/native/cuda/GroupedBlas.cpp Unescape Escape View File

2 aten/src/ATen/native/cuda/IndexKernelUtils.cu Unescape Escape View File

50 aten/src/ATen/native/cuda/ScaledBlas.cpp Unescape Escape View File

9 aten/src/ATen/native/cuda/ScatterGatherKernel.cu Unescape Escape View File

6 aten/src/ATen/native/cuda/layer_norm_kernel.cu Unescape Escape View File

28 aten/src/ATen/native/mps/kernels/BinaryKernel.metal Unescape Escape View File

10 aten/src/ATen/native/mps/operations/BinaryKernel.mm Unescape Escape View File

26 aten/src/ATen/native/mps/operations/BinaryOps.mm Unescape Escape View File

8 aten/src/ATen/native/mps/operations/LossOps.mm Unescape Escape View File

4 aten/src/ATen/native/mps/operations/Normalization.mm Unescape Escape View File

3 aten/src/ATen/native/mps/operations/ReduceOps.mm Unescape Escape View File

1 aten/src/ATen/native/mps/operations/Sort.mm Unescape Escape View File

18 aten/src/ATen/native/native_functions.yaml Unescape Escape View File

22 aten/src/ATen/native/sparse/SparseTensor.cpp Unescape Escape View File

6 benchmarks/dynamo/common.py Unescape Escape View File

59 benchmarks/dynamo/genai_layers/benchmark.py Unescape Escape View File

18 benchmarks/dynamo/genai_layers/kernels.py Unescape Escape View File

90 benchmarks/dynamo/genai_layers/utils.py Unescape Escape View File

7 benchmarks/dynamo/timm_models.py Unescape Escape View File

4 benchmarks/dynamo/torchbench.py Unescape Escape View File

3 benchmarks/dynamo/torchbench.yaml Unescape Escape View File

6 c10/cuda/CUDAStream.cpp Unescape Escape View File

60 c10/metal/special_math.h Unescape Escape View File

18 c10/metal/utils.h Unescape Escape View File

2 c10/mobile/CPUProfilingAllocator.cpp Unescape Escape View File

4 c10/test/util/TypeList_test.cpp Unescape Escape View File

2 c10/util/Bitset.h Unescape Escape View File

1 Commits

ciflow/pul ... pr165642

5

.ci/docker/build.sh

View File

2

.ci/docker/common/install_acl.sh

View File

10

.ci/docker/common/install_conda.sh

View File

4

.ci/docker/common/install_rocm.sh

View File

4

.ci/docker/common/install_rocm_magma.sh

View File

2

.ci/docker/manywheel/build.sh

View File

21

.ci/docker/requirements-ci.txt

View File

5

.ci/docker/ubuntu-xpu/Dockerfile

View File

2

.ci/lumen_cli/pyproject.toml

View File

8

.ci/magma-rocm/Makefile

View File

6

.ci/magma-rocm/build_magma.sh

View File

2

.ci/pytorch/build.sh

View File

6

.ci/pytorch/test.sh

View File

4

.github/actions/diskspace-cleanup/action.yml vendored

View File

2

.github/ci_commit_pins/audio.txt vendored

View File

2

.github/ci_commit_pins/vision.txt vendored

View File

2

.github/pytorch-probot.yml vendored

View File

89

.github/scripts/generate_binary_build_matrix.py vendored

View File

13

.github/workflows/_xpu-test.yml vendored

View File

2

.github/workflows/build-almalinux-images.yml vendored

View File

2

.github/workflows/build-libtorch-images.yml vendored

View File

2

.github/workflows/build-magma-rocm-linux.yml vendored

View File

2

.github/workflows/build-manywheel-images.yml vendored

View File

9

.github/workflows/build-triton-wheel.yml vendored

View File

2

.github/workflows/docker-builds.yml vendored

View File

236

.github/workflows/generated-linux-binary-libtorch-nightly.yml generated vendored

View File

1610

.github/workflows/generated-linux-binary-manywheel-nightly.yml generated vendored

View File

148

.github/workflows/inductor-perf-test-nightly-xpu.yml vendored

View File

84

.github/workflows/periodic-rocm-mi200.yml vendored

View File

31

.github/workflows/periodic.yml vendored

View File

1

.github/workflows/upload-test-stats.yml vendored

View File

1

.gitignore vendored

View File

2

CMakeLists.txt

View File

8

aten/src/ATen/Context.cpp

View File

4

aten/src/ATen/Context.h

View File

117

aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h

View File

4

aten/src/ATen/cpu/vec/vec128/vec128_convert.h

View File

7

aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h

View File

144

aten/src/ATen/cuda/CUDAGreenContext.cpp

View File

41

aten/src/ATen/cuda/CUDAGreenContext.h

View File

11

aten/src/ATen/cuda/CUDASparse.h

View File

35

aten/src/ATen/cuda/Sleep.cu

View File

5

aten/src/ATen/cuda/Sleep.h

View File

31

aten/src/ATen/cuda/detail/BLASConstants.cu

View File

31

aten/src/ATen/detail/MTIAHooksInterface.h

View File

8

aten/src/ATen/functorch/BatchRulesViews.cpp

View File

9

aten/src/ATen/mps/MPSDevice.mm

View File

13

aten/src/ATen/native/AveragePool2d.cpp

View File

4

aten/src/ATen/native/Convolution.cpp

View File

2

aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp

View File

10

aten/src/ATen/native/cuda/Blas.cpp

View File

54

aten/src/ATen/native/cuda/GroupedBlas.cpp

View File

2

aten/src/ATen/native/cuda/IndexKernelUtils.cu

View File

50

aten/src/ATen/native/cuda/ScaledBlas.cpp

View File

9

aten/src/ATen/native/cuda/ScatterGatherKernel.cu

View File

6

aten/src/ATen/native/cuda/layer_norm_kernel.cu

View File

28

aten/src/ATen/native/mps/kernels/BinaryKernel.metal

View File

10

aten/src/ATen/native/mps/operations/BinaryKernel.mm

View File

26

aten/src/ATen/native/mps/operations/BinaryOps.mm

View File

8

aten/src/ATen/native/mps/operations/LossOps.mm

View File

4

aten/src/ATen/native/mps/operations/Normalization.mm

View File

3

aten/src/ATen/native/mps/operations/ReduceOps.mm

View File

1

aten/src/ATen/native/mps/operations/Sort.mm

View File

18

aten/src/ATen/native/native_functions.yaml

View File

22

aten/src/ATen/native/sparse/SparseTensor.cpp

View File

6

benchmarks/dynamo/common.py

View File

59

benchmarks/dynamo/genai_layers/benchmark.py

View File

18

benchmarks/dynamo/genai_layers/kernels.py

View File

90

benchmarks/dynamo/genai_layers/utils.py

View File

7

benchmarks/dynamo/timm_models.py

View File

4

benchmarks/dynamo/torchbench.py

View File

3

benchmarks/dynamo/torchbench.yaml

View File

6

c10/cuda/CUDAStream.cpp

View File

60

c10/metal/special_math.h

View File

18

c10/metal/utils.h

View File

2

c10/mobile/CPUProfilingAllocator.cpp

View File

4

c10/test/util/TypeList_test.cpp

View File

2

c10/util/Bitset.h

View File

4

c10/util/C++17.h

View File