Compare commits

..

7 Commits

Author SHA1 Message Date
a3795cfaea lint 2025-09-16 11:00:20 -07:00
a9d5c00727 more memory? 2025-09-15 21:42:41 +00:00
c1102ca308 more memory? 2025-09-15 21:37:26 +00:00
7856f8d7f4 fixed i think 2025-09-13 13:52:39 +00:00
973c3b531a oops 2025-09-13 13:52:39 +00:00
d5a496e7f1 check 2025-09-13 13:52:39 +00:00
051e544ef6 [BE] Make PyObjectSlot use a global PyInterpreter
ghstack-source-id: 826f3f0c155cd0677776949b0c1c82395a95a1e8
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158409

[BE] Remove pyinterpreter struct from pyobjslot

ghstack-source-id: 826f3f0c155cd0677776949b0c1c82395a95a1e8
Pull Request resolved: https://github.com/pytorch/pytorch/pull/159511
2025-09-13 13:52:39 +00:00
380 changed files with 3777 additions and 10082 deletions

View File

@ -7,4 +7,4 @@ set -ex
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh

View File

@ -0,0 +1,40 @@
#!/bin/bash
# This is where the local pytorch install in the docker image is located
pt_checkout="/var/lib/jenkins/workspace"
source "$pt_checkout/.ci/pytorch/common_utils.sh"
echo "functorch_doc_push_script.sh: Invoked with $*"
set -ex -o pipefail
version=${DOCS_VERSION:-nightly}
echo "version: $version"
# Build functorch docs
pushd $pt_checkout/functorch/docs
make html
popd
git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
pushd functorch_ghpages
if [ "$version" == "main" ]; then
version=nightly
fi
git rm -rf "$version" || true
mv "$pt_checkout/functorch/docs/build/html" "$version"
git add "$version" || true
git status
git config user.email "soumith+bot@pytorch.org"
git config user.name "pytorchbot"
# If there aren't changes, don't make a commit; push is no-op
git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
git status
if [[ "${WITH_PUSH:-}" == true ]]; then
git push -u origin gh-pages
fi
popd

View File

@ -1,25 +0,0 @@
From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
From: Michael Wang <13521008+isVoid@users.noreply.github.com>
Date: Tue, 1 Apr 2025 17:28:05 -0700
Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
(#185)
Co-authored-by: isVoid <isVoid@users.noreply.github.com>
---
numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index 1641bf77..233e9ed7 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -365,6 +365,9 @@ def _find_api(self, fname):
else:
variants = ('_v2', '')
+ if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
+ return getattr(self.lib, fname)
+
for variant in variants:
try:
return getattr(self.lib, f'{fname}{variant}')

View File

@ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
git config --global --add safe.directory /var/lib/jenkins/workspace
fi
# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
if [ -n "$NUMBA_CUDA_DIR" ]; then
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
pushd "$NUMBA_CUDA_DIR"
patch -p4 <"$NUMBA_PATCH"
popd
fi
echo "Environment variables:"
env
@ -1582,7 +1572,6 @@ test_linux_aarch64() {
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
# Dynamo tests

View File

@ -21,7 +21,6 @@ self-hosted-runner:
- linux.arm64.2xlarge.ephemeral
- linux.arm64.m7g.4xlarge
- linux.arm64.m7g.4xlarge.ephemeral
- linux.arm64.r7g.12xlarge.memory
- linux.4xlarge.nvidia.gpu
- linux.8xlarge.nvidia.gpu
- linux.16xlarge.nvidia.gpu

View File

@ -1 +1 @@
87ff22e49ed0e92576c4935ccb8c143daac4a3cd
caba63f0fa29ef9e3d566699f32f11c07c8bda4e

View File

@ -1 +1 @@
d119fc86140785e7efc8f125c17153544d1e0f20
f510715882304796a96e33028b4f6de1b026c2c7

View File

@ -1 +1 @@
c77852e117bdf056c8e9a087e51d6f65cf6ba53d
6c5478ff7c3d50dd1e3047d72ec5909bea474073

View File

@ -82,10 +82,16 @@ RUN if command -v apt-get >/dev/null; then \
apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim; \
else \
dnf install -y git curl wget sudo; \
dnf install -y git curl wget sudo vim; \
fi \
&& python3 --version && python3 -m pip --version
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version >/dev/null 2>&1; then \
@ -214,16 +220,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..."; \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
else \
SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
fi; \
curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
echo "Installing sccache..." \
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
&& tar -xzf sccache.tar.gz \
&& sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -284,7 +285,7 @@ RUN if command -v apt-get >/dev/null; then \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
else \
dnf install -y git curl wget sudo; \
dnf install -y git curl wget sudo vim; \
fi \
&& python3 --version && python3 -m pip --version
@ -297,6 +298,12 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
echo "[INFO] Showing torch_build_versions.txt content:" && \
cat torch_build_versions.txt
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# Install uv for faster pip installs if not existed
RUN --mount=type=cache,target=/root/.cache/uv \
if ! python3 -m uv --version > /dev/null 2>&1; then \

View File

@ -84,9 +84,6 @@ repackage_wheel() {
rm -rf $package
}
# Require to re-package the wheel
${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
pushd externals/vllm/wheels
for package in xformers flashinfer-python vllm; do
repackage_wheel $package

View File

@ -75,6 +75,10 @@ jobs:
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 30m to finish python docs unless there are issues
timeout-minutes: 30
- docs_type: functorch
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 15m to finish functorch docs unless there are issues
timeout-minutes: 15
# Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
# The current name requires updating the database last docs push query from test-infra every time the matrix is updated
name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
@ -207,6 +211,16 @@ jobs:
path: cppdocs/
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs
- name: Upload functorch Docs Preview
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
with:
retention-days: 14
s3-bucket: doc-previews
if-no-files-found: error
path: functorch_ghpages/nightly/
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()

View File

@ -169,7 +169,7 @@ jobs:
id: install-nvidia-driver
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
with:
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
- name: Setup GPU_FLAG for docker run

View File

@ -62,11 +62,6 @@ on:
required: false
type: number
default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
env:
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -81,9 +76,10 @@ jobs:
strategy:
matrix: ${{ fromJSON(inputs.test-matrix) }}
fail-fast: false
runs-on: ${{ matrix.runner }}
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
runs-on: ${{ matrix.runner }}
steps:
# [see note: pytorch repo ref]
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
@ -135,9 +131,6 @@ jobs:
- name: Start monitoring script
id: monitor-script
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
env:
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
@ -145,6 +138,9 @@ jobs:
WORKFLOW_RUN_ID: ${{github.run_id}}
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
if: ${{ !inputs.disable-monitor }}
shell: bash
continue-on-error: true
run: |
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
@ -182,12 +178,6 @@ jobs:
run: |
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
- name: Preserve github env variables for use in docker
shell: bash
run: |
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Test
id: test
env:
@ -203,22 +193,20 @@ jobs:
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
run: |
set -x
@ -248,7 +236,6 @@ jobs:
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e JOB_NAME \
-e BASE_SHA \
-e BRANCH \
-e SHA1 \
-e AWS_DEFAULT_REGION \
@ -266,12 +253,10 @@ jobs:
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-e TESTS_TO_INCLUDE \
-e HUGGING_FACE_HUB_TOKEN \
-e DASHBOARD_TAG \
--env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \
--ulimit core=0 \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--shm-size="8g" \

View File

@ -12,9 +12,6 @@ on:
paths:
- .github/workflows/build-vllm-wheel.yml
- .github/ci_commit_pins/vllm.txt
schedule:
# every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
- cron: 30 13 * * *
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -27,33 +24,21 @@ jobs:
fail-fast: false
matrix:
python-version: [ '3.12' ]
# TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
# TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
device: [ 'cu128', 'cu129' ]
runner: [ 'linux.12xlarge.memory' ]
include:
- platform: manylinux_2_28_x86_64
device: cu128
- device: cu128
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
runner: linux.12xlarge.memory
- platform: manylinux_2_28_x86_64
device: cu129
- device: cu129
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
runner: linux.12xlarge.memory
- platform: manylinux_2_28_aarch64
device: cu128
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
runner: linux.arm64.r7g.12xlarge.memory
- platform: manylinux_2_28_aarch64
device: cu129
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
runner: linux.arm64.r7g.12xlarge.memory
name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
name: "Build ${{ matrix.device }} vLLM wheel"
runs-on: ${{ matrix.runner }}
timeout-minutes: 480
env:
PY_VERS: ${{ matrix.python-version }}
MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
PLATFORM: ${{ matrix.platform }}
PLATFORM: 'manylinux_2_28_x86_64'
BUILD_DEVICE: ${{ matrix.device }}
steps:
- name: Setup SSH (Click me for login details)
@ -151,7 +136,7 @@ jobs:
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
if-no-files-found: error
path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
@ -161,29 +146,27 @@ jobs:
# Copied from build-triton-wheel workflow (mostly)
upload-wheel:
name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
name: "Upload ${{ matrix.device }} vLLM wheel"
needs:
- build-wheel
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
device: [ 'cu128', 'cu129' ]
env:
PLATFORM: ${{ matrix.platform }}
BUILD_DEVICE: ${{ matrix.device }}
permissions:
id-token: write
contents: read
container:
image: continuumio/miniconda3:4.12.0
environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }}
environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Configure AWS credentials(PyTorch account) for main
if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
with:
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
@ -207,15 +190,15 @@ jobs:
run: |
set -eux
mkdir -p "${RUNNER_TEMP}/artifacts/"
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
- name: Set DRY_RUN
if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
- name: Set DRY_RUN (only for tagged pushes)
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
shell: bash
run: |
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
- name: Set UPLOAD_CHANNEL
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
shell: bash
run: |

View File

@ -43,11 +43,6 @@ on:
required: false
type: boolean
default: false
freezing:
description: Run freezing?
required: false
type: boolean
default: true
benchmark_configs:
description: The list of configs used the benchmark
required: false
@ -107,7 +102,7 @@ jobs:
if: github.event.schedule == '0 7 * * *'
with:
build-environment: linux-jammy-py3.10-gcc11-build
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
timeout-minutes: 720
@ -121,9 +116,10 @@ jobs:
name: inductor-test
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-py3.10-gcc11-build
dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
timeout-minutes: 720

View File

@ -39,7 +39,7 @@ jobs:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0;8.6'
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -62,7 +62,7 @@ jobs:
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },

2
.gitignore vendored
View File

@ -389,5 +389,3 @@ android/pytorch_android_torchvision/.cxx
# Claude Code local configuration
CLAUDE.local.md
/test_*.py
/debug_*.py

View File

@ -964,6 +964,7 @@ exclude_patterns = [
'test/jit/**', # should be run through test/test_jit.py
'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py
'test/fx/**', # should be run through test/test_fx.py
'test/bottleneck_test/**', # excluded by test/run_test.py
'test/package/**', # excluded by test/run_test.py
'test/distributed/argparse_util_test.py',
'test/distributed/bin/test_script.py',
@ -1409,6 +1410,8 @@ exclude_patterns = [
'torch/utils/benchmark/utils/timer.py',
'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
'torch/utils/bottleneck/__init__.py',
'torch/utils/bottleneck/__main__.py',
'torch/utils/bundled_inputs.py',
'torch/utils/checkpoint.py',
'torch/utils/collect_env.py',

View File

@ -874,7 +874,7 @@ cmake_dependent_option(
"Whether to build the flash_attention kernel for scaled dot product attention.\
Will be disabled if not supported by the platform"
ON
"(USE_CUDA AND NOT MSVC) OR USE_ROCM"
"USE_CUDA OR USE_ROCM"
OFF)
cmake_dependent_option(
@ -891,7 +891,7 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
endif()
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
set(USE_FBGEMM_GENAI ON)
endif()

View File

@ -65,24 +65,14 @@ DLDataType getDLDataType(const Tensor& t) {
break;
// TODO(#146647): use macro here instead of spelling out each shell dtype
case ScalarType::Float8_e5m2:
dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
break;
case ScalarType::Float8_e5m2fnuz:
dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz;
break;
case ScalarType::Float8_e4m3fn:
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn;
break;
case ScalarType::Float8_e4m3fnuz:
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz;
break;
case ScalarType::Float8_e8m0fnu:
dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu;
TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
break;
case ScalarType::Float4_e2m1fn_x2:
dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn;
dtype.lanes = 2;
dtype.bits = 4;
TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
break;
case ScalarType::QInt8:
case ScalarType::QUInt8:
@ -187,11 +177,7 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
ScalarType toScalarType(const DLDataType& dtype) {
ScalarType stype = ScalarType::Undefined;
if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) {
TORCH_CHECK_BUFFER(
dtype.lanes == 1,
"ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
}
TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
switch (dtype.code) {
case DLDataTypeCode::kDLUInt:
switch (dtype.bits) {
@ -283,73 +269,6 @@ ScalarType toScalarType(const DLDataType& dtype) {
false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e5m2:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e5m2;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e5m2fnuz:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e5m2fnuz;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e4m3fn:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e4m3fn;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e4m3fnuz:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e4m3fnuz;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat8_e8m0fnu:
switch (dtype.bits) {
case 8:
stype = ScalarType::Float8_e8m0fnu;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits));
}
break;
case DLDataTypeCode::kDLFloat4_e2m1fn:
switch (dtype.bits) {
case 4:
switch (dtype.lanes) {
case 2:
stype = ScalarType::Float4_e2m1fn_x2;
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes));
}
break;
default:
TORCH_CHECK_BUFFER(
false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits));
}
break;
default:
TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
}
@ -435,8 +354,8 @@ T* toDLPackImpl(const Tensor& src) {
atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
atDLMTensor->tensor.dl_tensor.byte_offset = 0;
fillVersion(&atDLMTensor->tensor);

View File

@ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
// SparseTensorImpl has no storage, so we cannot query its nbytes.
// (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
// Same for XLA
if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) {
if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
} else {
original_storage_size_ = -1;

View File

@ -266,14 +266,11 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(
* See Note [Acquire lock when using random generators]
*/
void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
state_->seed_ = seed;
state_->philox_offset_per_thread_ = 0;
no_reset_rnn_state_.clear();
} else {
TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.");
// no-op case
}
at::cuda::assertNotCapturing(
"Cannot call CUDAGeneratorImpl::set_current_seed");
state_->seed_ = seed;
state_->philox_offset_per_thread_ = 0;
no_reset_rnn_state_.clear();
}
/**
@ -302,6 +299,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
* Gets the current seed of CUDAGeneratorImpl.
*/
uint64_t CUDAGeneratorImpl::current_seed() const {
// Debatable if current_seed() should be allowed in captured regions.
// Conservatively disallow it for now.
at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
return state_->seed_;
}
@ -346,6 +346,8 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
* and size of the internal state.
*/
void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
at::cuda::assertNotCapturing(
"Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
static const size_t seed_size = sizeof(uint64_t);
static const size_t offset_size = sizeof(int64_t);
static const size_t total_size = seed_size + offset_size;
@ -400,27 +402,15 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
*/
void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
// see Note [Why enforce RNG offset % 4 == 0?]
// Note: If you use CUDNN RNN's, calling
// set_philox_offset_per_thread instead of set_offset will cause the
// cudnn RNN rng state to become stale.
TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
state_->philox_offset_per_thread_ = offset;
} else {
state_->offset_intragraph_ = offset;
}
state_->philox_offset_per_thread_ = offset;
}
/**
* Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
*/
uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
return state_->philox_offset_per_thread_;
} else {
return state_->offset_intragraph_;
}
return state_->philox_offset_per_thread_;
}
/**

View File

@ -19,7 +19,7 @@
#define DLPACK_MAJOR_VERSION 1
/*! \brief The current minor version of dlpack */
#define DLPACK_MINOR_VERSION 1
#define DLPACK_MINOR_VERSION 0
/*! \brief DLPACK_DLL prefix for windows */
#ifdef _WIN32
@ -32,7 +32,9 @@
#define DLPACK_DLL
#endif
// NOLINTNEXTLINE(modernize-deprecated-headers)
#include <stdint.h>
// NOLINTNEXTLINE(modernize-deprecated-headers)
#include <stddef.h>
#ifdef __cplusplus
@ -157,26 +159,6 @@ typedef enum {
kDLComplex = 5U,
/*! \brief boolean */
kDLBool = 6U,
/*! \brief FP8 data types */
kDLFloat8_e3m4 = 7U,
kDLFloat8_e4m3 = 8U,
kDLFloat8_e4m3b11fnuz = 9U,
kDLFloat8_e4m3fn = 10U,
kDLFloat8_e4m3fnuz = 11U,
kDLFloat8_e5m2 = 12U,
kDLFloat8_e5m2fnuz = 13U,
kDLFloat8_e8m0fnu = 14U,
/*! \brief FP6 data types
* Setting bits != 6 is currently unspecified, and the producer must ensure it is set
* while the consumer must stop importing if the value is unexpected.
*/
kDLFloat6_e2m3fn = 15U,
kDLFloat6_e3m2fn = 16U,
/*! \brief FP4 data types
* Setting bits != 4 is currently unspecified, and the producer must ensure it is set
* while the consumer must stop importing if the value is unexpected.
*/
kDLFloat4_e2m1fn = 17U,
} DLDataTypeCode;
/*!
@ -190,12 +172,6 @@ typedef enum {
* - int8: type_code = 0, bits = 8, lanes = 1
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
* - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
* - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
* - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
* - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
*
* When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
* for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
*/
typedef struct {
/*!
@ -253,12 +229,12 @@ typedef struct {
/*! \brief The data type of the pointer*/
DLDataType dtype;
/*! \brief The shape of the tensor */
int64_t* shape;
const int64_t* shape;
/*!
* \brief strides of the tensor (in number of elements, not bytes)
* can be NULL, indicating tensor is compact and row-majored.
*/
int64_t* strides;
const int64_t* strides;
/*! \brief The offset in bytes to the beginning pointer to data */
uint64_t byte_offset;
} DLTensor;
@ -293,7 +269,7 @@ typedef struct DLManagedTensor {
void (*deleter)(struct DLManagedTensor * self);
} DLManagedTensor;
// bit masks used in the DLManagedTensorVersioned
// bit masks used in in the DLManagedTensorVersioned
/*! \brief bit mask to indicate that the tensor is read only. */
#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
@ -306,14 +282,6 @@ typedef struct DLManagedTensor {
*/
#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
/*
* \brief bit mask to indicate that whether a sub-byte type is packed or padded.
*
* The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
* be set by the producer to signal that a tensor of sub-byte type is padded.
*/
#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
/*!
* \brief A versioned and managed C Tensor object, manage memory of DLTensor.
*

View File

@ -171,8 +171,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
POINTWISE_BOXED(fill_.Scalar);
POINTWISE_BOXED(zero_);
// This is special because this op doesn't return anything
m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata);
#undef UNARY_POINTWISE
#undef UNARY_POINTWISE_ALL

View File

@ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
// TODO: contiguous can be made to preserve the memory format
// of the input. However since the above reshape clobbers h and w
// it may not be safe to do that, since channels_last contiguous
// may think oc and the last dim correspond to h,w?
// may think oc and and the last dim correspond to h,w?
// It is not clear, however from initial looking around it feels that
// this may not be correct.
// In this case channels last will likely require custom implementation

View File

@ -1,4 +1,3 @@
#pragma once
#include <ATen/core/Tensor.h>
#include <ATen/Config.h>
#include <cstdint>

View File

@ -67,13 +67,13 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
int64_t inputH = input_.size(heightDim);
int64_t inputW = input_.size(widthDim);
TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT),
TORCH_CHECK(outputT + poolSizeT - 1 < inputT,
"fractional_max_pool3d_out(): pool time ", poolSizeT,
" too large relative to input time ", inputT);
TORCH_CHECK((poolSizeW <= inputW) && (outputW + poolSizeW - 1 < inputW),
TORCH_CHECK(outputW + poolSizeW - 1 < inputW,
"fractional_max_pool3d_out(): pool width ", poolSizeW,
" too large relative to input width ", inputW);
TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH),
TORCH_CHECK(outputH + poolSizeH - 1 < inputH,
"fractional_max_pool3d_out(): pool height ", poolSizeH,
" too large relative to input height ", inputH);

View File

@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
if (self.dim() == 1 || broadcast_index) {
Tensor squeezed = index;
if (broadcast_index && index.dim() > 1) {
for (int64_t d = index.dim() - 1; d >= 0; --d) {
for (const auto d : c10::irange(index.dim())) {
if (d == dim) {
continue;
}

View File

@ -52,7 +52,6 @@ void apply_triu_tril_single(
int64_t self_col_stride,
bool upper) {
constexpr int64_t zero = 0;
k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN.
if (upper) {
parallel_for(0, n, 0, [&](int64_t start, int64_t end) {

View File

@ -85,11 +85,11 @@ void cpu_max_unpool(
if constexpr (is_3d) {
TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
" (output volumes are of size ", output_depth,
"x", output_height, "x", output_width, ")");
"x", output_height, "x", output_width);
} else {
TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
" (output volumes are of size ", output_height,
"x", output_width, ")");
"x", output_width);
}
}

View File

@ -317,17 +317,6 @@ void nonzero_static_cuda_out_impl(
out_temp =
Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
}
// If input has zero elements, avoid kernel grid calculations (which can
// produce zero divisors) and just fill the output with fill_value.
if (self.numel() == 0) {
if (need_to_copy) {
out_temp.fill_(fill_value);
out.copy_(out_temp);
} else {
out.fill_(fill_value);
}
return;
}
int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
: out.mutable_data_ptr<int64_t>();

View File

@ -416,7 +416,6 @@ struct ReduceOp {
if (config.should_block_y_reduce()) {
value = block_y_reduce<output_vec_size>(value, shared_memory);
}
__syncthreads();
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}

View File

@ -17,11 +17,12 @@ __global__ static void compute_cuda_kernel(
index_t* result_ptr,
int64_t size,
int64_t result_size) {
CUDA_KERNEL_ASSERT_PRINTF(
result_size == cumsum_ptr[size - 1],
if (C10_UNLIKELY((result_size != cumsum_ptr[size - 1]))) {
printf("%s:%d:%s: block: [%d,%d,%d], thread: [%d,%d,%d] "
"Invalid input! In `repeat_interleave`, the `output_size` argument (%ld) must be the same as the sum of the elements in the `repeats` tensor (%ld).\n",
result_size,
cumsum_ptr[size - 1]);
__FILE__, __LINE__, __func__,blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, result_size, cumsum_ptr[size - 1 ]);
CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1])
}
int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;
int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE;

View File

@ -5,20 +5,12 @@
namespace at::native {
__global__ void weight_int8pack_mm_kernel(
const float* x,
const int8_t* w,
const float* scale,
float* out,
int B,
int K,
int N) {
__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
// one thread per output element: [B, N]
int b = blockIdx.y * blockDim.y + threadIdx.y;
int n = blockIdx.x * blockDim.x + threadIdx.x;
if (b >= B || n >= N)
return;
if (b >= B || n >= N) return;
float acc = 0.0f;
for (int k = 0; k < K; ++k) {
@ -28,11 +20,7 @@ __global__ void weight_int8pack_mm_kernel(
out[b * N + n] = acc * scale[n];
}
void launch_weight_int8pack_mm_cuda_kernel(
const Tensor& x,
const Tensor& w_int8,
const Tensor& scale,
Tensor& out) {
void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
const int B = x.size(0);
const int K = x.size(1);
const int N = w_int8.size(0);
@ -47,16 +35,12 @@ void launch_weight_int8pack_mm_cuda_kernel(
w_int8.data_ptr<int8_t>(),
scale.data_ptr<float>(),
out.data_ptr<float>(),
B,
K,
N);
B, K, N);
}
// Main GPU entry point
at::Tensor _weight_int8pack_mm_cuda(
const at::Tensor& x,
const at::Tensor& w_int8,
const at::Tensor& scale) {
at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
// --- Check inputs ---
TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
@ -66,16 +50,12 @@ at::Tensor _weight_int8pack_mm_cuda(
TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
TORCH_CHECK(scale.dim() == 1, "scale must be 1D");
TORCH_CHECK(
x.size(1) == w_int8.size(1),
"K dimension mismatch: x.size(1) != w.size(1)");
TORCH_CHECK(
w_int8.size(0) == scale.size(0),
"Output dim mismatch: w.size(0) != scale.size(0)");
TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
// --- Determine shapes ---
auto B = x.size(0); // batch size
auto N = w_int8.size(0); // output dim
auto B = x.size(0); // batch size
auto N = w_int8.size(0); // output dim
// Ensure inputs are in the correct types for the kernel
auto x_f32 = x.to(at::kFloat);
@ -83,13 +63,12 @@ at::Tensor _weight_int8pack_mm_cuda(
auto scale_f32 = scale.to(at::kFloat);
// --- Allocate output ---
auto out = at::empty({B, N}, x_f32.options());
auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
// --- Launch kernel ---
launch_weight_int8pack_mm_cuda_kernel(
x_f32, w_int8_contiguous, scale_f32, out);
launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
return out.to(x.dtype());
return out;
}
} // namespace at::native

View File

@ -482,9 +482,7 @@ auto build_graph(
auto scaled_dot_product_flash_attention_options =
fe::graph::SDPA_attributes()
.set_name("CUDNN_SDPA")
.set_is_inference(return_softmaxstats == false)
// TODO(eqy): switch to this API once cuDNN FE is upgraded
// .set_generate_stats(return_softmaxstats)
.set_generate_stats(return_softmaxstats)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale);
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@ -704,9 +702,7 @@ auto build_graph_nestedtensor(
auto scaled_dot_product_flash_attention_options =
fe::graph::SDPA_attributes()
.set_name("CUDNN_SDPA_NESTEDTENSOR")
.set_is_inference(return_softmaxstats == false)
// TODO(eqy): switch to this API once cuDNN FE is upgraded
// .set_generate_stats(return_softmaxstats)
.set_generate_stats(return_softmaxstats)
.set_causal_mask(is_causal)
.set_attn_scale(attn_scale)
.set_seq_len_q(SEQ_LEN_Q_)

View File

@ -1770,12 +1770,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_back
// fusions
// ---------------------------------------------------------------------
void raw_miopen_convolution_add_relu_out(
void raw_miopen_convolution_relu_out(
const Tensor& output,
const Tensor& input,
const Tensor& weight,
const Tensor& z,
float alpha,
const Tensor& bias,
IntArrayRef stride,
IntArrayRef padding,
@ -1783,20 +1781,68 @@ void raw_miopen_convolution_add_relu_out(
int64_t groups,
bool benchmark,
bool deterministic) {
raw_miopen_convolution_forward_out(
output,
auto dataType = getMiopenDataType(input);
miopenConvolutionMode_t c_mode = miopenConvolution;
ConvolutionArgs args{ input, output, weight };
args.handle = getMiopenHandle();
at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
setConvolutionParams(
&args.params,
args.handle,
input,
weight,
padding,
stride,
dilation,
groups,
deterministic,
memory_format);
args.idesc.set(input, memory_format);
args.wdesc.set(weight, memory_format, 0);
args.odesc.set(output, memory_format);
args.cdesc.set(
dataType,
c_mode,
input.dim() - 2,
args.params.padding,
args.params.stride,
args.params.dilation,
args.params.groups,
benchmark,
deterministic);
at::Tensor alpha_mul_z_add_bias =
at::native::reshape_bias(input.dim(), bias).add(z, alpha);
output.add_(alpha_mul_z_add_bias);
output.relu_();
TensorDescriptor bdesc;
bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
// Create the fusion plan
miopenFusionPlanDescriptor_t fusePlanDesc;
miopenFusionOpDescriptor_t convoOp;
miopenFusionOpDescriptor_t biasOp;
miopenFusionOpDescriptor_t activOp;
MIOPEN_CHECK(miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, args.idesc.desc()));
MIOPEN_CHECK(miopenCreateOpConvForward(fusePlanDesc, &convoOp, args.cdesc.desc(), args.wdesc.desc()));
MIOPEN_CHECK(miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bdesc.desc()));
MIOPEN_CHECK(miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU));
// compile fusion plan
MIOPEN_CHECK(miopenCompileFusionPlan(args.handle, fusePlanDesc));
// Set the Args
float alpha = static_cast<float>(1);
float beta = static_cast<float>(0);
float activ_alpha = static_cast<float>(0);
float activ_beta = static_cast<float>(0);
float activ_gamma = static_cast<float>(0);
miopenOperatorArgs_t fusionArgs;
MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));
miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
// Cleanup
miopenDestroyFusionPlan(fusePlanDesc);
}
static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat memory_format) {
@ -1809,107 +1855,171 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
Tensor miopen_convolution_add_relu(
const Tensor& input_t,
const Tensor& weight_t,
const Tensor& z_t,
const Tensor& z,
const std::optional<Scalar>& alpha,
const std::optional<Tensor>& bias_t,
const std::optional<Tensor>& bias,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef dilation,
int64_t groups) {
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
const Tensor input = input_t.contiguous(memory_format);
const Tensor weight = weight_t.contiguous(memory_format);
Tensor z = z_t;
if (z.suggest_memory_format() != memory_format) {
z = z.to(memory_format);
}
z = z.contiguous(memory_format);
// FuseFrozenConvAddRelu performs some tensor shape checking
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input.sizes(), weight.sizes(), padding, stride, dilation),
input.options().memory_format(memory_format));
if (output_t.numel() == 0) {
return output_t;
}
// MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
// y = act ( alpha1 * conv(x) + alpha2 * z + bias )
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
auto& ctx = at::globalContext();
bool benchmark = ctx.benchmarkCuDNN();
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
auto _bias = bias_t.has_value()
? bias_t.value()
: at::zeros(
{output_t.size(1)},
optTypeMetaToScalarType(output_t.options().dtype_opt()),
output_t.options().layout_opt(),
output_t.options().device_opt(),
output_t.options().pinned_memory_opt());
raw_miopen_convolution_add_relu_out(
output_t,
TensorArg input { input_t, "input", 1 },
weight { weight_t, "weight", 2 };
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
input_t.options().memory_format(memory_format));
if (output_t.numel() == 0){
return output_t;
}
// Avoid ambiguity of "output" when this is being used as backwards
TensorArg output{output_t, "result", 0};
miopen_convolution_forward_out(
output,
"miopen_convolution_add_relu",
input,
weight,
z,
_alpha,
_bias,
stride,
padding,
stride,
dilation,
groups,
benchmark,
true); // deterministic
false // deterministic
);
return output_t;
auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
if (!output_t.is_same(contig_output_t)) {
contig_output_t.copy_(output_t);
}
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
auto _bias = bias.has_value()
? bias.value()
: at::zeros(
{contig_output_t.size(1)},
optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
contig_output_t.options().layout_opt(),
contig_output_t.options().device_opt(),
contig_output_t.options().pinned_memory_opt());
at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
contig_output_t.add_(alpha_mul_z_add_bias);
contig_output_t.relu_();
return contig_output_t;
}
Tensor miopen_convolution_relu(
const Tensor& input_t,
const Tensor& weight_t,
const std::optional<Tensor>& bias_t,
const std::optional<Tensor>& bias,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef dilation,
int64_t groups) {
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
const Tensor input = input_t.contiguous(memory_format);
const Tensor weight = weight_t.contiguous(memory_format);
// FuseFrozenConvAddRelu performs some tensor shape checking
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input.sizes(), weight.sizes(), padding, stride, dilation),
input.options().memory_format(memory_format));
if (output_t.numel() == 0) {
return output_t;
}
auto& ctx = at::globalContext();
bool benchmark = ctx.benchmarkCuDNN();
auto _bias = bias_t.has_value()
? bias_t.value()
: at::zeros(
{output_t.size(1)},
optTypeMetaToScalarType(output_t.options().dtype_opt()),
output_t.options().layout_opt(),
output_t.options().device_opt(),
output_t.options().pinned_memory_opt());
raw_miopen_convolution_add_relu_out(
output_t,
input,
weight,
output_t, // use output_t as z to satisfy MIOpen API
0, // alpha
_bias,
stride,
padding,
dilation,
groups,
benchmark, // benchmark
true); // deterministic
// MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
&& input_t.scalar_type() == at::kFloat
&& input_t.ndimension() == 4) {
return output_t;
// FuseFrozenConvAddRelu performs some tensor shape checking
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
input_t.options().memory_format(input_t.suggest_memory_format()));
if (output_t.numel() == 0) {
return output_t;
}
auto _bias = bias.has_value()
? bias.value()
: at::zeros(
{output_t.size(1)},
optTypeMetaToScalarType(output_t.options().dtype_opt()),
output_t.options().layout_opt(),
output_t.options().device_opt(),
output_t.options().pinned_memory_opt());
raw_miopen_convolution_relu_out(
output_t,
input_t,
weight_t,
_bias,
stride,
padding,
dilation,
groups,
benchmark, // benchmark
false // deterministic
);
return output_t;
}
else {
// fallback
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
TensorArg input { input_t, "input", 1 },
weight { weight_t, "weight", 2 };
Tensor output_t = at::detail::empty_cuda(
conv_output_size(
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
input->options().memory_format(memory_format));
if (output_t.numel() == 0){
return output_t;
}
// Avoid ambiguity of "output" when this is being used as backwards
TensorArg output{output_t, "result", 0};
miopen_convolution_forward_out(
output,
"miopen_convolution_relu",
input,
weight,
padding,
stride,
dilation,
groups,
benchmark,
false // deterministic
);
auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
if (!output_t.is_same(contig_output_t)) {
contig_output_t.copy_(output_t);
}
auto _bias = bias.has_value()
? bias.value()
: at::zeros(
{contig_output_t.size(1)},
optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
contig_output_t.options().layout_opt(),
contig_output_t.options().device_opt(),
contig_output_t.options().pinned_memory_opt());
at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
contig_output_t.add_(reshaped_bias);
contig_output_t.relu_();
return contig_output_t;
}
}
REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward)

View File

@ -0,0 +1,48 @@
#pragma once
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
MPSGraphFFTScalingModeNone = 0L,
MPSGraphFFTScalingModeSize = 1L,
MPSGraphFFTScalingModeUnitary = 2L,
};
@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
@property(readwrite, nonatomic) BOOL inverse;
@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
@property(readwrite, nonatomic) BOOL roundToOddHermitean;
+ (nullable instancetype)descriptor;
@end
@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
@interface MPSGraph (SonomaOps)
- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
axes:(NSArray<NSNumber*>* _Nonnull)axes
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
axes:(NSArray<NSNumber*>* _Nonnull)axes
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
axes:(NSArray<NSNumber*>* _Nonnull)axes
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
@end
// define BFloat16 enums for MacOS13
#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
// define Metal version
#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
#endif

View File

@ -0,0 +1,196 @@
#pragma once
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
// TODO: Remove me when moved to MacOS 13
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
@property(readwrite, nonatomic) NSUInteger strideInX;
@property(readwrite, nonatomic) NSUInteger strideInY;
@property(readwrite, nonatomic) NSUInteger strideInZ;
@property(readwrite, nonatomic) NSUInteger dilationRateInX;
@property(readwrite, nonatomic) NSUInteger dilationRateInY;
@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
@property(readwrite, nonatomic) NSUInteger paddingLeft;
@property(readwrite, nonatomic) NSUInteger paddingRight;
@property(readwrite, nonatomic) NSUInteger paddingTop;
@property(readwrite, nonatomic) NSUInteger paddingBottom;
@property(readwrite, nonatomic) NSUInteger paddingFront;
@property(readwrite, nonatomic) NSUInteger paddingBack;
@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
@property(readwrite, nonatomic) NSUInteger groups;
@end
@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
#endif
@interface MPSGraph (VenturaOps)
#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
MPSGraphResizeNearestRoundingModeCeil = 2L,
MPSGraphResizeNearestRoundingModeFloor = 3L,
MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
};
// Define complex enums for MacOS 12
#define MPSDataTypeComplexBit 0x01000000
#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
#endif
- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
weightsTensor:(MPSGraphTensor* _Nonnull)weights
descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)
convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
weightsTensor:(MPSGraphTensor* _Nonnull)weights
outputShape:(MPSShape* _Nonnull)outputShape
forwardConvolutionDescriptor:
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)
convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
sourceTensor:(MPSGraphTensor* _Nonnull)source
outputShape:(MPSShape* _Nonnull)outputShape
forwardConvolutionDescriptor:
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axis:(NSInteger)axis
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
descending:(BOOL)descending
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
sizeTensor:(MPSGraphTensor* _Nonnull)size
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
centerResult:(BOOL)centerResult
alignCorners:(BOOL)alignCorners
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
input:(MPSGraphTensor* _Nonnull)input
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
layout:(MPSGraphTensorNamedDataLayout)layout
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
layout:(MPSGraphTensorNamedDataLayout)layout
normalizeCoordinates:(BOOL)normalizeCoordinates
relativeCoordinates:(BOOL)relativeCoordinates
alignCorners:(BOOL)alignCorners
paddingMode:(MPSGraphPaddingMode)paddingMode
samplingMode:(MPSGraphResizeMode)samplingMode
constantValue:(double)constantValue
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
layout:(MPSGraphTensorNamedDataLayout)layout
normalizeCoordinates:(BOOL)normalizeCoordinates
relativeCoordinates:(BOOL)relativeCoordinates
alignCorners:(BOOL)alignCorners
paddingMode:(MPSGraphPaddingMode)paddingMode
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
constantValue:(double)constantValue
name:(NSString* _Nullable)name;
- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
@end

View File

@ -9,6 +9,8 @@
#include <ATen/mps/MPSAllocatorInterface.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <fmt/format.h>
#include <fmt/ranges.h>
@ -568,7 +570,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
MPSShape* mpsStrides = getMPSShape(_tensor.strides());
check_mps_shape(mpsShape);
auto storage_numel = src.storage().nbytes() / src.element_size() - src.storage_offset();
auto storage_numel = src.storage().nbytes() / src.element_size();
TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(),
"MPSGaph does not support tensor dims larger than INT_MAX");
MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType

View File

@ -1,25 +0,0 @@
#pragma once
#include <c10/metal/common.h>
#ifdef __METAL__
enum class EmbeddingBagMode { SUM = 0, MEAN, MAX };
#else
#include <ATen/native/EmbeddingBag.h>
using at::native::EmbeddingBagMode;
#endif
template <typename idx_type_t = uint32_t>
struct EmbeddingBagParams {
::c10::metal::array<idx_type_t, 2> weight_strides;
::c10::metal::array<idx_type_t, 2> output_strides;
::c10::metal::array<idx_type_t, 2> max_indices_strides;
idx_type_t per_sample_weights_strides;
idx_type_t num_indices;
idx_type_t num_bags;
idx_type_t feature_size;
EmbeddingBagMode mode;
int64_t padding_idx;
};

View File

@ -1,212 +0,0 @@
#include <ATen/native/mps/kernels/EmbeddingBag.h>
#include <c10/metal/utils.h>
#include <metal_array>
#include <metal_stdlib>
using namespace metal;
using namespace c10::metal;
template <EmbeddingBagMode M, typename T>
struct ReductionOpInit {
inline opmath_t<T> operator()() {
return 0;
}
};
template <typename T>
struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
inline opmath_t<T> operator()() {
return static_cast<opmath_t<T>>(-INFINITY);
}
};
template <EmbeddingBagMode M, typename T>
struct ReductionOp {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_strides);
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::SUM, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t per_sample_weights_index,
constant T* per_sample_weights,
uint32_t per_sample_weights_strides) {
if (per_sample_weights_strides) {
T per_sample_weight = per_sample_weights
[per_sample_weights_strides * per_sample_weights_index];
return static_cast<opmath_t<T>>(per_sample_weight) *
static_cast<opmath_t<T>>(weight_val) +
out_val;
} else {
return static_cast<opmath_t<T>>(weight_val) + out_val;
}
}
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::MEAN, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t,
constant T*,
uint32_t) {
return static_cast<opmath_t<T>>(weight_val) + out_val;
}
};
template <typename T>
struct ReductionOp<EmbeddingBagMode::MAX, T> {
inline opmath_t<T> operator()(
T weight_val,
opmath_t<T> out_val,
uint32_t,
constant T*,
uint32_t) {
return max(static_cast<opmath_t<T>>(weight_val), out_val);
}
};
template <EmbeddingBagMode M, typename T>
struct ReductionOpFinal {
inline T operator()(opmath_t<T> val, uint32_t) {
return static_cast<T>(val);
}
};
template <typename T>
struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> {
inline T operator()(opmath_t<T> val, uint32_t count) {
auto out = val / count;
return static_cast<T>((count == 0) ? 0 : out);
}
};
template <typename T>
struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
inline T operator()(opmath_t<T> val, uint32_t count) {
return static_cast<T>((count == 0) ? 0 : val);
}
};
template <EmbeddingBagMode M, typename T, typename I>
void embedding_bag_impl(
constant T* weight,
constant I* indices,
constant I* offsets,
constant T* per_sample_weights,
device T* output,
device I* offset2bag,
device I* bag_size,
device I* max_indices,
constant EmbeddingBagParams<uint32_t>& params,
uint tid) {
auto num_indices = params.num_indices;
auto num_bags = params.num_bags;
auto feature_size = params.feature_size;
auto padding_idx = params.padding_idx;
auto per_sample_weights_strides = params.per_sample_weights_strides;
constant auto& output_strides = params.output_strides;
constant auto& weight_strides = params.weight_strides;
constant auto& max_indices_strides = params.max_indices_strides;
auto bag_idx = tid / feature_size;
auto feature_idx = tid % feature_size;
output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
bool is_last_bag = bag_idx + 1 == num_bags;
uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
uint32_t indices_end = is_last_bag * (num_indices) +
(!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end]));
auto out_val = ReductionOpInit<M, T>()();
uint32_t bag_size_ = 0;
for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
indices_idx++) {
I weight_idx = indices[indices_idx];
bool pad = (weight_idx == padding_idx);
T weight_val = weight
[static_cast<uint32_t>(weight_idx) * weight_strides[0] +
feature_idx * weight_strides[1]];
bag_size_ += static_cast<uint32_t>(!pad);
auto tmp_val = ReductionOp<M, T>()(
weight_val,
out_val,
indices_idx,
per_sample_weights,
per_sample_weights_strides);
out_val = pad ? out_val : tmp_val;
}
*output = ReductionOpFinal<M, T>()(out_val, bag_size_);
}
#define DISPATCH_IMPL(MODE) \
return embedding_bag_impl<MODE>( \
weight, \
indices, \
offsets, \
per_sample_weights, \
output, \
offset2bag, \
bag_size, \
max_indices, \
params, \
tid)
template <typename T, typename I>
kernel void embedding_bag(
constant T* weight [[buffer(0)]],
constant I* indices [[buffer(1)]],
constant I* offsets [[buffer(2)]],
constant T* per_sample_weights [[buffer(3)]],
device T* output [[buffer(4)]],
device I* offset2bag [[buffer(5)]],
device I* bag_size [[buffer(6)]],
device I* max_indices [[buffer(7)]],
constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]],
uint tid [[thread_position_in_grid]]) {
switch (params.mode) {
case EmbeddingBagMode::SUM:
DISPATCH_IMPL(EmbeddingBagMode::SUM);
case EmbeddingBagMode::MEAN:
DISPATCH_IMPL(EmbeddingBagMode::MEAN);
case EmbeddingBagMode::MAX:
DISPATCH_IMPL(EmbeddingBagMode::MAX);
}
}
#define REGISTER_EMBEDDING_BAG_OP(T, I) \
template [[host_name("embedding_bag_" #T "_" #I)]] \
kernel void embedding_bag<T, I>( \
constant T * weight [[buffer(0)]], \
constant I * indices [[buffer(1)]], \
constant I * offsets [[buffer(2)]], \
constant T * per_sample_weights [[buffer(3)]], \
device T * output [[buffer(4)]], \
device I * offset2bag [[buffer(5)]], \
device I * bag_size [[buffer(6)]], \
device I * max_indices [[buffer(7)]], \
constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
uint tid [[thread_position_in_grid]]);
REGISTER_EMBEDDING_BAG_OP(float, int);
REGISTER_EMBEDDING_BAG_OP(float, long);
REGISTER_EMBEDDING_BAG_OP(half, int);
REGISTER_EMBEDDING_BAG_OP(half, long);
REGISTER_EMBEDDING_BAG_OP(bfloat, int);
REGISTER_EMBEDDING_BAG_OP(bfloat, long);

View File

@ -8,6 +8,8 @@
#include <ATen/native/TensorIterator.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/operations/BinaryKernel.h>
// For MTLLanguageVersion_3_1
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <fmt/format.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -1,12 +1,23 @@
// Copyright © 2022 Apple Inc.
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/ConvUtils.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/ops/_mps_convolution_native.h>
#include <ATen/ops/_mps_convolution_transpose_native.h>
#include <ATen/ops/mps_convolution_backward_native.h>
#include <ATen/ops/mps_convolution_transpose_backward_native.h>
#include <fmt/format.h>
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
@implementation FakeMPSGraphConvolution3DOpDescriptor
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
return self;
}
@end
#endif
namespace at::native {
@ -39,9 +50,11 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
descriptor_.paddingFront = paddingDepth;
descriptor_.paddingBack = paddingDepth;
descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;
// PyTorch always uses NCDHW memory layout for 3D tensors
descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;
descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;
// PyTorch always uses OIDHW memory layout for 3D weights
descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;
descriptor_.groups = groups; // not yet tested in Xcode/C++
}
@ -173,6 +186,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
if (bias_defined)
bias_shape = bias_opt.value().sizes();
std::string mem_format_key;
switch (memory_format) {
case at::MemoryFormat::Contiguous:
mem_format_key = "Contiguous";
break;
case at::MemoryFormat::ChannelsLast:
mem_format_key = "ChannelsLast";
break;
default:
assert(0 && "Check should have been done earlier\n");
}
std::string bias_shape_key;
if (bias_defined) {
bias_shape_key = std::to_string(bias_shape[0]);
@ -180,16 +205,20 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
bias_shape_key = "nobias";
}
std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
getArrayRefString(stride),
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
is_channels_last,
mps::getTensorsStringKey({input_t, weight_t}),
bias_defined,
bias_shape_key);
std::string key;
if (is3DConv) {
key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
} else {
key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
}
MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
@ -371,15 +400,33 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
@autoreleasepool {
MPSStream* stream = getCurrentMPSStream();
std::string mem_format_key;
switch (memory_format) {
case at::MemoryFormat::Contiguous:
mem_format_key = "Contiguous";
break;
case at::MemoryFormat::ChannelsLast:
mem_format_key = "ChannelsLast";
break;
default:
assert(0 && "Check should have been done earlier\n");
}
MPSShape* mps_input_shape = getMPSShape(input_size);
std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
getArrayRefString(stride),
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
is_channels_last,
getTensorsStringKey({grad_output_t, weight_t}));
std::string key;
if (is3DConv) {
key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
getTensorsStringKey({grad_output_t, weight_t});
} else {
key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
getTensorsStringKey({grad_output_t, weight_t});
}
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
@ -504,13 +551,19 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
MPSStream* stream = getCurrentMPSStream();
MPSShape* mps_weight_shape = getMPSShape(weight_size);
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
is3DConv ? "3d_" : "",
getArrayRefString(stride),
getArrayRefString(dilation),
getArrayRefString(padding),
groups,
getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
std::string key;
if (is3DConv) {
key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
} else {
key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
}
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
MPSShape* inputShape = getMPSShape(input_t);
bool isDepthwiseConv =

View File

@ -2,6 +2,7 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/mps/Copy.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/ops/_copy_from_and_resize_native.h>
#include <ATen/ops/_copy_from_native.h>

View File

@ -5,6 +5,8 @@
#include <ATen/native/DistributionTemplates.h>
#include <ATen/native/Distributions.h>
#include <ATen/native/TensorFactories.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -1,179 +0,0 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/TensorUtils.h>
#include <ATen/core/Tensor.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/EmbeddingBag.h>
#include <ATen/native/Pool.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/kernels/EmbeddingBag.h>
#include <fmt/format.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_embedding_bag_forward_only_native.h>
#include <ATen/ops/_embedding_bag_native.h>
#include <ATen/ops/empty.h>
#endif
namespace at::native {
#ifndef PYTORCH_JIT_COMPILE_SHADERS
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/EmbeddingBag_metallib.h>
#endif
namespace {
std::pair<Tensor, Tensor> promoteIndicesAndOffsets(const Tensor& indices, const Tensor& offsets) {
const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type());
return {indices.scalar_type() == commonType ? indices : indices.toType(commonType),
offsets.scalar_type() == commonType ? offsets : offsets.toType(commonType)};
}
} // namespace
namespace mps {
static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
const Tensor& weight,
const Tensor& indices_,
const Tensor& offsets_,
const bool scale_grad_by_freq,
const int64_t mode,
bool sparse,
const std::optional<Tensor>& per_sample_weights_opt,
bool include_last_offset,
int64_t padding_idx) {
TORCH_CHECK(indices_.dim() == 1, "input has to be a 1D Tensor, but got Tensor of dimension ", indices_.dim());
if (indices_.dim() == 1) {
TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor, but got Tensor of dimension ", offsets_.dim());
}
TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim());
Tensor indices, offsets;
std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
auto indices_arg = TensorArg(indices, "indices", 1);
checkScalarTypes("embedding_bag_mps", indices_arg, {kLong, kInt});
auto offsets_arg = TensorArg(offsets, "offsets", 1);
checkScalarTypes("embedding_bag_mps", offsets_arg, {kLong, kInt});
checkSameType("embedding_bag_mps", indices_arg, offsets_arg);
auto weight_arg = TensorArg(weight, "weight", 1);
int64_t num_indices = indices.size(0);
int64_t num_bags = offsets.size(0);
if (include_last_offset) {
num_bags -= 1;
}
int64_t feature_size = weight.size(1);
auto bag_size = at::empty(offsets.sizes(), indices.options());
auto offset2bag = at::empty({indices.size(0)}, indices.options());
auto output = at::empty({num_bags, feature_size}, weight.options());
Tensor max_indices;
if (mode == EmbeddingBagMode::MAX) {
max_indices = at::empty({num_bags, feature_size}, indices.options());
} else {
max_indices = at::empty({0}, indices.options());
}
EmbeddingBagParams<uint32_t> params;
for (const auto dim : c10::irange(weight.dim())) {
params.weight_strides[dim] = safe_downcast<uint32_t, int64_t>(weight.stride(dim));
params.output_strides[dim] = safe_downcast<uint32_t, int64_t>(output.stride(dim));
if (mode == EmbeddingBagMode::MAX) {
params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
}
}
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
params.num_indices = num_indices;
params.num_bags = num_bags;
params.feature_size = feature_size;
params.mode = static_cast<EmbeddingBagMode>(mode);
params.padding_idx = padding_idx;
auto num_threads = output.numel();
MPSStream* stream = getCurrentMPSStream();
dispatch_sync_with_rethrow(stream->queue(), ^() {
@autoreleasepool {
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
auto pipeline_state = lib.getPipelineStateForFunc(
fmt::format("embedding_bag_{}_{}", scalarToMetalTypeString(weight), scalarToMetalTypeString(indices)));
getMPSProfiler().beginProfileKernel(pipeline_state, "embedding_bag", {weight, indices, offsets});
[computeEncoder setComputePipelineState:pipeline_state];
mtl_setArgs(computeEncoder,
weight,
indices,
offsets,
use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
output,
offset2bag,
bag_size,
max_indices,
params);
mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
getMPSProfiler().endProfileKernel(pipeline_state);
}
});
return std::tuple<Tensor, Tensor, Tensor, Tensor>(
std::move(output), std::move(offset2bag), std::move(bag_size), std::move(max_indices));
}
} // namespace mps
std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps(const Tensor& weight,
const Tensor& indices,
const Tensor& offsets,
const bool scale_grad_by_freq,
const int64_t mode,
bool sparse,
const std::optional<Tensor>& per_sample_weights_opt,
bool include_last_offset,
int64_t padding_idx) {
return mps::_embedding_bag_mps_impl(weight,
indices,
offsets,
scale_grad_by_freq,
mode,
sparse,
per_sample_weights_opt,
include_last_offset,
padding_idx);
}
std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
const Tensor& weight,
const Tensor& indices,
const Tensor& offsets,
const bool scale_grad_by_freq,
const int64_t mode,
bool sparse,
const std::optional<Tensor>& per_sample_weights_opt,
bool include_last_offset,
int64_t padding_idx) {
return _embedding_bag_mps(weight,
indices,
offsets,
scale_grad_by_freq,
mode,
sparse,
per_sample_weights_opt,
include_last_offset,
padding_idx);
}
} // namespace at::native

View File

@ -1,4 +1,6 @@
#include <ATen/native/SpectralOpsUtils.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS
@ -10,6 +12,20 @@
#include <ATen/ops/_fft_r2c_native.h>
#endif
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
@implementation FakeMPSGraphFFTDescriptor
+ (nullable instancetype)descriptor {
// Redispatch the constructor to the actual implementation
id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
}
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
return self;
}
@end
#endif
namespace at::native {
namespace {
MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {

View File

@ -2,6 +2,7 @@
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/GridSamplerUtils.h>
#include <ATen/native/Pool.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/kernels/GridSampler.h>

View File

@ -17,6 +17,7 @@
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/Resize.h>
#include <ATen/native/TensorAdvancedIndexing.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <c10/util/SmallVector.h>
#include <c10/util/irange.h>
#include <fmt/format.h>

View File

@ -6,7 +6,9 @@
#include <ATen/native/LinearAlgebra.h>
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/Resize.h>
// For MTLLanguageVersion_3_1
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -4,6 +4,7 @@
#include <ATen/TensorUtils.h>
#include <ATen/native/Pool.h>
#include <ATen/native/ReduceOpsUtils.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <c10/util/irange.h>

View File

@ -4,6 +4,7 @@
#include <ATen/WrapDimUtils.h>
#include <ATen/native/TensorShape.h>
#include <ATen/native/TypeProperties.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -5,6 +5,7 @@
#include <ATen/native/SortingUtils.h>
#include <ATen/native/TensorShape.h>
#include <ATen/native/TypeProperties.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -2,6 +2,8 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/UnaryOps.h>
#include <ATen/native/mps/Copy.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -1,6 +1,7 @@
// Copyright © 2022 Apple Inc.
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/Resize.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -1,6 +1,7 @@
// Copyright © 2023 Apple Inc.
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/native/UpSample.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <fmt/format.h>

View File

@ -4,6 +4,8 @@
#include <ATen/mps/MPSAllocatorInterface.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/Resize.h>
// For MTLLanguageVersion_3_1
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#include <fmt/format.h>

View File

@ -2351,7 +2351,6 @@
dispatch:
CPU: _embedding_bag_forward_only_cpu
CUDA: _embedding_bag_forward_only_cuda
MPS: _embedding_bag_forward_only_mps
autogen: _embedding_bag_forward_only.out
- func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
@ -2373,7 +2372,6 @@
dispatch:
CPU: _embedding_bag_cpu
CUDA: _embedding_bag_cuda
MPS: _embedding_bag_mps
autogen: _embedding_bag.out
tags: core
@ -2519,7 +2517,7 @@
dispatch:
CompositeExplicitAutograd: empty_like
QuantizedCPU, QuantizedCUDA: empty_like_quantized
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: empty_like_sparse_coo
SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
autogen: empty_like.out
@ -4374,7 +4372,7 @@
variants: function, method
dispatch:
CPU: narrow_copy_dense_cpu
SparseCPU, SparseCUDA, SparseMPS: narrow_copy_sparse
SparseCPU, SparseCUDA: narrow_copy_sparse
CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
tags: view_copy
@ -6494,7 +6492,7 @@
device_guard: False
dispatch:
CompositeExplicitAutograd: unsqueeze
SparseCPU, SparseCUDA, SparseMPS: unsqueeze_sparse
SparseCPU, SparseCUDA: unsqueeze_sparse
QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
tags: core
@ -6662,7 +6660,7 @@
- func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CompositeExplicitAutograd: zeros_out
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zeros_sparse_out
SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out
- func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
dispatch:
@ -10261,7 +10259,7 @@
structured_delegate: any.all_out
variants: method, function
dispatch:
SparseCPU, SparseCUDA, SparseMPS: any_sparse
SparseCPU, SparseCUDA: any_sparse
tags: core
- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@ -10701,7 +10699,6 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
CUDA: foreach_tensor_div_list_kernel_cuda
MTIA: foreach_tensor_div_list_kernel_mtia
- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@ -10709,7 +10706,6 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
CUDA: foreach_tensor_div_list_kernel_cuda_
MTIA: foreach_tensor_div_list_kernel_mtia_
autogen: _foreach_div.List_out
- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
@ -10733,7 +10729,6 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
CUDA: foreach_tensor_div_tensor_kernel_cuda
MTIA: foreach_tensor_div_tensor_kernel_mtia
- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
@ -10741,7 +10736,6 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
CUDA: foreach_tensor_div_tensor_kernel_cuda_
MTIA: foreach_tensor_div_tensor_kernel_mtia_
autogen: _foreach_div.Tensor_out
- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]

View File

@ -1,6 +1,5 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/Dispatch.h>
#include <ATen/ceil_div.h>
#include <ATen/native/cuda/Loops.cuh>
#include <c10/cuda/CUDAGuard.h>
@ -22,11 +21,10 @@
namespace at::native {
namespace {
template <typename T>
__global__ void ChooseQuantizationParamsKernelImpl(
const int64_t* fake_quant_on,
const T* x_min,
const T* x_max,
const float* x_min,
const float* x_max,
int32_t qmin,
int32_t qmax,
int size,
@ -95,44 +93,34 @@ __global__ void ChooseQuantizationParamsKernelImpl(
}
}
__device__ inline bool isinf_device(float v) {
return ::isinf(v);
}
__device__ inline bool isinf_device(c10::BFloat16 v) {
return ::isinf(static_cast<float>(v));
}
// CUDA kernel to compute Moving Average Min/Max of the tensor.
// It uses the running_min and running_max along with averaging const, c.
// The formula used to compute the new min/max is as follows
//
// running_min = (1 - c) * running_min + c * x_min, if running_min != inf
// running_min = x_min, if running_min == inf
template <typename T>
__global__ void MovingAverageMinMax(
const int64_t* observer_on,
const T* x_min,
const T* x_max,
T* running_min,
T* running_max,
const float* x_min,
const float* x_max,
float* running_min,
float* running_max,
const float averaging_const,
const int size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (*observer_on == 1) {
if (i < size) {
T curr_min = x_min[i];
T curr_max = x_max[i];
float curr_min = x_min[i];
float curr_max = x_max[i];
T averaging_const_t = static_cast<T>(averaging_const);
float adjusted_min = ::isinf(running_min[i])
? curr_min
: (running_min[i]) + averaging_const * (curr_min - (running_min[i]));
T adjusted_min = isinf_device(running_min[i]) ? curr_min
: (running_min[i]) +
averaging_const_t * (curr_min - (running_min[i]));
T adjusted_max = isinf_device(running_max[i]) ? curr_max
: (running_max[i]) +
averaging_const_t * (curr_max - (running_max[i]));
float adjusted_max = ::isinf(running_max[i])
? curr_max
: (running_max[i]) + averaging_const * (curr_max - (running_max[i]));
running_min[i] = adjusted_min;
running_max[i] = adjusted_max;
@ -154,51 +142,40 @@ void _calculate_moving_average(
at::Tensor x_min, x_max;
int64_t* observer_on_data = observer_on.data_ptr<int64_t>();
float* running_min_data = running_min.data_ptr<float>();
float* running_max_data = running_max.data_ptr<float>();
cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
if (per_row_fq) {
std::tie(x_min, x_max) = at::aminmax(x, 1);
float* x_min_data = x_min.data_ptr<float>();
float* x_max_data = x_max.data_ptr<float>();
int num_threads = std::min(size, (int64_t)512);
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
// Moving Average Min/Max observer for activations
MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
observer_on_data,
x_min_data,
x_max_data,
running_min_data,
running_max_data,
averaging_const,
size);
});
// Moving Average Min/Max observer for activations
MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
observer_on_data,
x_min_data,
x_max_data,
running_min_data,
running_max_data,
averaging_const,
size);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
std::tie(x_min, x_max) = at::aminmax(x);
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
// Moving Average Min/Max observer for activations
MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
observer_on_data,
x_min_data,
x_max_data,
running_min_data,
running_max_data,
averaging_const,
1 /*size*/);
});
float* x_min_data = x_min.data_ptr<float>();
float* x_max_data = x_max.data_ptr<float>();
// Moving Average Min/Max observer for activations
MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
observer_on_data,
x_min_data,
x_max_data,
running_min_data,
running_max_data,
averaging_const,
1 /*size*/);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
@ -221,44 +198,34 @@ void _calc_moving_avg_qparams_helper(
cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
if (per_row_fq) {
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
int num_threads = std::min(size, (int64_t)512);
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
ChooseQuantizationParamsKernelImpl<<<
num_blocks,
num_threads,
0,
cuda_stream>>>(
fake_quant_on_data,
running_min_data,
running_max_data,
qmin,
qmax,
size,
symmetric_quant,
scale_ptr,
zp_ptr);
});
float* running_min_data = running_min.data_ptr<float>();
float* running_max_data = running_max.data_ptr<float>();
int num_threads = std::min(size, (int64_t)512);
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
ChooseQuantizationParamsKernelImpl<<<num_blocks, num_threads, 0, cuda_stream>>>(
fake_quant_on_data,
running_min_data,
running_max_data,
qmin,
qmax,
size,
symmetric_quant,
scale_ptr,
zp_ptr);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
AT_DISPATCH_FLOATING_TYPES_AND(
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
fake_quant_on_data,
running_min_data,
running_max_data,
qmin,
qmax,
1, // size
symmetric_quant, // preserve_sparsity
scale_ptr,
zp_ptr);
});
float* running_min_data = running_min.data_ptr<float>();
float* running_max_data = running_max.data_ptr<float>();
ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
fake_quant_on_data,
running_min_data,
running_max_data,
qmin,
qmax,
1, // size
symmetric_quant, // preserve_sparsity
scale_ptr,
zp_ptr);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}

View File

@ -42,7 +42,7 @@ TEST(MPSObjCInterfaceTest, MPSCustomKernel) {
id<MTLLibrary> customKernelLibrary = [device newLibraryWithSource: [NSString stringWithUTF8String:CUSTOM_KERNEL]
options: nil
error: &error];
TORCH_CHECK(customKernelLibrary, "Failed to create custom kernel library, error: ", error.localizedDescription.UTF8String);
TORCH_CHECK(customKernelLibrary, "Failed to to create custom kernel library, error: ", error.localizedDescription.UTF8String);
id<MTLFunction> customFunction = [customKernelLibrary newFunctionWithName: @"add_arrays"];
TORCH_CHECK(customFunction, "Failed to create function state object for the kernel");

View File

@ -76,23 +76,4 @@ int32_t getGlobalIdxFromDevice(DeviceIndex device) {
return device_global_idxs[device];
}
// Check if a device can access the memory of a peer device directly.
bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer) {
if (device == -1) {
device = c10::xpu::current_device();
}
if (peer == -1) {
peer = c10::xpu::current_device();
}
check_device_index(device);
check_device_index(peer);
// A device can always access itself
if (device == peer) {
return true;
}
return c10::xpu::get_raw_device(device).ext_oneapi_can_access_peer(
c10::xpu::get_raw_device(peer),
sycl::ext::oneapi::peer_access::access_supported);
}
} // namespace at::xpu

View File

@ -17,6 +17,4 @@ TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device);
TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device);
TORCH_XPU_API bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer);
} // namespace at::xpu

View File

@ -72,12 +72,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
"timm_vovnet",
"torchrec_dlrm",
"vgg16",
# LLM
"meta-llama/Llama-3.2-1B",
"google/gemma-2-2b",
"google/gemma-3-4b-it",
"openai/whisper-tiny",
"Qwen/Qwen3-0.6B",
}
)

View File

@ -55,12 +55,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
"timm_nfnet",
"torchrec_dlrm",
"vgg16",
# LLM
"meta-llama/Llama-3.2-1B",
"google/gemma-2-2b",
"google/gemma-3-4b-it",
"openai/whisper-tiny",
"Qwen/Qwen3-0.6B",
}
)

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -167,23 +167,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,fail_accuracy,0
google/gemma-2-2b,fail_accuracy,0
google/gemma-3-4b-it,fail_accuracy,0
openai/whisper-tiny,fail_to_run,0
Qwen/Qwen3-0.6B,fail_accuracy,0

1 name accuracy graph_breaks
167
168
169

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
google/gemma-2-2b,pass_due_to_skip,0
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass_due_to_skip,0
Qwen/Qwen3-0.6B,pass_due_to_skip,0

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
google/gemma-2-2b,pass_due_to_skip,0
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass_due_to_skip,0
Qwen/Qwen3-0.6B,pass_due_to_skip,0

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
google/gemma-2-2b,pass_due_to_skip,0
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass_due_to_skip,0
Qwen/Qwen3-0.6B,pass_due_to_skip,0

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
google/gemma-2-2b,pass,0
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,0
Qwen/Qwen3-0.6B,pass,0

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -167,23 +167,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,fail_accuracy,0
google/gemma-2-2b,fail_accuracy,0
google/gemma-3-4b-it,fail_accuracy,0
openai/whisper-tiny,fail_to_run,0
Qwen/Qwen3-0.6B,fail_accuracy,0

1 name accuracy graph_breaks
167
168
169

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -205,7 +205,7 @@ llama,pass,0
llama_v2_7b_16h,pass_due_to_skip,0
llama_v2_7b_16h,model_fail_to_load,0

1 name accuracy graph_breaks
205
206
207
208
209
210
211

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -205,7 +205,7 @@ llama,pass,0
llama_v2_7b_16h,pass_due_to_skip,0
llama_v2_7b_16h,model_fail_to_load,0

1 name accuracy graph_breaks
205
206
207
208
209
210
211

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,5
google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
171
172
173

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -733,7 +733,7 @@ def timed(
time_total = 0
# Dont collect outputs to correctly measure timing
for i in range(times):
for _ in range(times):
# If batch_size is 1, it too often collides with other non batch size
# dimensions resulting in errors.
if batch_size and batch_size > 1:
@ -1106,13 +1106,7 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
elif args.torchscript_jit_trace:
frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
else:
if kwargs["hf_llm"]:
# If it's an llm, we want to optimize model.forward, and use
# the generate function
model.forward = torch._dynamo.run(model)
frozen_model_iter_fn = model_iter_fn
else:
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
for rep in trange(args.repeat, desc="running benchmark"):
inputs = (
@ -1126,10 +1120,7 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
maybe_mark_step(args)
# interleave the runs to handle frequency scaling and load changes
with (
maybe_mark_profile(p=p, mark="expected"),
torch.compiler.set_stance("force_eager"),
):
with maybe_mark_profile(p=p, mark="expected"):
timings[rep, 0], expected_output = timed(
model,
model_iter_fn,
@ -2242,12 +2233,11 @@ class BenchmarkRunner:
reset_rng_state()
model_copy = None
try:
with torch.compiler.set_stance("force_eager"):
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs), self.model_iter_fn
)
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs), self.model_iter_fn
)
except Exception as e:
accuracy_status = (
"eager_1st_run_OOM"
@ -2264,12 +2254,11 @@ class BenchmarkRunner:
reset_rng_state()
model_copy = None
try:
with torch.compiler.set_stance("force_eager"):
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_rerun_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs), self.model_iter_fn
)
model_copy = self.deepcopy_and_maybe_parallelize(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_rerun_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs), self.model_iter_fn
)
except Exception as e:
accuracy_status = (
"eager_2nd_run_OOM"
@ -2553,11 +2542,7 @@ class BenchmarkRunner:
)
baseline_timings = experiment(
self.model_iter_fn,
model,
example_inputs,
mark="expected",
**experiment_kwargs,
model, example_inputs, mark="expected", **experiment_kwargs
)
if self.args.export_aot_inductor:
@ -2625,11 +2610,7 @@ class BenchmarkRunner:
)
backend_timings = experiment(
self.model_iter_fn,
model,
example_inputs,
mark="expected",
**experiment_kwargs,
model, example_inputs, mark="expected", **experiment_kwargs
)
timings = np.stack((baseline_timings, backend_timings), axis=1)
result_summary = latency_experiment_summary(
@ -2648,17 +2629,9 @@ class BenchmarkRunner:
tag=None,
batch_size=None,
):
niters = 5
if getattr(self, "hf_llm", False):
# If we're benchmarking an llm, we want to use the generate function
self.model_iter_fn = self.generate
niters = 1
if self.args.xla:
with self.pick_grad(name, self.args.training):
return experiment(
self.model_iter_fn, *self.maybe_cast(model, example_inputs)
)
return experiment(*self.maybe_cast(model, example_inputs))
def warmup(fn, model, example_inputs, mode, niters=5):
gc.collect()
@ -2723,22 +2696,17 @@ class BenchmarkRunner:
with maybe_snapshot_memory(
self.args.snapshot_memory, f"eager_{self.args.only}"
):
with torch.compiler.set_stance("force_eager"):
eager_latency, eager_peak_mem, _ = warmup(
eager_latency, eager_peak_mem, _ = warmup(
self.model_iter_fn, copy.deepcopy(model), example_inputs, "eager"
)
if self.args.use_warm_peak_memory:
_, eager_peak_mem, _ = warmup(
self.model_iter_fn,
copy.deepcopy(model),
example_inputs,
"eager",
niters=niters,
niters=1,
)
if self.args.use_warm_peak_memory:
_, eager_peak_mem, _ = warmup(
self.model_iter_fn,
copy.deepcopy(model),
example_inputs,
"eager",
niters=1,
)
if (
self.args.export_aot_inductor
@ -2747,13 +2715,7 @@ class BenchmarkRunner:
):
optimized_model_iter_fn = optimize_ctx
else:
if getattr(self, "hf_llm", False):
# If it's an llm, we want to optimize model.forward, and use
# the generate function
model = optimize_ctx(model)
optimized_model_iter_fn = self.model_iter_fn
else:
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
with maybe_snapshot_memory(
self.args.snapshot_memory, f"compiled_{self.args.only}"
@ -2831,13 +2793,7 @@ class BenchmarkRunner:
f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
)
experiment_kwargs["hf_llm"] = getattr(self, "hf_llm", False)
results.append(
experiment(
self.model_iter_fn, model, example_inputs, **experiment_kwargs
)
)
results.append(experiment(model, example_inputs, **experiment_kwargs))
return " ".join(map(str, results))
def minify_model(
@ -4128,7 +4084,7 @@ def run(runner, args, original_dir=None):
# Overwrite 'translation_validation' config, if specified.
torch.fx.experimental._config.translation_validation = False
experiment = functools.partial(experiment, args)
experiment = functools.partial(experiment, args, runner.model_iter_fn)
if args.only and should_diff_branch(args):
import git

View File

@ -7,7 +7,6 @@ import os
import re
import subprocess
import sys
import types
import warnings
@ -129,12 +128,6 @@ with open(MODELS_FILENAME) as fh:
assert len(BATCH_SIZE_KNOWN_MODELS)
try:
from .huggingface_llm_models import HF_LLM_MODELS
except ImportError:
from huggingface_llm_models import HF_LLM_MODELS
def get_module_cls_by_model_name(model_cls_name):
_module_by_model_name = {
"Speech2Text2Decoder": "transformers.models.speech_to_text_2.modeling_speech_to_text_2",
@ -425,8 +418,11 @@ class HuggingfaceRunner(BenchmarkRunner):
use_eval_mode = self.args.use_eval_mode
dtype = torch.float32
reset_rng_state()
# Get batch size
model_cls, config = self._get_model_cls_and_config(model_name)
model = self._download_model(model_name)
model = model.to(device, dtype=dtype)
if self.args.enable_activation_checkpointing:
model.gradient_checkpointing_enable()
if model_name in BATCH_SIZE_KNOWN_MODELS:
batch_size_default = BATCH_SIZE_KNOWN_MODELS[model_name]
elif batch_size is None:
@ -444,46 +440,14 @@ class HuggingfaceRunner(BenchmarkRunner):
f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}" # noqa: G004
)
# Get model and example inputs
if model_name in HF_LLM_MODELS:
benchmark_cls = HF_LLM_MODELS[model_name]
model, example_inputs = benchmark_cls.get_model_and_inputs(
model_name, device
)
example_inputs = generate_inputs_for_model(
model_cls, model, model_name, batch_size, device, include_loss_args=True
)
# Set this flag so that when we test for speedup, we use
# model.generate instead of using model.forward
self.hf_llm = True
def generate(self, _, example_inputs, collect_outputs=True):
return model.generate(**example_inputs)
self.generate = types.MethodType(generate, self)
else:
self.hf_llm = False
model_cls, config = self._get_model_cls_and_config(model_name)
model = self._download_model(model_name)
model = model.to(device, dtype=dtype)
example_inputs = generate_inputs_for_model(
model_cls, model, model_name, batch_size, device, include_loss_args=True
)
# So we can check for correct gradients without eliminating the dropout computation
for attr in dir(config):
if "drop" in attr and isinstance(getattr(config, attr), float):
setattr(config, attr, 1e-30)
# Turning off kv cache for torchbench models. This is not the right
# thing to do, but the pt2 dashboard is outdated. Real transformers
# benchmarks will be added soon using a different infra.
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
model.config.use_cache = False
if self.args.enable_activation_checkpointing:
model.gradient_checkpointing_enable()
# So we can check for correct gradients without eliminating the dropout computation
for attr in dir(config):
if "drop" in attr and isinstance(getattr(config, attr), float):
setattr(config, attr, 1e-30)
if (
is_training
@ -496,6 +460,12 @@ class HuggingfaceRunner(BenchmarkRunner):
else:
model.eval()
# Turning off kv cache for torchbench models. This is not the right
# thing to do, but the pt2 dashboard is outdated. Real transformers
# benchmarks will be added soon using a different infra.
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
model.config.use_cache = False
self.validate_model(model, example_inputs)
return device, model_name, model, example_inputs, batch_size
@ -560,8 +530,7 @@ class HuggingfaceRunner(BenchmarkRunner):
def forward_pass(self, mod, inputs, collect_outputs=True):
with self.autocast(**self.autocast_arg):
res = mod(**inputs)
return res.logits if self.hf_llm else res
return mod(**inputs)
def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
cloned_inputs = clone_inputs(inputs)

View File

@ -9,16 +9,9 @@ skip:
# Fails with even batch size = 1
- GPTJForCausalLM
- GPTJForQuestionAnswering
# Model too big
- google/gemma-3-4b-it
device:
cpu:
- meta-llama/Llama-3.2-1B
- google/gemma-2-2b
- google/gemma-3-4b-it
- openai/whisper-tiny
- Qwen/Qwen3-0.6B
cpu: []
control_flow:
- AllenaiLongformerBase
@ -74,11 +67,6 @@ batch_size:
XGLMForCausalLM: 4
XLNetLMHeadModel: 2
YituTechConvBert: 2
meta-llama/Llama-3.2-1B: 8
google/gemma-2-2b: 8
google/gemma-3-4b-it: 8
openai/whisper-tiny: 8
Qwen/Qwen3-0.6B: 8
tolerance:

View File

@ -1,102 +0,0 @@
import subprocess
import sys
import torch
def pip_install(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
try:
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
WhisperForConditionalGeneration,
WhisperProcessor,
)
except ModuleNotFoundError:
print("Installing HuggingFace Transformers...")
pip_install("git+https://github.com/huggingface/transformers.git#egg=transformers")
finally:
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
WhisperForConditionalGeneration,
WhisperProcessor,
)
class Benchmark:
@staticmethod
def get_model_and_inputs(model_name, device):
raise NotImplementedError("get_model_and_inputs() not implemented")
class WhisperBenchmark(Benchmark):
SAMPLE_RATE = 16000
DURATION = 30.0 # seconds
@staticmethod
def get_model_and_inputs(model_name, device):
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
model.config.forced_decoder_ids = None
model.generation_config.do_sample = False
model.generation_config.temperature = 0.0
num_samples = int(WhisperBenchmark.DURATION * WhisperBenchmark.SAMPLE_RATE)
audio = torch.randn(num_samples) * 0.1
inputs = dict(
processor(
audio, sampling_rate=WhisperBenchmark.SAMPLE_RATE, return_tensors="pt"
)
)
inputs["input_features"] = inputs["input_features"].to(device)
decoder_start_token = model.config.decoder_start_token_id
inputs["decoder_input_ids"] = torch.tensor(
[[decoder_start_token]], device=device
)
return model, inputs
class TextGenerationBenchmark(Benchmark):
INPUT_LENGTH = 1000
OUTPUT_LENGTH = 2000
@staticmethod
def get_model_and_inputs(model_name, device):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
model.eval()
model.generation_config.do_sample = False
model.generation_config.use_cache = True
model.generation_config.cache_implementation = "static"
model.generation_config.max_new_tokens = TextGenerationBenchmark.OUTPUT_LENGTH
model.generation_config.pad_token_id = tokenizer.eos_token_id
model.generation_config.temperature = 0.0
vocab_size = tokenizer.vocab_size
input_ids = torch.randint(
low=0,
high=vocab_size,
size=(1, TextGenerationBenchmark.INPUT_LENGTH),
device=device,
dtype=torch.long,
)
example_inputs = {"input_ids": input_ids}
return model, example_inputs
HF_LLM_MODELS: dict[str, Benchmark] = {
"meta-llama/Llama-3.2-1B": TextGenerationBenchmark,
"google/gemma-2-2b": TextGenerationBenchmark,
"google/gemma-3-4b-it": TextGenerationBenchmark,
"openai/whisper-tiny": WhisperBenchmark,
"Qwen/Qwen3-0.6B": TextGenerationBenchmark,
}

View File

@ -46,8 +46,3 @@ TrOCRForCausalLM,64
XGLMForCausalLM,32
XLNetLMHeadModel,16
YituTechConvBert,32
meta-llama/Llama-3.2-1B,8
google/gemma-2-2b,8
google/gemma-3-4b-it,8
openai/whisper-tiny,8
Qwen/Qwen3-0.6B,8

View File

@ -61,22 +61,6 @@ struct C10_API Storage {
allocator,
resizable)) {}
// Creates storage with pre-allocated memory buffer. Allocator is given for
// potential future reallocations, however it can be nullptr if the storage
// is non-resizable
Storage(
use_byte_size_t /*use_byte_size*/,
SymInt size_bytes,
at::DataPtr data_ptr,
at::Allocator* allocator = nullptr,
bool resizable = false)
: storage_impl_(c10::make_intrusive<StorageImpl>(
StorageImpl::use_byte_size_t(),
std::move(size_bytes),
std::move(data_ptr),
allocator,
resizable)) {}
protected:
explicit Storage(unsafe_borrow_t, const Storage& rhs)
: storage_impl_(c10::intrusive_ptr<c10::StorageImpl>::reclaim(

View File

@ -10,9 +10,9 @@ namespace c10::cuda {
void c10_cuda_check_implementation(
const int32_t err,
const char* filename,
const char* function_name,
const uint32_t line_number,
const char* /*filename*/,
const char* /*function_name*/,
const int /*line_number*/,
const bool include_device_assertions) {
const auto cuda_error = static_cast<cudaError_t>(err);
const auto cuda_kernel_failure = include_device_assertions
@ -41,7 +41,7 @@ void c10_cuda_check_implementation(
}
#endif
throw c10::AcceleratorError(
{function_name, filename, line_number}, err, check_message);
{__func__, __FILE__, int32_t(__LINE__)}, err, check_message);
}
} // namespace c10::cuda

View File

@ -91,7 +91,7 @@ C10_CUDA_API void c10_cuda_check_implementation(
const int32_t err,
const char* filename,
const char* function_name,
const uint32_t line_number,
const int line_number,
const bool include_device_assertions);
} // namespace c10::cuda

View File

@ -1,22 +0,0 @@
// Shim header for filesystem for compilers that are too old to have it not
// in the experimental namespace
#if __has_include(<filesystem>)
#include <filesystem>
#elif __has_include(<experimental/filesystem>)
#include <experimental/filesystem>
#else
#error "Neither <filesystem> nor <experimental/filesystem> is available."
#endif
namespace c10 {
#if __has_include(<filesystem>)
// NOLINTNEXTLINE(misc-unused-alias-decls)
namespace filesystem = std::filesystem;
#elif __has_include(<experimental/filesystem>)
// NOLINTNEXTLINE(misc-unused-alias-decls)
namespace filesystem = std::experimental::filesystem;
#endif
} // namespace c10

View File

@ -52,7 +52,7 @@ struct maybe_bool {
template <typename src_t>
struct maybe_bool<true, src_t> {
C10_HOST_DEVICE static inline decltype(auto) apply(src_t src) {
// Don't use bool operator so as to also compile for ComplexHalf.
// Don't use bool operator so as to to also compile for ComplexHalf.
return src.real() || src.imag();
}
};

Some files were not shown because too many files have changed in this diff Show More