mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-02 23:15:01 +08:00
Compare commits
7 Commits
gh/PaliC/2
...
remove_pyi
| Author | SHA1 | Date | |
|---|---|---|---|
| a3795cfaea | |||
| a9d5c00727 | |||
| c1102ca308 | |||
| 7856f8d7f4 | |||
| 973c3b531a | |||
| d5a496e7f1 | |||
| 051e544ef6 |
@ -7,4 +7,4 @@ set -ex
|
||||
|
||||
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
|
||||
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
|
||||
|
||||
40
.ci/pytorch/functorch_doc_push_script.sh
Executable file
40
.ci/pytorch/functorch_doc_push_script.sh
Executable file
@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
# This is where the local pytorch install in the docker image is located
|
||||
pt_checkout="/var/lib/jenkins/workspace"
|
||||
source "$pt_checkout/.ci/pytorch/common_utils.sh"
|
||||
echo "functorch_doc_push_script.sh: Invoked with $*"
|
||||
|
||||
set -ex -o pipefail
|
||||
|
||||
version=${DOCS_VERSION:-nightly}
|
||||
echo "version: $version"
|
||||
|
||||
# Build functorch docs
|
||||
pushd $pt_checkout/functorch/docs
|
||||
make html
|
||||
popd
|
||||
|
||||
git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
|
||||
pushd functorch_ghpages
|
||||
|
||||
if [ "$version" == "main" ]; then
|
||||
version=nightly
|
||||
fi
|
||||
|
||||
git rm -rf "$version" || true
|
||||
mv "$pt_checkout/functorch/docs/build/html" "$version"
|
||||
|
||||
git add "$version" || true
|
||||
git status
|
||||
git config user.email "soumith+bot@pytorch.org"
|
||||
git config user.name "pytorchbot"
|
||||
# If there aren't changes, don't make a commit; push is no-op
|
||||
git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
|
||||
git status
|
||||
|
||||
if [[ "${WITH_PUSH:-}" == true ]]; then
|
||||
git push -u origin gh-pages
|
||||
fi
|
||||
|
||||
popd
|
||||
@ -1,25 +0,0 @@
|
||||
From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Wang <13521008+isVoid@users.noreply.github.com>
|
||||
Date: Tue, 1 Apr 2025 17:28:05 -0700
|
||||
Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
|
||||
(#185)
|
||||
|
||||
Co-authored-by: isVoid <isVoid@users.noreply.github.com>
|
||||
---
|
||||
numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
|
||||
1 file changed, 3 insertions(+)
|
||||
|
||||
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
|
||||
index 1641bf77..233e9ed7 100644
|
||||
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
|
||||
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
|
||||
@@ -365,6 +365,9 @@ def _find_api(self, fname):
|
||||
else:
|
||||
variants = ('_v2', '')
|
||||
|
||||
+ if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
|
||||
+ return getattr(self.lib, fname)
|
||||
+
|
||||
for variant in variants:
|
||||
try:
|
||||
return getattr(self.lib, f'{fname}{variant}')
|
||||
@ -32,16 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
|
||||
git config --global --add safe.directory /var/lib/jenkins/workspace
|
||||
fi
|
||||
|
||||
|
||||
# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
|
||||
NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
|
||||
if [ -n "$NUMBA_CUDA_DIR" ]; then
|
||||
NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
|
||||
pushd "$NUMBA_CUDA_DIR"
|
||||
patch -p4 <"$NUMBA_PATCH"
|
||||
popd
|
||||
fi
|
||||
|
||||
echo "Environment variables:"
|
||||
env
|
||||
|
||||
@ -1582,7 +1572,6 @@ test_linux_aarch64() {
|
||||
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
|
||||
test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
|
||||
test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
|
||||
distributed/elastic/timer/api_test distributed/elastic/timer/local_timer_example distributed/elastic/timer/local_timer_test \
|
||||
--shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
|
||||
|
||||
# Dynamo tests
|
||||
|
||||
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
@ -21,7 +21,6 @@ self-hosted-runner:
|
||||
- linux.arm64.2xlarge.ephemeral
|
||||
- linux.arm64.m7g.4xlarge
|
||||
- linux.arm64.m7g.4xlarge.ephemeral
|
||||
- linux.arm64.r7g.12xlarge.memory
|
||||
- linux.4xlarge.nvidia.gpu
|
||||
- linux.8xlarge.nvidia.gpu
|
||||
- linux.16xlarge.nvidia.gpu
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
87ff22e49ed0e92576c4935ccb8c143daac4a3cd
|
||||
caba63f0fa29ef9e3d566699f32f11c07c8bda4e
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
d119fc86140785e7efc8f125c17153544d1e0f20
|
||||
f510715882304796a96e33028b4f6de1b026c2c7
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
c77852e117bdf056c8e9a087e51d6f65cf6ba53d
|
||||
6c5478ff7c3d50dd1e3047d72ec5909bea474073
|
||||
|
||||
29
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
29
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
@ -82,10 +82,16 @@ RUN if command -v apt-get >/dev/null; then \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim; \
|
||||
else \
|
||||
dnf install -y git curl wget sudo; \
|
||||
dnf install -y git curl wget sudo vim; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
# Install uv for faster pip installs if not existed
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! python3 -m uv --version >/dev/null 2>&1; then \
|
||||
@ -214,16 +220,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..."; \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
|
||||
else \
|
||||
SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
|
||||
fi; \
|
||||
curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
|
||||
echo "Installing sccache..." \
|
||||
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||
@ -284,7 +285,7 @@ RUN if command -v apt-get >/dev/null; then \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
|
||||
else \
|
||||
dnf install -y git curl wget sudo; \
|
||||
dnf install -y git curl wget sudo vim; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
@ -297,6 +298,12 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
|
||||
echo "[INFO] Showing torch_build_versions.txt content:" && \
|
||||
cat torch_build_versions.txt
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
# Install uv for faster pip installs if not existed
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! python3 -m uv --version > /dev/null 2>&1; then \
|
||||
|
||||
3
.github/scripts/prepare_vllm_wheels.sh
vendored
3
.github/scripts/prepare_vllm_wheels.sh
vendored
@ -84,9 +84,6 @@ repackage_wheel() {
|
||||
rm -rf $package
|
||||
}
|
||||
|
||||
# Require to re-package the wheel
|
||||
${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
|
||||
|
||||
pushd externals/vllm/wheels
|
||||
for package in xformers flashinfer-python vllm; do
|
||||
repackage_wheel $package
|
||||
|
||||
14
.github/workflows/_docs.yml
vendored
14
.github/workflows/_docs.yml
vendored
@ -75,6 +75,10 @@ jobs:
|
||||
runner: ${{ inputs.runner_prefix }}linux.2xlarge
|
||||
# It takes less than 30m to finish python docs unless there are issues
|
||||
timeout-minutes: 30
|
||||
- docs_type: functorch
|
||||
runner: ${{ inputs.runner_prefix }}linux.2xlarge
|
||||
# It takes less than 15m to finish functorch docs unless there are issues
|
||||
timeout-minutes: 15
|
||||
# Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
|
||||
# The current name requires updating the database last docs push query from test-infra every time the matrix is updated
|
||||
name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
|
||||
@ -207,6 +211,16 @@ jobs:
|
||||
path: cppdocs/
|
||||
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/cppdocs
|
||||
|
||||
- name: Upload functorch Docs Preview
|
||||
uses: seemethere/upload-artifact-s3@baba72d0712b404f646cebe0730933554ebce96a # v5.1.0
|
||||
if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'functorch' && steps.build-docs.outcome == 'success' }}
|
||||
with:
|
||||
retention-days: 14
|
||||
s3-bucket: doc-previews
|
||||
if-no-files-found: error
|
||||
path: functorch_ghpages/nightly/
|
||||
s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
|
||||
|
||||
- name: Teardown Linux
|
||||
uses: pytorch/test-infra/.github/actions/teardown-linux@main
|
||||
if: always()
|
||||
|
||||
2
.github/workflows/_linux-test.yml
vendored
2
.github/workflows/_linux-test.yml
vendored
@ -169,7 +169,7 @@ jobs:
|
||||
id: install-nvidia-driver
|
||||
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
|
||||
with:
|
||||
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
|
||||
driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
|
||||
|
||||
- name: Setup GPU_FLAG for docker run
|
||||
|
||||
33
.github/workflows/_rocm-test.yml
vendored
33
.github/workflows/_rocm-test.yml
vendored
@ -62,11 +62,6 @@ on:
|
||||
required: false
|
||||
type: number
|
||||
default: 1
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN:
|
||||
required: false
|
||||
description: |
|
||||
HF Auth token to avoid rate limits when downloading models or datasets from hub
|
||||
env:
|
||||
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
|
||||
@ -81,9 +76,10 @@ jobs:
|
||||
strategy:
|
||||
matrix: ${{ fromJSON(inputs.test-matrix) }}
|
||||
fail-fast: false
|
||||
runs-on: ${{ matrix.runner }}
|
||||
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
||||
runs-on: ${{ matrix.runner }}
|
||||
steps:
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
with:
|
||||
@ -135,9 +131,6 @@ jobs:
|
||||
|
||||
- name: Start monitoring script
|
||||
id: monitor-script
|
||||
if: ${{ !inputs.disable-monitor }}
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
env:
|
||||
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
||||
@ -145,6 +138,9 @@ jobs:
|
||||
WORKFLOW_RUN_ID: ${{github.run_id}}
|
||||
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
||||
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
||||
if: ${{ !inputs.disable-monitor }}
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
run: |
|
||||
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
|
||||
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
||||
@ -182,12 +178,6 @@ jobs:
|
||||
run: |
|
||||
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Preserve github env variables for use in docker
|
||||
shell: bash
|
||||
run: |
|
||||
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
|
||||
- name: Test
|
||||
id: test
|
||||
env:
|
||||
@ -203,22 +193,20 @@ jobs:
|
||||
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
||||
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
|
||||
TEST_CONFIG: ${{ matrix.config }}
|
||||
SHARD_NUMBER: ${{ matrix.shard }}
|
||||
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
||||
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
|
||||
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
|
||||
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
|
||||
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
|
||||
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
|
||||
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
|
||||
TEST_CONFIG: ${{ matrix.config }}
|
||||
SHARD_NUMBER: ${{ matrix.shard }}
|
||||
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
||||
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
||||
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
||||
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
|
||||
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
||||
run: |
|
||||
set -x
|
||||
@ -248,7 +236,6 @@ jobs:
|
||||
-e GITHUB_RUN_ATTEMPT \
|
||||
-e JOB_ID \
|
||||
-e JOB_NAME \
|
||||
-e BASE_SHA \
|
||||
-e BRANCH \
|
||||
-e SHA1 \
|
||||
-e AWS_DEFAULT_REGION \
|
||||
@ -266,12 +253,10 @@ jobs:
|
||||
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
|
||||
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
|
||||
-e TESTS_TO_INCLUDE \
|
||||
-e HUGGING_FACE_HUB_TOKEN \
|
||||
-e DASHBOARD_TAG \
|
||||
--env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
|
||||
--ulimit stack=10485760:83886080 \
|
||||
--ulimit core=0 \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--shm-size="8g" \
|
||||
|
||||
45
.github/workflows/build-vllm-wheel.yml
vendored
45
.github/workflows/build-vllm-wheel.yml
vendored
@ -12,9 +12,6 @@ on:
|
||||
paths:
|
||||
- .github/workflows/build-vllm-wheel.yml
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
schedule:
|
||||
# every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
|
||||
- cron: 30 13 * * *
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
@ -27,33 +24,21 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [ '3.12' ]
|
||||
# TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
|
||||
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
|
||||
# TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
|
||||
device: [ 'cu128', 'cu129' ]
|
||||
runner: [ 'linux.12xlarge.memory' ]
|
||||
include:
|
||||
- platform: manylinux_2_28_x86_64
|
||||
device: cu128
|
||||
- device: cu128
|
||||
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
|
||||
runner: linux.12xlarge.memory
|
||||
- platform: manylinux_2_28_x86_64
|
||||
device: cu129
|
||||
- device: cu129
|
||||
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
|
||||
runner: linux.12xlarge.memory
|
||||
- platform: manylinux_2_28_aarch64
|
||||
device: cu128
|
||||
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
|
||||
runner: linux.arm64.r7g.12xlarge.memory
|
||||
- platform: manylinux_2_28_aarch64
|
||||
device: cu129
|
||||
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
|
||||
runner: linux.arm64.r7g.12xlarge.memory
|
||||
name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
|
||||
name: "Build ${{ matrix.device }} vLLM wheel"
|
||||
runs-on: ${{ matrix.runner }}
|
||||
timeout-minutes: 480
|
||||
env:
|
||||
PY_VERS: ${{ matrix.python-version }}
|
||||
MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
PLATFORM: 'manylinux_2_28_x86_64'
|
||||
BUILD_DEVICE: ${{ matrix.device }}
|
||||
steps:
|
||||
- name: Setup SSH (Click me for login details)
|
||||
@ -151,7 +136,7 @@ jobs:
|
||||
|
||||
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
|
||||
name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
|
||||
if-no-files-found: error
|
||||
path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
|
||||
|
||||
@ -161,29 +146,27 @@ jobs:
|
||||
|
||||
# Copied from build-triton-wheel workflow (mostly)
|
||||
upload-wheel:
|
||||
name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
|
||||
name: "Upload ${{ matrix.device }} vLLM wheel"
|
||||
needs:
|
||||
- build-wheel
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
|
||||
device: [ 'cu128', 'cu129' ]
|
||||
env:
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
BUILD_DEVICE: ${{ matrix.device }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
container:
|
||||
image: continuumio/miniconda3:4.12.0
|
||||
environment: ${{ ((github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'nightly-wheel-upload' || '' }}
|
||||
environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Configure AWS credentials(PyTorch account) for main
|
||||
if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
||||
if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
|
||||
@ -207,15 +190,15 @@ jobs:
|
||||
run: |
|
||||
set -eux
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"
|
||||
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
- name: Set DRY_RUN
|
||||
if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
||||
- name: Set DRY_RUN (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Set UPLOAD_CHANNEL
|
||||
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
shell: bash
|
||||
run: |
|
||||
|
||||
@ -43,11 +43,6 @@ on:
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
freezing:
|
||||
description: Run freezing?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
benchmark_configs:
|
||||
description: The list of configs used the benchmark
|
||||
required: false
|
||||
@ -107,7 +102,7 @@ jobs:
|
||||
if: github.event.schedule == '0 7 * * *'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
@ -121,9 +116,10 @@ jobs:
|
||||
name: inductor-test
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
|
||||
4
.github/workflows/inductor-periodic.yml
vendored
4
.github/workflows/inductor-periodic.yml
vendored
@ -39,7 +39,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0;8.6'
|
||||
cuda-arch-list: '8.6'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
@ -62,7 +62,7 @@ jobs:
|
||||
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
|
||||
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -389,5 +389,3 @@ android/pytorch_android_torchvision/.cxx
|
||||
|
||||
# Claude Code local configuration
|
||||
CLAUDE.local.md
|
||||
/test_*.py
|
||||
/debug_*.py
|
||||
|
||||
@ -964,6 +964,7 @@ exclude_patterns = [
|
||||
'test/jit/**', # should be run through test/test_jit.py
|
||||
'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py
|
||||
'test/fx/**', # should be run through test/test_fx.py
|
||||
'test/bottleneck_test/**', # excluded by test/run_test.py
|
||||
'test/package/**', # excluded by test/run_test.py
|
||||
'test/distributed/argparse_util_test.py',
|
||||
'test/distributed/bin/test_script.py',
|
||||
@ -1409,6 +1410,8 @@ exclude_patterns = [
|
||||
'torch/utils/benchmark/utils/timer.py',
|
||||
'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
|
||||
'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
|
||||
'torch/utils/bottleneck/__init__.py',
|
||||
'torch/utils/bottleneck/__main__.py',
|
||||
'torch/utils/bundled_inputs.py',
|
||||
'torch/utils/checkpoint.py',
|
||||
'torch/utils/collect_env.py',
|
||||
|
||||
@ -874,7 +874,7 @@ cmake_dependent_option(
|
||||
"Whether to build the flash_attention kernel for scaled dot product attention.\
|
||||
Will be disabled if not supported by the platform"
|
||||
ON
|
||||
"(USE_CUDA AND NOT MSVC) OR USE_ROCM"
|
||||
"USE_CUDA OR USE_ROCM"
|
||||
OFF)
|
||||
|
||||
cmake_dependent_option(
|
||||
@ -891,7 +891,7 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
endif()
|
||||
|
||||
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
|
||||
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
|
||||
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||
message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
|
||||
set(USE_FBGEMM_GENAI ON)
|
||||
endif()
|
||||
|
||||
@ -65,24 +65,14 @@ DLDataType getDLDataType(const Tensor& t) {
|
||||
break;
|
||||
// TODO(#146647): use macro here instead of spelling out each shell dtype
|
||||
case ScalarType::Float8_e5m2:
|
||||
dtype.code = DLDataTypeCode::kDLFloat8_e5m2;
|
||||
break;
|
||||
case ScalarType::Float8_e5m2fnuz:
|
||||
dtype.code = DLDataTypeCode::kDLFloat8_e5m2fnuz;
|
||||
break;
|
||||
case ScalarType::Float8_e4m3fn:
|
||||
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fn;
|
||||
break;
|
||||
case ScalarType::Float8_e4m3fnuz:
|
||||
dtype.code = DLDataTypeCode::kDLFloat8_e4m3fnuz;
|
||||
break;
|
||||
case ScalarType::Float8_e8m0fnu:
|
||||
dtype.code = DLDataTypeCode::kDLFloat8_e8m0fnu;
|
||||
TORCH_CHECK_BUFFER(false, "float8 types are not supported by dlpack");
|
||||
break;
|
||||
case ScalarType::Float4_e2m1fn_x2:
|
||||
dtype.code = DLDataTypeCode::kDLFloat4_e2m1fn;
|
||||
dtype.lanes = 2;
|
||||
dtype.bits = 4;
|
||||
TORCH_CHECK_BUFFER(false, "float4 types are not supported by dlpack");
|
||||
break;
|
||||
case ScalarType::QInt8:
|
||||
case ScalarType::QUInt8:
|
||||
@ -187,11 +177,7 @@ static Device getATenDevice(DLDeviceType type, c10::DeviceIndex index, void* dat
|
||||
|
||||
ScalarType toScalarType(const DLDataType& dtype) {
|
||||
ScalarType stype = ScalarType::Undefined;
|
||||
if (dtype.code != DLDataTypeCode::kDLFloat4_e2m1fn) {
|
||||
TORCH_CHECK_BUFFER(
|
||||
dtype.lanes == 1,
|
||||
"ATen does not support lanes != 1 for dtype code", std::to_string(dtype.code));
|
||||
}
|
||||
TORCH_CHECK_BUFFER(dtype.lanes == 1, "ATen does not support lanes != 1");
|
||||
switch (dtype.code) {
|
||||
case DLDataTypeCode::kDLUInt:
|
||||
switch (dtype.bits) {
|
||||
@ -283,73 +269,6 @@ ScalarType toScalarType(const DLDataType& dtype) {
|
||||
false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat8_e5m2:
|
||||
switch (dtype.bits) {
|
||||
case 8:
|
||||
stype = ScalarType::Float8_e5m2;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat8_e5m2 bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat8_e5m2fnuz:
|
||||
switch (dtype.bits) {
|
||||
case 8:
|
||||
stype = ScalarType::Float8_e5m2fnuz;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat8_e5m2fnuz bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat8_e4m3fn:
|
||||
switch (dtype.bits) {
|
||||
case 8:
|
||||
stype = ScalarType::Float8_e4m3fn;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat8_e4m3fn bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat8_e4m3fnuz:
|
||||
switch (dtype.bits) {
|
||||
case 8:
|
||||
stype = ScalarType::Float8_e4m3fnuz;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat8_e4m3fnuz bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat8_e8m0fnu:
|
||||
switch (dtype.bits) {
|
||||
case 8:
|
||||
stype = ScalarType::Float8_e8m0fnu;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat8_e8m0fnu bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
case DLDataTypeCode::kDLFloat4_e2m1fn:
|
||||
switch (dtype.bits) {
|
||||
case 4:
|
||||
switch (dtype.lanes) {
|
||||
case 2:
|
||||
stype = ScalarType::Float4_e2m1fn_x2;
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat4_e2m1fn lanes ", std::to_string(dtype.lanes));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(
|
||||
false, "Unsupported kDLFloat4_e2m1fn bits ", std::to_string(dtype.bits));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
TORCH_CHECK_BUFFER(false, "Unsupported code ", std::to_string(dtype.code));
|
||||
}
|
||||
@ -435,8 +354,8 @@ T* toDLPackImpl(const Tensor& src) {
|
||||
atDLMTensor->tensor.dl_tensor.device = torchDeviceToDLDevice(src.device());
|
||||
atDLMTensor->tensor.dl_tensor.ndim = static_cast<int32_t>(src.dim());
|
||||
atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src);
|
||||
atDLMTensor->tensor.dl_tensor.shape = const_cast<int64_t*>(view.sizes().data());
|
||||
atDLMTensor->tensor.dl_tensor.strides = const_cast<int64_t*>(view.strides().data());
|
||||
atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
|
||||
atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
|
||||
atDLMTensor->tensor.dl_tensor.byte_offset = 0;
|
||||
fillVersion(&atDLMTensor->tensor);
|
||||
|
||||
|
||||
@ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
|
||||
// SparseTensorImpl has no storage, so we cannot query its nbytes.
|
||||
// (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
|
||||
// Same for XLA
|
||||
if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) {
|
||||
if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
|
||||
original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
|
||||
} else {
|
||||
original_storage_size_ = -1;
|
||||
|
||||
@ -266,14 +266,11 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(
|
||||
* See Note [Acquire lock when using random generators]
|
||||
*/
|
||||
void CUDAGeneratorImpl::set_current_seed(uint64_t seed) {
|
||||
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
|
||||
state_->seed_ = seed;
|
||||
state_->philox_offset_per_thread_ = 0;
|
||||
no_reset_rnn_state_.clear();
|
||||
} else {
|
||||
TORCH_CHECK(state_->seed_ == seed, "CUDAGeneratorImpl::set_current_seed can be called during stream capture only if new seed is the same as the original seed.");
|
||||
// no-op case
|
||||
}
|
||||
at::cuda::assertNotCapturing(
|
||||
"Cannot call CUDAGeneratorImpl::set_current_seed");
|
||||
state_->seed_ = seed;
|
||||
state_->philox_offset_per_thread_ = 0;
|
||||
no_reset_rnn_state_.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -302,6 +299,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const {
|
||||
* Gets the current seed of CUDAGeneratorImpl.
|
||||
*/
|
||||
uint64_t CUDAGeneratorImpl::current_seed() const {
|
||||
// Debatable if current_seed() should be allowed in captured regions.
|
||||
// Conservatively disallow it for now.
|
||||
at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed");
|
||||
return state_->seed_;
|
||||
}
|
||||
|
||||
@ -346,6 +346,8 @@ c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
|
||||
* and size of the internal state.
|
||||
*/
|
||||
void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
|
||||
at::cuda::assertNotCapturing(
|
||||
"Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing.");
|
||||
static const size_t seed_size = sizeof(uint64_t);
|
||||
static const size_t offset_size = sizeof(int64_t);
|
||||
static const size_t total_size = seed_size + offset_size;
|
||||
@ -400,27 +402,15 @@ c10::intrusive_ptr<c10::GeneratorImpl> CUDAGeneratorImpl::graphsafe_get_state()
|
||||
*/
|
||||
void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
|
||||
// see Note [Why enforce RNG offset % 4 == 0?]
|
||||
|
||||
// Note: If you use CUDNN RNN's, calling
|
||||
// set_philox_offset_per_thread instead of set_offset will cause the
|
||||
// cudnn RNN rng state to become stale.
|
||||
TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4");
|
||||
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
|
||||
state_->philox_offset_per_thread_ = offset;
|
||||
} else {
|
||||
state_->offset_intragraph_ = offset;
|
||||
}
|
||||
state_->philox_offset_per_thread_ = offset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
|
||||
*/
|
||||
uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
|
||||
if (C10_LIKELY(at::cuda::currentStreamCaptureStatus() == at::cuda::CaptureStatus::None)) {
|
||||
return state_->philox_offset_per_thread_;
|
||||
} else {
|
||||
return state_->offset_intragraph_;
|
||||
}
|
||||
return state_->philox_offset_per_thread_;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -19,7 +19,7 @@
|
||||
#define DLPACK_MAJOR_VERSION 1
|
||||
|
||||
/*! \brief The current minor version of dlpack */
|
||||
#define DLPACK_MINOR_VERSION 1
|
||||
#define DLPACK_MINOR_VERSION 0
|
||||
|
||||
/*! \brief DLPACK_DLL prefix for windows */
|
||||
#ifdef _WIN32
|
||||
@ -32,7 +32,9 @@
|
||||
#define DLPACK_DLL
|
||||
#endif
|
||||
|
||||
// NOLINTNEXTLINE(modernize-deprecated-headers)
|
||||
#include <stdint.h>
|
||||
// NOLINTNEXTLINE(modernize-deprecated-headers)
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -157,26 +159,6 @@ typedef enum {
|
||||
kDLComplex = 5U,
|
||||
/*! \brief boolean */
|
||||
kDLBool = 6U,
|
||||
/*! \brief FP8 data types */
|
||||
kDLFloat8_e3m4 = 7U,
|
||||
kDLFloat8_e4m3 = 8U,
|
||||
kDLFloat8_e4m3b11fnuz = 9U,
|
||||
kDLFloat8_e4m3fn = 10U,
|
||||
kDLFloat8_e4m3fnuz = 11U,
|
||||
kDLFloat8_e5m2 = 12U,
|
||||
kDLFloat8_e5m2fnuz = 13U,
|
||||
kDLFloat8_e8m0fnu = 14U,
|
||||
/*! \brief FP6 data types
|
||||
* Setting bits != 6 is currently unspecified, and the producer must ensure it is set
|
||||
* while the consumer must stop importing if the value is unexpected.
|
||||
*/
|
||||
kDLFloat6_e2m3fn = 15U,
|
||||
kDLFloat6_e3m2fn = 16U,
|
||||
/*! \brief FP4 data types
|
||||
* Setting bits != 4 is currently unspecified, and the producer must ensure it is set
|
||||
* while the consumer must stop importing if the value is unexpected.
|
||||
*/
|
||||
kDLFloat4_e2m1fn = 17U,
|
||||
} DLDataTypeCode;
|
||||
|
||||
/*!
|
||||
@ -190,12 +172,6 @@ typedef enum {
|
||||
* - int8: type_code = 0, bits = 8, lanes = 1
|
||||
* - std::complex<float>: type_code = 5, bits = 64, lanes = 1
|
||||
* - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
|
||||
* - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
|
||||
* - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
|
||||
* - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
|
||||
*
|
||||
* When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
|
||||
* for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
|
||||
*/
|
||||
typedef struct {
|
||||
/*!
|
||||
@ -253,12 +229,12 @@ typedef struct {
|
||||
/*! \brief The data type of the pointer*/
|
||||
DLDataType dtype;
|
||||
/*! \brief The shape of the tensor */
|
||||
int64_t* shape;
|
||||
const int64_t* shape;
|
||||
/*!
|
||||
* \brief strides of the tensor (in number of elements, not bytes)
|
||||
* can be NULL, indicating tensor is compact and row-majored.
|
||||
*/
|
||||
int64_t* strides;
|
||||
const int64_t* strides;
|
||||
/*! \brief The offset in bytes to the beginning pointer to data */
|
||||
uint64_t byte_offset;
|
||||
} DLTensor;
|
||||
@ -293,7 +269,7 @@ typedef struct DLManagedTensor {
|
||||
void (*deleter)(struct DLManagedTensor * self);
|
||||
} DLManagedTensor;
|
||||
|
||||
// bit masks used in the DLManagedTensorVersioned
|
||||
// bit masks used in in the DLManagedTensorVersioned
|
||||
|
||||
/*! \brief bit mask to indicate that the tensor is read only. */
|
||||
#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
|
||||
@ -306,14 +282,6 @@ typedef struct DLManagedTensor {
|
||||
*/
|
||||
#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
|
||||
|
||||
/*
|
||||
* \brief bit mask to indicate that whether a sub-byte type is packed or padded.
|
||||
*
|
||||
* The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
|
||||
* be set by the producer to signal that a tensor of sub-byte type is padded.
|
||||
*/
|
||||
#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
|
||||
|
||||
/*!
|
||||
* \brief A versioned and managed C Tensor object, manage memory of DLTensor.
|
||||
*
|
||||
|
||||
@ -171,8 +171,6 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
|
||||
|
||||
POINTWISE_BOXED(fill_.Scalar);
|
||||
POINTWISE_BOXED(zero_);
|
||||
// This is special because this op doesn't return anything
|
||||
m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata);
|
||||
|
||||
#undef UNARY_POINTWISE
|
||||
#undef UNARY_POINTWISE_ALL
|
||||
|
||||
@ -81,7 +81,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) {
|
||||
// TODO: contiguous can be made to preserve the memory format
|
||||
// of the input. However since the above reshape clobbers h and w
|
||||
// it may not be safe to do that, since channels_last contiguous
|
||||
// may think oc and the last dim correspond to h,w?
|
||||
// may think oc and and the last dim correspond to h,w?
|
||||
// It is not clear, however from initial looking around it feels that
|
||||
// this may not be correct.
|
||||
// In this case channels last will likely require custom implementation
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
#pragma once
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Config.h>
|
||||
#include <cstdint>
|
||||
|
||||
@ -67,13 +67,13 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
|
||||
int64_t inputH = input_.size(heightDim);
|
||||
int64_t inputW = input_.size(widthDim);
|
||||
|
||||
TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT),
|
||||
TORCH_CHECK(outputT + poolSizeT - 1 < inputT,
|
||||
"fractional_max_pool3d_out(): pool time ", poolSizeT,
|
||||
" too large relative to input time ", inputT);
|
||||
TORCH_CHECK((poolSizeW <= inputW) && (outputW + poolSizeW - 1 < inputW),
|
||||
TORCH_CHECK(outputW + poolSizeW - 1 < inputW,
|
||||
"fractional_max_pool3d_out(): pool width ", poolSizeW,
|
||||
" too large relative to input width ", inputW);
|
||||
TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH),
|
||||
TORCH_CHECK(outputH + poolSizeH - 1 < inputH,
|
||||
"fractional_max_pool3d_out(): pool height ", poolSizeH,
|
||||
" too large relative to input height ", inputH);
|
||||
|
||||
|
||||
@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
|
||||
if (self.dim() == 1 || broadcast_index) {
|
||||
Tensor squeezed = index;
|
||||
if (broadcast_index && index.dim() > 1) {
|
||||
for (int64_t d = index.dim() - 1; d >= 0; --d) {
|
||||
for (const auto d : c10::irange(index.dim())) {
|
||||
if (d == dim) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -52,7 +52,6 @@ void apply_triu_tril_single(
|
||||
int64_t self_col_stride,
|
||||
bool upper) {
|
||||
constexpr int64_t zero = 0;
|
||||
k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN.
|
||||
|
||||
if (upper) {
|
||||
parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
|
||||
|
||||
@ -85,11 +85,11 @@ void cpu_max_unpool(
|
||||
if constexpr (is_3d) {
|
||||
TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
|
||||
" (output volumes are of size ", output_depth,
|
||||
"x", output_height, "x", output_width, ")");
|
||||
"x", output_height, "x", output_width);
|
||||
} else {
|
||||
TORCH_CHECK(false, "Found an invalid max index: ", optional_error_index.value(),
|
||||
" (output volumes are of size ", output_height,
|
||||
"x", output_width, ")");
|
||||
"x", output_width);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -317,17 +317,6 @@ void nonzero_static_cuda_out_impl(
|
||||
out_temp =
|
||||
Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
|
||||
}
|
||||
// If input has zero elements, avoid kernel grid calculations (which can
|
||||
// produce zero divisors) and just fill the output with fill_value.
|
||||
if (self.numel() == 0) {
|
||||
if (need_to_copy) {
|
||||
out_temp.fill_(fill_value);
|
||||
out.copy_(out_temp);
|
||||
} else {
|
||||
out.fill_(fill_value);
|
||||
}
|
||||
return;
|
||||
}
|
||||
int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
|
||||
: out.mutable_data_ptr<int64_t>();
|
||||
|
||||
|
||||
@ -416,7 +416,6 @@ struct ReduceOp {
|
||||
if (config.should_block_y_reduce()) {
|
||||
value = block_y_reduce<output_vec_size>(value, shared_memory);
|
||||
}
|
||||
__syncthreads();
|
||||
if (config.should_block_x_reduce()) {
|
||||
value = block_x_reduce<output_vec_size>(value, shared_memory);
|
||||
}
|
||||
|
||||
@ -17,11 +17,12 @@ __global__ static void compute_cuda_kernel(
|
||||
index_t* result_ptr,
|
||||
int64_t size,
|
||||
int64_t result_size) {
|
||||
CUDA_KERNEL_ASSERT_PRINTF(
|
||||
result_size == cumsum_ptr[size - 1],
|
||||
if (C10_UNLIKELY((result_size != cumsum_ptr[size - 1]))) {
|
||||
printf("%s:%d:%s: block: [%d,%d,%d], thread: [%d,%d,%d] "
|
||||
"Invalid input! In `repeat_interleave`, the `output_size` argument (%ld) must be the same as the sum of the elements in the `repeats` tensor (%ld).\n",
|
||||
result_size,
|
||||
cumsum_ptr[size - 1]);
|
||||
__FILE__, __LINE__, __func__,blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, result_size, cumsum_ptr[size - 1 ]);
|
||||
CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1])
|
||||
}
|
||||
|
||||
int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;
|
||||
int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE;
|
||||
|
||||
@ -5,20 +5,12 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
__global__ void weight_int8pack_mm_kernel(
|
||||
const float* x,
|
||||
const int8_t* w,
|
||||
const float* scale,
|
||||
float* out,
|
||||
int B,
|
||||
int K,
|
||||
int N) {
|
||||
__global__ void weight_int8pack_mm_kernel(const float* x, const int8_t* w, const float* scale, float* out, int B, int K, int N) {
|
||||
// one thread per output element: [B, N]
|
||||
int b = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int n = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (b >= B || n >= N)
|
||||
return;
|
||||
if (b >= B || n >= N) return;
|
||||
|
||||
float acc = 0.0f;
|
||||
for (int k = 0; k < K; ++k) {
|
||||
@ -28,11 +20,7 @@ __global__ void weight_int8pack_mm_kernel(
|
||||
out[b * N + n] = acc * scale[n];
|
||||
}
|
||||
|
||||
void launch_weight_int8pack_mm_cuda_kernel(
|
||||
const Tensor& x,
|
||||
const Tensor& w_int8,
|
||||
const Tensor& scale,
|
||||
Tensor& out) {
|
||||
void launch_weight_int8pack_mm_cuda_kernel(const Tensor& x, const Tensor& w_int8, const Tensor& scale, Tensor& out) {
|
||||
const int B = x.size(0);
|
||||
const int K = x.size(1);
|
||||
const int N = w_int8.size(0);
|
||||
@ -47,16 +35,12 @@ void launch_weight_int8pack_mm_cuda_kernel(
|
||||
w_int8.data_ptr<int8_t>(),
|
||||
scale.data_ptr<float>(),
|
||||
out.data_ptr<float>(),
|
||||
B,
|
||||
K,
|
||||
N);
|
||||
B, K, N);
|
||||
}
|
||||
|
||||
|
||||
// Main GPU entry point
|
||||
at::Tensor _weight_int8pack_mm_cuda(
|
||||
const at::Tensor& x,
|
||||
const at::Tensor& w_int8,
|
||||
const at::Tensor& scale) {
|
||||
at::Tensor _weight_int8pack_mm_cuda(const at::Tensor& x, const at::Tensor& w_int8, const at::Tensor& scale) {
|
||||
// --- Check inputs ---
|
||||
TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
|
||||
TORCH_CHECK(w_int8.is_cuda(), "w must be a CUDA tensor");
|
||||
@ -66,16 +50,12 @@ at::Tensor _weight_int8pack_mm_cuda(
|
||||
TORCH_CHECK(w_int8.dim() == 2, "w must be 2D");
|
||||
TORCH_CHECK(scale.dim() == 1, "scale must be 1D");
|
||||
|
||||
TORCH_CHECK(
|
||||
x.size(1) == w_int8.size(1),
|
||||
"K dimension mismatch: x.size(1) != w.size(1)");
|
||||
TORCH_CHECK(
|
||||
w_int8.size(0) == scale.size(0),
|
||||
"Output dim mismatch: w.size(0) != scale.size(0)");
|
||||
TORCH_CHECK(x.size(1) == w_int8.size(1), "K dimension mismatch: x.size(1) != w.size(1)");
|
||||
TORCH_CHECK(w_int8.size(0) == scale.size(0), "Output dim mismatch: w.size(0) != scale.size(0)");
|
||||
|
||||
// --- Determine shapes ---
|
||||
auto B = x.size(0); // batch size
|
||||
auto N = w_int8.size(0); // output dim
|
||||
auto B = x.size(0); // batch size
|
||||
auto N = w_int8.size(0); // output dim
|
||||
|
||||
// Ensure inputs are in the correct types for the kernel
|
||||
auto x_f32 = x.to(at::kFloat);
|
||||
@ -83,13 +63,12 @@ at::Tensor _weight_int8pack_mm_cuda(
|
||||
auto scale_f32 = scale.to(at::kFloat);
|
||||
|
||||
// --- Allocate output ---
|
||||
auto out = at::empty({B, N}, x_f32.options());
|
||||
auto out = at::empty({B, N}, x.options().dtype(at::kFloat));
|
||||
|
||||
// --- Launch kernel ---
|
||||
launch_weight_int8pack_mm_cuda_kernel(
|
||||
x_f32, w_int8_contiguous, scale_f32, out);
|
||||
launch_weight_int8pack_mm_cuda_kernel(x_f32, w_int8_contiguous, scale_f32, out);
|
||||
|
||||
return out.to(x.dtype());
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -482,9 +482,7 @@ auto build_graph(
|
||||
auto scaled_dot_product_flash_attention_options =
|
||||
fe::graph::SDPA_attributes()
|
||||
.set_name("CUDNN_SDPA")
|
||||
.set_is_inference(return_softmaxstats == false)
|
||||
// TODO(eqy): switch to this API once cuDNN FE is upgraded
|
||||
// .set_generate_stats(return_softmaxstats)
|
||||
.set_generate_stats(return_softmaxstats)
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale);
|
||||
if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
|
||||
@ -704,9 +702,7 @@ auto build_graph_nestedtensor(
|
||||
auto scaled_dot_product_flash_attention_options =
|
||||
fe::graph::SDPA_attributes()
|
||||
.set_name("CUDNN_SDPA_NESTEDTENSOR")
|
||||
.set_is_inference(return_softmaxstats == false)
|
||||
// TODO(eqy): switch to this API once cuDNN FE is upgraded
|
||||
// .set_generate_stats(return_softmaxstats)
|
||||
.set_generate_stats(return_softmaxstats)
|
||||
.set_causal_mask(is_causal)
|
||||
.set_attn_scale(attn_scale)
|
||||
.set_seq_len_q(SEQ_LEN_Q_)
|
||||
|
||||
@ -1770,12 +1770,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_back
|
||||
// fusions
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
void raw_miopen_convolution_add_relu_out(
|
||||
void raw_miopen_convolution_relu_out(
|
||||
const Tensor& output,
|
||||
const Tensor& input,
|
||||
const Tensor& weight,
|
||||
const Tensor& z,
|
||||
float alpha,
|
||||
const Tensor& bias,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
@ -1783,20 +1781,68 @@ void raw_miopen_convolution_add_relu_out(
|
||||
int64_t groups,
|
||||
bool benchmark,
|
||||
bool deterministic) {
|
||||
raw_miopen_convolution_forward_out(
|
||||
output,
|
||||
auto dataType = getMiopenDataType(input);
|
||||
miopenConvolutionMode_t c_mode = miopenConvolution;
|
||||
ConvolutionArgs args{ input, output, weight };
|
||||
args.handle = getMiopenHandle();
|
||||
at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
|
||||
setConvolutionParams(
|
||||
&args.params,
|
||||
args.handle,
|
||||
input,
|
||||
weight,
|
||||
padding,
|
||||
stride,
|
||||
dilation,
|
||||
groups,
|
||||
deterministic,
|
||||
memory_format);
|
||||
args.idesc.set(input, memory_format);
|
||||
args.wdesc.set(weight, memory_format, 0);
|
||||
args.odesc.set(output, memory_format);
|
||||
args.cdesc.set(
|
||||
dataType,
|
||||
c_mode,
|
||||
input.dim() - 2,
|
||||
args.params.padding,
|
||||
args.params.stride,
|
||||
args.params.dilation,
|
||||
args.params.groups,
|
||||
benchmark,
|
||||
deterministic);
|
||||
at::Tensor alpha_mul_z_add_bias =
|
||||
at::native::reshape_bias(input.dim(), bias).add(z, alpha);
|
||||
output.add_(alpha_mul_z_add_bias);
|
||||
output.relu_();
|
||||
|
||||
TensorDescriptor bdesc;
|
||||
bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
|
||||
|
||||
// Create the fusion plan
|
||||
miopenFusionPlanDescriptor_t fusePlanDesc;
|
||||
miopenFusionOpDescriptor_t convoOp;
|
||||
miopenFusionOpDescriptor_t biasOp;
|
||||
miopenFusionOpDescriptor_t activOp;
|
||||
MIOPEN_CHECK(miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, args.idesc.desc()));
|
||||
MIOPEN_CHECK(miopenCreateOpConvForward(fusePlanDesc, &convoOp, args.cdesc.desc(), args.wdesc.desc()));
|
||||
MIOPEN_CHECK(miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bdesc.desc()));
|
||||
MIOPEN_CHECK(miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU));
|
||||
|
||||
// compile fusion plan
|
||||
MIOPEN_CHECK(miopenCompileFusionPlan(args.handle, fusePlanDesc));
|
||||
|
||||
// Set the Args
|
||||
float alpha = static_cast<float>(1);
|
||||
float beta = static_cast<float>(0);
|
||||
float activ_alpha = static_cast<float>(0);
|
||||
float activ_beta = static_cast<float>(0);
|
||||
float activ_gamma = static_cast<float>(0);
|
||||
miopenOperatorArgs_t fusionArgs;
|
||||
MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
|
||||
MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
|
||||
MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
|
||||
MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));
|
||||
|
||||
miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
|
||||
|
||||
// Cleanup
|
||||
miopenDestroyFusionPlan(fusePlanDesc);
|
||||
}
|
||||
|
||||
static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat memory_format) {
|
||||
@ -1809,107 +1855,171 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
|
||||
Tensor miopen_convolution_add_relu(
|
||||
const Tensor& input_t,
|
||||
const Tensor& weight_t,
|
||||
const Tensor& z_t,
|
||||
const Tensor& z,
|
||||
const std::optional<Scalar>& alpha,
|
||||
const std::optional<Tensor>& bias_t,
|
||||
const std::optional<Tensor>& bias,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
int64_t groups) {
|
||||
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
|
||||
const Tensor input = input_t.contiguous(memory_format);
|
||||
const Tensor weight = weight_t.contiguous(memory_format);
|
||||
Tensor z = z_t;
|
||||
if (z.suggest_memory_format() != memory_format) {
|
||||
z = z.to(memory_format);
|
||||
}
|
||||
z = z.contiguous(memory_format);
|
||||
|
||||
// FuseFrozenConvAddRelu performs some tensor shape checking
|
||||
Tensor output_t = at::detail::empty_cuda(
|
||||
conv_output_size(
|
||||
input.sizes(), weight.sizes(), padding, stride, dilation),
|
||||
input.options().memory_format(memory_format));
|
||||
if (output_t.numel() == 0) {
|
||||
return output_t;
|
||||
}
|
||||
// MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
|
||||
// y = act ( alpha1 * conv(x) + alpha2 * z + bias )
|
||||
|
||||
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
|
||||
|
||||
auto& ctx = at::globalContext();
|
||||
bool benchmark = ctx.benchmarkCuDNN();
|
||||
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
|
||||
auto _bias = bias_t.has_value()
|
||||
? bias_t.value()
|
||||
: at::zeros(
|
||||
{output_t.size(1)},
|
||||
optTypeMetaToScalarType(output_t.options().dtype_opt()),
|
||||
output_t.options().layout_opt(),
|
||||
output_t.options().device_opt(),
|
||||
output_t.options().pinned_memory_opt());
|
||||
|
||||
raw_miopen_convolution_add_relu_out(
|
||||
output_t,
|
||||
TensorArg input { input_t, "input", 1 },
|
||||
weight { weight_t, "weight", 2 };
|
||||
|
||||
Tensor output_t = at::detail::empty_cuda(
|
||||
conv_output_size(
|
||||
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
|
||||
input_t.options().memory_format(memory_format));
|
||||
if (output_t.numel() == 0){
|
||||
return output_t;
|
||||
}
|
||||
// Avoid ambiguity of "output" when this is being used as backwards
|
||||
TensorArg output{output_t, "result", 0};
|
||||
miopen_convolution_forward_out(
|
||||
output,
|
||||
"miopen_convolution_add_relu",
|
||||
input,
|
||||
weight,
|
||||
z,
|
||||
_alpha,
|
||||
_bias,
|
||||
stride,
|
||||
padding,
|
||||
stride,
|
||||
dilation,
|
||||
groups,
|
||||
benchmark,
|
||||
true); // deterministic
|
||||
false // deterministic
|
||||
);
|
||||
|
||||
return output_t;
|
||||
auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
|
||||
|
||||
if (!output_t.is_same(contig_output_t)) {
|
||||
contig_output_t.copy_(output_t);
|
||||
}
|
||||
|
||||
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
|
||||
auto _bias = bias.has_value()
|
||||
? bias.value()
|
||||
: at::zeros(
|
||||
{contig_output_t.size(1)},
|
||||
optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
|
||||
contig_output_t.options().layout_opt(),
|
||||
contig_output_t.options().device_opt(),
|
||||
contig_output_t.options().pinned_memory_opt());
|
||||
|
||||
at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
|
||||
contig_output_t.add_(alpha_mul_z_add_bias);
|
||||
contig_output_t.relu_();
|
||||
|
||||
return contig_output_t;
|
||||
}
|
||||
|
||||
Tensor miopen_convolution_relu(
|
||||
const Tensor& input_t,
|
||||
const Tensor& weight_t,
|
||||
const std::optional<Tensor>& bias_t,
|
||||
const std::optional<Tensor>& bias,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
int64_t groups) {
|
||||
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
|
||||
const Tensor input = input_t.contiguous(memory_format);
|
||||
const Tensor weight = weight_t.contiguous(memory_format);
|
||||
|
||||
// FuseFrozenConvAddRelu performs some tensor shape checking
|
||||
Tensor output_t = at::detail::empty_cuda(
|
||||
conv_output_size(
|
||||
input.sizes(), weight.sizes(), padding, stride, dilation),
|
||||
input.options().memory_format(memory_format));
|
||||
if (output_t.numel() == 0) {
|
||||
return output_t;
|
||||
}
|
||||
|
||||
auto& ctx = at::globalContext();
|
||||
bool benchmark = ctx.benchmarkCuDNN();
|
||||
auto _bias = bias_t.has_value()
|
||||
? bias_t.value()
|
||||
: at::zeros(
|
||||
{output_t.size(1)},
|
||||
optTypeMetaToScalarType(output_t.options().dtype_opt()),
|
||||
output_t.options().layout_opt(),
|
||||
output_t.options().device_opt(),
|
||||
output_t.options().pinned_memory_opt());
|
||||
|
||||
raw_miopen_convolution_add_relu_out(
|
||||
output_t,
|
||||
input,
|
||||
weight,
|
||||
output_t, // use output_t as z to satisfy MIOpen API
|
||||
0, // alpha
|
||||
_bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
benchmark, // benchmark
|
||||
true); // deterministic
|
||||
// MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
|
||||
if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
|
||||
&& input_t.scalar_type() == at::kFloat
|
||||
&& input_t.ndimension() == 4) {
|
||||
|
||||
return output_t;
|
||||
// FuseFrozenConvAddRelu performs some tensor shape checking
|
||||
Tensor output_t = at::detail::empty_cuda(
|
||||
conv_output_size(
|
||||
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
|
||||
input_t.options().memory_format(input_t.suggest_memory_format()));
|
||||
if (output_t.numel() == 0) {
|
||||
return output_t;
|
||||
}
|
||||
|
||||
auto _bias = bias.has_value()
|
||||
? bias.value()
|
||||
: at::zeros(
|
||||
{output_t.size(1)},
|
||||
optTypeMetaToScalarType(output_t.options().dtype_opt()),
|
||||
output_t.options().layout_opt(),
|
||||
output_t.options().device_opt(),
|
||||
output_t.options().pinned_memory_opt());
|
||||
|
||||
raw_miopen_convolution_relu_out(
|
||||
output_t,
|
||||
input_t,
|
||||
weight_t,
|
||||
_bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
benchmark, // benchmark
|
||||
false // deterministic
|
||||
);
|
||||
|
||||
return output_t;
|
||||
}
|
||||
else {
|
||||
// fallback
|
||||
|
||||
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
|
||||
|
||||
TensorArg input { input_t, "input", 1 },
|
||||
weight { weight_t, "weight", 2 };
|
||||
|
||||
Tensor output_t = at::detail::empty_cuda(
|
||||
conv_output_size(
|
||||
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
|
||||
input->options().memory_format(memory_format));
|
||||
if (output_t.numel() == 0){
|
||||
return output_t;
|
||||
}
|
||||
// Avoid ambiguity of "output" when this is being used as backwards
|
||||
TensorArg output{output_t, "result", 0};
|
||||
miopen_convolution_forward_out(
|
||||
output,
|
||||
"miopen_convolution_relu",
|
||||
input,
|
||||
weight,
|
||||
padding,
|
||||
stride,
|
||||
dilation,
|
||||
groups,
|
||||
benchmark,
|
||||
false // deterministic
|
||||
);
|
||||
|
||||
auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
|
||||
|
||||
if (!output_t.is_same(contig_output_t)) {
|
||||
contig_output_t.copy_(output_t);
|
||||
}
|
||||
|
||||
auto _bias = bias.has_value()
|
||||
? bias.value()
|
||||
: at::zeros(
|
||||
{contig_output_t.size(1)},
|
||||
optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
|
||||
contig_output_t.options().layout_opt(),
|
||||
contig_output_t.options().device_opt(),
|
||||
contig_output_t.options().pinned_memory_opt());
|
||||
|
||||
at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
|
||||
contig_output_t.add_(reshaped_bias);
|
||||
contig_output_t.relu_();
|
||||
|
||||
return contig_output_t;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward)
|
||||
|
||||
48
aten/src/ATen/native/mps/MPSGraphSonomaOps.h
Normal file
48
aten/src/ATen/native/mps/MPSGraphSonomaOps.h
Normal file
@ -0,0 +1,48 @@
|
||||
#pragma once
|
||||
|
||||
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
|
||||
|
||||
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
|
||||
|
||||
typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
|
||||
MPSGraphFFTScalingModeNone = 0L,
|
||||
MPSGraphFFTScalingModeSize = 1L,
|
||||
MPSGraphFFTScalingModeUnitary = 2L,
|
||||
};
|
||||
|
||||
@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
|
||||
@property(readwrite, nonatomic) BOOL inverse;
|
||||
@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
|
||||
@property(readwrite, nonatomic) BOOL roundToOddHermitean;
|
||||
+ (nullable instancetype)descriptor;
|
||||
@end
|
||||
|
||||
@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
|
||||
|
||||
@interface MPSGraph (SonomaOps)
|
||||
- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axes:(NSArray<NSNumber*>* _Nonnull)axes
|
||||
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axes:(NSArray<NSNumber*>* _Nonnull)axes
|
||||
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axes:(NSArray<NSNumber*>* _Nonnull)axes
|
||||
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
@end
|
||||
|
||||
// define BFloat16 enums for MacOS13
|
||||
#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
|
||||
|
||||
// define Metal version
|
||||
#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
|
||||
#endif
|
||||
196
aten/src/ATen/native/mps/MPSGraphVenturaOps.h
Normal file
196
aten/src/ATen/native/mps/MPSGraphVenturaOps.h
Normal file
@ -0,0 +1,196 @@
|
||||
#pragma once
|
||||
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
|
||||
|
||||
// TODO: Remove me when moved to MacOS 13
|
||||
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
|
||||
|
||||
@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
|
||||
|
||||
@property(readwrite, nonatomic) NSUInteger strideInX;
|
||||
@property(readwrite, nonatomic) NSUInteger strideInY;
|
||||
@property(readwrite, nonatomic) NSUInteger strideInZ;
|
||||
@property(readwrite, nonatomic) NSUInteger dilationRateInX;
|
||||
@property(readwrite, nonatomic) NSUInteger dilationRateInY;
|
||||
@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
|
||||
|
||||
@property(readwrite, nonatomic) NSUInteger paddingLeft;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingRight;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingTop;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingBottom;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingFront;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingBack;
|
||||
|
||||
@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
|
||||
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
|
||||
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
|
||||
|
||||
@property(readwrite, nonatomic) NSUInteger groups;
|
||||
|
||||
@end
|
||||
|
||||
@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
|
||||
|
||||
#endif
|
||||
|
||||
@interface MPSGraph (VenturaOps)
|
||||
|
||||
#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
|
||||
|
||||
typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
|
||||
MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
|
||||
MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
|
||||
MPSGraphResizeNearestRoundingModeCeil = 2L,
|
||||
MPSGraphResizeNearestRoundingModeFloor = 3L,
|
||||
MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
|
||||
MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
|
||||
};
|
||||
|
||||
// Define complex enums for MacOS 12
|
||||
#define MPSDataTypeComplexBit 0x01000000
|
||||
#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
|
||||
#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
|
||||
#endif
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
weightsTensor:(MPSGraphTensor* _Nonnull)weights
|
||||
descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)
|
||||
convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
|
||||
weightsTensor:(MPSGraphTensor* _Nonnull)weights
|
||||
outputShape:(MPSShape* _Nonnull)outputShape
|
||||
forwardConvolutionDescriptor:
|
||||
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)
|
||||
convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
|
||||
sourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
outputShape:(MPSShape* _Nonnull)outputShape
|
||||
forwardConvolutionDescriptor:
|
||||
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
normalizeCoordinates:(BOOL)normalizeCoordinates
|
||||
relativeCoordinates:(BOOL)relativeCoordinates
|
||||
alignCorners:(BOOL)alignCorners
|
||||
paddingMode:(MPSGraphPaddingMode)paddingMode
|
||||
samplingMode:(MPSGraphResizeMode)samplingMode
|
||||
constantValue:(double)constantValue
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
normalizeCoordinates:(BOOL)normalizeCoordinates
|
||||
relativeCoordinates:(BOOL)relativeCoordinates
|
||||
alignCorners:(BOOL)alignCorners
|
||||
paddingMode:(MPSGraphPaddingMode)paddingMode
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
constantValue:(double)constantValue
|
||||
name:(NSString* _Nullable)name;
|
||||
- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
|
||||
|
||||
@end
|
||||
@ -9,6 +9,8 @@
|
||||
#include <ATen/mps/MPSAllocatorInterface.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <fmt/format.h>
|
||||
#include <fmt/ranges.h>
|
||||
@ -568,7 +570,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
|
||||
MPSShape* mpsStrides = getMPSShape(_tensor.strides());
|
||||
check_mps_shape(mpsShape);
|
||||
|
||||
auto storage_numel = src.storage().nbytes() / src.element_size() - src.storage_offset();
|
||||
auto storage_numel = src.storage().nbytes() / src.element_size();
|
||||
TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(),
|
||||
"MPSGaph does not support tensor dims larger than INT_MAX");
|
||||
MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
#pragma once
|
||||
#include <c10/metal/common.h>
|
||||
|
||||
#ifdef __METAL__
|
||||
enum class EmbeddingBagMode { SUM = 0, MEAN, MAX };
|
||||
#else
|
||||
#include <ATen/native/EmbeddingBag.h>
|
||||
using at::native::EmbeddingBagMode;
|
||||
#endif
|
||||
|
||||
template <typename idx_type_t = uint32_t>
|
||||
struct EmbeddingBagParams {
|
||||
::c10::metal::array<idx_type_t, 2> weight_strides;
|
||||
::c10::metal::array<idx_type_t, 2> output_strides;
|
||||
::c10::metal::array<idx_type_t, 2> max_indices_strides;
|
||||
|
||||
idx_type_t per_sample_weights_strides;
|
||||
|
||||
idx_type_t num_indices;
|
||||
idx_type_t num_bags;
|
||||
idx_type_t feature_size;
|
||||
|
||||
EmbeddingBagMode mode;
|
||||
int64_t padding_idx;
|
||||
};
|
||||
@ -1,212 +0,0 @@
|
||||
#include <ATen/native/mps/kernels/EmbeddingBag.h>
|
||||
#include <c10/metal/utils.h>
|
||||
#include <metal_array>
|
||||
#include <metal_stdlib>
|
||||
|
||||
using namespace metal;
|
||||
using namespace c10::metal;
|
||||
|
||||
template <EmbeddingBagMode M, typename T>
|
||||
struct ReductionOpInit {
|
||||
inline opmath_t<T> operator()() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ReductionOpInit<EmbeddingBagMode::MAX, T> {
|
||||
inline opmath_t<T> operator()() {
|
||||
return static_cast<opmath_t<T>>(-INFINITY);
|
||||
}
|
||||
};
|
||||
|
||||
template <EmbeddingBagMode M, typename T>
|
||||
struct ReductionOp {
|
||||
inline opmath_t<T> operator()(
|
||||
T weight_val,
|
||||
opmath_t<T> out_val,
|
||||
uint32_t per_sample_weights_index,
|
||||
constant T* per_sample_weights,
|
||||
uint32_t per_sample_weights_strides);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ReductionOp<EmbeddingBagMode::SUM, T> {
|
||||
inline opmath_t<T> operator()(
|
||||
T weight_val,
|
||||
opmath_t<T> out_val,
|
||||
uint32_t per_sample_weights_index,
|
||||
constant T* per_sample_weights,
|
||||
uint32_t per_sample_weights_strides) {
|
||||
if (per_sample_weights_strides) {
|
||||
T per_sample_weight = per_sample_weights
|
||||
[per_sample_weights_strides * per_sample_weights_index];
|
||||
return static_cast<opmath_t<T>>(per_sample_weight) *
|
||||
static_cast<opmath_t<T>>(weight_val) +
|
||||
out_val;
|
||||
} else {
|
||||
return static_cast<opmath_t<T>>(weight_val) + out_val;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ReductionOp<EmbeddingBagMode::MEAN, T> {
|
||||
inline opmath_t<T> operator()(
|
||||
T weight_val,
|
||||
opmath_t<T> out_val,
|
||||
uint32_t,
|
||||
constant T*,
|
||||
uint32_t) {
|
||||
return static_cast<opmath_t<T>>(weight_val) + out_val;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ReductionOp<EmbeddingBagMode::MAX, T> {
|
||||
inline opmath_t<T> operator()(
|
||||
T weight_val,
|
||||
opmath_t<T> out_val,
|
||||
uint32_t,
|
||||
constant T*,
|
||||
uint32_t) {
|
||||
return max(static_cast<opmath_t<T>>(weight_val), out_val);
|
||||
}
|
||||
};
|
||||
|
||||
template <EmbeddingBagMode M, typename T>
|
||||
struct ReductionOpFinal {
|
||||
inline T operator()(opmath_t<T> val, uint32_t) {
|
||||
return static_cast<T>(val);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ReductionOpFinal<EmbeddingBagMode::MEAN, T> {
|
||||
inline T operator()(opmath_t<T> val, uint32_t count) {
|
||||
auto out = val / count;
|
||||
return static_cast<T>((count == 0) ? 0 : out);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ReductionOpFinal<EmbeddingBagMode::MAX, T> {
|
||||
inline T operator()(opmath_t<T> val, uint32_t count) {
|
||||
return static_cast<T>((count == 0) ? 0 : val);
|
||||
}
|
||||
};
|
||||
|
||||
template <EmbeddingBagMode M, typename T, typename I>
|
||||
void embedding_bag_impl(
|
||||
constant T* weight,
|
||||
constant I* indices,
|
||||
constant I* offsets,
|
||||
constant T* per_sample_weights,
|
||||
device T* output,
|
||||
device I* offset2bag,
|
||||
device I* bag_size,
|
||||
device I* max_indices,
|
||||
constant EmbeddingBagParams<uint32_t>& params,
|
||||
uint tid) {
|
||||
auto num_indices = params.num_indices;
|
||||
auto num_bags = params.num_bags;
|
||||
auto feature_size = params.feature_size;
|
||||
auto padding_idx = params.padding_idx;
|
||||
auto per_sample_weights_strides = params.per_sample_weights_strides;
|
||||
constant auto& output_strides = params.output_strides;
|
||||
constant auto& weight_strides = params.weight_strides;
|
||||
constant auto& max_indices_strides = params.max_indices_strides;
|
||||
|
||||
auto bag_idx = tid / feature_size;
|
||||
auto feature_idx = tid % feature_size;
|
||||
|
||||
output += bag_idx * output_strides[0] + feature_idx * output_strides[1];
|
||||
|
||||
uint32_t offsets_end = min(bag_idx + 1, num_bags - 1);
|
||||
bool is_last_bag = bag_idx + 1 == num_bags;
|
||||
uint32_t indices_start = static_cast<uint32_t>(offsets[bag_idx]);
|
||||
uint32_t indices_end = is_last_bag * (num_indices) +
|
||||
(!is_last_bag) * (static_cast<uint32_t>(offsets[offsets_end]));
|
||||
|
||||
auto out_val = ReductionOpInit<M, T>()();
|
||||
|
||||
uint32_t bag_size_ = 0;
|
||||
|
||||
for (uint32_t indices_idx = indices_start; indices_idx < indices_end;
|
||||
indices_idx++) {
|
||||
I weight_idx = indices[indices_idx];
|
||||
bool pad = (weight_idx == padding_idx);
|
||||
T weight_val = weight
|
||||
[static_cast<uint32_t>(weight_idx) * weight_strides[0] +
|
||||
feature_idx * weight_strides[1]];
|
||||
|
||||
bag_size_ += static_cast<uint32_t>(!pad);
|
||||
|
||||
auto tmp_val = ReductionOp<M, T>()(
|
||||
weight_val,
|
||||
out_val,
|
||||
indices_idx,
|
||||
per_sample_weights,
|
||||
per_sample_weights_strides);
|
||||
|
||||
out_val = pad ? out_val : tmp_val;
|
||||
}
|
||||
|
||||
*output = ReductionOpFinal<M, T>()(out_val, bag_size_);
|
||||
}
|
||||
|
||||
#define DISPATCH_IMPL(MODE) \
|
||||
return embedding_bag_impl<MODE>( \
|
||||
weight, \
|
||||
indices, \
|
||||
offsets, \
|
||||
per_sample_weights, \
|
||||
output, \
|
||||
offset2bag, \
|
||||
bag_size, \
|
||||
max_indices, \
|
||||
params, \
|
||||
tid)
|
||||
|
||||
template <typename T, typename I>
|
||||
kernel void embedding_bag(
|
||||
constant T* weight [[buffer(0)]],
|
||||
constant I* indices [[buffer(1)]],
|
||||
constant I* offsets [[buffer(2)]],
|
||||
constant T* per_sample_weights [[buffer(3)]],
|
||||
device T* output [[buffer(4)]],
|
||||
device I* offset2bag [[buffer(5)]],
|
||||
device I* bag_size [[buffer(6)]],
|
||||
device I* max_indices [[buffer(7)]],
|
||||
constant EmbeddingBagParams<uint32_t>& params [[buffer(8)]],
|
||||
uint tid [[thread_position_in_grid]]) {
|
||||
switch (params.mode) {
|
||||
case EmbeddingBagMode::SUM:
|
||||
DISPATCH_IMPL(EmbeddingBagMode::SUM);
|
||||
case EmbeddingBagMode::MEAN:
|
||||
DISPATCH_IMPL(EmbeddingBagMode::MEAN);
|
||||
case EmbeddingBagMode::MAX:
|
||||
DISPATCH_IMPL(EmbeddingBagMode::MAX);
|
||||
}
|
||||
}
|
||||
|
||||
#define REGISTER_EMBEDDING_BAG_OP(T, I) \
|
||||
template [[host_name("embedding_bag_" #T "_" #I)]] \
|
||||
kernel void embedding_bag<T, I>( \
|
||||
constant T * weight [[buffer(0)]], \
|
||||
constant I * indices [[buffer(1)]], \
|
||||
constant I * offsets [[buffer(2)]], \
|
||||
constant T * per_sample_weights [[buffer(3)]], \
|
||||
device T * output [[buffer(4)]], \
|
||||
device I * offset2bag [[buffer(5)]], \
|
||||
device I * bag_size [[buffer(6)]], \
|
||||
device I * max_indices [[buffer(7)]], \
|
||||
constant EmbeddingBagParams<uint32_t> & params [[buffer(8)]], \
|
||||
uint tid [[thread_position_in_grid]]);
|
||||
|
||||
REGISTER_EMBEDDING_BAG_OP(float, int);
|
||||
REGISTER_EMBEDDING_BAG_OP(float, long);
|
||||
REGISTER_EMBEDDING_BAG_OP(half, int);
|
||||
REGISTER_EMBEDDING_BAG_OP(half, long);
|
||||
REGISTER_EMBEDDING_BAG_OP(bfloat, int);
|
||||
REGISTER_EMBEDDING_BAG_OP(bfloat, long);
|
||||
@ -8,6 +8,8 @@
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/operations/BinaryKernel.h>
|
||||
// For MTLLanguageVersion_3_1
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -1,12 +1,23 @@
|
||||
// Copyright © 2022 Apple Inc.
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/ConvUtils.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/ops/_mps_convolution_native.h>
|
||||
#include <ATen/ops/_mps_convolution_transpose_native.h>
|
||||
#include <ATen/ops/mps_convolution_backward_native.h>
|
||||
#include <ATen/ops/mps_convolution_transpose_backward_native.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
|
||||
|
||||
@implementation FakeMPSGraphConvolution3DOpDescriptor
|
||||
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
|
||||
return self;
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -39,9 +50,11 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
|
||||
descriptor_.paddingFront = paddingDepth;
|
||||
descriptor_.paddingBack = paddingDepth;
|
||||
|
||||
descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;
|
||||
// PyTorch always uses NCDHW memory layout for 3D tensors
|
||||
descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;
|
||||
|
||||
descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;
|
||||
// PyTorch always uses OIDHW memory layout for 3D weights
|
||||
descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;
|
||||
|
||||
descriptor_.groups = groups; // not yet tested in Xcode/C++
|
||||
}
|
||||
@ -173,6 +186,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
if (bias_defined)
|
||||
bias_shape = bias_opt.value().sizes();
|
||||
|
||||
std::string mem_format_key;
|
||||
switch (memory_format) {
|
||||
case at::MemoryFormat::Contiguous:
|
||||
mem_format_key = "Contiguous";
|
||||
break;
|
||||
case at::MemoryFormat::ChannelsLast:
|
||||
mem_format_key = "ChannelsLast";
|
||||
break;
|
||||
default:
|
||||
assert(0 && "Check should have been done earlier\n");
|
||||
}
|
||||
|
||||
std::string bias_shape_key;
|
||||
if (bias_defined) {
|
||||
bias_shape_key = std::to_string(bias_shape[0]);
|
||||
@ -180,16 +205,20 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
bias_shape_key = "nobias";
|
||||
}
|
||||
|
||||
std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
getArrayRefString(stride),
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
is_channels_last,
|
||||
mps::getTensorsStringKey({input_t, weight_t}),
|
||||
bias_defined,
|
||||
bias_shape_key);
|
||||
std::string key;
|
||||
if (is3DConv) {
|
||||
key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
|
||||
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
|
||||
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
|
||||
|
||||
} else {
|
||||
key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
|
||||
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
|
||||
}
|
||||
|
||||
MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
|
||||
MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
|
||||
@ -371,15 +400,33 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
@autoreleasepool {
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
std::string mem_format_key;
|
||||
switch (memory_format) {
|
||||
case at::MemoryFormat::Contiguous:
|
||||
mem_format_key = "Contiguous";
|
||||
break;
|
||||
case at::MemoryFormat::ChannelsLast:
|
||||
mem_format_key = "ChannelsLast";
|
||||
break;
|
||||
default:
|
||||
assert(0 && "Check should have been done earlier\n");
|
||||
}
|
||||
|
||||
MPSShape* mps_input_shape = getMPSShape(input_size);
|
||||
std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
getArrayRefString(stride),
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
is_channels_last,
|
||||
getTensorsStringKey({grad_output_t, weight_t}));
|
||||
std::string key;
|
||||
if (is3DConv) {
|
||||
key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
|
||||
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
|
||||
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
getTensorsStringKey({grad_output_t, weight_t});
|
||||
|
||||
} else {
|
||||
key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
|
||||
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
getTensorsStringKey({grad_output_t, weight_t});
|
||||
}
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
|
||||
auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
|
||||
@ -504,13 +551,19 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
MPSShape* mps_weight_shape = getMPSShape(weight_size);
|
||||
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
getArrayRefString(stride),
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
|
||||
std::string key;
|
||||
if (is3DConv) {
|
||||
key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
|
||||
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
|
||||
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
|
||||
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
|
||||
} else {
|
||||
key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
|
||||
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
|
||||
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
|
||||
}
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
MPSShape* inputShape = getMPSShape(input_t);
|
||||
bool isDepthwiseConv =
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/mps/Copy.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/ops/_copy_from_and_resize_native.h>
|
||||
#include <ATen/ops/_copy_from_native.h>
|
||||
|
||||
@ -5,6 +5,8 @@
|
||||
#include <ATen/native/DistributionTemplates.h>
|
||||
#include <ATen/native/Distributions.h>
|
||||
#include <ATen/native/TensorFactories.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -1,179 +0,0 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/EmbeddingBag.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/kernels/EmbeddingBag.h>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_embedding_bag_forward_only_native.h>
|
||||
#include <ATen/ops/_embedding_bag_native.h>
|
||||
#include <ATen/ops/empty.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#ifndef PYTORCH_JIT_COMPILE_SHADERS
|
||||
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
|
||||
#else
|
||||
#include <ATen/native/mps/EmbeddingBag_metallib.h>
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
std::pair<Tensor, Tensor> promoteIndicesAndOffsets(const Tensor& indices, const Tensor& offsets) {
|
||||
const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type());
|
||||
return {indices.scalar_type() == commonType ? indices : indices.toType(commonType),
|
||||
offsets.scalar_type() == commonType ? offsets : offsets.toType(commonType)};
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace mps {
|
||||
|
||||
static std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps_impl(
|
||||
const Tensor& weight,
|
||||
const Tensor& indices_,
|
||||
const Tensor& offsets_,
|
||||
const bool scale_grad_by_freq,
|
||||
const int64_t mode,
|
||||
bool sparse,
|
||||
const std::optional<Tensor>& per_sample_weights_opt,
|
||||
bool include_last_offset,
|
||||
int64_t padding_idx) {
|
||||
TORCH_CHECK(indices_.dim() == 1, "input has to be a 1D Tensor, but got Tensor of dimension ", indices_.dim());
|
||||
if (indices_.dim() == 1) {
|
||||
TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor, but got Tensor of dimension ", offsets_.dim());
|
||||
}
|
||||
TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim());
|
||||
|
||||
Tensor indices, offsets;
|
||||
std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_);
|
||||
auto indices_arg = TensorArg(indices, "indices", 1);
|
||||
checkScalarTypes("embedding_bag_mps", indices_arg, {kLong, kInt});
|
||||
auto offsets_arg = TensorArg(offsets, "offsets", 1);
|
||||
checkScalarTypes("embedding_bag_mps", offsets_arg, {kLong, kInt});
|
||||
checkSameType("embedding_bag_mps", indices_arg, offsets_arg);
|
||||
auto weight_arg = TensorArg(weight, "weight", 1);
|
||||
|
||||
int64_t num_indices = indices.size(0);
|
||||
int64_t num_bags = offsets.size(0);
|
||||
if (include_last_offset) {
|
||||
num_bags -= 1;
|
||||
}
|
||||
int64_t feature_size = weight.size(1);
|
||||
|
||||
auto bag_size = at::empty(offsets.sizes(), indices.options());
|
||||
auto offset2bag = at::empty({indices.size(0)}, indices.options());
|
||||
auto output = at::empty({num_bags, feature_size}, weight.options());
|
||||
|
||||
Tensor max_indices;
|
||||
|
||||
if (mode == EmbeddingBagMode::MAX) {
|
||||
max_indices = at::empty({num_bags, feature_size}, indices.options());
|
||||
} else {
|
||||
max_indices = at::empty({0}, indices.options());
|
||||
}
|
||||
|
||||
EmbeddingBagParams<uint32_t> params;
|
||||
|
||||
for (const auto dim : c10::irange(weight.dim())) {
|
||||
params.weight_strides[dim] = safe_downcast<uint32_t, int64_t>(weight.stride(dim));
|
||||
params.output_strides[dim] = safe_downcast<uint32_t, int64_t>(output.stride(dim));
|
||||
|
||||
if (mode == EmbeddingBagMode::MAX) {
|
||||
params.max_indices_strides[dim] = safe_downcast<uint32_t, int64_t>(max_indices.stride(dim));
|
||||
}
|
||||
}
|
||||
|
||||
bool use_per_sample_weights = per_sample_weights_opt.has_value() && per_sample_weights_opt->defined();
|
||||
params.per_sample_weights_strides = use_per_sample_weights ? per_sample_weights_opt->stride(0) : 0;
|
||||
|
||||
params.num_indices = num_indices;
|
||||
params.num_bags = num_bags;
|
||||
params.feature_size = feature_size;
|
||||
params.mode = static_cast<EmbeddingBagMode>(mode);
|
||||
params.padding_idx = padding_idx;
|
||||
|
||||
auto num_threads = output.numel();
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
|
||||
auto pipeline_state = lib.getPipelineStateForFunc(
|
||||
fmt::format("embedding_bag_{}_{}", scalarToMetalTypeString(weight), scalarToMetalTypeString(indices)));
|
||||
|
||||
getMPSProfiler().beginProfileKernel(pipeline_state, "embedding_bag", {weight, indices, offsets});
|
||||
[computeEncoder setComputePipelineState:pipeline_state];
|
||||
mtl_setArgs(computeEncoder,
|
||||
weight,
|
||||
indices,
|
||||
offsets,
|
||||
use_per_sample_weights ? per_sample_weights_opt : std::nullopt,
|
||||
output,
|
||||
offset2bag,
|
||||
bag_size,
|
||||
max_indices,
|
||||
params);
|
||||
|
||||
mtl_dispatch1DJob(computeEncoder, pipeline_state, num_threads);
|
||||
getMPSProfiler().endProfileKernel(pipeline_state);
|
||||
}
|
||||
});
|
||||
|
||||
return std::tuple<Tensor, Tensor, Tensor, Tensor>(
|
||||
std::move(output), std::move(offset2bag), std::move(bag_size), std::move(max_indices));
|
||||
}
|
||||
|
||||
} // namespace mps
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_mps(const Tensor& weight,
|
||||
const Tensor& indices,
|
||||
const Tensor& offsets,
|
||||
const bool scale_grad_by_freq,
|
||||
const int64_t mode,
|
||||
bool sparse,
|
||||
const std::optional<Tensor>& per_sample_weights_opt,
|
||||
bool include_last_offset,
|
||||
int64_t padding_idx) {
|
||||
return mps::_embedding_bag_mps_impl(weight,
|
||||
indices,
|
||||
offsets,
|
||||
scale_grad_by_freq,
|
||||
mode,
|
||||
sparse,
|
||||
per_sample_weights_opt,
|
||||
include_last_offset,
|
||||
padding_idx);
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor, Tensor> _embedding_bag_forward_only_mps(
|
||||
const Tensor& weight,
|
||||
const Tensor& indices,
|
||||
const Tensor& offsets,
|
||||
const bool scale_grad_by_freq,
|
||||
const int64_t mode,
|
||||
bool sparse,
|
||||
const std::optional<Tensor>& per_sample_weights_opt,
|
||||
bool include_last_offset,
|
||||
int64_t padding_idx) {
|
||||
return _embedding_bag_mps(weight,
|
||||
indices,
|
||||
offsets,
|
||||
scale_grad_by_freq,
|
||||
mode,
|
||||
sparse,
|
||||
per_sample_weights_opt,
|
||||
include_last_offset,
|
||||
padding_idx);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
@ -1,4 +1,6 @@
|
||||
#include <ATen/native/SpectralOpsUtils.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
@ -10,6 +12,20 @@
|
||||
#include <ATen/ops/_fft_r2c_native.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
|
||||
@implementation FakeMPSGraphFFTDescriptor
|
||||
+ (nullable instancetype)descriptor {
|
||||
// Redispatch the constructor to the actual implementation
|
||||
id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
|
||||
return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
|
||||
}
|
||||
|
||||
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
|
||||
return self;
|
||||
}
|
||||
@end
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
namespace {
|
||||
MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/GridSamplerUtils.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/kernels/GridSampler.h>
|
||||
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/TensorAdvancedIndexing.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
@ -6,7 +6,9 @@
|
||||
#include <ATen/native/LinearAlgebra.h>
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
// For MTLLanguageVersion_3_1
|
||||
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/ReduceOpsUtils.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
#include <ATen/WrapDimUtils.h>
|
||||
#include <ATen/native/TensorShape.h>
|
||||
#include <ATen/native/TypeProperties.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
#include <ATen/native/SortingUtils.h>
|
||||
#include <ATen/native/TensorShape.h>
|
||||
#include <ATen/native/TypeProperties.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/UnaryOps.h>
|
||||
#include <ATen/native/mps/Copy.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
// Copyright © 2022 Apple Inc.
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
// Copyright © 2023 Apple Inc.
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/UpSample.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
|
||||
@ -4,6 +4,8 @@
|
||||
#include <ATen/mps/MPSAllocatorInterface.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
// For MTLLanguageVersion_3_1
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
|
||||
@ -2351,7 +2351,6 @@
|
||||
dispatch:
|
||||
CPU: _embedding_bag_forward_only_cpu
|
||||
CUDA: _embedding_bag_forward_only_cuda
|
||||
MPS: _embedding_bag_forward_only_mps
|
||||
autogen: _embedding_bag_forward_only.out
|
||||
|
||||
- func: _rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
|
||||
@ -2373,7 +2372,6 @@
|
||||
dispatch:
|
||||
CPU: _embedding_bag_cpu
|
||||
CUDA: _embedding_bag_cuda
|
||||
MPS: _embedding_bag_mps
|
||||
autogen: _embedding_bag.out
|
||||
tags: core
|
||||
|
||||
@ -2519,7 +2517,7 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: empty_like
|
||||
QuantizedCPU, QuantizedCUDA: empty_like_quantized
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: empty_like_sparse_coo
|
||||
SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
|
||||
autogen: empty_like.out
|
||||
@ -4374,7 +4372,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CPU: narrow_copy_dense_cpu
|
||||
SparseCPU, SparseCUDA, SparseMPS: narrow_copy_sparse
|
||||
SparseCPU, SparseCUDA: narrow_copy_sparse
|
||||
CompositeExplicitAutogradNonFunctional: narrow_copy_dense_symint
|
||||
tags: view_copy
|
||||
|
||||
@ -6494,7 +6492,7 @@
|
||||
device_guard: False
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: unsqueeze
|
||||
SparseCPU, SparseCUDA, SparseMPS: unsqueeze_sparse
|
||||
SparseCPU, SparseCUDA: unsqueeze_sparse
|
||||
QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
|
||||
tags: core
|
||||
@ -6662,7 +6660,7 @@
|
||||
- func: zeros.out(SymInt[] size, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: zeros_out
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zeros_sparse_out
|
||||
SparseCPU, SparseCUDA, SparseMeta: zeros_sparse_out
|
||||
|
||||
- func: zeros_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
|
||||
dispatch:
|
||||
@ -10261,7 +10259,7 @@
|
||||
structured_delegate: any.all_out
|
||||
variants: method, function
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: any_sparse
|
||||
SparseCPU, SparseCUDA: any_sparse
|
||||
tags: core
|
||||
|
||||
- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
@ -10701,7 +10699,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
|
||||
CUDA: foreach_tensor_div_list_kernel_cuda
|
||||
MTIA: foreach_tensor_div_list_kernel_mtia
|
||||
|
||||
- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
|
||||
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
|
||||
@ -10709,7 +10706,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
|
||||
CUDA: foreach_tensor_div_list_kernel_cuda_
|
||||
MTIA: foreach_tensor_div_list_kernel_mtia_
|
||||
autogen: _foreach_div.List_out
|
||||
|
||||
- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
|
||||
@ -10733,7 +10729,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
|
||||
CUDA: foreach_tensor_div_tensor_kernel_cuda
|
||||
MTIA: foreach_tensor_div_tensor_kernel_mtia
|
||||
|
||||
- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
|
||||
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
|
||||
@ -10741,7 +10736,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
|
||||
CUDA: foreach_tensor_div_tensor_kernel_cuda_
|
||||
MTIA: foreach_tensor_div_tensor_kernel_mtia_
|
||||
autogen: _foreach_div.Tensor_out
|
||||
|
||||
- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/ceil_div.h>
|
||||
#include <ATen/native/cuda/Loops.cuh>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
@ -22,11 +21,10 @@
|
||||
namespace at::native {
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
__global__ void ChooseQuantizationParamsKernelImpl(
|
||||
const int64_t* fake_quant_on,
|
||||
const T* x_min,
|
||||
const T* x_max,
|
||||
const float* x_min,
|
||||
const float* x_max,
|
||||
int32_t qmin,
|
||||
int32_t qmax,
|
||||
int size,
|
||||
@ -95,44 +93,34 @@ __global__ void ChooseQuantizationParamsKernelImpl(
|
||||
}
|
||||
}
|
||||
|
||||
__device__ inline bool isinf_device(float v) {
|
||||
return ::isinf(v);
|
||||
}
|
||||
__device__ inline bool isinf_device(c10::BFloat16 v) {
|
||||
return ::isinf(static_cast<float>(v));
|
||||
}
|
||||
|
||||
// CUDA kernel to compute Moving Average Min/Max of the tensor.
|
||||
// It uses the running_min and running_max along with averaging const, c.
|
||||
// The formula used to compute the new min/max is as follows
|
||||
//
|
||||
// running_min = (1 - c) * running_min + c * x_min, if running_min != inf
|
||||
// running_min = x_min, if running_min == inf
|
||||
template <typename T>
|
||||
__global__ void MovingAverageMinMax(
|
||||
const int64_t* observer_on,
|
||||
const T* x_min,
|
||||
const T* x_max,
|
||||
T* running_min,
|
||||
T* running_max,
|
||||
const float* x_min,
|
||||
const float* x_max,
|
||||
float* running_min,
|
||||
float* running_max,
|
||||
const float averaging_const,
|
||||
const int size) {
|
||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (*observer_on == 1) {
|
||||
if (i < size) {
|
||||
T curr_min = x_min[i];
|
||||
T curr_max = x_max[i];
|
||||
float curr_min = x_min[i];
|
||||
float curr_max = x_max[i];
|
||||
|
||||
T averaging_const_t = static_cast<T>(averaging_const);
|
||||
float adjusted_min = ::isinf(running_min[i])
|
||||
? curr_min
|
||||
: (running_min[i]) + averaging_const * (curr_min - (running_min[i]));
|
||||
|
||||
T adjusted_min = isinf_device(running_min[i]) ? curr_min
|
||||
: (running_min[i]) +
|
||||
averaging_const_t * (curr_min - (running_min[i]));
|
||||
|
||||
T adjusted_max = isinf_device(running_max[i]) ? curr_max
|
||||
: (running_max[i]) +
|
||||
averaging_const_t * (curr_max - (running_max[i]));
|
||||
float adjusted_max = ::isinf(running_max[i])
|
||||
? curr_max
|
||||
: (running_max[i]) + averaging_const * (curr_max - (running_max[i]));
|
||||
|
||||
running_min[i] = adjusted_min;
|
||||
running_max[i] = adjusted_max;
|
||||
@ -154,51 +142,40 @@ void _calculate_moving_average(
|
||||
at::Tensor x_min, x_max;
|
||||
|
||||
int64_t* observer_on_data = observer_on.data_ptr<int64_t>();
|
||||
float* running_min_data = running_min.data_ptr<float>();
|
||||
float* running_max_data = running_max.data_ptr<float>();
|
||||
cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
if (per_row_fq) {
|
||||
std::tie(x_min, x_max) = at::aminmax(x, 1);
|
||||
float* x_min_data = x_min.data_ptr<float>();
|
||||
float* x_max_data = x_max.data_ptr<float>();
|
||||
int num_threads = std::min(size, (int64_t)512);
|
||||
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(
|
||||
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
|
||||
scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
|
||||
scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
|
||||
|
||||
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
|
||||
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
|
||||
|
||||
// Moving Average Min/Max observer for activations
|
||||
MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
|
||||
observer_on_data,
|
||||
x_min_data,
|
||||
x_max_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
averaging_const,
|
||||
size);
|
||||
});
|
||||
// Moving Average Min/Max observer for activations
|
||||
MovingAverageMinMax<<<num_blocks, num_threads, 0, cuda_stream>>>(
|
||||
observer_on_data,
|
||||
x_min_data,
|
||||
x_max_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
averaging_const,
|
||||
size);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else {
|
||||
std::tie(x_min, x_max) = at::aminmax(x);
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(
|
||||
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
|
||||
scalar_t* x_min_data = x_min.data_ptr<scalar_t>();
|
||||
scalar_t* x_max_data = x_max.data_ptr<scalar_t>();
|
||||
|
||||
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
|
||||
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
|
||||
|
||||
// Moving Average Min/Max observer for activations
|
||||
MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
|
||||
observer_on_data,
|
||||
x_min_data,
|
||||
x_max_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
averaging_const,
|
||||
1 /*size*/);
|
||||
});
|
||||
float* x_min_data = x_min.data_ptr<float>();
|
||||
float* x_max_data = x_max.data_ptr<float>();
|
||||
// Moving Average Min/Max observer for activations
|
||||
MovingAverageMinMax<<<1, 1, 0, cuda_stream>>>(
|
||||
observer_on_data,
|
||||
x_min_data,
|
||||
x_max_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
averaging_const,
|
||||
1 /*size*/);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
}
|
||||
@ -221,44 +198,34 @@ void _calc_moving_avg_qparams_helper(
|
||||
cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
|
||||
int64_t* fake_quant_on_data = fake_quant_on.data_ptr<int64_t>();
|
||||
if (per_row_fq) {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(
|
||||
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
|
||||
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
|
||||
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
|
||||
int num_threads = std::min(size, (int64_t)512);
|
||||
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
|
||||
ChooseQuantizationParamsKernelImpl<<<
|
||||
num_blocks,
|
||||
num_threads,
|
||||
0,
|
||||
cuda_stream>>>(
|
||||
fake_quant_on_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
qmin,
|
||||
qmax,
|
||||
size,
|
||||
symmetric_quant,
|
||||
scale_ptr,
|
||||
zp_ptr);
|
||||
});
|
||||
float* running_min_data = running_min.data_ptr<float>();
|
||||
float* running_max_data = running_max.data_ptr<float>();
|
||||
int num_threads = std::min(size, (int64_t)512);
|
||||
const uint64_t num_blocks = ceil_div<uint64_t>(size, num_threads);
|
||||
ChooseQuantizationParamsKernelImpl<<<num_blocks, num_threads, 0, cuda_stream>>>(
|
||||
fake_quant_on_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
qmin,
|
||||
qmax,
|
||||
size,
|
||||
symmetric_quant,
|
||||
scale_ptr,
|
||||
zp_ptr);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(
|
||||
at::kBFloat16, x.scalar_type(), "aminmax_kernel", [&] {
|
||||
scalar_t* running_min_data = running_min.data_ptr<scalar_t>();
|
||||
scalar_t* running_max_data = running_max.data_ptr<scalar_t>();
|
||||
ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
|
||||
fake_quant_on_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
qmin,
|
||||
qmax,
|
||||
1, // size
|
||||
symmetric_quant, // preserve_sparsity
|
||||
scale_ptr,
|
||||
zp_ptr);
|
||||
});
|
||||
float* running_min_data = running_min.data_ptr<float>();
|
||||
float* running_max_data = running_max.data_ptr<float>();
|
||||
ChooseQuantizationParamsKernelImpl<<<1, 1, 0, cuda_stream>>>(
|
||||
fake_quant_on_data,
|
||||
running_min_data,
|
||||
running_max_data,
|
||||
qmin,
|
||||
qmax,
|
||||
1, // size
|
||||
symmetric_quant, // preserve_sparsity
|
||||
scale_ptr,
|
||||
zp_ptr);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
}
|
||||
|
||||
@ -42,7 +42,7 @@ TEST(MPSObjCInterfaceTest, MPSCustomKernel) {
|
||||
id<MTLLibrary> customKernelLibrary = [device newLibraryWithSource: [NSString stringWithUTF8String:CUSTOM_KERNEL]
|
||||
options: nil
|
||||
error: &error];
|
||||
TORCH_CHECK(customKernelLibrary, "Failed to create custom kernel library, error: ", error.localizedDescription.UTF8String);
|
||||
TORCH_CHECK(customKernelLibrary, "Failed to to create custom kernel library, error: ", error.localizedDescription.UTF8String);
|
||||
|
||||
id<MTLFunction> customFunction = [customKernelLibrary newFunctionWithName: @"add_arrays"];
|
||||
TORCH_CHECK(customFunction, "Failed to create function state object for the kernel");
|
||||
|
||||
@ -76,23 +76,4 @@ int32_t getGlobalIdxFromDevice(DeviceIndex device) {
|
||||
return device_global_idxs[device];
|
||||
}
|
||||
|
||||
// Check if a device can access the memory of a peer device directly.
|
||||
bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer) {
|
||||
if (device == -1) {
|
||||
device = c10::xpu::current_device();
|
||||
}
|
||||
if (peer == -1) {
|
||||
peer = c10::xpu::current_device();
|
||||
}
|
||||
check_device_index(device);
|
||||
check_device_index(peer);
|
||||
// A device can always access itself
|
||||
if (device == peer) {
|
||||
return true;
|
||||
}
|
||||
return c10::xpu::get_raw_device(device).ext_oneapi_can_access_peer(
|
||||
c10::xpu::get_raw_device(peer),
|
||||
sycl::ext::oneapi::peer_access::access_supported);
|
||||
}
|
||||
|
||||
} // namespace at::xpu
|
||||
|
||||
@ -17,6 +17,4 @@ TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device);
|
||||
|
||||
TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device);
|
||||
|
||||
TORCH_XPU_API bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer);
|
||||
|
||||
} // namespace at::xpu
|
||||
|
||||
@ -72,12 +72,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
|
||||
"timm_vovnet",
|
||||
"torchrec_dlrm",
|
||||
"vgg16",
|
||||
# LLM
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"google/gemma-2-2b",
|
||||
"google/gemma-3-4b-it",
|
||||
"openai/whisper-tiny",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@ -55,12 +55,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
|
||||
"timm_nfnet",
|
||||
"torchrec_dlrm",
|
||||
"vgg16",
|
||||
# LLM
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"google/gemma-2-2b",
|
||||
"google/gemma-3-4b-it",
|
||||
"openai/whisper-tiny",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -167,23 +167,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,fail_accuracy,0
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -167,23 +167,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,fail_accuracy,0
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -205,7 +205,7 @@ llama,pass,0
|
||||
|
||||
|
||||
|
||||
llama_v2_7b_16h,pass_due_to_skip,0
|
||||
llama_v2_7b_16h,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -205,7 +205,7 @@ llama,pass,0
|
||||
|
||||
|
||||
|
||||
llama_v2_7b_16h,pass_due_to_skip,0
|
||||
llama_v2_7b_16h,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -733,7 +733,7 @@ def timed(
|
||||
|
||||
time_total = 0
|
||||
# Dont collect outputs to correctly measure timing
|
||||
for i in range(times):
|
||||
for _ in range(times):
|
||||
# If batch_size is 1, it too often collides with other non batch size
|
||||
# dimensions resulting in errors.
|
||||
if batch_size and batch_size > 1:
|
||||
@ -1106,13 +1106,7 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
|
||||
elif args.torchscript_jit_trace:
|
||||
frozen_model_iter_fn = torchscript_jit_trace(model, example_inputs)
|
||||
else:
|
||||
if kwargs["hf_llm"]:
|
||||
# If it's an llm, we want to optimize model.forward, and use
|
||||
# the generate function
|
||||
model.forward = torch._dynamo.run(model)
|
||||
frozen_model_iter_fn = model_iter_fn
|
||||
else:
|
||||
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
|
||||
frozen_model_iter_fn = torch._dynamo.run(model_iter_fn)
|
||||
|
||||
for rep in trange(args.repeat, desc="running benchmark"):
|
||||
inputs = (
|
||||
@ -1126,10 +1120,7 @@ def speedup_experiment(args, model_iter_fn, model, example_inputs, **kwargs):
|
||||
maybe_mark_step(args)
|
||||
|
||||
# interleave the runs to handle frequency scaling and load changes
|
||||
with (
|
||||
maybe_mark_profile(p=p, mark="expected"),
|
||||
torch.compiler.set_stance("force_eager"),
|
||||
):
|
||||
with maybe_mark_profile(p=p, mark="expected"):
|
||||
timings[rep, 0], expected_output = timed(
|
||||
model,
|
||||
model_iter_fn,
|
||||
@ -2242,12 +2233,11 @@ class BenchmarkRunner:
|
||||
reset_rng_state()
|
||||
model_copy = None
|
||||
try:
|
||||
with torch.compiler.set_stance("force_eager"):
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
except Exception as e:
|
||||
accuracy_status = (
|
||||
"eager_1st_run_OOM"
|
||||
@ -2264,12 +2254,11 @@ class BenchmarkRunner:
|
||||
reset_rng_state()
|
||||
model_copy = None
|
||||
try:
|
||||
with torch.compiler.set_stance("force_eager"):
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_rerun_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
model_copy = self.deepcopy_and_maybe_parallelize(model)
|
||||
self.init_optimizer(name, current_device, model_copy.parameters())
|
||||
correct_rerun_result = self.run_n_iterations(
|
||||
model_copy, clone_inputs(example_inputs), self.model_iter_fn
|
||||
)
|
||||
except Exception as e:
|
||||
accuracy_status = (
|
||||
"eager_2nd_run_OOM"
|
||||
@ -2553,11 +2542,7 @@ class BenchmarkRunner:
|
||||
)
|
||||
|
||||
baseline_timings = experiment(
|
||||
self.model_iter_fn,
|
||||
model,
|
||||
example_inputs,
|
||||
mark="expected",
|
||||
**experiment_kwargs,
|
||||
model, example_inputs, mark="expected", **experiment_kwargs
|
||||
)
|
||||
|
||||
if self.args.export_aot_inductor:
|
||||
@ -2625,11 +2610,7 @@ class BenchmarkRunner:
|
||||
)
|
||||
|
||||
backend_timings = experiment(
|
||||
self.model_iter_fn,
|
||||
model,
|
||||
example_inputs,
|
||||
mark="expected",
|
||||
**experiment_kwargs,
|
||||
model, example_inputs, mark="expected", **experiment_kwargs
|
||||
)
|
||||
timings = np.stack((baseline_timings, backend_timings), axis=1)
|
||||
result_summary = latency_experiment_summary(
|
||||
@ -2648,17 +2629,9 @@ class BenchmarkRunner:
|
||||
tag=None,
|
||||
batch_size=None,
|
||||
):
|
||||
niters = 5
|
||||
if getattr(self, "hf_llm", False):
|
||||
# If we're benchmarking an llm, we want to use the generate function
|
||||
self.model_iter_fn = self.generate
|
||||
niters = 1
|
||||
|
||||
if self.args.xla:
|
||||
with self.pick_grad(name, self.args.training):
|
||||
return experiment(
|
||||
self.model_iter_fn, *self.maybe_cast(model, example_inputs)
|
||||
)
|
||||
return experiment(*self.maybe_cast(model, example_inputs))
|
||||
|
||||
def warmup(fn, model, example_inputs, mode, niters=5):
|
||||
gc.collect()
|
||||
@ -2723,22 +2696,17 @@ class BenchmarkRunner:
|
||||
with maybe_snapshot_memory(
|
||||
self.args.snapshot_memory, f"eager_{self.args.only}"
|
||||
):
|
||||
with torch.compiler.set_stance("force_eager"):
|
||||
eager_latency, eager_peak_mem, _ = warmup(
|
||||
eager_latency, eager_peak_mem, _ = warmup(
|
||||
self.model_iter_fn, copy.deepcopy(model), example_inputs, "eager"
|
||||
)
|
||||
if self.args.use_warm_peak_memory:
|
||||
_, eager_peak_mem, _ = warmup(
|
||||
self.model_iter_fn,
|
||||
copy.deepcopy(model),
|
||||
example_inputs,
|
||||
"eager",
|
||||
niters=niters,
|
||||
niters=1,
|
||||
)
|
||||
if self.args.use_warm_peak_memory:
|
||||
_, eager_peak_mem, _ = warmup(
|
||||
self.model_iter_fn,
|
||||
copy.deepcopy(model),
|
||||
example_inputs,
|
||||
"eager",
|
||||
niters=1,
|
||||
)
|
||||
|
||||
if (
|
||||
self.args.export_aot_inductor
|
||||
@ -2747,13 +2715,7 @@ class BenchmarkRunner:
|
||||
):
|
||||
optimized_model_iter_fn = optimize_ctx
|
||||
else:
|
||||
if getattr(self, "hf_llm", False):
|
||||
# If it's an llm, we want to optimize model.forward, and use
|
||||
# the generate function
|
||||
model = optimize_ctx(model)
|
||||
optimized_model_iter_fn = self.model_iter_fn
|
||||
else:
|
||||
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
|
||||
optimized_model_iter_fn = optimize_ctx(self.model_iter_fn)
|
||||
|
||||
with maybe_snapshot_memory(
|
||||
self.args.snapshot_memory, f"compiled_{self.args.only}"
|
||||
@ -2831,13 +2793,7 @@ class BenchmarkRunner:
|
||||
f"{ok:3}/{total:3} +{frames_third_pass} frames {compilation_time:3.0f}s"
|
||||
)
|
||||
|
||||
experiment_kwargs["hf_llm"] = getattr(self, "hf_llm", False)
|
||||
|
||||
results.append(
|
||||
experiment(
|
||||
self.model_iter_fn, model, example_inputs, **experiment_kwargs
|
||||
)
|
||||
)
|
||||
results.append(experiment(model, example_inputs, **experiment_kwargs))
|
||||
return " ".join(map(str, results))
|
||||
|
||||
def minify_model(
|
||||
@ -4128,7 +4084,7 @@ def run(runner, args, original_dir=None):
|
||||
# Overwrite 'translation_validation' config, if specified.
|
||||
torch.fx.experimental._config.translation_validation = False
|
||||
|
||||
experiment = functools.partial(experiment, args)
|
||||
experiment = functools.partial(experiment, args, runner.model_iter_fn)
|
||||
|
||||
if args.only and should_diff_branch(args):
|
||||
import git
|
||||
|
||||
@ -7,7 +7,6 @@ import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import types
|
||||
import warnings
|
||||
|
||||
|
||||
@ -129,12 +128,6 @@ with open(MODELS_FILENAME) as fh:
|
||||
assert len(BATCH_SIZE_KNOWN_MODELS)
|
||||
|
||||
|
||||
try:
|
||||
from .huggingface_llm_models import HF_LLM_MODELS
|
||||
except ImportError:
|
||||
from huggingface_llm_models import HF_LLM_MODELS
|
||||
|
||||
|
||||
def get_module_cls_by_model_name(model_cls_name):
|
||||
_module_by_model_name = {
|
||||
"Speech2Text2Decoder": "transformers.models.speech_to_text_2.modeling_speech_to_text_2",
|
||||
@ -425,8 +418,11 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
use_eval_mode = self.args.use_eval_mode
|
||||
dtype = torch.float32
|
||||
reset_rng_state()
|
||||
|
||||
# Get batch size
|
||||
model_cls, config = self._get_model_cls_and_config(model_name)
|
||||
model = self._download_model(model_name)
|
||||
model = model.to(device, dtype=dtype)
|
||||
if self.args.enable_activation_checkpointing:
|
||||
model.gradient_checkpointing_enable()
|
||||
if model_name in BATCH_SIZE_KNOWN_MODELS:
|
||||
batch_size_default = BATCH_SIZE_KNOWN_MODELS[model_name]
|
||||
elif batch_size is None:
|
||||
@ -444,46 +440,14 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
f"Running smaller batch size={batch_size} for {model_name}, orig batch_size={batch_size_default}" # noqa: G004
|
||||
)
|
||||
|
||||
# Get model and example inputs
|
||||
if model_name in HF_LLM_MODELS:
|
||||
benchmark_cls = HF_LLM_MODELS[model_name]
|
||||
model, example_inputs = benchmark_cls.get_model_and_inputs(
|
||||
model_name, device
|
||||
)
|
||||
example_inputs = generate_inputs_for_model(
|
||||
model_cls, model, model_name, batch_size, device, include_loss_args=True
|
||||
)
|
||||
|
||||
# Set this flag so that when we test for speedup, we use
|
||||
# model.generate instead of using model.forward
|
||||
self.hf_llm = True
|
||||
|
||||
def generate(self, _, example_inputs, collect_outputs=True):
|
||||
return model.generate(**example_inputs)
|
||||
|
||||
self.generate = types.MethodType(generate, self)
|
||||
|
||||
else:
|
||||
self.hf_llm = False
|
||||
|
||||
model_cls, config = self._get_model_cls_and_config(model_name)
|
||||
model = self._download_model(model_name)
|
||||
model = model.to(device, dtype=dtype)
|
||||
|
||||
example_inputs = generate_inputs_for_model(
|
||||
model_cls, model, model_name, batch_size, device, include_loss_args=True
|
||||
)
|
||||
|
||||
# So we can check for correct gradients without eliminating the dropout computation
|
||||
for attr in dir(config):
|
||||
if "drop" in attr and isinstance(getattr(config, attr), float):
|
||||
setattr(config, attr, 1e-30)
|
||||
|
||||
# Turning off kv cache for torchbench models. This is not the right
|
||||
# thing to do, but the pt2 dashboard is outdated. Real transformers
|
||||
# benchmarks will be added soon using a different infra.
|
||||
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
|
||||
model.config.use_cache = False
|
||||
|
||||
if self.args.enable_activation_checkpointing:
|
||||
model.gradient_checkpointing_enable()
|
||||
# So we can check for correct gradients without eliminating the dropout computation
|
||||
for attr in dir(config):
|
||||
if "drop" in attr and isinstance(getattr(config, attr), float):
|
||||
setattr(config, attr, 1e-30)
|
||||
|
||||
if (
|
||||
is_training
|
||||
@ -496,6 +460,12 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
else:
|
||||
model.eval()
|
||||
|
||||
# Turning off kv cache for torchbench models. This is not the right
|
||||
# thing to do, but the pt2 dashboard is outdated. Real transformers
|
||||
# benchmarks will be added soon using a different infra.
|
||||
if hasattr(model, "config") and hasattr(model.config, "use_cache"):
|
||||
model.config.use_cache = False
|
||||
|
||||
self.validate_model(model, example_inputs)
|
||||
return device, model_name, model, example_inputs, batch_size
|
||||
|
||||
@ -560,8 +530,7 @@ class HuggingfaceRunner(BenchmarkRunner):
|
||||
|
||||
def forward_pass(self, mod, inputs, collect_outputs=True):
|
||||
with self.autocast(**self.autocast_arg):
|
||||
res = mod(**inputs)
|
||||
return res.logits if self.hf_llm else res
|
||||
return mod(**inputs)
|
||||
|
||||
def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
|
||||
cloned_inputs = clone_inputs(inputs)
|
||||
|
||||
@ -9,16 +9,9 @@ skip:
|
||||
# Fails with even batch size = 1
|
||||
- GPTJForCausalLM
|
||||
- GPTJForQuestionAnswering
|
||||
# Model too big
|
||||
- google/gemma-3-4b-it
|
||||
|
||||
device:
|
||||
cpu:
|
||||
- meta-llama/Llama-3.2-1B
|
||||
- google/gemma-2-2b
|
||||
- google/gemma-3-4b-it
|
||||
- openai/whisper-tiny
|
||||
- Qwen/Qwen3-0.6B
|
||||
cpu: []
|
||||
|
||||
control_flow:
|
||||
- AllenaiLongformerBase
|
||||
@ -74,11 +67,6 @@ batch_size:
|
||||
XGLMForCausalLM: 4
|
||||
XLNetLMHeadModel: 2
|
||||
YituTechConvBert: 2
|
||||
meta-llama/Llama-3.2-1B: 8
|
||||
google/gemma-2-2b: 8
|
||||
google/gemma-3-4b-it: 8
|
||||
openai/whisper-tiny: 8
|
||||
Qwen/Qwen3-0.6B: 8
|
||||
|
||||
|
||||
tolerance:
|
||||
|
||||
@ -1,102 +0,0 @@
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def pip_install(package):
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
|
||||
|
||||
|
||||
try:
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
WhisperForConditionalGeneration,
|
||||
WhisperProcessor,
|
||||
)
|
||||
except ModuleNotFoundError:
|
||||
print("Installing HuggingFace Transformers...")
|
||||
pip_install("git+https://github.com/huggingface/transformers.git#egg=transformers")
|
||||
finally:
|
||||
from transformers import (
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
WhisperForConditionalGeneration,
|
||||
WhisperProcessor,
|
||||
)
|
||||
|
||||
|
||||
class Benchmark:
|
||||
@staticmethod
|
||||
def get_model_and_inputs(model_name, device):
|
||||
raise NotImplementedError("get_model_and_inputs() not implemented")
|
||||
|
||||
|
||||
class WhisperBenchmark(Benchmark):
|
||||
SAMPLE_RATE = 16000
|
||||
DURATION = 30.0 # seconds
|
||||
|
||||
@staticmethod
|
||||
def get_model_and_inputs(model_name, device):
|
||||
processor = WhisperProcessor.from_pretrained(model_name)
|
||||
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
|
||||
model.config.forced_decoder_ids = None
|
||||
|
||||
model.generation_config.do_sample = False
|
||||
model.generation_config.temperature = 0.0
|
||||
|
||||
num_samples = int(WhisperBenchmark.DURATION * WhisperBenchmark.SAMPLE_RATE)
|
||||
audio = torch.randn(num_samples) * 0.1
|
||||
inputs = dict(
|
||||
processor(
|
||||
audio, sampling_rate=WhisperBenchmark.SAMPLE_RATE, return_tensors="pt"
|
||||
)
|
||||
)
|
||||
inputs["input_features"] = inputs["input_features"].to(device)
|
||||
|
||||
decoder_start_token = model.config.decoder_start_token_id
|
||||
inputs["decoder_input_ids"] = torch.tensor(
|
||||
[[decoder_start_token]], device=device
|
||||
)
|
||||
|
||||
return model, inputs
|
||||
|
||||
|
||||
class TextGenerationBenchmark(Benchmark):
|
||||
INPUT_LENGTH = 1000
|
||||
OUTPUT_LENGTH = 2000
|
||||
|
||||
@staticmethod
|
||||
def get_model_and_inputs(model_name, device):
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
|
||||
model.eval()
|
||||
|
||||
model.generation_config.do_sample = False
|
||||
model.generation_config.use_cache = True
|
||||
model.generation_config.cache_implementation = "static"
|
||||
model.generation_config.max_new_tokens = TextGenerationBenchmark.OUTPUT_LENGTH
|
||||
model.generation_config.pad_token_id = tokenizer.eos_token_id
|
||||
model.generation_config.temperature = 0.0
|
||||
|
||||
vocab_size = tokenizer.vocab_size
|
||||
input_ids = torch.randint(
|
||||
low=0,
|
||||
high=vocab_size,
|
||||
size=(1, TextGenerationBenchmark.INPUT_LENGTH),
|
||||
device=device,
|
||||
dtype=torch.long,
|
||||
)
|
||||
example_inputs = {"input_ids": input_ids}
|
||||
|
||||
return model, example_inputs
|
||||
|
||||
|
||||
HF_LLM_MODELS: dict[str, Benchmark] = {
|
||||
"meta-llama/Llama-3.2-1B": TextGenerationBenchmark,
|
||||
"google/gemma-2-2b": TextGenerationBenchmark,
|
||||
"google/gemma-3-4b-it": TextGenerationBenchmark,
|
||||
"openai/whisper-tiny": WhisperBenchmark,
|
||||
"Qwen/Qwen3-0.6B": TextGenerationBenchmark,
|
||||
}
|
||||
@ -46,8 +46,3 @@ TrOCRForCausalLM,64
|
||||
XGLMForCausalLM,32
|
||||
XLNetLMHeadModel,16
|
||||
YituTechConvBert,32
|
||||
meta-llama/Llama-3.2-1B,8
|
||||
google/gemma-2-2b,8
|
||||
google/gemma-3-4b-it,8
|
||||
openai/whisper-tiny,8
|
||||
Qwen/Qwen3-0.6B,8
|
||||
|
||||
@ -61,22 +61,6 @@ struct C10_API Storage {
|
||||
allocator,
|
||||
resizable)) {}
|
||||
|
||||
// Creates storage with pre-allocated memory buffer. Allocator is given for
|
||||
// potential future reallocations, however it can be nullptr if the storage
|
||||
// is non-resizable
|
||||
Storage(
|
||||
use_byte_size_t /*use_byte_size*/,
|
||||
SymInt size_bytes,
|
||||
at::DataPtr data_ptr,
|
||||
at::Allocator* allocator = nullptr,
|
||||
bool resizable = false)
|
||||
: storage_impl_(c10::make_intrusive<StorageImpl>(
|
||||
StorageImpl::use_byte_size_t(),
|
||||
std::move(size_bytes),
|
||||
std::move(data_ptr),
|
||||
allocator,
|
||||
resizable)) {}
|
||||
|
||||
protected:
|
||||
explicit Storage(unsafe_borrow_t, const Storage& rhs)
|
||||
: storage_impl_(c10::intrusive_ptr<c10::StorageImpl>::reclaim(
|
||||
|
||||
@ -10,9 +10,9 @@ namespace c10::cuda {
|
||||
|
||||
void c10_cuda_check_implementation(
|
||||
const int32_t err,
|
||||
const char* filename,
|
||||
const char* function_name,
|
||||
const uint32_t line_number,
|
||||
const char* /*filename*/,
|
||||
const char* /*function_name*/,
|
||||
const int /*line_number*/,
|
||||
const bool include_device_assertions) {
|
||||
const auto cuda_error = static_cast<cudaError_t>(err);
|
||||
const auto cuda_kernel_failure = include_device_assertions
|
||||
@ -41,7 +41,7 @@ void c10_cuda_check_implementation(
|
||||
}
|
||||
#endif
|
||||
throw c10::AcceleratorError(
|
||||
{function_name, filename, line_number}, err, check_message);
|
||||
{__func__, __FILE__, int32_t(__LINE__)}, err, check_message);
|
||||
}
|
||||
|
||||
} // namespace c10::cuda
|
||||
|
||||
@ -91,7 +91,7 @@ C10_CUDA_API void c10_cuda_check_implementation(
|
||||
const int32_t err,
|
||||
const char* filename,
|
||||
const char* function_name,
|
||||
const uint32_t line_number,
|
||||
const int line_number,
|
||||
const bool include_device_assertions);
|
||||
|
||||
} // namespace c10::cuda
|
||||
|
||||
@ -1,22 +0,0 @@
|
||||
// Shim header for filesystem for compilers that are too old to have it not
|
||||
// in the experimental namespace
|
||||
|
||||
#if __has_include(<filesystem>)
|
||||
#include <filesystem>
|
||||
#elif __has_include(<experimental/filesystem>)
|
||||
#include <experimental/filesystem>
|
||||
#else
|
||||
#error "Neither <filesystem> nor <experimental/filesystem> is available."
|
||||
#endif
|
||||
|
||||
namespace c10 {
|
||||
|
||||
#if __has_include(<filesystem>)
|
||||
// NOLINTNEXTLINE(misc-unused-alias-decls)
|
||||
namespace filesystem = std::filesystem;
|
||||
#elif __has_include(<experimental/filesystem>)
|
||||
// NOLINTNEXTLINE(misc-unused-alias-decls)
|
||||
namespace filesystem = std::experimental::filesystem;
|
||||
#endif
|
||||
|
||||
} // namespace c10
|
||||
@ -52,7 +52,7 @@ struct maybe_bool {
|
||||
template <typename src_t>
|
||||
struct maybe_bool<true, src_t> {
|
||||
C10_HOST_DEVICE static inline decltype(auto) apply(src_t src) {
|
||||
// Don't use bool operator so as to also compile for ComplexHalf.
|
||||
// Don't use bool operator so as to to also compile for ComplexHalf.
|
||||
return src.real() || src.imag();
|
||||
}
|
||||
};
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user