mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-11 14:28:03 +08:00
Compare commits
128 Commits
v2.6.0-rc7
...
gh/mhorowi
| Author | SHA1 | Date | |
|---|---|---|---|
| 515abb7744 | |||
| 8621b9ff0c | |||
| 4e0de50eb5 | |||
| f406207af2 | |||
| ad2faec8bb | |||
| b29fc52f82 | |||
| bb574abe73 | |||
| d25e6e623f | |||
| e19f493f02 | |||
| 8fae4397b4 | |||
| 8a04018329 | |||
| 571cd92d7c | |||
| 60c54467db | |||
| 0d6d29af38 | |||
| 65d0a25289 | |||
| 52f31cc238 | |||
| e87f07d3b8 | |||
| 625b4edb97 | |||
| fe9365f3f5 | |||
| 8f40446770 | |||
| 1ebdfd5605 | |||
| f1ff8bc1c5 | |||
| 9d05c8110d | |||
| bf711a9cce | |||
| 6178be822d | |||
| 6bcda3a21a | |||
| b472d82c96 | |||
| 63e1f97f4b | |||
| e0c8abda76 | |||
| 23b8ea3094 | |||
| 82a45d19b4 | |||
| 3f62054de1 | |||
| 7968732f5b | |||
| da67a6a7bb | |||
| fbfc530442 | |||
| 04bb82f097 | |||
| 810808d97d | |||
| 3e1f587514 | |||
| 9f90583ca2 | |||
| c37185c76a | |||
| 075905b7bd | |||
| 72fd7abb35 | |||
| b4f4c75e19 | |||
| b5d8d2444a | |||
| b7ad52abb0 | |||
| 57c46af47a | |||
| b731ced91f | |||
| ceb664aca6 | |||
| ab04f3aee1 | |||
| dbe4b69df0 | |||
| 9f5ebf3fc6 | |||
| 2533a5a843 | |||
| fb93462904 | |||
| 602c86a420 | |||
| a7509e98c5 | |||
| 39cacc1d81 | |||
| 82ce888273 | |||
| 0b75b7ff2b | |||
| c170248b78 | |||
| e3fe5f62b6 | |||
| d48b16a725 | |||
| b0c3d39e0d | |||
| ee5bceaee6 | |||
| 5dabe2d464 | |||
| d47a80246a | |||
| 7edeb1005a | |||
| c85323c5e8 | |||
| 2f0fe82f6d | |||
| dc23f1944a | |||
| 7667235a23 | |||
| 520ba556cd | |||
| cf538efd0c | |||
| 15ee2960e1 | |||
| 30b61e521c | |||
| e3ddc0ca33 | |||
| 0f78be5573 | |||
| 725526abc5 | |||
| d83a049232 | |||
| 7cc3a591c2 | |||
| 84f791381a | |||
| cd1b5924d5 | |||
| 30e2b322a1 | |||
| 91261107e0 | |||
| 18785c1af9 | |||
| a5fb07af27 | |||
| 497f89ff83 | |||
| da76e912a4 | |||
| dcb128d495 | |||
| 5ad7d5304c | |||
| 357e261b1e | |||
| 9701c50bdc | |||
| b25f64b613 | |||
| 79cf8fa751 | |||
| 1e2b841675 | |||
| fda43c98d1 | |||
| 20df80a669 | |||
| f7b9533c3f | |||
| fbbafd0320 | |||
| 4d0775462e | |||
| 2903cf0ad8 | |||
| f892f9862a | |||
| 4d8357e912 | |||
| cb354f8b47 | |||
| 06075d3d18 | |||
| d68403df3b | |||
| 6cb6e8d790 | |||
| 1dd6f21029 | |||
| bd7d81db9e | |||
| db81a3f31c | |||
| 1b3f8b7589 | |||
| dfe5669076 | |||
| cd50bd8477 | |||
| de313f1155 | |||
| bd199bc754 | |||
| 688f44824b | |||
| fd65bd755d | |||
| 0b96413dbf | |||
| e5f08c0cbf | |||
| e228381846 | |||
| 42d4eec5f3 | |||
| e647b6d590 | |||
| 0ddb33ba22 | |||
| c632e29774 | |||
| a8fa98ccef | |||
| 24a5a2ef25 | |||
| be27dbf2b8 | |||
| fb02b40d27 | |||
| 82aaf64422 |
@ -9,7 +9,7 @@ install_ubuntu() {
|
||||
# Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
|
||||
apt-get install -y cargo
|
||||
echo "Checking out sccache repo"
|
||||
git clone https://github.com/mozilla/sccache -b v0.8.2
|
||||
git clone https://github.com/mozilla/sccache -b v0.9.0
|
||||
cd sccache
|
||||
echo "Building sccache"
|
||||
cargo build --release
|
||||
|
||||
@ -30,10 +30,10 @@ dill==0.3.7
|
||||
#Pinned versions: 0.3.7
|
||||
#test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py
|
||||
|
||||
expecttest==0.2.1
|
||||
expecttest==0.3.0
|
||||
#Description: method for writing tests where test framework auto populates
|
||||
# the expected output based on previous runs
|
||||
#Pinned versions: 0.2.1
|
||||
#Pinned versions: 0.3.0
|
||||
#test that import:
|
||||
|
||||
fbscribelogger==0.1.7
|
||||
@ -280,9 +280,9 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
|
||||
#test that import:
|
||||
|
||||
#lintrunner is supported on aarch64-linux only from 0.12.4 version
|
||||
lintrunner==0.12.5
|
||||
lintrunner==0.12.7
|
||||
#Description: all about linters!
|
||||
#Pinned versions: 0.12.5
|
||||
#Pinned versions: 0.12.7
|
||||
#test that import:
|
||||
|
||||
redis>=4.0.0
|
||||
|
||||
@ -247,7 +247,7 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
|
||||
set -e
|
||||
set -e -o pipefail
|
||||
|
||||
get_bazel
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
# Common setup for all Jenkins scripts
|
||||
# shellcheck source=./common_utils.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
|
||||
set -ex
|
||||
set -ex -o pipefail
|
||||
|
||||
# Required environment variables:
|
||||
# $BUILD_ENVIRONMENT (should be set by your Docker image)
|
||||
|
||||
@ -160,7 +160,7 @@ function install_torchvision() {
|
||||
}
|
||||
|
||||
function install_tlparse() {
|
||||
pip_install --user "tlparse==0.3.25"
|
||||
pip_install --user "tlparse==0.3.30"
|
||||
PATH="$(python -m site --user-base)/bin:$PATH"
|
||||
}
|
||||
|
||||
|
||||
@ -40,7 +40,7 @@ echo "Building PyTorch C++ API docs..."
|
||||
rm -rf cppdocs
|
||||
git clone https://github.com/pytorch/cppdocs
|
||||
|
||||
set -ex
|
||||
set -ex -o pipefail
|
||||
|
||||
# Generate ATen files
|
||||
pushd "${pt_checkout}"
|
||||
|
||||
@ -5,7 +5,7 @@ pt_checkout="/var/lib/jenkins/workspace"
|
||||
source "$pt_checkout/.ci/pytorch/common_utils.sh"
|
||||
echo "functorch_doc_push_script.sh: Invoked with $*"
|
||||
|
||||
set -ex
|
||||
set -ex -o pipefail
|
||||
|
||||
version=${DOCS_VERSION:-nightly}
|
||||
echo "version: $version"
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
# return the same thing, ex checks for for rocm, CUDA, and changing the path
|
||||
# where sccache is installed, and not changing /etc/environment.
|
||||
|
||||
set -ex
|
||||
set -ex -o pipefail
|
||||
|
||||
install_binary() {
|
||||
echo "Downloading sccache binary from S3 repo"
|
||||
|
||||
@ -7,7 +7,7 @@ source "$pt_checkout/.ci/pytorch/common_utils.sh"
|
||||
|
||||
echo "python_doc_push_script.sh: Invoked with $*"
|
||||
|
||||
set -ex
|
||||
set -ex -o pipefail
|
||||
|
||||
# for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
|
||||
# the order of operations goes:
|
||||
@ -63,7 +63,7 @@ build_docs () {
|
||||
echo "(tried to echo the WARNINGS above the ==== line)"
|
||||
echo =========================
|
||||
fi
|
||||
set -ex
|
||||
set -ex -o pipefail
|
||||
return $code
|
||||
}
|
||||
|
||||
|
||||
@ -180,7 +180,7 @@ def smoke_test_cuda(
|
||||
# torch.compile is available on macos-arm64 and Linux for python 3.8-3.13
|
||||
if (
|
||||
torch_compile_check == "enabled"
|
||||
and sys.version_info < (3, 13, 0)
|
||||
and sys.version_info < (3, 14, 0)
|
||||
and target_os in ["linux", "linux-aarch64", "macos-arm64", "darwin"]
|
||||
):
|
||||
smoke_test_compile("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
# (This is set by default in the Docker images we build, so you don't
|
||||
# need to set it yourself.
|
||||
|
||||
set -ex
|
||||
set -ex -o pipefail
|
||||
|
||||
# Suppress ANSI color escape sequences
|
||||
export TERM=vt100
|
||||
@ -313,6 +313,7 @@ test_dynamo_wrapped_shard() {
|
||||
--exclude-jit-executor \
|
||||
--exclude-distributed-tests \
|
||||
--exclude-torch-export-tests \
|
||||
--exclude-aot-dispatch-tests \
|
||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||
--verbose \
|
||||
--upload-artifacts-while-running
|
||||
@ -1243,7 +1244,7 @@ EOF
|
||||
}
|
||||
|
||||
test_bazel() {
|
||||
set -e
|
||||
set -e -o pipefail
|
||||
|
||||
# bazel test needs sccache setup.
|
||||
# shellcheck source=./common-build.sh
|
||||
|
||||
@ -38,7 +38,7 @@ if [[ $PYLONG_API_CHECK == 0 ]]; then
|
||||
echo "PyLong_AsUnsignedLong -> THPUtils_unpackUInt32 / THPUtils_unpackUInt64"
|
||||
exit 1
|
||||
fi
|
||||
set -ex
|
||||
set -ex -o pipefail
|
||||
|
||||
"$SCRIPT_HELPERS_DIR"/build_pytorch.bat
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
set -ex -o pipefail
|
||||
|
||||
SCRIPT_PARENT_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
|
||||
# shellcheck source=./common.sh
|
||||
@ -41,7 +41,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
|
||||
python -m pip install z3-solver==4.12.2.0
|
||||
|
||||
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
|
||||
python -m pip install tlparse==0.3.25
|
||||
python -m pip install tlparse==0.3.30
|
||||
|
||||
# Install parameterized
|
||||
python -m pip install parameterized==0.8.1
|
||||
|
||||
@ -173,8 +173,10 @@ conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_p
|
||||
source activate "$tmp_env_name"
|
||||
|
||||
pip install -q "numpy=${NUMPY_PINNED_VERSION}" "pyyaml${PYYAML_PINNED_VERSION}" requests
|
||||
retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq llvm-openmp=14.0.6 cmake ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing_extensions
|
||||
retry pip install -qr "${pytorch_rootdir}/requirements.txt" || true
|
||||
# TODO : Remove me later (but in the interim, use Anaconda cmake, to find Anaconda installed OpenMP)
|
||||
retry pip uninstall -y cmake
|
||||
retry conda install ${EXTRA_CONDA_INSTALL_FLAGS} -yq llvm-openmp=14.0.6 cmake ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing_extensions
|
||||
|
||||
# For USE_DISTRIBUTED=1 on macOS, need libuv and pkg-config to find libuv.
|
||||
export USE_DISTRIBUTED=1
|
||||
|
||||
@ -75,9 +75,8 @@ export PYTORCH_BUILD_NUMBER=1
|
||||
TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
|
||||
|
||||
# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
|
||||
# Only linux Python < 3.13 are supported wheels for triton
|
||||
TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
|
||||
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
|
||||
TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
boto3==1.35.42
|
||||
hypothesis==6.56.4
|
||||
expecttest==0.2.1
|
||||
expecttest==0.3.0
|
||||
fbscribelogger==0.1.7
|
||||
librosa>=0.6.2
|
||||
mpmath==1.3.0
|
||||
|
||||
2
.github/scripts/lintrunner.sh
vendored
2
.github/scripts/lintrunner.sh
vendored
@ -19,7 +19,7 @@ fi
|
||||
|
||||
# if lintrunner is not installed, install it
|
||||
if ! command -v lintrunner &> /dev/null; then
|
||||
python3 -m pip install lintrunner==0.12.5
|
||||
python3 -m pip install lintrunner==0.12.7
|
||||
fi
|
||||
|
||||
# This has already been cached in the docker image
|
||||
|
||||
21
.github/workflows/_binary-build-linux.yml
vendored
21
.github/workflows/_binary-build-linux.yml
vendored
@ -206,21 +206,6 @@ jobs:
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
|
||||
- name: Checkout pytorch/builder to builder dir
|
||||
uses: malfet/checkout@silent-checkout
|
||||
with:
|
||||
ref: main
|
||||
submodules: recursive
|
||||
repository: pytorch/builder
|
||||
path: builder
|
||||
quiet-checkout: true
|
||||
|
||||
- name: Clean pytorch/builder checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: builder
|
||||
|
||||
- name: Check if the job is disabled
|
||||
id: filter
|
||||
uses: ./pytorch/.github/actions/filter-test-configs
|
||||
@ -246,7 +231,6 @@ jobs:
|
||||
mkdir -p artifacts/
|
||||
container_name=$(docker run \
|
||||
-e BINARY_ENV_FILE \
|
||||
-e BUILDER_ROOT \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e DESIRED_CUDA \
|
||||
-e DESIRED_DEVTOOLSET \
|
||||
@ -264,7 +248,6 @@ jobs:
|
||||
--tty \
|
||||
--detach \
|
||||
-v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
|
||||
-v "${GITHUB_WORKSPACE}/builder:/builder" \
|
||||
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
|
||||
-w / \
|
||||
"${DOCKER_IMAGE}"
|
||||
@ -272,10 +255,8 @@ jobs:
|
||||
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
|
||||
elif [[ ${{ inputs.PACKAGE_TYPE }} == "manywheel" || ${{ inputs.PACKAGE_TYPE }} == "libtorch" ]]; then
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
|
||||
else
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/${{ inputs.PACKAGE_TYPE }}/build.sh"
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
|
||||
fi
|
||||
|
||||
- name: Chown artifacts
|
||||
|
||||
2
.github/workflows/build-libtorch-images.yml
vendored
2
.github/workflows/build-libtorch-images.yml
vendored
@ -87,7 +87,7 @@ jobs:
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["6.1", "6.2.4"]
|
||||
rocm_version: ["6.2.4", "6.3"]
|
||||
env:
|
||||
GPU_ARCH_TYPE: rocm
|
||||
GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
|
||||
|
||||
2
.github/workflows/build-manywheel-images.yml
vendored
2
.github/workflows/build-manywheel-images.yml
vendored
@ -178,7 +178,7 @@ jobs:
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["6.1", "6.2.4"]
|
||||
rocm_version: ["6.2.4", "6.3"]
|
||||
env:
|
||||
GPU_ARCH_TYPE: rocm-manylinux_2_28
|
||||
GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
|
||||
|
||||
5
.github/workflows/build-triton-wheel.yml
vendored
5
.github/workflows/build-triton-wheel.yml
vendored
@ -44,7 +44,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
|
||||
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
|
||||
device: ["cuda", "rocm", "xpu"]
|
||||
docker-image: ["pytorch/manylinux-builder:cpu", "pytorch/manylinux2_28-builder:cpu"]
|
||||
exclude:
|
||||
@ -114,6 +114,9 @@ jobs:
|
||||
3.13)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp313-cp313/bin/python
|
||||
;;
|
||||
3.13t)
|
||||
PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported python version ${PY_VERS}"
|
||||
exit 1
|
||||
|
||||
2
.github/workflows/docker-release.yml
vendored
2
.github/workflows/docker-release.yml
vendored
@ -165,7 +165,7 @@ jobs:
|
||||
|
||||
validate:
|
||||
needs: build
|
||||
uses: pytorch/builder/.github/workflows/validate-docker-images.yml@main
|
||||
uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@main
|
||||
with:
|
||||
channel: nightly
|
||||
ref: main
|
||||
|
||||
6
.github/workflows/lint.yml
vendored
6
.github/workflows/lint.yml
vendored
@ -207,8 +207,8 @@ jobs:
|
||||
conda activate "${CONDA_ENV}"
|
||||
|
||||
# Test tools
|
||||
PYTHONPATH=$(pwd) pytest tools/test/test_*.py
|
||||
PYTHONPATH=$(pwd) pytest .github/scripts/test_*.py
|
||||
PYTHONPATH=$(pwd) pytest tools/test -o "python_files=test*.py"
|
||||
PYTHONPATH=$(pwd) pytest .github/scripts -o "python_files=test*.py"
|
||||
|
||||
test_run_test:
|
||||
name: Test `run_test.py` is usable without boto3
|
||||
@ -229,7 +229,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python3 -m pip install --upgrade pip
|
||||
pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.2.* fbscribelogger==0.1.* numpy==1.24.*
|
||||
pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.3.* fbscribelogger==0.1.* numpy==1.24.*
|
||||
pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
|
||||
- name: Run run_test.py (nonretryable)
|
||||
run: |
|
||||
|
||||
3
.github/workflows/linux-aarch64.yml
vendored
3
.github/workflows/linux-aarch64.yml
vendored
@ -41,6 +41,9 @@ jobs:
|
||||
{ config: "default", shard: 2, num_shards: 4, runner: "linux.arm64.2xlarge" },
|
||||
{ config: "default", shard: 3, num_shards: 4, runner: "linux.arm64.2xlarge" },
|
||||
{ config: "default", shard: 4, num_shards: 4, runner: "linux.arm64.2xlarge" },
|
||||
{ config: "default", shard: 1, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
|
||||
{ config: "default", shard: 2, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
|
||||
{ config: "default", shard: 3, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
69
.github/workflows/pull.yml
vendored
69
.github/workflows/pull.yml
vendored
@ -214,73 +214,6 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-focal-py3_9-clang10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-py3_11-clang10-build:
|
||||
name: linux-focal-py3.11-clang10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-py3.11-clang10
|
||||
docker-image-name: pytorch-linux-focal-py3.11-clang10
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-py3_11-clang10-test:
|
||||
name: linux-focal-py3.11-clang10
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-focal-py3_11-clang10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-focal-py3.11-clang10
|
||||
docker-image: ${{ needs.linux-focal-py3_11-clang10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-py3_11-clang10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-py3_12-clang10-build:
|
||||
name: linux-focal-py3.12-clang10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-py3.12-clang10
|
||||
docker-image-name: pytorch-linux-focal-py3.12-clang10
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-py3_12-clang10-test:
|
||||
name: linux-focal-py3.12-clang10
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-focal-py3_12-clang10-build
|
||||
with:
|
||||
build-environment: linux-focal-py3.12-clang10
|
||||
docker-image: ${{ needs.linux-focal-py3_12-clang10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-py3_12-clang10-build.outputs.test-matrix }}
|
||||
timeout-minutes: 600
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-py3_13-clang10-build:
|
||||
name: linux-focal-py3.13-clang10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
@ -296,6 +229,8 @@ jobs:
|
||||
{ config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
|
||||
20
.github/workflows/slow.yml
vendored
20
.github/workflows/slow.yml
vendored
@ -47,14 +47,14 @@ jobs:
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-sm86-build:
|
||||
name: linux-focal-cuda12.1-py3.10-gcc9-sm86
|
||||
linux-focal-cuda12_4-py3_10-gcc9-sm86-build:
|
||||
name: linux-focal-cuda12.4-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
|
||||
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
|
||||
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -64,16 +64,16 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
|
||||
name: linux-focal-cuda12.1-py3.10-gcc9-sm86
|
||||
linux-focal-cuda12_4-py3_10-gcc9-sm86-test:
|
||||
name: linux-focal-cuda12.4-py3.10-gcc9-sm86
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-focal-cuda12_1-py3_10-gcc9-sm86-build
|
||||
- linux-focal-cuda12_4-py3_10-gcc9-sm86-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-sm86-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-sm86-build.outputs.test-matrix }}
|
||||
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm86-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm86-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-py3_9-clang10-build:
|
||||
|
||||
40
.github/workflows/test-check-binary.yml
vendored
Normal file
40
.github/workflows/test-check-binary.yml
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
name: Test check_binary
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/test-check-binary.yml
|
||||
- .ci/pytorch/check_binary.sh
|
||||
- .ci/pytorch//smoke_test/smoke_test.py
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
check_binary_linux_cpu:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: Test check_binary.sh for Linux CPU
|
||||
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
|
||||
with:
|
||||
docker-image: python:3.11
|
||||
docker-build-dir: "skip-docker-build"
|
||||
script: |
|
||||
pushd .ci/pytorch/
|
||||
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
|
||||
DESIRED_PYTHON=3.11 DESIRED_CUDA=cpu PACKAGE_TYPE=manywheel ./check_binary.sh
|
||||
popd
|
||||
|
||||
check_binary_linux_cuda:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: Test check_binary.sh for Linux CUDA
|
||||
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
|
||||
with:
|
||||
runner: linux.4xlarge.nvidia.gpu
|
||||
docker-image: python:3.11
|
||||
docker-build-dir: "skip-docker-build"
|
||||
script: |
|
||||
pushd .ci/pytorch/
|
||||
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124
|
||||
DESIRED_PYTHON=3.11 DESIRED_CUDA=cu124 PACKAGE_TYPE=manywheel ./check_binary.sh
|
||||
popd
|
||||
37
.github/workflows/trunk.yml
vendored
37
.github/workflows/trunk.yml
vendored
@ -45,43 +45,12 @@ jobs:
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
libtorch-linux-focal-cuda12_1-py3_7-gcc9-debug-build:
|
||||
name: libtorch-linux-focal-cuda12.1-py3.7-gcc9-debug
|
||||
libtorch-linux-focal-cuda12_4-py3_10-gcc9-debug-build:
|
||||
name: libtorch-linux-focal-cuda12.4-py3.10-gcc9-debug
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: libtorch-linux-focal-cuda12.1-py3.7-gcc9
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
|
||||
build-generates-artifacts: false
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: "linux.4xlarge"
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
# no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
|
||||
linux-focal-cuda12_1-py3_10-gcc9-no-ops-build:
|
||||
name: linux-focal-cuda12.1-py3.10-gcc9-no-ops
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-no-ops
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 1 },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
libtorch-linux-focal-cuda12_4-py3_7-gcc9-debug-build:
|
||||
name: libtorch-linux-focal-cuda12.4-py3.7-gcc9-debug
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: libtorch-linux-focal-cuda12.4-py3.7-gcc9
|
||||
build-environment: libtorch-linux-focal-cuda12.4-py3.10-gcc9
|
||||
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
|
||||
build-generates-artifacts: false
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
|
||||
@ -73,6 +73,8 @@ include_patterns = [
|
||||
'aten/src/ATen/native/cudnn/*.cpp',
|
||||
'aten/src/ATen/native/mkldnn/xpu/**/*.h',
|
||||
'aten/src/ATen/native/mkldnn/xpu/**/*.cpp',
|
||||
'aten/src/ATen/native/Tensor*.h',
|
||||
'aten/src/ATen/native/Tensor*.cpp',
|
||||
'c10/**/*.h',
|
||||
'c10/**/*.cpp',
|
||||
'torch/csrc/**/*.h',
|
||||
@ -143,7 +145,7 @@ init_command = [
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11"',
|
||||
'numpy==2.1.0 ; python_version >= "3.12"',
|
||||
'expecttest==0.2.1',
|
||||
'expecttest==0.3.0',
|
||||
'mypy==1.13.0',
|
||||
'sympy==1.13.0 ; python_version >= "3.9"',
|
||||
'types-requests==2.27.25',
|
||||
@ -1711,7 +1713,7 @@ command = [
|
||||
'@{{PATHSFILE}}'
|
||||
]
|
||||
include_patterns = [
|
||||
'torch/**/does-not-exist.py'
|
||||
"torch/_inductor/**/*.py",
|
||||
]
|
||||
is_formatter = true
|
||||
|
||||
|
||||
@ -997,8 +997,6 @@ if(NOT MSVC)
|
||||
append_cxx_flag_if_supported("-Wnarrowing" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-Wno-missing-field-initializers"
|
||||
CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-Wno-type-limits" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-Wno-array-bounds" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-Wno-unused-parameter" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-Wno-strict-overflow" CMAKE_CXX_FLAGS)
|
||||
@ -1076,7 +1074,6 @@ if(NOT MSVC)
|
||||
set(WERROR FALSE)
|
||||
endif()
|
||||
endif()
|
||||
append_cxx_flag_if_supported("-Wno-unused-but-set-variable" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-fstandalone-debug" CMAKE_CXX_FLAGS_DEBUG)
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
|
||||
@ -1093,6 +1090,7 @@ if(NOT MSVC)
|
||||
append_cxx_flag_if_supported("-fno-trapping-math" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-Werror=format" CMAKE_CXX_FLAGS)
|
||||
if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 13)
|
||||
append_cxx_flag_if_supported("-Wno-dangling-reference" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-Wno-error=dangling-reference" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-Wno-error=redundant-move" CMAKE_CXX_FLAGS)
|
||||
endif()
|
||||
|
||||
@ -1,8 +1,6 @@
|
||||
#include <ATen/Context.h>
|
||||
#include <ATen/DeviceAccelerator.h>
|
||||
#include <c10/core/impl/VirtualGuardImpl.h>
|
||||
|
||||
namespace at::accelerator {
|
||||
namespace at {
|
||||
|
||||
std::optional<c10::DeviceType> getAccelerator(bool checked) {
|
||||
#define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \
|
||||
@ -39,8 +37,8 @@ std::optional<c10::DeviceType> getAccelerator(bool checked) {
|
||||
#undef DETECT_AND_ASSIGN_ACCELERATOR
|
||||
}
|
||||
|
||||
bool isAccelerator(c10::DeviceType device_type) {
|
||||
switch (device_type) {
|
||||
bool isAccelerator(c10::DeviceType d) {
|
||||
switch (d) {
|
||||
case at::kCUDA:
|
||||
case at::kMTIA:
|
||||
case at::kXPU:
|
||||
@ -54,50 +52,4 @@ bool isAccelerator(c10::DeviceType device_type) {
|
||||
}
|
||||
}
|
||||
|
||||
c10::DeviceIndex deviceCount() {
|
||||
const auto device_type = getAccelerator(false);
|
||||
if (!device_type.has_value()) {
|
||||
return static_cast<c10::DeviceIndex>(0);
|
||||
}
|
||||
c10::impl::VirtualGuardImpl impl(device_type.value());
|
||||
return static_cast<c10::DeviceIndex>(impl.deviceCount());
|
||||
}
|
||||
|
||||
void setDeviceIndex(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
c10::impl::VirtualGuardImpl impl(device_type);
|
||||
impl.setDevice({device_type, device_index});
|
||||
}
|
||||
|
||||
c10::DeviceIndex getDeviceIndex() {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
c10::impl::VirtualGuardImpl impl(device_type);
|
||||
return static_cast<c10::DeviceIndex>(impl.getDevice().index());
|
||||
}
|
||||
|
||||
void setCurrentStream(c10::Stream stream) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
TORCH_CHECK(
|
||||
device_type == stream.device_type(),
|
||||
"stream's device type ",
|
||||
c10::DeviceTypeName(stream.device_type()),
|
||||
" doesn't match the current accelerator ",
|
||||
c10::DeviceTypeName(device_type));
|
||||
c10::impl::VirtualGuardImpl impl(device_type);
|
||||
impl.exchangeStream(stream);
|
||||
}
|
||||
|
||||
c10::Stream getCurrentStream(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
c10::impl::VirtualGuardImpl impl(device_type);
|
||||
return impl.getStream({device_type, device_index});
|
||||
}
|
||||
|
||||
void synchronizeDevice(c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
c10::impl::VirtualGuardImpl impl(device_type);
|
||||
// impl.synchronizeDevice should can be safely called from any device
|
||||
impl.synchronizeDevice(device_index);
|
||||
}
|
||||
|
||||
} // namespace at::accelerator
|
||||
} // namespace at
|
||||
|
||||
@ -6,8 +6,6 @@
|
||||
#include <ATen/detail/MTIAHooksInterface.h>
|
||||
#include <optional>
|
||||
|
||||
namespace at::accelerator {
|
||||
|
||||
// This file defines the top level Accelerator concept for PyTorch.
|
||||
// A device is an accelerator per the definition here if:
|
||||
// - It is mutually exclusive with all other accelerators
|
||||
@ -17,39 +15,13 @@ namespace at::accelerator {
|
||||
// As of today, accelerator devices are (in no particular order):
|
||||
// CUDA, MTIA, XPU, HIP, MPS, PrivateUse1
|
||||
|
||||
namespace at {
|
||||
|
||||
// Ensures that only one accelerator is available (at
|
||||
// compile time if possible) and return it.
|
||||
// When checked is true, the returned optional always has a value.
|
||||
TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
|
||||
|
||||
// Check if the given device type is an accelerator.
|
||||
TORCH_API bool isAccelerator(c10::DeviceType device_type);
|
||||
TORCH_API bool isAccelerator(c10::DeviceType d);
|
||||
|
||||
// Return the number of the device available. Note that this is *REQUIRED* to
|
||||
// not raise any exception.
|
||||
TORCH_API c10::DeviceIndex deviceCount();
|
||||
|
||||
// Set the current device index to the given device index.
|
||||
TORCH_API void setDeviceIndex(c10::DeviceIndex device_index);
|
||||
|
||||
// Get the current device index.
|
||||
TORCH_API c10::DeviceIndex getDeviceIndex();
|
||||
|
||||
// Set the current stream to a given stream. Note that this API doesn't change
|
||||
// the current device index.
|
||||
TORCH_API void setCurrentStream(c10::Stream stream);
|
||||
|
||||
// Get the current stream of the given device index.
|
||||
TORCH_API c10::Stream getCurrentStream(c10::DeviceIndex device_index);
|
||||
|
||||
// Wait (by blocking the calling thread) until all the work previously enqueued
|
||||
// on the given device index has been completed.
|
||||
TORCH_API void synchronizeDevice(c10::DeviceIndex device_index);
|
||||
|
||||
} // namespace at::accelerator
|
||||
|
||||
namespace at {
|
||||
// Keep BC only
|
||||
using at::accelerator::getAccelerator;
|
||||
using at::accelerator::isAccelerator;
|
||||
} // namespace at
|
||||
|
||||
@ -92,8 +92,8 @@ class MatrixRef {
|
||||
/// The declaration here is extra complicated so that "arrayRef = {}"
|
||||
/// continues to select the move assignment operator.
|
||||
template <typename U>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
|
||||
std::enable_if_t<std::is_same_v<U, T>, MatrixRef<T>>& operator=(
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
|
||||
U&& Temporary) = delete;
|
||||
|
||||
/// Disallow accidental assignment from a temporary.
|
||||
|
||||
@ -1129,6 +1129,7 @@ TEST(ListTest, canAccessOptionalStringByReference) {
|
||||
EXPECT_EQ("two", str1);
|
||||
EXPECT_FALSE(str2.has_value());
|
||||
EXPECT_TRUE(strRef1.has_value());
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
EXPECT_EQ("two", strRef1.value().get());
|
||||
EXPECT_FALSE(strRef2.has_value());
|
||||
}
|
||||
|
||||
@ -66,7 +66,7 @@ struct TORCH_API EnumType : public NamedType {
|
||||
}
|
||||
|
||||
const QualifiedName& qualifiedClassName() const {
|
||||
// NOLINTLEXTLINE(bugprone-unchecked-optional-access)
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
return name().value();
|
||||
}
|
||||
|
||||
|
||||
@ -95,7 +95,7 @@ struct TORCH_API Argument {
|
||||
const TypePtr& real_type() const {
|
||||
return real_type_;
|
||||
}
|
||||
std::optional<int32_t> N() const {
|
||||
const std::optional<int32_t>& N() const {
|
||||
return N_;
|
||||
}
|
||||
const std::optional<IValue>& default_value() const {
|
||||
@ -651,11 +651,11 @@ template<>
|
||||
hash = c10::hash_combine(hash, type_hash);
|
||||
hash = c10::hash_combine(hash, kwarg_only_hash);
|
||||
// hashing optional fields if they exist
|
||||
if (arg.default_value()) {
|
||||
auto default_value_hash = c10::hash<c10::IValue>{}(arg.default_value().value());
|
||||
if (arg.default_value().has_value()) {
|
||||
auto default_value_hash = c10::hash<c10::IValue>{}(*arg.default_value());
|
||||
hash = c10::hash_combine(hash, default_value_hash);
|
||||
}
|
||||
if (arg.N()) {
|
||||
if (arg.N().has_value()) {
|
||||
auto N_hash = std::hash<int64_t>{}(*arg.N());
|
||||
hash = c10::hash_combine(hash, N_hash);
|
||||
}
|
||||
|
||||
@ -1546,11 +1546,11 @@ struct WeakOrStrongCompilationUnit {
|
||||
}
|
||||
|
||||
bool holdingStrongRef() const {
|
||||
return strong_ptr_ != std::nullopt;
|
||||
return strong_ptr_.has_value();
|
||||
}
|
||||
|
||||
bool holdingEmptyStrongRef() const {
|
||||
return holdingStrongRef() && *strong_ptr_ == nullptr;
|
||||
return strong_ptr_ == nullptr;
|
||||
}
|
||||
|
||||
std::optional<std::shared_ptr<torch::jit::CompilationUnit>> strong_ptr_;
|
||||
|
||||
@ -625,13 +625,13 @@ struct TORCH_API TensorType : public SharedType {
|
||||
return strides_;
|
||||
}
|
||||
|
||||
std::optional<at::Device> device() const {
|
||||
const std::optional<at::Device>& device() const {
|
||||
return device_;
|
||||
}
|
||||
std::optional<at::ScalarType> scalarType() const {
|
||||
const std::optional<at::ScalarType>& scalarType() const {
|
||||
return scalar_type_;
|
||||
}
|
||||
std::optional<bool> requiresGrad() const {
|
||||
const std::optional<bool>& requiresGrad() const {
|
||||
return requires_grad_;
|
||||
}
|
||||
bool requires_grad() const override {
|
||||
@ -656,7 +656,7 @@ struct TORCH_API TensorType : public SharedType {
|
||||
const auto& shape = sizes();
|
||||
|
||||
for (size_t i = 0; i < shape.size(); i++) {
|
||||
if (!shape[i]) {
|
||||
if (!shape[i].has_value()) {
|
||||
return std::optional<size_t>{};
|
||||
}
|
||||
prod *= shape[i].value();
|
||||
|
||||
@ -292,7 +292,7 @@ TensorTypePtr TensorType::create(
|
||||
scalar_type, device, symbol_sizes, sprops, requires_grad, undefined);
|
||||
} else {
|
||||
// strides are all null, but still have number of strides equal to number of ranks
|
||||
TORCH_INTERNAL_ASSERT(sizes.sizes() && sizes.size());
|
||||
TORCH_INTERNAL_ASSERT(sizes.sizes().has_value() && sizes.size());
|
||||
auto symbol_sizes = SymbolicShape(*sizes.sizes());
|
||||
return TensorType::create(
|
||||
scalar_type, device, symbol_sizes, VaryingShape<Stride>(*sizes.size()), requires_grad, undefined);
|
||||
|
||||
@ -61,8 +61,8 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
|
||||
} else {
|
||||
out << "Tensor";
|
||||
}
|
||||
if (auto ndim = value->sizes().size()) {
|
||||
bool has_valid_strides_info = *ndim > 0 &&
|
||||
if (auto ndim = value->sizes().size(); ndim.has_value()) {
|
||||
bool has_valid_strides_info = ndim > 0 &&
|
||||
value->strides().isComplete() && value->strides().size() == ndim;
|
||||
|
||||
out << "(";
|
||||
@ -87,7 +87,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
|
||||
if (i > 0) {
|
||||
out << ", ";
|
||||
}
|
||||
out << *value->strides()[i];
|
||||
out << value->strides()[i].value();
|
||||
}
|
||||
out << "]";
|
||||
}
|
||||
@ -903,7 +903,7 @@ bool ListType::isSubtypeOfExt(const Type& rhs_, std::ostream* why_not) const {
|
||||
|
||||
std::string TupleType::str() const {
|
||||
std::stringstream ss;
|
||||
if (schema_ && name()) {
|
||||
if (schema_ && name().has_value()) {
|
||||
ss << name()->qualifiedName();
|
||||
} else {
|
||||
ss << "(";
|
||||
|
||||
@ -106,6 +106,7 @@ static hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status error)
|
||||
namespace {
|
||||
|
||||
static cublasOperation_t _cublasOpFromChar(char op) {
|
||||
// NOLINTNEXTLINE(bugprone-switch-missing-default-case)
|
||||
switch (op) {
|
||||
case 'n':
|
||||
case 'N':
|
||||
|
||||
@ -466,6 +466,6 @@ void CUDAHooks::deviceSynchronize(DeviceIndex device_index) const {
|
||||
using at::CUDAHooksRegistry;
|
||||
using at::RegistererCUDAHooksRegistry;
|
||||
|
||||
REGISTER_CUDA_HOOKS(CUDAHooks);
|
||||
REGISTER_CUDA_HOOKS(CUDAHooks)
|
||||
|
||||
} // namespace at::cuda::detail
|
||||
|
||||
@ -127,8 +127,8 @@ RETTYPE NAME(ARG1 a1, ARG2 a2, ARG3 a3, ARG4 a4) {
|
||||
#define NVRTC_STUB2(NAME, A1, A2) _STUB_2(NVRTC, NAME, nvrtcResult, A1, A2)
|
||||
#define NVRTC_STUB3(NAME, A1, A2, A3) _STUB_3(NVRTC, NAME, nvrtcResult, A1, A2, A3)
|
||||
|
||||
NVRTC_STUB2(nvrtcVersion, int*, int*);
|
||||
NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const);
|
||||
NVRTC_STUB2(nvrtcVersion, int*, int*)
|
||||
NVRTC_STUB2(nvrtcAddNameExpression, nvrtcProgram, const char * const)
|
||||
|
||||
nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
|
||||
const char *src,
|
||||
@ -143,32 +143,32 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
|
||||
return fn(prog, src, name, numHeaders, headers, includeNames);
|
||||
}
|
||||
|
||||
NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *);
|
||||
NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *);
|
||||
NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *);
|
||||
NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *)
|
||||
NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *)
|
||||
NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *)
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
|
||||
NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *);
|
||||
NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *);
|
||||
NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *)
|
||||
NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *)
|
||||
#endif
|
||||
NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *);
|
||||
_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult);
|
||||
NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*);
|
||||
NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *);
|
||||
NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **);
|
||||
NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *)
|
||||
_STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult)
|
||||
NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)
|
||||
NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *)
|
||||
NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **)
|
||||
|
||||
CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *);
|
||||
CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *);
|
||||
CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t);
|
||||
CUDA_STUB2(cuGetErrorString, CUresult, const char **);
|
||||
CUDA_STUB1(cuCtxGetCurrent, CUcontext *);
|
||||
CUDA_STUB1(cuCtxSetCurrent, CUcontext);
|
||||
CUDA_STUB1(cuModuleUnload, CUmodule);
|
||||
CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *);
|
||||
CUDA_STUB2(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice);
|
||||
CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *);
|
||||
CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *);
|
||||
CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int);
|
||||
CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction);
|
||||
CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *)
|
||||
CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *)
|
||||
CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t)
|
||||
CUDA_STUB2(cuGetErrorString, CUresult, const char **)
|
||||
CUDA_STUB1(cuCtxGetCurrent, CUcontext *)
|
||||
CUDA_STUB1(cuCtxSetCurrent, CUcontext)
|
||||
CUDA_STUB1(cuModuleUnload, CUmodule)
|
||||
CUDA_STUB3(cuDevicePrimaryCtxGetState, CUdevice, unsigned int *, int *)
|
||||
CUDA_STUB2(cuDevicePrimaryCtxRetain, CUcontext *, CUdevice)
|
||||
CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *)
|
||||
CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *)
|
||||
CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
|
||||
CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
|
||||
CUresult CUDAAPI
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
#include <ATen/cuda/tunable/Tunable.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/StringUtil.h>
|
||||
#include <c10/util/env.h>
|
||||
#include <torch/version.h>
|
||||
|
||||
#ifndef _WIN32
|
||||
@ -435,8 +436,8 @@ void TuningContext::EnableTunableOp(bool value) {
|
||||
}
|
||||
|
||||
bool TuningContext::IsTunableOpEnabled() const {
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_ENABLED");
|
||||
if (env != nullptr && strcmp(env, "1") == 0) {
|
||||
static const bool eval = c10::utils::get_env("PYTORCH_TUNABLEOP_ENABLED") == "1";
|
||||
if (eval) {
|
||||
return true;
|
||||
}
|
||||
return enable_;
|
||||
@ -462,16 +463,16 @@ void TuningContext::EnableRecordUntuned(bool value) {
|
||||
}
|
||||
|
||||
bool TuningContext::IsTuningEnabled() const {
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_TUNING");
|
||||
if (env != nullptr && strcmp(env, "0") == 0) {
|
||||
static const bool eval = c10::utils::get_env("PYTORCH_TUNABLEOP_TUNING") == "0";
|
||||
if (eval) {
|
||||
return false;
|
||||
}
|
||||
return tuning_enable_;
|
||||
}
|
||||
|
||||
bool TuningContext::IsRecordUntunedEnabled() const {
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_RECORD_UNTUNED");
|
||||
if (env != nullptr && strcmp(env, "1") == 0) {
|
||||
static const bool eval = c10::utils::get_env("PYTORCH_TUNABLEOP_RECORD_UNTUNED") == "1";
|
||||
if (eval) {
|
||||
return true;
|
||||
}
|
||||
return record_untuned_enable_;
|
||||
@ -479,8 +480,8 @@ bool TuningContext::IsRecordUntunedEnabled() const {
|
||||
|
||||
std::ofstream& TuningContext::GetUntunedFile(){
|
||||
if (!untuned_file_.is_open()) {
|
||||
const char *env = std::getenv("PYTORCH_TUNABLEOP_UNTUNED_FILENAME");
|
||||
std::string filename = (env == nullptr) ? "tunableop_untuned.csv" : env;
|
||||
const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_UNTUNED_FILENAME");
|
||||
std::string filename = (!env.has_value()) ? "tunableop_untuned.csv" : env.value();
|
||||
|
||||
std::string device = c10::str(int(c10::cuda::current_device()));
|
||||
std::size_t found = filename.rfind('.');
|
||||
@ -517,9 +518,9 @@ void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) {
|
||||
}
|
||||
|
||||
int TuningContext::GetMaxTuningDurationMs() const {
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS");
|
||||
if (env != nullptr) {
|
||||
int val = atoi(env);
|
||||
static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS");
|
||||
if (env.has_value()) {
|
||||
int val = stoi(env.value());
|
||||
return val < 0 ? 0 : val;
|
||||
}
|
||||
return max_tuning_duration_ms_;
|
||||
@ -530,9 +531,9 @@ void TuningContext::SetMaxTuningIterations(int max_iter) {
|
||||
}
|
||||
|
||||
int TuningContext::GetMaxTuningIterations() const {
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_TUNING_ITERATIONS");
|
||||
if (env != nullptr) {
|
||||
int val = atoi(env);
|
||||
static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_MAX_TUNING_ITERATIONS");
|
||||
if (env.has_value()) {
|
||||
int val = stoi(env.value());
|
||||
return val < 0 ? 0 : val;
|
||||
}
|
||||
return max_tuning_iterations_;
|
||||
@ -543,9 +544,9 @@ void TuningContext::SetMaxWarmupDurationMs(int max_duration_ms) {
|
||||
}
|
||||
|
||||
int TuningContext::GetMaxWarmupDurationMs() const {
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_WARMUP_DURATION_MS");
|
||||
if (env != nullptr) {
|
||||
int val = atoi(env);
|
||||
static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_MAX_WARMUP_DURATION_MS");
|
||||
if (env.has_value()) {
|
||||
int val = stoi(env.value());
|
||||
return val < 0 ? 0 : val;
|
||||
}
|
||||
return max_warmup_duration_ms_;
|
||||
@ -556,9 +557,9 @@ void TuningContext::SetMaxWarmupIterations(int max_iter) {
|
||||
}
|
||||
|
||||
int TuningContext::GetMaxWarmupIterations() const {
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS");
|
||||
if (env != nullptr) {
|
||||
int val = atoi(env);
|
||||
static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS");
|
||||
if (env.has_value()) {
|
||||
int val = stoi(env.value());
|
||||
return val < 0 ? 0 : val;
|
||||
}
|
||||
return max_warmup_iterations_;
|
||||
@ -569,8 +570,8 @@ void TuningContext::EnableICacheFlush(bool value) {
|
||||
}
|
||||
|
||||
bool TuningContext::IsICacheFlushEnabled() const {
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_ICACHE_FLUSH_ENABLED");
|
||||
if (env != nullptr && strcmp(env, "0") == 0) {
|
||||
static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_ICACHE_FLUSH_ENABLED");
|
||||
if (env == "0") {
|
||||
return false;
|
||||
}
|
||||
return icache_flush_;
|
||||
@ -581,10 +582,10 @@ void TuningContext::SetRotatingBufferSize(int size) {
|
||||
}
|
||||
|
||||
int TuningContext::GetRotatingBufferSize() const {
|
||||
static const char *env = std::getenv("PYTORCH_TUNABLEOP_ROTATING_BUFFER_SIZE");
|
||||
if (env != nullptr) {
|
||||
static const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_ROTATING_BUFFER_SIZE");
|
||||
if (env.has_value()) {
|
||||
constexpr int MB = 1024 * 1024;
|
||||
int val = atoi(env);
|
||||
int val = stoi(env.value());
|
||||
return val < 0 ? 0 : val * MB; // env var is specified as MB, returned as bytes
|
||||
}
|
||||
else {
|
||||
@ -604,8 +605,8 @@ TuningResultsManager& TuningContext::GetTuningResultsManager() {
|
||||
manager_initialized_ = true;
|
||||
if (GetFilename().empty()) {
|
||||
// if SetFilename() was not already called, call it now with the default or env var
|
||||
const char *env = std::getenv("PYTORCH_TUNABLEOP_FILENAME");
|
||||
std::string filename = (env == nullptr) ? "tunableop_results.csv" : env;
|
||||
const auto env = c10::utils::get_env("PYTORCH_TUNABLEOP_FILENAME");
|
||||
std::string filename = (!env.has_value()) ? "tunableop_results.csv" : env.value();
|
||||
SetFilename(filename, true);
|
||||
}
|
||||
auto filename = GetFilename();
|
||||
|
||||
@ -42,8 +42,9 @@ static Tensor materializeGradWrappers(const Tensor& tensor, int64_t current_leve
|
||||
if (!wrapper) {
|
||||
return makeTensorWrapper(tensor, current_level, /*is_immutable=*/true);
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(wrapper->level().value() <= current_level, "escaped?");
|
||||
if (wrapper->level() == current_level) {
|
||||
auto level = wrapper->level();
|
||||
TORCH_INTERNAL_ASSERT(level.has_value() && level <= current_level, "escaped?");
|
||||
if (level == current_level) {
|
||||
TORCH_INTERNAL_ASSERT(tensor.defined());
|
||||
return tensor;
|
||||
}
|
||||
|
||||
@ -54,6 +54,8 @@ struct BinaryRandomPointwiseBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
|
||||
static Tensor apply(const Tensor& tensor, const Tensor& other, T... extra_args) {
|
||||
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
TORCH_INTERNAL_ASSERT(maybe_layer.has_value())
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
auto cur_level = maybe_layer->layerId();
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
|
||||
|
||||
@ -19,6 +19,7 @@ struct NewBlahBatchRuleHelperSymInt<F, Func, typelist<A, B, T...>> {
|
||||
std::optional<int64_t> batch_dim,
|
||||
SymIntArrayRef shape,
|
||||
T... extra_args) {
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
const auto bdim_size = tensor.sym_size(batch_dim.value());
|
||||
c10::SmallVector<c10::SymInt> new_shape;
|
||||
new_shape.reserve(shape.size() + 1);
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
|
||||
namespace at::functorch {
|
||||
|
||||
Tensor moveBatchDimToFront(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim) {
|
||||
Tensor moveBatchDimToFront(Tensor tensor, std::optional<int64_t> maybe_batch_dim) {
|
||||
if (!maybe_batch_dim.has_value()) {
|
||||
return tensor;
|
||||
}
|
||||
|
||||
@ -30,7 +30,7 @@ TORCH_API Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x);
|
||||
|
||||
TORCH_API Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1, const Tensor& x);
|
||||
|
||||
Tensor moveBatchDimToFront(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
|
||||
Tensor moveBatchDimToFront(Tensor tensor, std::optional<int64_t> maybe_batch_dim);
|
||||
int64_t rankWithoutBatchDim(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
|
||||
int64_t numelWithoutBatchDim(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
|
||||
std::optional<int64_t> valIfNonempty(std::optional<int64_t> maybe_empty, int64_t new_val);
|
||||
@ -243,9 +243,8 @@ inline void boxed_existing_bdim_all_batch_rule(
|
||||
const auto num_arguments = static_cast<int64_t>(schema.arguments().size());
|
||||
|
||||
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
const auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
vmap_check_escaped(maybe_layer, "boxed_existing_bdim_all_batch_rule");
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
|
||||
const auto arguments = torch::jit::last(stack, num_arguments);
|
||||
if (std::none_of(arguments.begin(), arguments.end(), ivalueParticipatesInCurrentLevel)) {
|
||||
@ -257,6 +256,8 @@ inline void boxed_existing_bdim_all_batch_rule(
|
||||
SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
|
||||
SmallVector<int64_t, 5> tensor_pos;
|
||||
int64_t batch_size = 0;
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
|
||||
find_and_unpack_tensors(
|
||||
stack, num_arguments, cur_level,
|
||||
|
||||
@ -492,6 +492,7 @@ _scaled_dot_product_flash_attention_batch_rule(
|
||||
) {
|
||||
if (dropout_p > 0) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
|
||||
}
|
||||
@ -543,6 +544,7 @@ fourOutputs _scaled_dot_product_efficient_attention_batch_rule(
|
||||
) {
|
||||
if (dropout_p > 0) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
|
||||
}
|
||||
@ -585,6 +587,7 @@ _scaled_dot_product_cudnn_attention_batch_rule(
|
||||
) {
|
||||
if (dropout_p > 0) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness, query_bdim.has_value() || key_bdim.has_value() || value_bdim.has_value());
|
||||
}
|
||||
|
||||
@ -90,6 +90,7 @@ static Tensor binary_cross_entropy_plumbing(
|
||||
const std::optional<Tensor>& weight, int64_t reduction) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
vmap_check_escaped(maybe_layer, "binary_cross_entropy_plumbing");
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
|
||||
if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(target, cur_level)
|
||||
@ -126,6 +127,7 @@ static Tensor binary_cross_entropy_backward_plumbing(
|
||||
const std::optional<Tensor>& weight_opt, int64_t reduction) {
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
vmap_check_escaped(maybe_layer, "binary_cross_entropy_backward_plumbing");
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
|
||||
if (!areAnyBatchedAtLevel({grad, input, target, weight_opt}, cur_level)) {
|
||||
|
||||
@ -57,7 +57,7 @@ embedding_dense_backward_batch_rule(
|
||||
c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq) {
|
||||
Tensor grad = grad_;
|
||||
Tensor indices = indices_;
|
||||
if (!indices_bdim && grad_bdim) {
|
||||
if (!indices_bdim.has_value() && grad_bdim) {
|
||||
const auto bdim_size = grad.sym_size(*grad_bdim);
|
||||
grad = reshape_dim_into(*grad_bdim, -1, grad);
|
||||
auto result = at::embedding_dense_backward_symint(
|
||||
@ -162,12 +162,12 @@ grid_sample_backward_helper_in(
|
||||
static std::tuple<Tensor, std::optional<int64_t>, Tensor, std::optional<int64_t>>
|
||||
grid_sample_backward_helper_out(
|
||||
std::tuple<Tensor, Tensor> bw_out,
|
||||
std::optional<int64_t> grad_input_out_bdim,
|
||||
std::optional<int64_t> grad_grid_out_bdim,
|
||||
int64_t grad_input_out_bdim,
|
||||
int64_t grad_grid_out_bdim,
|
||||
int64_t bdim_size) {
|
||||
auto& [grad_input, grad_grid] = bw_out;
|
||||
grad_input = reshape_dim_outof(*grad_input_out_bdim, bdim_size, grad_input);
|
||||
grad_grid = reshape_dim_outof(*grad_grid_out_bdim, bdim_size, grad_grid);
|
||||
grad_input = reshape_dim_outof(grad_input_out_bdim, bdim_size, grad_input);
|
||||
grad_grid = reshape_dim_outof(grad_grid_out_bdim, bdim_size, grad_grid);
|
||||
return std::make_tuple(std::move(grad_input), grad_input_out_bdim, std::move(grad_grid), grad_grid_out_bdim);
|
||||
}
|
||||
|
||||
|
||||
@ -218,6 +218,8 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
|
||||
c10::MaybeOwned<Tensor> running_var_maybe_owned = at::borrow_from_optional_tensor(running_var_opt);
|
||||
const Tensor& running_var = *running_var_maybe_owned;
|
||||
// NB: not sure why these are optional...these are required from the forward
|
||||
TORCH_INTERNAL_ASSERT(save_mean_opt.has_value());
|
||||
TORCH_INTERNAL_ASSERT(save_rstd_opt.has_value());
|
||||
const Tensor& save_mean = *save_mean_opt;
|
||||
const Tensor& save_rstd = *save_rstd_opt;
|
||||
TORCH_INTERNAL_ASSERT(save_mean.defined());
|
||||
@ -226,6 +228,7 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> batch_norm_backward_plumbing(
|
||||
// plumbing
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
vmap_check_escaped(maybe_layer, "batch_norm_backward_plumbing");
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
|
||||
auto [grad_out_value, grad_out_bdim] = unwrapTensorAtLevel(grad_out, cur_level);
|
||||
@ -298,6 +301,7 @@ static std::tuple<Tensor,Tensor,Tensor> native_group_norm_plumbing(
|
||||
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
vmap_check_escaped(maybe_layer, "native_group_norm_plumbing");
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
|
||||
if (!areAnyBatchedAtLevel({input, weight_opt, bias_opt}, cur_level)) {
|
||||
@ -380,6 +384,7 @@ static std::tuple<Tensor,Tensor,Tensor> native_group_norm_backward_plumbing(
|
||||
// plumbing
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
vmap_check_escaped(maybe_layer, "native_group_norm_backward_plumbing");
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
|
||||
if (!areAnyBatchedAtLevel({grad_out, input, mean, rstd, weight_opt}, cur_level)) {
|
||||
@ -579,6 +584,7 @@ static std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_p
|
||||
// plumbing
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
vmap_check_escaped(maybe_layer, "native_layer_norm_backward_plumbing");
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
if (!areAnyBatchedAtLevel({grad_out, input, mean, rstd, weight_opt, bias_opt}, cur_level)) {
|
||||
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
|
||||
@ -721,6 +727,7 @@ struct NativeBatchNormBackwardBatchRuleHelper {
|
||||
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
vmap_check_escaped(maybe_layer, "NativeBatchNormBackwardBatchRuleHelper.apply");
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
|
||||
if (!areAnyBatchedAtLevel({grad_out, input, weight_opt, running_mean_opt,
|
||||
@ -751,6 +758,7 @@ struct CudnnBatchNormBackwardBatchRuleHelper {
|
||||
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
vmap_check_escaped(maybe_layer, "CudnnBatchNormBackwardBatchRuleHelper.apply");
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
|
||||
if (!areAnyBatchedAtLevel({input, grad_out, weight, running_mean_opt,
|
||||
@ -779,6 +787,7 @@ struct MiopenBatchNormBackwardBatchRuleHelper {
|
||||
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
vmap_check_escaped(maybe_layer, "MiopenBatchNormBackwardBatchRuleHelper.apply");
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
int64_t cur_level = maybe_layer->layerId();
|
||||
|
||||
if (!areAnyBatchedAtLevel({input, grad_out, weight, running_mean_opt,
|
||||
|
||||
@ -28,8 +28,10 @@ max_pool_with_indices_batch_rule_helper(
|
||||
return std::make_tuple(std::move(std::get<0>(result)), 0, std::move(std::get<1>(result)), 0);
|
||||
}
|
||||
// Tensor[B, N, logical_rank...] -> Tensor[B * N, logical_rank...]
|
||||
auto bdim_size = self.size(*self_bdim);
|
||||
auto self_ = reshape_dim_into(*self_bdim, 0, self);
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
auto bdim_size = self.size(self_bdim.value());
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
auto self_ = reshape_dim_into(self_bdim.value(), 0, self);
|
||||
auto result = pooling_fn(
|
||||
self_, kernel_size, stride, padding, dilation, ceil_mode);
|
||||
return std::make_tuple(
|
||||
|
||||
@ -25,6 +25,7 @@ Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) {
|
||||
c10::SmallVector<SymInt> shapeVec(1, maybe_layer->batchSize());
|
||||
shapeVec.reserve(shape.size() + 1);
|
||||
shapeVec.insert(shapeVec.end(), shape.begin(), shape.end());
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness);
|
||||
if (randomness == RandomnessType::Different) {
|
||||
@ -38,9 +39,11 @@ template <typename F, F Func, typename... ExtraArgs>
|
||||
Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) {
|
||||
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
const auto cur_level = maybe_layer->layerId();
|
||||
auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level);
|
||||
self_value = moveBatchDimToFront(self_value, self_bdim);
|
||||
self_value = moveBatchDimToFront(std::move(self_value), self_bdim);
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
RandomnessType randomness = maybe_layer->randomness();
|
||||
check_randomness(randomness);
|
||||
TORCH_CHECK(
|
||||
|
||||
@ -1732,11 +1732,10 @@ std::tuple<Tensor,Tensor,Tensor> _convolution_double_backward( const std::option
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> ggI_maybe_owned = at::borrow_from_optional_tensor(ggI_opt);
|
||||
const Tensor& ggI = *ggI_maybe_owned;
|
||||
const Tensor& ggW_r = ggW_r_opt.value_or(Tensor());
|
||||
Tensor ggW = ggW_r_opt.value_or(Tensor());
|
||||
const Tensor& ggb = ggb_opt.value_or(Tensor());
|
||||
|
||||
|
||||
auto ggW = ggW_r;
|
||||
auto gO = gO_r;
|
||||
auto weight = weight_r;
|
||||
|
||||
|
||||
@ -251,20 +251,12 @@ Tensor kl_div(const Tensor& input, const Tensor& target, int64_t reduction, bool
|
||||
}
|
||||
|
||||
Tensor binary_cross_entropy_cpu(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
Tensor loss = at::empty_like(input);
|
||||
return at::native::binary_cross_entropy_out_cpu(
|
||||
input, target, weight, reduction, loss);
|
||||
input, target, weight_opt, reduction, loss);
|
||||
}
|
||||
|
||||
Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
Tensor loss_squeezed = at::squeeze(loss);
|
||||
|
||||
auto iter = TensorIteratorConfig()
|
||||
@ -297,8 +289,8 @@ Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target,
|
||||
});
|
||||
});
|
||||
|
||||
if (weight.defined()) {
|
||||
loss.mul_(weight);
|
||||
if (weight_opt.has_value() && weight_opt->defined()) {
|
||||
loss.mul_(*weight_opt);
|
||||
}
|
||||
if (reduction != at::Reduction::None) {
|
||||
Tensor loss_reduced = apply_loss_reduction(loss, reduction);
|
||||
@ -308,20 +300,12 @@ Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target,
|
||||
}
|
||||
|
||||
Tensor binary_cross_entropy_backward_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
Tensor grad_input = at::empty_like(input);
|
||||
return at::native::binary_cross_entropy_backward_out_cpu(
|
||||
grad, input, target, weight, reduction, grad_input);
|
||||
grad, input, target, weight_opt, reduction, grad_input);
|
||||
}
|
||||
|
||||
Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& grad_input) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
Tensor grad_input_squeezed = at::squeeze(grad_input);
|
||||
|
||||
auto iter = TensorIteratorConfig()
|
||||
@ -350,8 +334,8 @@ Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor&
|
||||
});
|
||||
});
|
||||
|
||||
if (weight.defined()) {
|
||||
grad_input.mul_(weight);
|
||||
if (weight_opt.has_value() && weight_opt->defined()) {
|
||||
grad_input.mul_(*weight_opt);
|
||||
}
|
||||
if (reduction == at::Reduction::Mean) {
|
||||
grad_input.div_(input.numel());
|
||||
@ -360,23 +344,17 @@ Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor&
|
||||
}
|
||||
|
||||
Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& pos_weight_opt, int64_t reduction) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
c10::MaybeOwned<Tensor> pos_weight_maybe_owned = at::borrow_from_optional_tensor(pos_weight_opt);
|
||||
const Tensor& pos_weight = *pos_weight_maybe_owned;
|
||||
|
||||
auto log_sigmoid_input = at::log_sigmoid(input);
|
||||
if (pos_weight.defined()) {
|
||||
if (pos_weight_opt.has_value() && pos_weight_opt->defined()) {
|
||||
// pos_weight need to be broadcasted, thus mul(target) is not inplace.
|
||||
auto log_weight = (pos_weight - 1).mul(target).add_(1);
|
||||
auto log_weight = (*pos_weight_opt- 1).mul(target).add_(1);
|
||||
log_sigmoid_input.mul_(log_weight);
|
||||
}
|
||||
|
||||
Tensor loss = (1 - target).mul_(input).sub_(log_sigmoid_input);
|
||||
|
||||
if (weight.defined()) {
|
||||
loss.mul_(weight);
|
||||
if (weight_opt.has_value() && weight_opt->defined()) {
|
||||
loss.mul_(*weight_opt);
|
||||
}
|
||||
|
||||
return apply_loss_reduction(loss, reduction);
|
||||
|
||||
@ -659,20 +659,12 @@ Tensor cross_entropy_loss_symint(
|
||||
}
|
||||
|
||||
Tensor & nll_loss_out(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
Tensor total_weight = at::empty({0}, self.options());
|
||||
return std::get<0>(at::nll_loss_forward_out(output, total_weight, self, target, weight, reduction, ignore_index));
|
||||
return std::get<0>(at::nll_loss_forward_out(output, total_weight, self, target, weight_opt, reduction, ignore_index));
|
||||
}
|
||||
|
||||
Tensor nll_loss_symint(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, c10::SymInt ignore_index) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
return std::get<0>(at::nll_loss_forward_symint(self, target, weight, reduction, std::move(ignore_index)));
|
||||
return std::get<0>(at::nll_loss_forward_symint(self, target, weight_opt, reduction, std::move(ignore_index)));
|
||||
}
|
||||
|
||||
Tensor nll_loss_nd_symint(
|
||||
|
||||
@ -424,14 +424,10 @@ std::tuple<Tensor, Tensor> nll_loss2d_forward_cpu(
|
||||
const Tensor& target, const std::optional<Tensor>& weight_opt,
|
||||
int64_t reduction,
|
||||
int64_t ignore_index) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
auto output = at::empty({0}, self.options());
|
||||
auto total_weight = at::empty({0}, self.options());
|
||||
at::native::nll_loss2d_forward_out_cpu(
|
||||
self, target, weight, reduction, ignore_index, output, total_weight);
|
||||
self, target, weight_opt, reduction, ignore_index, output, total_weight);
|
||||
return std::make_tuple(output, total_weight);
|
||||
}
|
||||
|
||||
@ -465,16 +461,12 @@ Tensor nll_loss2d_backward_cpu(
|
||||
int64_t reduction,
|
||||
int64_t ignore_index,
|
||||
const Tensor& total_weight) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
auto grad_input = at::zeros_like(self);
|
||||
at::native::nll_loss2d_backward_out_cpu(
|
||||
grad_output,
|
||||
self,
|
||||
target,
|
||||
weight,
|
||||
weight_opt,
|
||||
reduction,
|
||||
ignore_index,
|
||||
total_weight,
|
||||
@ -483,20 +475,12 @@ Tensor nll_loss2d_backward_cpu(
|
||||
}
|
||||
|
||||
Tensor & nll_loss2d_out(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index, Tensor & output) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
Tensor total_weight = at::empty({0}, self.options());
|
||||
return std::get<0>(at::nll_loss2d_forward_out(output, total_weight, self, target, weight, reduction, ignore_index));
|
||||
return std::get<0>(at::nll_loss2d_forward_out(output, total_weight, self, target, weight_opt, reduction, ignore_index));
|
||||
}
|
||||
|
||||
Tensor nll_loss2d_symint(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, c10::SymInt ignore_index) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
return std::get<0>(at::nll_loss2d_forward_symint(self, target, weight, reduction, std::move(ignore_index)));
|
||||
return std::get<0>(at::nll_loss2d_forward_symint(self, target, weight_opt, reduction, std::move(ignore_index)));
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -13,21 +13,62 @@ struct TensorIterator;
|
||||
|
||||
namespace at::native {
|
||||
|
||||
using index_put_with_sort_fn = void(*)(Tensor &, const c10::List<std::optional<Tensor>> &, const Tensor &, bool accumulate, bool unsafe);
|
||||
using index_put_with_sort_quantized_fn = void(*)(Tensor& self, const c10::List<std::optional<Tensor>>& indices, const Tensor& value, double scale, int zero_point, bool unsafe);
|
||||
using gather_fn = void (*)(const Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
|
||||
using scatter_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
|
||||
using scatter_fill_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src);
|
||||
using scatter_add_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
|
||||
using scatter_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
|
||||
const Tensor& src, const ReductionType& reduce);
|
||||
using scatter_scalar_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
|
||||
const Scalar& value, const ReductionType& reduce);
|
||||
using scatter_reduce_two_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
|
||||
const Tensor& src, const ReductionType& reduce);
|
||||
using index_put_with_sort_fn = void (*)(
|
||||
Tensor&,
|
||||
const c10::List<std::optional<Tensor>>&,
|
||||
const Tensor&,
|
||||
bool accumulate,
|
||||
bool unsafe);
|
||||
using index_put_with_sort_quantized_fn = void (*)(
|
||||
Tensor& self,
|
||||
const c10::List<std::optional<Tensor>>& indices,
|
||||
const Tensor& value,
|
||||
double scale,
|
||||
int zero_point,
|
||||
bool unsafe);
|
||||
using gather_fn = void (*)(
|
||||
const Tensor& result,
|
||||
const Tensor& self,
|
||||
int64_t dim,
|
||||
const Tensor& index);
|
||||
using scatter_fn = void (*)(
|
||||
const Tensor& self,
|
||||
int64_t dim,
|
||||
const Tensor& index,
|
||||
const Tensor& src);
|
||||
using scatter_fill_fn = void (*)(
|
||||
const Tensor& self,
|
||||
int64_t dim,
|
||||
const Tensor& index,
|
||||
const Scalar& src);
|
||||
using scatter_add_fn = void (*)(
|
||||
const Tensor& self,
|
||||
int64_t dim,
|
||||
const Tensor& index,
|
||||
const Tensor& src);
|
||||
using scatter_reduce_fn = void (*)(
|
||||
const Tensor& self,
|
||||
const int64_t dim,
|
||||
const Tensor& index,
|
||||
const Tensor& src,
|
||||
const ReductionType& reduce);
|
||||
using scatter_scalar_reduce_fn = void (*)(
|
||||
const Tensor& self,
|
||||
const int64_t dim,
|
||||
const Tensor& index,
|
||||
const Scalar& value,
|
||||
const ReductionType& reduce);
|
||||
using scatter_reduce_two_fn = void (*)(
|
||||
const Tensor& self,
|
||||
const int64_t dim,
|
||||
const Tensor& index,
|
||||
const Tensor& src,
|
||||
const ReductionType& reduce);
|
||||
|
||||
DECLARE_DISPATCH(index_put_with_sort_fn, index_put_with_sort_stub)
|
||||
DECLARE_DISPATCH(index_put_with_sort_quantized_fn, index_put_with_sort_quantized_stub)
|
||||
DECLARE_DISPATCH(
|
||||
index_put_with_sort_quantized_fn,
|
||||
index_put_with_sort_quantized_stub)
|
||||
DECLARE_DISPATCH(gather_fn, gather_stub)
|
||||
DECLARE_DISPATCH(scatter_fn, scatter_stub)
|
||||
DECLARE_DISPATCH(scatter_fill_fn, scatter_fill_stub)
|
||||
@ -36,14 +77,26 @@ DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub)
|
||||
DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub)
|
||||
DECLARE_DISPATCH(scatter_reduce_two_fn, scatter_reduce_two_stub)
|
||||
|
||||
TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List<std::optional<at::Tensor>>& indices);
|
||||
TORCH_API Tensor& index_out(
|
||||
Tensor& result,
|
||||
const Tensor& self,
|
||||
const c10::List<std::optional<at::Tensor>>& indices);
|
||||
|
||||
using scatter_add_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&);
|
||||
using scatter_reduce_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const ReductionType& reduce, bool);
|
||||
using gather_expanded_index_fn = void (*)(const Tensor&, const Tensor&, const Tensor&);
|
||||
using scatter_add_expanded_index_fn =
|
||||
void (*)(const Tensor&, const Tensor&, const Tensor&);
|
||||
using scatter_reduce_expanded_index_fn = void (*)(
|
||||
const Tensor&,
|
||||
const Tensor&,
|
||||
const Tensor&,
|
||||
const ReductionType& reduce,
|
||||
bool);
|
||||
using gather_expanded_index_fn =
|
||||
void (*)(const Tensor&, const Tensor&, const Tensor&);
|
||||
|
||||
DECLARE_DISPATCH(scatter_add_expanded_index_fn, scatter_add_expanded_index_stub)
|
||||
DECLARE_DISPATCH(scatter_reduce_expanded_index_fn, scatter_reduce_expanded_index_stub)
|
||||
DECLARE_DISPATCH(
|
||||
scatter_reduce_expanded_index_fn,
|
||||
scatter_reduce_expanded_index_stub)
|
||||
DECLARE_DISPATCH(gather_expanded_index_fn, gather_expanded_index_stub)
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -23,28 +23,38 @@ inline std::string shapes_as_str(TensorList tensors) {
|
||||
#endif
|
||||
} // anonymous namespace
|
||||
|
||||
inline std::tuple<bool, Tensor> canDispatchToMaskedFill(const Tensor& self, const torch::List<std::optional<at::Tensor>>& indices,
|
||||
const Tensor& value){
|
||||
if (!(value.numel() ==1 && value.device().is_cpu())){
|
||||
return std::make_tuple(false,Tensor());
|
||||
inline std::tuple<bool, Tensor> canDispatchToMaskedFill(
|
||||
const Tensor& self,
|
||||
const torch::List<std::optional<at::Tensor>>& indices,
|
||||
const Tensor& value) {
|
||||
if (!(value.numel() == 1 && value.device().is_cpu())) {
|
||||
return std::make_tuple(false, Tensor());
|
||||
}
|
||||
int64_t num_ind = 0;
|
||||
Tensor mask;
|
||||
auto self_device = self.device();
|
||||
for (const std::optional<Tensor>& i: indices) {
|
||||
if (!i.has_value() || !(*i).defined()){
|
||||
for (const std::optional<Tensor>& i : indices) {
|
||||
if (!i.has_value() || !(*i).defined()) {
|
||||
num_ind++;
|
||||
} else {
|
||||
const Tensor &index = *i;
|
||||
const Tensor& index = *i;
|
||||
if ((index.scalar_type() != kByte && index.scalar_type() != kBool) ||
|
||||
index.device() != self_device || mask.defined()){
|
||||
index.device() != self_device || mask.defined()) {
|
||||
return std::make_tuple(false, Tensor());
|
||||
} else {
|
||||
mask = index;
|
||||
for (const auto j : c10::irange(index.dim())) {
|
||||
int64_t srcIdx = num_ind + j;
|
||||
TORCH_CHECK_INDEX(index.size(j) == self.size(srcIdx), "The shape of the mask ", index.sizes(), " at index ", j,
|
||||
" does not match the shape of the indexed tensor ", self.sizes(), " at index ", srcIdx);
|
||||
TORCH_CHECK_INDEX(
|
||||
index.size(j) == self.size(srcIdx),
|
||||
"The shape of the mask ",
|
||||
index.sizes(),
|
||||
" at index ",
|
||||
j,
|
||||
" does not match the shape of the indexed tensor ",
|
||||
self.sizes(),
|
||||
" at index ",
|
||||
srcIdx);
|
||||
}
|
||||
num_ind += mask.ndimension();
|
||||
}
|
||||
@ -59,14 +69,18 @@ const Tensor& value){
|
||||
|
||||
inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
|
||||
checkIndexTensorTypes(orig, /*allow_int*/ true);
|
||||
// first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
|
||||
// first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more
|
||||
// LongTensors
|
||||
auto indices = expandTensors(self, orig);
|
||||
// next broadcast all index tensors together
|
||||
try {
|
||||
indices = expand_outplace(indices);
|
||||
} catch (std::exception& e) {
|
||||
TORCH_CHECK_INDEX(false, "shape mismatch: indexing tensors could not be broadcast together"
|
||||
" with shapes ", shapes_as_str(indices));
|
||||
TORCH_CHECK_INDEX(
|
||||
false,
|
||||
"shape mismatch: indexing tensors could not be broadcast together"
|
||||
" with shapes ",
|
||||
shapes_as_str(indices));
|
||||
}
|
||||
// add missing null Tensors so that it matches self.dim()
|
||||
while (indices.size() < (size_t)self.dim()) {
|
||||
@ -78,12 +92,12 @@ inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
|
||||
std::tie(self, indices) = transposeToFront(self, indices);
|
||||
}
|
||||
// Ensure indices are on the same device as self
|
||||
for (auto & indice : indices) {
|
||||
for (auto& indice : indices) {
|
||||
if (indice.defined() && indice.device() != self.device()) {
|
||||
indice = indice.to(self.device());
|
||||
}
|
||||
}
|
||||
for (auto & indice : indices) {
|
||||
for (auto& indice : indices) {
|
||||
if (indice.defined() && indice.dtype() == at::kInt) {
|
||||
indice = indice.to(at::kLong);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -10,7 +10,7 @@ namespace at {
|
||||
class Tensor;
|
||||
struct TensorIterator;
|
||||
struct TensorIteratorBase;
|
||||
}
|
||||
} // namespace at
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -22,28 +22,35 @@ using structured_reduce_minmax_fn =
|
||||
DECLARE_DISPATCH(structured_reduce_minmax_fn, max_stub)
|
||||
DECLARE_DISPATCH(structured_reduce_minmax_fn, min_stub)
|
||||
|
||||
using where_fn = void (*)(TensorIterator &);
|
||||
using where_fn = void (*)(TensorIterator&);
|
||||
DECLARE_DISPATCH(where_fn, where_kernel)
|
||||
|
||||
using is_infinity_op_fn = void (*)(TensorIteratorBase &);
|
||||
using is_infinity_op_fn = void (*)(TensorIteratorBase&);
|
||||
DECLARE_DISPATCH(is_infinity_op_fn, isposinf_stub)
|
||||
DECLARE_DISPATCH(is_infinity_op_fn, isneginf_stub)
|
||||
|
||||
using mode_fn = void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool);
|
||||
DECLARE_DISPATCH(mode_fn, mode_stub)
|
||||
|
||||
using clamp_tensor_fn = void (*)(TensorIteratorBase &);
|
||||
using clamp_tensor_fn = void (*)(TensorIteratorBase&);
|
||||
DECLARE_DISPATCH(clamp_tensor_fn, clamp_stub)
|
||||
|
||||
namespace detail {
|
||||
enum class ClampLimits {Min, Max, MinMax};
|
||||
enum class ClampLimits { Min, Max, MinMax };
|
||||
}
|
||||
|
||||
DECLARE_DISPATCH(void (*)(TensorIteratorBase &, const c10::Scalar&, const c10::Scalar&), clamp_scalar_stub)
|
||||
DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_min_scalar_stub)
|
||||
DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_max_scalar_stub)
|
||||
DECLARE_DISPATCH(
|
||||
void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&),
|
||||
clamp_scalar_stub)
|
||||
DECLARE_DISPATCH(
|
||||
void (*)(TensorIteratorBase&, c10::Scalar),
|
||||
clamp_min_scalar_stub)
|
||||
DECLARE_DISPATCH(
|
||||
void (*)(TensorIteratorBase&, c10::Scalar),
|
||||
clamp_max_scalar_stub)
|
||||
|
||||
using isin_default_fn = void (*)(const Tensor&, const Tensor&, bool, const Tensor&);
|
||||
using isin_default_fn =
|
||||
void (*)(const Tensor&, const Tensor&, bool, const Tensor&);
|
||||
DECLARE_DISPATCH(isin_default_fn, isin_default_stub)
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -7,7 +7,7 @@
|
||||
#include <optional>
|
||||
|
||||
namespace at {
|
||||
class Tensor;
|
||||
class Tensor;
|
||||
namespace native {
|
||||
bool to_will_alias(
|
||||
const Tensor& self,
|
||||
@ -20,7 +20,12 @@ bool to_will_alias(
|
||||
Tensor to_meta(const Tensor& tensor);
|
||||
std::optional<Tensor> to_meta(const std::optional<Tensor>& tensor);
|
||||
std::vector<Tensor> to_meta(at::ITensorListRef t_list);
|
||||
Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, std::optional<c10::Layout> layout, OptionalIntArrayRef blocksize, std::optional<int64_t> dense_dim_opt);
|
||||
Tensor dense_to_sparse_with_mask(
|
||||
const Tensor& self,
|
||||
const Tensor& mask,
|
||||
std::optional<c10::Layout> layout,
|
||||
OptionalIntArrayRef blocksize,
|
||||
std::optional<int64_t> dense_dim_opt);
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
||||
@ -3,10 +3,15 @@
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at::native {
|
||||
//input tensors are non-zero dim and non-empty
|
||||
template<typename T1, typename T2, typename Function>
|
||||
// input tensors are non-zero dim and non-empty
|
||||
template <typename T1, typename T2, typename Function>
|
||||
|
||||
void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim, Function func) {
|
||||
void tensor_dim_apply3(
|
||||
const Tensor& self,
|
||||
Tensor& values,
|
||||
Tensor& indices,
|
||||
int64_t dim,
|
||||
Function func) {
|
||||
int ndims = self.dim();
|
||||
int tensor_dim_apply_has_finished = 0;
|
||||
std::vector<int64_t> counter(ndims, 0);
|
||||
@ -19,9 +24,16 @@ void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int6
|
||||
int self_dim_size = self.size(dim);
|
||||
|
||||
while (!tensor_dim_apply_has_finished) {
|
||||
func(self_data, values_data, indices_data, self_dim_size, self_stride, values_stride, indices_stride);
|
||||
func(
|
||||
self_data,
|
||||
values_data,
|
||||
indices_data,
|
||||
self_dim_size,
|
||||
self_stride,
|
||||
values_stride,
|
||||
indices_stride);
|
||||
if (ndims == 1) {
|
||||
break;
|
||||
break;
|
||||
}
|
||||
for (const auto dim_i : c10::irange(ndims)) {
|
||||
if (dim_i == dim) {
|
||||
@ -37,18 +49,18 @@ void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int6
|
||||
indices_data += indices.stride(dim_i);
|
||||
|
||||
if (counter[dim_i] == self.size(dim_i)) {
|
||||
if (dim_i == ndims-1) {
|
||||
if (dim_i == ndims - 1) {
|
||||
tensor_dim_apply_has_finished = 1;
|
||||
break;
|
||||
} else {
|
||||
self_data -= counter[dim_i]*self.stride(dim_i);
|
||||
values_data -= counter[dim_i]*values.stride(dim_i);
|
||||
indices_data -= counter[dim_i]*indices.stride(dim_i);
|
||||
self_data -= counter[dim_i] * self.stride(dim_i);
|
||||
values_data -= counter[dim_i] * values.stride(dim_i);
|
||||
indices_data -= counter[dim_i] * indices.stride(dim_i);
|
||||
counter[dim_i] = 0;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,10 +1,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/EmptyTensor.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/EmptyTensor.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
@ -41,9 +41,9 @@ inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
|
||||
return 0;
|
||||
}
|
||||
// number of elements in the first row of the tril
|
||||
auto m_first_row = offset > 0 ?
|
||||
std::min<int64_t>(col, 1 + offset) : // upper bounded by col
|
||||
row + offset > 0; // either 0 or 1
|
||||
auto m_first_row = offset > 0 ? std::min<int64_t>(col, 1 + offset)
|
||||
: // upper bounded by col
|
||||
row + offset > 0; // either 0 or 1
|
||||
// number of elements in the last row of the tril, bounded by [0, col]
|
||||
auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset));
|
||||
// number of rows, bounded by [0, row]
|
||||
@ -63,35 +63,49 @@ inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
|
||||
}
|
||||
|
||||
inline void check_args(
|
||||
int64_t row, int64_t col, std::optional<Layout> layout_opt) {
|
||||
int64_t row,
|
||||
int64_t col,
|
||||
std::optional<Layout> layout_opt) {
|
||||
TORCH_CHECK(row >= 0, "row must be non-negative, got", row);
|
||||
TORCH_CHECK(col >= 0, "col must be non-negative, got", col);
|
||||
if (layout_opt.has_value()) {
|
||||
TORCH_CHECK(
|
||||
*layout_opt == at::kStrided,
|
||||
"only support layout=torch.strided, got",
|
||||
*layout_opt)
|
||||
*layout_opt == at::kStrided,
|
||||
"only support layout=torch.strided, got",
|
||||
*layout_opt)
|
||||
}
|
||||
}
|
||||
|
||||
using at::check_size_nonnegative;
|
||||
|
||||
// assumes maximum value in created tensor is n-1 (e.g., torch.randperm(n))
|
||||
inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tensor) {
|
||||
inline void check_supported_max_int_with_precision(
|
||||
int64_t n,
|
||||
const Tensor& tensor) {
|
||||
// match defined() to behavior of checks below
|
||||
TORCH_CHECK(at::scalar_tensor(n>0?n-1:n, tensor.options()).defined(),
|
||||
"n is too large for result tensor type: '", tensor.toString(), "'");
|
||||
TORCH_CHECK(
|
||||
at::scalar_tensor(n > 0 ? n - 1 : n, tensor.options()).defined(),
|
||||
"n is too large for result tensor type: '",
|
||||
tensor.toString(),
|
||||
"'");
|
||||
|
||||
// Ensure sufficient precision for floating point representation.
|
||||
switch (tensor.scalar_type()) {
|
||||
case at::ScalarType::Half:
|
||||
TORCH_CHECK(n <= (int64_t(1) << 11) + 1, "n cannot be greater than 2049 for Half type.");
|
||||
TORCH_CHECK(
|
||||
n <= (int64_t(1) << 11) + 1,
|
||||
"n cannot be greater than 2049 for Half type.");
|
||||
break;
|
||||
case at::ScalarType::Float:
|
||||
TORCH_CHECK(n <= (int64_t(1) << 24) + 1, "n cannot be greater than 2^24+1 for Float type.");
|
||||
TORCH_CHECK(
|
||||
n <= (int64_t(1) << 24) + 1,
|
||||
"n cannot be greater than 2^24+1 for Float type.");
|
||||
break;
|
||||
case at::ScalarType::Double: // Unlikely to happen, but doesn't hurt to check
|
||||
TORCH_CHECK(n <= (int64_t(1) << 53) + 1, "n cannot be greater than 2^53+1 for Double type.");
|
||||
case at::ScalarType::Double: // Unlikely to happen, but doesn't hurt to
|
||||
// check
|
||||
TORCH_CHECK(
|
||||
n <= (int64_t(1) << 53) + 1,
|
||||
"n cannot be greater than 2^53+1 for Double type.");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@ -104,14 +118,24 @@ inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tens
|
||||
inline Tensor& fill_empty_deterministic_(Tensor& tensor) {
|
||||
if (tensor.is_floating_point() || tensor.is_complex()) {
|
||||
AT_DISPATCH_V2(
|
||||
tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
|
||||
tensor.fill_(std::numeric_limits<scalar_t>::quiet_NaN());
|
||||
}), AT_EXPAND(AT_FLOATING_TYPES), AT_EXPAND(AT_COMPLEX_TYPES), AT_EXPAND(AT_FLOAT8_TYPES), kBFloat16, kHalf, kComplexHalf);
|
||||
tensor.scalar_type(),
|
||||
"fill_empty_deterministic_",
|
||||
AT_WRAP([&]() {
|
||||
tensor.fill_(std::numeric_limits<scalar_t>::quiet_NaN());
|
||||
}),
|
||||
AT_EXPAND(AT_FLOATING_TYPES),
|
||||
AT_EXPAND(AT_COMPLEX_TYPES),
|
||||
AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
kBFloat16,
|
||||
kHalf,
|
||||
kComplexHalf);
|
||||
} else {
|
||||
AT_DISPATCH_V2(
|
||||
tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
|
||||
tensor.fill_(std::numeric_limits<scalar_t>::max());
|
||||
}), kBool, AT_EXPAND(AT_INTEGRAL_TYPES_V2));
|
||||
tensor.scalar_type(),
|
||||
"fill_empty_deterministic_",
|
||||
AT_WRAP([&]() { tensor.fill_(std::numeric_limits<scalar_t>::max()); }),
|
||||
kBool,
|
||||
AT_EXPAND(AT_INTEGRAL_TYPES_V2));
|
||||
}
|
||||
return tensor;
|
||||
}
|
||||
@ -130,7 +154,10 @@ struct ZeroTensorAllocator final : public at::Allocator {
|
||||
DeleterFnPtr raw_deleter() const override {
|
||||
return deleter;
|
||||
}
|
||||
void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const final {}
|
||||
void copy_data(
|
||||
void* dest [[maybe_unused]],
|
||||
const void* src [[maybe_unused]],
|
||||
std::size_t count [[maybe_unused]]) const final {}
|
||||
at::Device device_;
|
||||
};
|
||||
|
||||
|
||||
@ -1,39 +1,39 @@
|
||||
#pragma once
|
||||
|
||||
#include <complex>
|
||||
#include <type_traits>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <ATen/detail/FunctionTraits.h>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <complex>
|
||||
#include <type_traits>
|
||||
|
||||
// This file includes utilities for dynamic_casting done by TensorIterator, see
|
||||
// CUDALoops.cuh and Loops.h.
|
||||
|
||||
// This file includes utilities for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h.
|
||||
|
||||
// dynamic_casting handles when the types expected by the iterator do not match the types of the arguments
|
||||
// to the function that is being called.
|
||||
// On CUDA, the cast is currently pushed down into the kernel (for performance reasons).
|
||||
// On CPU, there is currently an internal assert that a dynamic_cast is not needed.
|
||||
// dynamic_casting handles when the types expected by the iterator do not match
|
||||
// the types of the arguments to the function that is being called. On CUDA, the
|
||||
// cast is currently pushed down into the kernel (for performance reasons). On
|
||||
// CPU, there is currently an internal assert that a dynamic_cast is not needed.
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// `needs_dynamic_casting` compares the types expected by iterator
|
||||
// (i.e. dtypes of the operands) with the actual type of the arguments
|
||||
// (and returns) of func_t
|
||||
template<typename func_t, int nargs=function_traits<func_t>::arity>
|
||||
template <typename func_t, int nargs = function_traits<func_t>::arity>
|
||||
struct needs_dynamic_casting {
|
||||
static bool check(TensorIteratorBase& iter) {
|
||||
using traits = function_traits<func_t>;
|
||||
using cpp_type = typename traits::template arg<nargs - 1>::type;
|
||||
using cpp_map = c10::CppTypeToScalarType<cpp_type>;
|
||||
|
||||
if (iter.input_dtype(nargs-1) != cpp_map::value) {
|
||||
if (iter.input_dtype(nargs - 1) != cpp_map::value) {
|
||||
return true;
|
||||
}
|
||||
return needs_dynamic_casting<func_t, nargs - 1>::check(iter);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename func_t>
|
||||
template <typename func_t>
|
||||
struct needs_dynamic_casting<func_t, 0> {
|
||||
static bool check(TensorIteratorBase& iter) {
|
||||
using traits = function_traits<func_t>;
|
||||
@ -49,4 +49,4 @@ struct needs_dynamic_casting<func_t, 0> {
|
||||
}
|
||||
};
|
||||
|
||||
} //namespace at::native
|
||||
} // namespace at::native
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/TensorIterator.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
#include <ATen/TensorIteratorInternal.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
@ -22,7 +22,9 @@ static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop);
|
||||
static void parallel_dim_reduction(TensorIteratorBase& iter, loop2d_t loop);
|
||||
|
||||
void TensorIteratorBase::parallel_reduce(loop2d_t loop) {
|
||||
TORCH_CHECK(ntensors() == 2, "parallel_reduce only supports one input and one output");
|
||||
TORCH_CHECK(
|
||||
ntensors() == 2,
|
||||
"parallel_reduce only supports one input and one output");
|
||||
int64_t numel = this->numel();
|
||||
if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
|
||||
at::in_parallel_region()) {
|
||||
@ -54,18 +56,24 @@ static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop) {
|
||||
auto first_reduce = TensorIterator::reduce_op(buffer_0, iter.input(0));
|
||||
TORCH_INTERNAL_ASSERT(first_reduce.output(0).is_alias_of(buffer_0));
|
||||
|
||||
at::parallel_for(0, iter.numel(), internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
|
||||
const auto thread_num = at::get_thread_num();
|
||||
auto shape = first_reduce.shape();
|
||||
auto strides = first_reduce.get_strides();
|
||||
at::parallel_for(
|
||||
0, iter.numel(), internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
|
||||
const auto thread_num = at::get_thread_num();
|
||||
auto shape = first_reduce.shape();
|
||||
auto strides = first_reduce.get_strides();
|
||||
|
||||
// Bump output ptr so each thread has its own output slice
|
||||
auto base_ptrs = first_reduce.get_base_ptrs();
|
||||
base_ptrs[0] += buffer_stride * thread_num;
|
||||
// Bump output ptr so each thread has its own output slice
|
||||
auto base_ptrs = first_reduce.get_base_ptrs();
|
||||
base_ptrs[0] += buffer_stride * thread_num;
|
||||
|
||||
at::internal::serial_for_each(shape, strides, base_ptrs.data(),
|
||||
base_ptrs.size(), loop, {begin, end});
|
||||
});
|
||||
at::internal::serial_for_each(
|
||||
shape,
|
||||
strides,
|
||||
base_ptrs.data(),
|
||||
base_ptrs.size(),
|
||||
loop,
|
||||
{begin, end});
|
||||
});
|
||||
|
||||
auto final_reduce = TensorIterator::reduce_op(unsqueezed, buffer);
|
||||
final_reduce.for_each(loop);
|
||||
@ -91,8 +99,12 @@ static int find_split_dim(TensorIteratorBase& iter) {
|
||||
return best_dim;
|
||||
}
|
||||
|
||||
static std::tuple<int64_t, int64_t>
|
||||
round_columns(TensorIteratorBase& iter, int dim, int multiple, int64_t begin, int64_t end) {
|
||||
static std::tuple<int64_t, int64_t> round_columns(
|
||||
TensorIteratorBase& iter,
|
||||
int dim,
|
||||
int multiple,
|
||||
int64_t begin,
|
||||
int64_t end) {
|
||||
begin = begin - (begin % multiple);
|
||||
if (end != iter.shape()[dim]) {
|
||||
// only round the 'end' column down if it's not the final column
|
||||
@ -113,7 +125,8 @@ static void parallel_dim_reduction(TensorIteratorBase& iter, loop2d_t loop) {
|
||||
// round columns to multiples of 128 bytes if adjacent columns are
|
||||
// contiguous in memory.
|
||||
int64_t cols_per_128_bytes = 128 / element_size;
|
||||
std::tie(begin, end) = round_columns(iter, dim, cols_per_128_bytes, begin, end);
|
||||
std::tie(begin, end) =
|
||||
round_columns(iter, dim, cols_per_128_bytes, begin, end);
|
||||
}
|
||||
if (begin == end) {
|
||||
return;
|
||||
@ -124,7 +137,9 @@ static void parallel_dim_reduction(TensorIteratorBase& iter, loop2d_t loop) {
|
||||
});
|
||||
}
|
||||
|
||||
void TensorIteratorBase::foreach_reduced_elt(loop_subiter_t loop, bool parallelize) {
|
||||
void TensorIteratorBase::foreach_reduced_elt(
|
||||
loop_subiter_t loop,
|
||||
bool parallelize) {
|
||||
AT_ASSERT(ninputs() == 1);
|
||||
AT_ASSERT(noutputs() >= 1);
|
||||
|
||||
@ -134,26 +149,26 @@ void TensorIteratorBase::foreach_reduced_elt(loop_subiter_t loop, bool paralleli
|
||||
}
|
||||
if (output(0).numel() == 1) {
|
||||
loop(*this);
|
||||
}
|
||||
else if (numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
|
||||
} else if (
|
||||
numel() < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
|
||||
at::in_parallel_region() || !parallelize) {
|
||||
auto reduce_dims = num_reduce_dims();
|
||||
|
||||
auto non_reduced_shape = shape.slice(reduce_dims, shape.size() - reduce_dims);
|
||||
auto non_reduced_shape =
|
||||
shape.slice(reduce_dims, shape.size() - reduce_dims);
|
||||
|
||||
int64_t non_reduced_numel = 1;
|
||||
for (const auto i : non_reduced_shape) {
|
||||
non_reduced_numel *= i;
|
||||
}
|
||||
DimCounter dims {non_reduced_shape, {0, non_reduced_numel}};
|
||||
DimCounter dims{non_reduced_shape, {0, non_reduced_numel}};
|
||||
while (!dims.is_done()) {
|
||||
TensorIterator reduced = *this;
|
||||
reduced.select_all_keeping_dim(reduce_dims, dims.values);
|
||||
loop(reduced);
|
||||
dims.increment({1, 1});
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
int dim = find_split_dim(*this);
|
||||
int64_t cols = shape[dim];
|
||||
at::parallel_for(0, cols, 1, [&](int64_t begin, int64_t end) {
|
||||
@ -177,4 +192,4 @@ void TensorIteratorBase::foreach_reduced_elt(loop_subiter_t loop, bool paralleli
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace at
|
||||
} // namespace at
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Context.h>
|
||||
#include <ATen/NamedTensorUtils.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/detail/CUDAHooksInterface.h>
|
||||
#include <ATen/native/TensorProperties.h>
|
||||
|
||||
@ -36,9 +36,10 @@ bool nested_is_same_size(const Tensor& self, const Tensor& other) {
|
||||
TORCH_CHECK(
|
||||
self.is_nested() && other.is_nested(),
|
||||
"Expected both self and other to be nested tensors. ",
|
||||
"Self ", self.is_nested()? "is " : "is not ",
|
||||
"Self ",
|
||||
self.is_nested() ? "is " : "is not ",
|
||||
"nested. While Other ",
|
||||
other.is_nested()? "is " : "is not ",
|
||||
other.is_nested() ? "is " : "is not ",
|
||||
"nested.")
|
||||
const auto self_nt_size = _nested_tensor_size(self);
|
||||
const auto other_nt_size = _nested_tensor_size(other);
|
||||
@ -79,16 +80,21 @@ int64_t stride(const Tensor& self, Dimname dim) {
|
||||
}
|
||||
|
||||
bool cudnn_is_acceptable(const TensorBase& self) {
|
||||
if (!globalContext().userEnabledCuDNN()) return false;
|
||||
if (!self.is_cuda()) return false;
|
||||
if (!globalContext().userEnabledCuDNN())
|
||||
return false;
|
||||
if (!self.is_cuda())
|
||||
return false;
|
||||
auto st = self.scalar_type();
|
||||
if (!(st == kDouble || st == kFloat || st == kHalf)) return false;
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN()) return false;
|
||||
if (!(st == kDouble || st == kFloat || st == kHalf))
|
||||
return false;
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN())
|
||||
return false;
|
||||
// cuDNN functions like grid_sampler returns CUDNN_STATUS_BAD_PARAM on empty
|
||||
// tensors. Maybe some cuDNN functions actually support empty tensors, but
|
||||
// native/THNN kernels shouldn't be much slower because the output is also
|
||||
// likely empty.
|
||||
if (self.sym_numel() == 0) return false;
|
||||
if (self.sym_numel() == 0)
|
||||
return false;
|
||||
// NB: In the old Python code, there was also a test to see if the
|
||||
// cuDNN library was actually dynamically linked or not. I'm not
|
||||
// sure if we can actually test this.
|
||||
@ -99,9 +105,10 @@ bool cudnn_is_acceptable(const Tensor& self) {
|
||||
return cudnn_is_acceptable(static_cast<const TensorBase&>(self));
|
||||
}
|
||||
|
||||
Tensor & detach_(Tensor & self) {
|
||||
// this just exists to give us a hook in VariableType and an entry in Declarations.yaml
|
||||
//TORCH_CHECK(false, "detach_ is not implemented for Tensor");
|
||||
Tensor& detach_(Tensor& self) {
|
||||
// this just exists to give us a hook in VariableType and an entry in
|
||||
// Declarations.yaml
|
||||
// TORCH_CHECK(false, "detach_ is not implemented for Tensor");
|
||||
return self;
|
||||
}
|
||||
|
||||
@ -117,7 +124,8 @@ Tensor contiguous(const Tensor& self, MemoryFormat memory_format) {
|
||||
}
|
||||
|
||||
bool is_set_to(const Tensor& self, const Tensor& src) {
|
||||
if (self.storage().unsafeGetStorageImpl() == src.storage().unsafeGetStorageImpl() &&
|
||||
if (self.storage().unsafeGetStorageImpl() ==
|
||||
src.storage().unsafeGetStorageImpl() &&
|
||||
self.storage_offset() == src.storage_offset() &&
|
||||
self.dim() == src.dim()) {
|
||||
for (const auto d : c10::irange(self.dim())) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,7 @@
|
||||
#pragma once
|
||||
#include <ATen/core/IListRef.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <ATen/core/IListRef.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -11,45 +11,74 @@ inline bool cat_should_skip_tensor(const Tensor& t) {
|
||||
return t.sym_numel() == 0 && t.dim() == 1;
|
||||
}
|
||||
|
||||
// Check to see if the shape of tensors is compatible
|
||||
// for being concatenated along a given dimension.
|
||||
inline void check_cat_shape_except_dim(const Tensor & first, const Tensor & second, int64_t dimension, int64_t index) {
|
||||
int64_t first_dims = first.dim();
|
||||
int64_t second_dims = second.dim();
|
||||
TORCH_CHECK(first_dims == second_dims, "Tensors must have same number of dimensions: got ",
|
||||
first_dims, " and ", second_dims);
|
||||
for (const auto dim : c10::irange(first_dims)) {
|
||||
if (dim == dimension) {
|
||||
continue;
|
||||
}
|
||||
int64_t first_dim_size = first.sizes()[dim];
|
||||
int64_t second_dim_size = second.sizes()[dim];
|
||||
TORCH_CHECK(first_dim_size == second_dim_size, "Sizes of tensors must match except in dimension ",
|
||||
dimension, ". Expected size ", static_cast<long long>(first_dim_size), " but got size ", static_cast<long long>(second_dim_size), " for tensor number ", index, " in the list.");
|
||||
}
|
||||
}
|
||||
// Check to see if the shape of tensors is compatible
|
||||
// for being concatenated along a given dimension.
|
||||
inline void check_cat_shape_except_dim(
|
||||
const Tensor& first,
|
||||
const Tensor& second,
|
||||
int64_t dimension,
|
||||
int64_t index) {
|
||||
int64_t first_dims = first.dim();
|
||||
int64_t second_dims = second.dim();
|
||||
TORCH_CHECK(
|
||||
first_dims == second_dims,
|
||||
"Tensors must have same number of dimensions: got ",
|
||||
first_dims,
|
||||
" and ",
|
||||
second_dims);
|
||||
for (const auto dim : c10::irange(first_dims)) {
|
||||
if (dim == dimension) {
|
||||
continue;
|
||||
}
|
||||
int64_t first_dim_size = first.sizes()[dim];
|
||||
int64_t second_dim_size = second.sizes()[dim];
|
||||
TORCH_CHECK(
|
||||
first_dim_size == second_dim_size,
|
||||
"Sizes of tensors must match except in dimension ",
|
||||
dimension,
|
||||
". Expected size ",
|
||||
static_cast<long long>(first_dim_size),
|
||||
" but got size ",
|
||||
static_cast<long long>(second_dim_size),
|
||||
" for tensor number ",
|
||||
index,
|
||||
" in the list.");
|
||||
}
|
||||
}
|
||||
|
||||
inline void check_cat_no_zero_dim(const MaterializedITensorListRef& tensors) {
|
||||
[[maybe_unused]] int64_t i = 0;
|
||||
for(const Tensor& t : tensors) {
|
||||
TORCH_CHECK(t.dim() > 0,
|
||||
"zero-dimensional tensor (at position ", i, ") cannot be concatenated");
|
||||
for (const Tensor& t : tensors) {
|
||||
TORCH_CHECK(
|
||||
t.dim() > 0,
|
||||
"zero-dimensional tensor (at position ",
|
||||
i,
|
||||
") cannot be concatenated");
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t dim) {
|
||||
inline int64_t get_num_splits(
|
||||
const Tensor& self,
|
||||
int64_t split_size,
|
||||
int64_t dim) {
|
||||
TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
|
||||
TORCH_CHECK(split_size >= 0, "split expects split_size be non-negative, but got split_size=", split_size);
|
||||
TORCH_CHECK(
|
||||
split_size >= 0,
|
||||
"split expects split_size be non-negative, but got split_size=",
|
||||
split_size);
|
||||
int64_t dim_size = self.size(dim);
|
||||
TORCH_CHECK(split_size > 0 || dim_size == 0,
|
||||
"split_size can only be 0 if dimension size is 0, "
|
||||
"but got dimension size of ", dim_size);
|
||||
TORCH_CHECK(
|
||||
split_size > 0 || dim_size == 0,
|
||||
"split_size can only be 0 if dimension size is 0, "
|
||||
"but got dimension size of ",
|
||||
dim_size);
|
||||
// if split_size is 0 and dimension size is 0, there is 1 split.
|
||||
int64_t num_splits = 1;
|
||||
if (split_size != 0) {
|
||||
// ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size
|
||||
// (returns a single split). We might want to error here, but keep it for BC.
|
||||
// ensuring num_splits is at least 1 makes consistent the case where
|
||||
// split_size > dim_size (returns a single split). We might want to error
|
||||
// here, but keep it for BC.
|
||||
num_splits = std::max<int64_t>((dim_size + split_size - 1) / split_size, 1);
|
||||
}
|
||||
return num_splits;
|
||||
@ -58,7 +87,7 @@ inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t di
|
||||
inline bool have_same_ndims(TensorList tensors) {
|
||||
auto ndim = tensors[0].dim();
|
||||
for (const auto tensor_idx : c10::irange(tensors.size())) {
|
||||
if(tensors[tensor_idx].dim() != ndim) {
|
||||
if (tensors[tensor_idx].dim() != ndim) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -67,35 +96,46 @@ inline bool have_same_ndims(TensorList tensors) {
|
||||
|
||||
inline void leading_dimension_matches(TensorList tensors, int64_t dim) {
|
||||
auto tensor_zero_size = tensors[0].sizes();
|
||||
std::vector<c10::SymInt> leading_dim_sizes(tensor_zero_size.begin(), tensor_zero_size.begin() + dim);
|
||||
std::vector<c10::SymInt> leading_dim_sizes(
|
||||
tensor_zero_size.begin(), tensor_zero_size.begin() + dim);
|
||||
for (const auto i : c10::irange(tensors.size())) {
|
||||
at::Tensor tensor = tensors[i];
|
||||
for(const auto j : c10::irange(dim)) {
|
||||
for (const auto j : c10::irange(dim)) {
|
||||
TORCH_CHECK(
|
||||
tensor.size(j) == leading_dim_sizes[j],
|
||||
"_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors"
|
||||
);
|
||||
tensor.size(j) == leading_dim_sizes[j],
|
||||
"_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline int64_t preprocess_chunk_cat_inputs(TensorList tensors, int64_t dim, int64_t num_chunks) {
|
||||
inline int64_t preprocess_chunk_cat_inputs(
|
||||
TensorList tensors,
|
||||
int64_t dim,
|
||||
int64_t num_chunks) {
|
||||
TORCH_CHECK(num_chunks >= 1, "_chunk_cat expects positive num_chunks");
|
||||
TORCH_CHECK(!tensors.empty(),
|
||||
"_chunk_cat expects a non-empty input tensor list");
|
||||
TORCH_CHECK(
|
||||
!tensors.empty(), "_chunk_cat expects a non-empty input tensor list");
|
||||
auto expected_dtype = tensors[0].dtype();
|
||||
auto expected_device = tensors[0].device();
|
||||
for(const auto i : c10::irange(tensors.size())) {
|
||||
for (const auto i : c10::irange(tensors.size())) {
|
||||
TORCH_CHECK(tensors[i].numel() > 0, "_chunk_cat expects non-empty tensor");
|
||||
TORCH_CHECK(tensors[i].dtype() == expected_dtype, "_chunk_cat expects all input tensors with the same dtype");
|
||||
TORCH_CHECK(tensors[i].device() == expected_device, "_chunk_cat expects all inputs tensors on the same device");
|
||||
TORCH_CHECK(
|
||||
tensors[i].dtype() == expected_dtype,
|
||||
"_chunk_cat expects all input tensors with the same dtype");
|
||||
TORCH_CHECK(
|
||||
tensors[i].device() == expected_device,
|
||||
"_chunk_cat expects all inputs tensors on the same device");
|
||||
}
|
||||
if (have_same_ndims(tensors)) {
|
||||
dim = maybe_wrap_dim(dim, tensors[0].dim());
|
||||
} else {
|
||||
TORCH_CHECK(dim >= 0, "_chunk_cat expects non-negative dim when input tensors have different ndims")
|
||||
for(const auto i : c10::irange(tensors.size())) {
|
||||
TORCH_CHECK(dim < tensors[i].ndimension(), "_chunk_cat expects dim < ndim for all input tensors");
|
||||
TORCH_CHECK(
|
||||
dim >= 0,
|
||||
"_chunk_cat expects non-negative dim when input tensors have different ndims")
|
||||
for (const auto i : c10::irange(tensors.size())) {
|
||||
TORCH_CHECK(
|
||||
dim < tensors[i].ndimension(),
|
||||
"_chunk_cat expects dim < ndim for all input tensors");
|
||||
}
|
||||
}
|
||||
leading_dimension_matches(tensors, dim);
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/IndexKernel.h> // for flip_stub
|
||||
#include <ATen/native/TensorTransformations.h>
|
||||
#include <ATen/native/IndexKernel.h> // for flip_stub
|
||||
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
@ -44,28 +44,30 @@ Tensor flip(const Tensor& self, IntArrayRef dims) {
|
||||
int n = 0;
|
||||
auto strides = DimVector(self.strides());
|
||||
for (const auto i : c10::irange(total_dims)) {
|
||||
if(flip_dims_b[i] && self.size(i) > 1 && self.stride(i) != 0) {
|
||||
if (flip_dims_b[i] && self.size(i) > 1 && self.stride(i) != 0) {
|
||||
n++;
|
||||
strides[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Nothing to do, we return fast
|
||||
if (n == 0 || self.numel() <=1) {
|
||||
if (n == 0 || self.numel() <= 1) {
|
||||
out_tensor.copy_(self);
|
||||
return out_tensor;
|
||||
}
|
||||
|
||||
//create dummy output with 0 strides at flipped dimension, to prevent tensorIterator from coalescing flipped dims
|
||||
// create dummy output with 0 strides at flipped dimension, to prevent
|
||||
// tensorIterator from coalescing flipped dims
|
||||
const auto restrided_self = self.as_strided(self.sizes(), strides);
|
||||
auto iter = TensorIteratorConfig()
|
||||
.set_check_mem_overlap(false)
|
||||
.check_all_same_dtype(false)
|
||||
.declare_static_dtype_and_device(self.scalar_type(), self.device())
|
||||
.add_output(out_tensor)
|
||||
.add_const_input(self)
|
||||
.add_const_input(restrided_self)
|
||||
.build();
|
||||
auto iter =
|
||||
TensorIteratorConfig()
|
||||
.set_check_mem_overlap(false)
|
||||
.check_all_same_dtype(false)
|
||||
.declare_static_dtype_and_device(self.scalar_type(), self.device())
|
||||
.add_output(out_tensor)
|
||||
.add_const_input(self)
|
||||
.add_const_input(restrided_self)
|
||||
.build();
|
||||
|
||||
auto* data = reinterpret_cast<char*>(iter.data_ptr(0));
|
||||
const auto sizes = iter.shape();
|
||||
@ -83,11 +85,12 @@ Tensor flip(const Tensor& self, IntArrayRef dims) {
|
||||
// - We iterate in the opposite direction (invert the strides)
|
||||
|
||||
for (const auto i : c10::irange(iter.ndim())) {
|
||||
// We know that an dimension has a zero stride and self[i] does not, as we defined above
|
||||
// Note that it may be the case that strides_dummy[i] = 0 not because we set it, but because
|
||||
// strides_self[i] == 0. We do not want to do anything there
|
||||
// We know that an dimension has a zero stride and self[i] does not, as we
|
||||
// defined above Note that it may be the case that strides_dummy[i] = 0 not
|
||||
// because we set it, but because strides_self[i] == 0. We do not want to do
|
||||
// anything there
|
||||
if (strides_dummy[i] == 0 && strides_self[i] != 0) {
|
||||
data += strides_bytes[i] * (sizes[i]-1);
|
||||
data += strides_bytes[i] * (sizes[i] - 1);
|
||||
strides_bytes[i] *= -1;
|
||||
}
|
||||
}
|
||||
@ -99,7 +102,10 @@ Tensor flip(const Tensor& self, IntArrayRef dims) {
|
||||
return out_tensor;
|
||||
}
|
||||
|
||||
Tensor roll(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { // Used by CPU and MPS dispatch.
|
||||
Tensor roll(
|
||||
const Tensor& self,
|
||||
IntArrayRef shifts,
|
||||
IntArrayRef dims) { // Used by CPU and MPS dispatch.
|
||||
if (dims.size() != 1 || shifts.size() != 1) {
|
||||
return roll_common(self, shifts, dims);
|
||||
}
|
||||
@ -115,7 +121,7 @@ Tensor roll(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { // Used
|
||||
if (start < 0) {
|
||||
start = start + size;
|
||||
}
|
||||
auto t0 = self.narrow(dim, start, size-start);
|
||||
auto t0 = self.narrow(dim, start, size - start);
|
||||
auto t1 = self.narrow(dim, 0, start);
|
||||
return at::cat({std::move(t0), std::move(t1)}, dim);
|
||||
}
|
||||
@ -123,27 +129,38 @@ Tensor roll(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { // Used
|
||||
Tensor rot90(const Tensor& self, int64_t k, IntArrayRef dims) {
|
||||
const int64_t total_dims = self.dim(), total_rot_dims = dims.size();
|
||||
|
||||
TORCH_CHECK(total_rot_dims == 2,
|
||||
"expected total rotation dims == 2, but got dims = ", total_rot_dims);
|
||||
TORCH_CHECK(
|
||||
total_rot_dims == 2,
|
||||
"expected total rotation dims == 2, but got dims = ",
|
||||
total_rot_dims);
|
||||
|
||||
TORCH_CHECK(total_dims >= 2,
|
||||
"expected total dims >= 2, but got total dims = ", total_dims);
|
||||
TORCH_CHECK(
|
||||
total_dims >= 2,
|
||||
"expected total dims >= 2, but got total dims = ",
|
||||
total_dims);
|
||||
|
||||
TORCH_CHECK(dims[0] != dims[1] && std::abs(dims[0] - dims[1]) != total_dims,
|
||||
"expected rotation dims to be different, but got dim0 = ", dims[0],
|
||||
" and dim1 = ", dims[1]);
|
||||
TORCH_CHECK(
|
||||
dims[0] != dims[1] && std::abs(dims[0] - dims[1]) != total_dims,
|
||||
"expected rotation dims to be different, but got dim0 = ",
|
||||
dims[0],
|
||||
" and dim1 = ",
|
||||
dims[1]);
|
||||
|
||||
// check range of dims
|
||||
TORCH_CHECK(dims[0] < total_dims && dims[0] >= -total_dims,
|
||||
"Rotation dim0 out of range, dim0 = ", dims[0]);
|
||||
TORCH_CHECK(
|
||||
dims[0] < total_dims && dims[0] >= -total_dims,
|
||||
"Rotation dim0 out of range, dim0 = ",
|
||||
dims[0]);
|
||||
|
||||
TORCH_CHECK(dims[1] < total_dims && dims[1] >= -total_dims,
|
||||
"Rotation dim1 out of range, dim1 = ", dims[1]);
|
||||
TORCH_CHECK(
|
||||
dims[1] < total_dims && dims[1] >= -total_dims,
|
||||
"Rotation dim1 out of range, dim1 = ",
|
||||
dims[1]);
|
||||
|
||||
// handle modulo with negative k
|
||||
k = (4 + (k % 4)) % 4;
|
||||
|
||||
switch(k) {
|
||||
switch (k) {
|
||||
case 1:
|
||||
return self.flip({dims[1]}).transpose_(dims[0], dims[1]);
|
||||
case 2:
|
||||
@ -181,7 +198,8 @@ std::vector<Tensor> atleast_1d(TensorList tensors) {
|
||||
auto transform_lambda = [](const Tensor& input) -> Tensor {
|
||||
return at::native::atleast_1d(input);
|
||||
};
|
||||
std::transform(tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
|
||||
std::transform(
|
||||
tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -202,7 +220,8 @@ std::vector<Tensor> atleast_2d(TensorList tensors) {
|
||||
auto transform_lambda = [](const Tensor& input) -> Tensor {
|
||||
return at::native::atleast_2d(input);
|
||||
};
|
||||
std::transform(tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
|
||||
std::transform(
|
||||
tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -226,7 +245,8 @@ std::vector<Tensor> atleast_3d(TensorList tensors) {
|
||||
auto transform_lambda = [](const Tensor& input) -> Tensor {
|
||||
return at::native::atleast_3d(input);
|
||||
};
|
||||
std::transform(tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
|
||||
std::transform(
|
||||
tensors.cbegin(), tensors.cend(), result.begin(), transform_lambda);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@ -10,16 +10,21 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
static inline Tensor roll_common(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
|
||||
static inline Tensor roll_common(
|
||||
const Tensor& self,
|
||||
IntArrayRef shifts,
|
||||
IntArrayRef dims) {
|
||||
TORCH_CHECK(!shifts.empty(), "`shifts` required");
|
||||
if (dims.empty() && shifts.size() == 1) {
|
||||
auto flattened = self.contiguous().view(self.numel());
|
||||
return roll(flattened, shifts[0], 0).view(self.sizes());
|
||||
}
|
||||
TORCH_CHECK(
|
||||
shifts.size() == dims.size(),
|
||||
"shifts and dimensions must align. shifts: ", shifts.size(), ", dims:", dims.size()
|
||||
);
|
||||
shifts.size() == dims.size(),
|
||||
"shifts and dimensions must align. shifts: ",
|
||||
shifts.size(),
|
||||
", dims:",
|
||||
dims.size());
|
||||
AT_ASSERT(dims.size() > 1);
|
||||
auto tail_shifts = shifts.slice(1);
|
||||
auto tail_dims = dims.slice(1);
|
||||
@ -27,4 +32,4 @@ static inline Tensor roll_common(const Tensor& self, IntArrayRef shifts, IntArra
|
||||
return at::roll(first_dim_rolled, tail_shifts, tail_dims);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
} // namespace at::native
|
||||
|
||||
@ -63,13 +63,9 @@ void binary_cross_entropy_backward_out_kernel(Tensor& grad_input, const Tensor&
|
||||
namespace at::native {
|
||||
|
||||
Tensor binary_cross_entropy_cuda(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
Tensor loss = at::empty_like(input);
|
||||
return at::native::binary_cross_entropy_out_cuda(
|
||||
input, target, weight, reduction, loss);
|
||||
input, target, weight_opt, reduction, loss);
|
||||
}
|
||||
|
||||
Tensor& binary_cross_entropy_out_cuda(const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& loss) {
|
||||
@ -122,13 +118,9 @@ Tensor& binary_cross_entropy_out_cuda(const Tensor& input, const Tensor& target,
|
||||
}
|
||||
|
||||
Tensor binary_cross_entropy_backward_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
|
||||
Tensor grad_input = at::empty_like(input);
|
||||
return at::native::binary_cross_entropy_backward_out_cuda(
|
||||
grad, input, target, weight, reduction, grad_input);
|
||||
grad, input, target, weight_opt, reduction, grad_input);
|
||||
}
|
||||
|
||||
Tensor& binary_cross_entropy_backward_out_cuda(const Tensor& grad, const Tensor& input, const Tensor& target, const std::optional<Tensor>& weight_opt, int64_t reduction, Tensor& grad_input) {
|
||||
|
||||
@ -75,8 +75,6 @@ struct ReduceConfig {
|
||||
static constexpr int BLOCK_Y = 1;
|
||||
static constexpr int CTA = 2;
|
||||
|
||||
static constexpr int input_vec_size = 4;
|
||||
|
||||
ReduceConfig(int element_size_bytes, int num_outputs, int num_inputs)
|
||||
: element_size_bytes(element_size_bytes)
|
||||
, num_inputs(num_inputs)
|
||||
@ -286,7 +284,6 @@ struct ReduceJitOp {
|
||||
//TODO for now arg_t is always opmath_t of the input, later we'll need to change it
|
||||
using arg_t = at::opmath_type<scalar_t>;
|
||||
|
||||
static constexpr int input_vec_size = ReduceConfig::input_vec_size;
|
||||
//TODO - ReduceJitOp will probably need to be changed for reductions that need full functor,
|
||||
//not just wrapper
|
||||
arg_t ident;
|
||||
@ -336,7 +333,7 @@ struct ReduceJitOp {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename scalar_t, typename ops_t, typename index_t, typename out_scalar_t=scalar_t, int vt0=4>
|
||||
template <typename scalar_t, typename ops_t, typename index_t, typename out_scalar_t=scalar_t, int vt0=4, int input_vec_size=vt0>
|
||||
struct ReduceOp {
|
||||
using traits = function_traits<decltype(&ops_t::reduce)>;
|
||||
using arg_t = typename std::decay<typename traits::template arg<0>::type>::type;
|
||||
@ -348,8 +345,6 @@ struct ReduceOp {
|
||||
std::is_convertible_v<arg_t, out_scalar_t>
|
||||
&& std::is_convertible_v<out_scalar_t, arg_t>;
|
||||
|
||||
static constexpr int input_vec_size = ReduceConfig::input_vec_size;
|
||||
|
||||
ops_t ops;
|
||||
arg_t ident;
|
||||
ReduceConfig config;
|
||||
@ -996,7 +991,7 @@ int get_output_vec_size(const TensorIterator &iter) {
|
||||
return vec_size;
|
||||
}
|
||||
|
||||
template<typename arg_t, typename scalar_t, int vt0>
|
||||
template<typename arg_t, typename scalar_t, int vt0, int input_vec_size=vt0>
|
||||
ReduceConfig setReduceConfig(const TensorIterator& iter){
|
||||
// Start by assuming that each thread handles a single output and all
|
||||
// the inputs for that output.
|
||||
@ -1063,12 +1058,16 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
|
||||
// threads with different threadIdx.x are independent and will produce results for different outputs.
|
||||
// In such case, values in each loaded vector always correspond to different outputs.
|
||||
if (fastest_moving_stride == sizeof(scalar_t)) {
|
||||
if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= ReduceConfig::input_vec_size) {
|
||||
#ifdef USE_ROCM
|
||||
if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1) {
|
||||
#else
|
||||
if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= input_vec_size) {
|
||||
#endif
|
||||
// Case 1: "vectorize along input"
|
||||
// Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
|
||||
// we should avoid vectorization.
|
||||
config.vectorize_input = true;
|
||||
dim0 /= config.input_vec_size;
|
||||
dim0 /= input_vec_size;
|
||||
} else if (!reduction_on_fastest_striding_dimension) {
|
||||
// Case 2: "vectorize along output"
|
||||
config.output_vec_size = get_output_vec_size<scalar_t>(iter);
|
||||
@ -1123,7 +1122,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
|
||||
// Control the number of threadblocks by adjusting the maximum number of
|
||||
// threads per multi-processor. These numbers better reflect the maximum
|
||||
// theoretical achievable threads per MP for the reduction operation.
|
||||
if (iter.ndim() == 1)
|
||||
if (iter.ndim() == 1 || iter.ndim() == 3)
|
||||
max_threads_per_mp = 512;
|
||||
if (iter.ndim() == 2)
|
||||
max_threads_per_mp = 256;
|
||||
@ -1169,7 +1168,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){
|
||||
return config;
|
||||
};
|
||||
|
||||
template <typename scalar_t, typename out_scalar_t, int vt0=4, typename ops_t, typename ident_t=double>
|
||||
template <typename scalar_t, typename out_scalar_t, int vt0=4, int input_vec_size=vt0, typename ops_t, typename ident_t=double>
|
||||
inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0,
|
||||
AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
|
||||
AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
|
||||
@ -1221,7 +1220,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
|
||||
for (auto& sub_iter : iter.with_32bit_indexing()) {
|
||||
int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
|
||||
|
||||
gpu_reduce_kernel<scalar_t, out_scalar_t, vt0>(sub_iter, ops, ident,
|
||||
gpu_reduce_kernel<scalar_t, out_scalar_t, vt0, input_vec_size>(sub_iter, ops, ident,
|
||||
acc_buf_ptr, sub_iter_base_idx);
|
||||
}
|
||||
return;
|
||||
@ -1238,7 +1237,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
|
||||
}
|
||||
char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
|
||||
|
||||
ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0>(iter);
|
||||
ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0, input_vec_size>(iter);
|
||||
at::DataPtr buffer;
|
||||
at::DataPtr semaphores;
|
||||
if (config.should_global_reduce()) {
|
||||
@ -1253,7 +1252,7 @@ inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t id
|
||||
AT_ASSERT(can_use_32bit_indexing);
|
||||
auto output_calc = make_output_calculator<uint32_t>(iter);
|
||||
auto input_calc = make_input_calculator<uint32_t>(iter);
|
||||
auto reduce = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0>(
|
||||
auto reduce = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0, input_vec_size>(
|
||||
ops,
|
||||
config,
|
||||
input_calc,
|
||||
|
||||
@ -13,6 +13,20 @@ namespace at::native {
|
||||
template <typename scalar_t, typename acc_t = scalar_t, typename out_t = scalar_t>
|
||||
struct sum_functor {
|
||||
void operator()(TensorIterator& iter) {
|
||||
#ifdef USE_ROCM
|
||||
// Half and BFloat16 can be packed in groups of up to 8 elements and
|
||||
// can use *_DWORDX4 instructions to achieve that.
|
||||
const bool is_16_bits =
|
||||
( (std::is_same<at::Half, scalar_t>::value) ||
|
||||
(std::is_same<at::BFloat16, scalar_t>::value) );
|
||||
if (is_16_bits) {
|
||||
gpu_reduce_kernel<scalar_t, out_t, /*vt0=*/4, /*input_vec_size=*/8>(
|
||||
iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
|
||||
return a + b;
|
||||
}));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
gpu_reduce_kernel<scalar_t, out_t>(
|
||||
iter, func_wrapper<out_t>([] GPU_LAMBDA(acc_t a, acc_t b) -> acc_t {
|
||||
return a + b;
|
||||
|
||||
@ -190,13 +190,7 @@ Tensor layer_norm_symint(
|
||||
c10::SymIntArrayRef normalized_shape, const std::optional<Tensor>& weight_opt /* optional */, const std::optional<Tensor>& bias_opt /* optional */,
|
||||
double eps,
|
||||
bool /* cudnn_enable, deprecated */) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
|
||||
const Tensor& bias = *bias_maybe_owned;
|
||||
|
||||
return std::get<0>(at::native_layer_norm_symint(input, normalized_shape, weight, bias, eps));
|
||||
return std::get<0>(at::native_layer_norm_symint(input, normalized_shape, weight_opt, bias_opt, eps));
|
||||
}
|
||||
|
||||
DEFINE_DISPATCH(LayerNormKernel);
|
||||
|
||||
@ -54,7 +54,7 @@ at::Tensor quantized_convolution(
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
c10::optional<at::Tensor> bias,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
@ -63,15 +63,15 @@ at::Tensor quantized_convolution(
|
||||
at::Tensor output,
|
||||
double inv_output_scale,
|
||||
int64_t output_zero_point,
|
||||
c10::optional<at::Tensor> accum,
|
||||
std::optional<at::Tensor> accum,
|
||||
double accum_scale,
|
||||
int64_t accum_zero_point,
|
||||
c10::optional<c10::ScalarType> output_dtype,
|
||||
c10::optional<std::string_view> binary_attr,
|
||||
c10::optional<at::Scalar> binary_alpha,
|
||||
c10::optional<std::string_view> unary_attr,
|
||||
torch::List<c10::optional<at::Scalar>> unary_scalars,
|
||||
c10::optional<std::string_view> unary_algorithm) {
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::optional<std::string_view> binary_attr,
|
||||
std::optional<at::Scalar> binary_alpha,
|
||||
std::optional<std::string_view> unary_attr,
|
||||
torch::List<std::optional<at::Scalar>> unary_scalars,
|
||||
std::optional<std::string_view> unary_algorithm) {
|
||||
Attr attr =
|
||||
Attr(/*q_scale=*/1.0 / inv_output_scale, /*zp=*/output_zero_point);
|
||||
|
||||
|
||||
@ -114,7 +114,7 @@ at::Tensor quantized_convolution(
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
c10::optional<at::Tensor> bias,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
@ -123,14 +123,14 @@ at::Tensor quantized_convolution(
|
||||
at::Tensor output,
|
||||
double inv_output_scale,
|
||||
int64_t output_zero_point,
|
||||
c10::optional<at::Tensor> accum,
|
||||
std::optional<at::Tensor> accum,
|
||||
double accum_scale,
|
||||
int64_t accum_zero_point,
|
||||
c10::optional<c10::ScalarType> output_dtype,
|
||||
c10::optional<std::string_view> binary_attr,
|
||||
c10::optional<at::Scalar> binary_alpha,
|
||||
c10::optional<std::string_view> unary_attr,
|
||||
torch::List<c10::optional<at::Scalar>> unary_scalars,
|
||||
c10::optional<std::string_view> unary_algorithm);
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::optional<std::string_view> binary_attr,
|
||||
std::optional<at::Scalar> binary_alpha,
|
||||
std::optional<std::string_view> unary_attr,
|
||||
torch::List<std::optional<at::Scalar>> unary_scalars,
|
||||
std::optional<std::string_view> unary_algorithm);
|
||||
|
||||
} // namespace at::native::onednn
|
||||
|
||||
@ -31,17 +31,17 @@ class QConvoneDNNXPU final {
|
||||
at::Tensor weight,
|
||||
at::Tensor weight_scales,
|
||||
at::Tensor weight_zero_points,
|
||||
c10::optional<at::Tensor> bias,
|
||||
std::optional<at::Tensor> bias,
|
||||
torch::List<int64_t> stride,
|
||||
torch::List<int64_t> padding,
|
||||
torch::List<int64_t> dilation,
|
||||
int64_t groups,
|
||||
double inv_output_scale,
|
||||
int64_t output_zero_point,
|
||||
c10::optional<c10::ScalarType> output_dtype,
|
||||
std::optional<c10::ScalarType> output_dtype,
|
||||
std::string_view attr,
|
||||
torch::List<c10::optional<at::Scalar>> scalars,
|
||||
c10::optional<std::string_view> algorithm) {
|
||||
torch::List<std::optional<at::Scalar>> scalars,
|
||||
std::optional<std::string_view> algorithm) {
|
||||
if (act.dim() == 3 || act.dim() == 5) {
|
||||
TORCH_CHECK(
|
||||
attr == "none",
|
||||
|
||||
@ -372,7 +372,6 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
using namespace at::native::mps;
|
||||
using namespace mps;
|
||||
bool is3DConv = grad_output_t.dim() == 5;
|
||||
|
||||
if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_1_PLUS)) {
|
||||
// On macOS < 15.1, MPS convolution kernel does not support output channels > 2^16
|
||||
for (auto elem : grad_output_t.sizes()) {
|
||||
@ -417,36 +416,29 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
assert(0 && "Check should have been done earlier\n");
|
||||
}
|
||||
|
||||
MPSShape* gradOutputShape = getMPSShape(grad_output_t, memory_format);
|
||||
MPSShape* mps_input_shape = getMPSShape(input_size);
|
||||
NSString* ns_shape_key = [[gradOutputShape valueForKey:@"description"] componentsJoinedByString:@","];
|
||||
string key;
|
||||
if (is3DConv) {
|
||||
key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
|
||||
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
|
||||
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
getTensorsStringKey({grad_output_t, weight_t}) + ":" + string([ns_shape_key UTF8String]);
|
||||
getTensorsStringKey({grad_output_t, weight_t});
|
||||
|
||||
} else {
|
||||
key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
|
||||
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
getTensorsStringKey({grad_output_t, weight_t}) + ":" + string([ns_shape_key UTF8String]);
|
||||
getTensorsStringKey({grad_output_t, weight_t});
|
||||
}
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
MPSGraphTensor* gradOutputTensor =
|
||||
mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(grad_output_t), gradOutputShape);
|
||||
MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
|
||||
auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
|
||||
auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
|
||||
|
||||
MPSGraphTensor* gradOutputTensorTranspose = gradOutputTensor;
|
||||
if (is_channels_last) {
|
||||
gradOutputTensorTranspose = mps::convertNHWCtoNCHW(mpsGraph, gradOutputTensorTranspose);
|
||||
}
|
||||
MPSGraphTensor* gradInputTensor;
|
||||
MPSShape* weightOutputShape = mps::getMPSShape(weight_t);
|
||||
// Depthwise conv is input feature channels = groups. So I in OIHW has to be 1.
|
||||
bool isDepthwiseConv = ((groups > 1 && (weightOutputShape[1].intValue == 1)) && gradOutputShape.count >= 4 &&
|
||||
bool isDepthwiseConv = ((groups > 1 && (weightOutputShape[1].intValue == 1)) && grad_output_t.ndimension() >= 4 &&
|
||||
weightOutputShape.count >= 4 && !is_channels_last);
|
||||
|
||||
if (is3DConv) {
|
||||
@ -462,7 +454,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
padding[1],
|
||||
padding[0],
|
||||
groups);
|
||||
gradInputTensor = [mpsGraph convolution3DDataGradientWithIncomingGradientTensor:gradOutputTensorTranspose
|
||||
gradInputTensor = [mpsGraph convolution3DDataGradientWithIncomingGradientTensor:gradOutputTensor
|
||||
weightsTensor:weightTensor
|
||||
outputShape:mps_input_shape
|
||||
forwardConvolutionDescriptor:conv3dDescriptor_
|
||||
@ -484,7 +476,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
withDimension:-4
|
||||
name:nil];
|
||||
gradInputTensor =
|
||||
[mpsGraph depthwiseConvolution3DDataGradientWithIncomingGradientTensor:gradOutputTensorTranspose
|
||||
[mpsGraph depthwiseConvolution3DDataGradientWithIncomingGradientTensor:gradOutputTensor
|
||||
weightsTensor:weightTransposeTensor
|
||||
outputShape:mps_input_shape
|
||||
descriptor:depthWiseConv3dDescriptor_
|
||||
@ -501,7 +493,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
at::MemoryFormat::Contiguous,
|
||||
groups);
|
||||
|
||||
gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensorTranspose
|
||||
gradInputTensor = [mpsGraph convolution2DDataGradientWithIncomingGradientTensor:gradOutputTensor
|
||||
weightsTensor:weightTensor
|
||||
outputShape:mps_input_shape
|
||||
forwardConvolutionDescriptor:conv2dDescriptor_
|
||||
@ -513,7 +505,7 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
newCachedGraph->gradInputTensor_ = gradInputTensor;
|
||||
});
|
||||
|
||||
auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t, gradOutputShape);
|
||||
auto gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output_t);
|
||||
auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t);
|
||||
auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input);
|
||||
|
||||
|
||||
@ -385,19 +385,15 @@ Tensor quantized_batch_norm(
|
||||
double eps,
|
||||
double output_scale,
|
||||
int64_t output_zero_point) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
|
||||
const Tensor& weight = *weight_maybe_owned;
|
||||
const Tensor& bias = bias_opt.value_or(Tensor());
|
||||
|
||||
Tensor qy;
|
||||
// TODO: this should arguably support 3d as well
|
||||
qy = q_batch_norm2d_impl<false>(
|
||||
return q_batch_norm_impl<false>(
|
||||
qx,
|
||||
weight.defined() ? std::make_optional(weight) : std::nullopt,
|
||||
bias.defined() ? std::make_optional(bias) : std::nullopt,
|
||||
mean, var, eps, output_scale, output_zero_point);
|
||||
return qy;
|
||||
weight_opt,
|
||||
bias_opt,
|
||||
mean,
|
||||
var,
|
||||
eps,
|
||||
output_scale,
|
||||
output_zero_point);
|
||||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) {
|
||||
|
||||
@ -931,8 +931,8 @@ static at::Tensor linear_int8_with_onednn_weight(
|
||||
std::string_view& unary_post_op_algorithm) {
|
||||
using ideep::tensor;
|
||||
const int64_t dim = input.dim();
|
||||
TORCH_CHECK(input.scalar_type() == c10::ScalarType::Byte,
|
||||
"qlinear with mkldnn tensor: data type of input should be uint8 (unsigned char).");
|
||||
TORCH_CHECK(input.scalar_type() == c10::ScalarType::Byte || input.scalar_type() == c10::ScalarType::Char,
|
||||
"qlinear with mkldnn tensor: data type of input should be uint8 or int8 (unsigned char or char).");
|
||||
TORCH_CHECK(onednn_weight.scalar_type() == c10::ScalarType::Char,
|
||||
"qlinear with mkldnn tensor: data type of weight should be int8 (char).");
|
||||
TORCH_CHECK(
|
||||
@ -1021,7 +1021,8 @@ static at::Tensor linear_int8_with_onednn_weight(
|
||||
empty_tensor;
|
||||
|
||||
// Create onednn primitive
|
||||
auto src_desc = tensor::desc(src_dims, ideep::data_type::u8, ideep::format_tag::any);
|
||||
auto src_dtype = input.scalar_type() == c10::kByte ? ideep::data_type::u8 : ideep::data_type::s8;
|
||||
auto src_desc = tensor::desc(src_dims, src_dtype, ideep::format_tag::any);
|
||||
auto weights_desc = packed_weight.get_desc();
|
||||
auto dst_dtype = dst.get_data_type();
|
||||
auto dst_desc = tensor::desc(dst_dims, dst_dtype, ideep::format_tag::any);
|
||||
@ -1118,12 +1119,14 @@ namespace at::native {
|
||||
torch::List<std::optional<at::Scalar>> post_op_args,
|
||||
std::string_view post_op_algorithm) {
|
||||
#if AT_MKLDNN_ENABLED()
|
||||
TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
|
||||
"onednn int8 linear: act scale/zp size should be 1");
|
||||
// act_zero_point.numel() == 0 for symmetric quantization
|
||||
TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
|
||||
"onednn int8 linear: act scale/zp size should be 1/<=1");
|
||||
static std::optional<at::Tensor> other = std::nullopt;
|
||||
static const std::string_view binary_post_op = "none";
|
||||
int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
|
||||
return linear_int8_with_onednn_weight(
|
||||
act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
|
||||
act, act_scale.item().toDouble(), act_zp,
|
||||
onednn_weight, weight_scales, weight_zero_points,
|
||||
bias, output_scale, output_zero_point, output_dtype,
|
||||
other, /*other scale*/1.0, /*other zp*/0,
|
||||
@ -1154,10 +1157,12 @@ namespace at::native {
|
||||
torch::List<std::optional<at::Scalar>> unary_post_op_args,
|
||||
std::string_view unary_post_op_algorithm) {
|
||||
#if AT_MKLDNN_ENABLED()
|
||||
TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1,
|
||||
"onednn int8 linear: act scale/zp size should be 1");
|
||||
// act_zero_point.numel() == 0 for symmetric quantization
|
||||
TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() <= 1,
|
||||
"onednn int8 linear: act scale/zp size should be 1/<=1");
|
||||
int64_t act_zp = act_zero_point.numel() == 1 ? act_zero_point.item().toLong() : 0;
|
||||
return linear_int8_with_onednn_weight(
|
||||
act, act_scale.item().toDouble(), act_zero_point.item().toLong(),
|
||||
act, act_scale.item().toDouble(), act_zp,
|
||||
onednn_weight, weight_scales, weight_zero_points,
|
||||
bias, output_scale, output_zero_point, output_dtype,
|
||||
other, other_scale, other_zero_point,
|
||||
|
||||
@ -561,8 +561,8 @@ namespace {
|
||||
bool expected = std::isnan(val);
|
||||
CACHE_ALIGN c10::Half actual_vals[vHalf::size()];
|
||||
vHalf(val).isnan().store(actual_vals);
|
||||
for (int jj = 0; jj < vHalf::size(); ++jj) {
|
||||
EXPECT_EQ(expected, c10::bit_cast<uint16_t>(actual_vals[jj]) != 0) << "fp16 isnan failure for bit pattern " << std::hex << ii << std::dec;
|
||||
for (auto actual_val : actual_vals) {
|
||||
EXPECT_EQ(expected, c10::bit_cast<uint16_t>(actual_val) != 0) << "fp16 isnan failure for bit pattern " << std::hex << ii << std::dec;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1046,7 +1046,7 @@ namespace {
|
||||
mask[idx] = (VT)0;
|
||||
}
|
||||
else {
|
||||
int64_t hex_mask = 0xFFFFFFFFFFFFFFFF;
|
||||
uint64_t hex_mask = 0xFFFFFFFFFFFFFFFF;
|
||||
std::memcpy(&mask[idx], &hex_mask, sizeof(VT));
|
||||
}
|
||||
if (!test_blendv<vec, VT, idx+1, N>(expected_val, a, b, mask)) return false;
|
||||
@ -1315,8 +1315,8 @@ namespace {
|
||||
ValueGen<float> generator_sc(1.f, 15.f, seed.add(2));
|
||||
for ([[maybe_unused]] const auto i : c10::irange(trials)) {
|
||||
float scale = generator_sc.get();
|
||||
int32_t zero_point_val = generator.get();
|
||||
float scale_zp_premul = -(scale * zero_point_val);
|
||||
auto zero_point_val = generator.get();
|
||||
float scale_zp_premul = -(scale * static_cast<float>(zero_point_val));
|
||||
vfloat vf_scale = vfloat{scale};
|
||||
vfloat vf_zp = vfloat{static_cast<float>(zero_point_val)};
|
||||
vfloat vf_scale_zp = vfloat{scale_zp_premul};
|
||||
@ -1657,18 +1657,16 @@ namespace {
|
||||
TEST(HalfConversionTest, HalfFloat) {
|
||||
float f32s[100];
|
||||
for (const auto i : c10::irange(100)) {
|
||||
f32s[i] = i + 0.3;
|
||||
f32s[i] = static_cast<float>(i + 0.3);
|
||||
}
|
||||
uint16_t u16;
|
||||
float x;
|
||||
for (const auto i : c10::irange(100)) {
|
||||
#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \
|
||||
!defined(__APPLE__)
|
||||
u16 = at::vec::float2half_scalar(f32s[i]);
|
||||
x = at::vec::half2float_scalar(u16);
|
||||
uint16_t u16 = at::vec::float2half_scalar(f32s[i]);
|
||||
float x = at::vec::half2float_scalar(u16);
|
||||
#else
|
||||
u16 = c10::detail::fp16_ieee_from_fp32_value(f32s[i]);
|
||||
x = c10::detail::fp16_ieee_to_fp32_value(u16);
|
||||
uint16_t u16 = c10::detail::fp16_ieee_from_fp32_value(f32s[i]);
|
||||
float x = c10::detail::fp16_ieee_to_fp32_value(u16);
|
||||
#endif
|
||||
|
||||
EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i]))
|
||||
@ -1697,7 +1695,7 @@ namespace {
|
||||
VT v_pinf = static_cast<VT>(*(float *)&infBits);
|
||||
values[index] = v_pinf;
|
||||
auto vec_pinf = vec::loadu(values);
|
||||
int negInfBits = 0xFF800000;
|
||||
unsigned int negInfBits = 0xFF800000;
|
||||
VT v_ninf = static_cast<VT>(*(float *)&negInfBits);
|
||||
values[index] = v_ninf;
|
||||
auto vec_ninf = vec::loadu(values);
|
||||
@ -1779,8 +1777,8 @@ namespace {
|
||||
const auto expected = static_cast<float>(val);
|
||||
CACHE_ALIGN float actual_vals[vfloat::size()];
|
||||
at::vec::convert<float>(vBFloat16(val)).store(actual_vals);
|
||||
for (int jj = 0; jj < vfloat::size(); ++jj) {
|
||||
EXPECT_EQ(c10::bit_cast<uint32_t>(expected), c10::bit_cast<uint32_t>(actual_vals[jj]))
|
||||
for (auto actual_val : actual_vals) {
|
||||
EXPECT_EQ(c10::bit_cast<uint32_t>(expected), c10::bit_cast<uint32_t>(actual_val))
|
||||
<< "convert-to-float failure for bf16 bit pattern "
|
||||
<< std::hex << ii << std::dec;
|
||||
}
|
||||
@ -1794,20 +1792,20 @@ namespace {
|
||||
|
||||
#define TEST_MASK_LOAD(dst_t, mask_t, mask_n) \
|
||||
do { \
|
||||
CACHE_ALIGN dst_t x[mask_n * size]; \
|
||||
CACHE_ALIGN dst_t y[mask_n * size]; \
|
||||
CACHE_ALIGN dst_t ref[mask_n * size]; \
|
||||
auto seed = TestSeed(); \
|
||||
dst_t generator_min = std::numeric_limits<dst_t>::is_signed ? dst_t(-100) : dst_t(0); \
|
||||
ValueGen<dst_t> generator(generator_min, dst_t(100), seed); \
|
||||
for (const auto i : c10::irange(mask_n * size)) { \
|
||||
x[i] = generator.get(); \
|
||||
} \
|
||||
auto vec_mask = generate_vec_mask<mask_t, mask_n>(seed); \
|
||||
constexpr int dst_size = at::vec::Vectorized<dst_t>::size(); \
|
||||
constexpr int dst_n = mask_n * size / dst_size; \
|
||||
constexpr int rnd_n = (mask_n * size + dst_size - 1) / dst_size; \
|
||||
if constexpr(dst_n * dst_size >= mask_n * size) { \
|
||||
CACHE_ALIGN dst_t x[mask_n * size]; \
|
||||
CACHE_ALIGN dst_t y[mask_n * size]; \
|
||||
CACHE_ALIGN dst_t ref[mask_n * size]; \
|
||||
auto seed = TestSeed(); \
|
||||
dst_t generator_min = std::numeric_limits<dst_t>::is_signed ? dst_t(-100) : dst_t(0); \
|
||||
ValueGen<dst_t> generator(generator_min, dst_t(100), seed); \
|
||||
for (const auto i : c10::irange(mask_n * size)) { \
|
||||
x[i] = generator.get(); \
|
||||
} \
|
||||
auto vec_mask = generate_vec_mask<mask_t, mask_n>(seed); \
|
||||
constexpr int rnd_n = (mask_n * size + dst_size - 1) / dst_size;\
|
||||
auto x_vec = vec_mask.template loadu<dst_t, rnd_n>(x); \
|
||||
x_vec.store(y); \
|
||||
for (const auto i : c10::irange(mask_n * size)) { \
|
||||
|
||||
9
benchmarks/dynamo/pr_time_benchmarks/README.md
Normal file
9
benchmarks/dynamo/pr_time_benchmarks/README.md
Normal file
@ -0,0 +1,9 @@
|
||||
# Instructions on how to make a new compile time benchmark
|
||||
|
||||
1. Make a new benchmark file in /benchmarks/dynamo/pr_time_benchmarks/benchmarks/ eg. https://github.com/pytorch/pytorch/blob/0b75b7ff2b8ab8f40e433a52b06a671d6377997f/benchmarks/dynamo/pr_time_benchmarks/benchmarks/add_loop.py
|
||||
2. cd into the pr_time_benchmarks directory `cd benchmarks/dynamo/pr_time_benchmarks`
|
||||
3. Run `PYTHONPATH=./ python benchmarks/[YOUR_BENCHMARK].py a.txt`
|
||||
4. (Optional) flip a flag that you know will change the benchmark and run again with b.txt `PYTHONPATH=./ python benchmarks/[YOUR_BENCHMARK].py a.txt`
|
||||
5. Compare `a.txt` and `b.txt` located within the `benchmarks/dynamo/pr_time_benchmarks` folder to make sure things look as you expect
|
||||
6. Check in your new benchmark file and submit a new PR
|
||||
7. In a few days, if your benchmark is stable, bug Laith Sakka to enable running your benchmark on all PRs. If your a meta employee, you can find the dashboard here: internalfb.com/intern/unidash/dashboard/pt2_diff_time_metrics
|
||||
@ -0,0 +1,47 @@
|
||||
import sys
|
||||
|
||||
from benchmark_base import BenchmarkBase
|
||||
|
||||
import torch
|
||||
from torch._inductor.utils import fresh_inductor_cache
|
||||
|
||||
|
||||
class Benchmark(BenchmarkBase):
|
||||
def __init__(self):
|
||||
super().__init__(
|
||||
category="float_args",
|
||||
backend="inductor",
|
||||
device="cpu",
|
||||
)
|
||||
|
||||
def name(self):
|
||||
return f"{self.category()}"
|
||||
|
||||
def description(self):
|
||||
return "Benchmark to measure recompilations with float arguments."
|
||||
|
||||
def _prepare_once(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
def _prepare(self):
|
||||
torch._dynamo.reset()
|
||||
|
||||
def _work(self):
|
||||
@torch.compile(backend="inductor")
|
||||
def f(x, y):
|
||||
return x + y
|
||||
|
||||
with fresh_inductor_cache():
|
||||
for i in range(8):
|
||||
f(torch.arange(3), i * 2.5)
|
||||
|
||||
|
||||
def main():
|
||||
result_path = sys.argv[1]
|
||||
Benchmark().enable_compile_time_instruction_count().collect_all().append_results(
|
||||
result_path
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -6,27 +6,27 @@ add_loop_eager_dynamic,compile_time_instruction_count,5703000000,0.025
|
||||
|
||||
|
||||
|
||||
add_loop_inductor,compile_time_instruction_count,29510000000,0.015
|
||||
add_loop_inductor,compile_time_instruction_count,32220000000,0.015
|
||||
|
||||
|
||||
|
||||
add_loop_inductor_dynamic_gpu,compile_time_instruction_count,43280000000,0.025
|
||||
add_loop_inductor_dynamic_gpu,compile_time_instruction_count,44500000000,0.025
|
||||
|
||||
|
||||
|
||||
add_loop_inductor_gpu,compile_time_instruction_count,25690000000,0.015
|
||||
add_loop_inductor_gpu,compile_time_instruction_count,27320000000,0.015
|
||||
|
||||
|
||||
|
||||
basic_modules_ListOfLinears_eager,compile_time_instruction_count,1033000000,0.015
|
||||
basic_modules_ListOfLinears_eager,compile_time_instruction_count,1018000000,0.015
|
||||
|
||||
|
||||
|
||||
basic_modules_ListOfLinears_inductor,compile_time_instruction_count,20810000000,0.015
|
||||
basic_modules_ListOfLinears_inductor,compile_time_instruction_count,21760000000,0.015
|
||||
|
||||
|
||||
|
||||
basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17020000000,0.015
|
||||
basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17810000000,0.015
|
||||
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ update_hint_regression,compile_time_instruction_count,1669000000,0.02
|
||||
|
||||
|
||||
|
||||
sum_floordiv_regression,compile_time_instruction_count,1113000000,0.015
|
||||
sum_floordiv_regression,compile_time_instruction_count,1033000000,0.015
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2018000000
|
||||
|
||||
|
||||
|
||||
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5843000000,0.015
|
||||
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5796000000,0.015
|
||||
|
||||
|
||||
|
||||
@ -62,4 +62,4 @@ aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3863000000,
|
||||
|
||||
|
||||
|
||||
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10410000000,0.015
|
||||
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10330000000,0.015
|
||||
|
||||
|
@ -353,7 +353,7 @@ void testStaticRuntime(
|
||||
|
||||
size_t new_managed_bytes =
|
||||
memory_planner ? memory_planner->total_managed() : 0;
|
||||
if (check_resize && new_managed_bytes >= 0) {
|
||||
if (check_resize) {
|
||||
EXPECT_GE(new_managed_bytes, managed_bytes);
|
||||
}
|
||||
|
||||
|
||||
@ -123,7 +123,7 @@ inline constexpr crc64_t crc64(const char* str, size_t size) {
|
||||
return crc64_t{detail::crc64impl(0, str, size)};
|
||||
}
|
||||
|
||||
inline constexpr crc64_t crc64(c10::string_view str) {
|
||||
inline constexpr crc64_t crc64(std::string_view str) {
|
||||
return crc64(str.data(), str.size());
|
||||
}
|
||||
} // namespace c10::util
|
||||
|
||||
@ -92,7 +92,7 @@ size_t ReplaceAll(std::string& s, std::string_view from, std::string_view to) {
|
||||
std::string::size_type last_pos = 0u;
|
||||
std::string::size_type cur_pos = 0u;
|
||||
std::string::size_type write_pos = 0u;
|
||||
const c10::string_view input(s);
|
||||
const std::string_view input(s);
|
||||
|
||||
if (from.size() >= to.size()) {
|
||||
// If the replacement string is not larger than the original, we
|
||||
|
||||
@ -188,7 +188,6 @@ class BlockingCounter {
|
||||
// returns false.
|
||||
bool DecrementCount() {
|
||||
const auto count_value = count_.fetch_sub(1, std::memory_order_relaxed) - 1;
|
||||
TORCH_DCHECK_GE(count_value, 0);
|
||||
if (count_value == 0) {
|
||||
std::lock_guard<std::mutex> g(mutex_);
|
||||
cond_.notify_one();
|
||||
|
||||
@ -414,6 +414,9 @@ function(torch_compile_options libname)
|
||||
$<$<COMPILE_LANGUAGE:CXX>:${private_compile_options}>)
|
||||
if(USE_CUDA)
|
||||
foreach(option IN LISTS private_compile_options)
|
||||
if("${option}" STREQUAL "-Wextra-semi")
|
||||
continue()
|
||||
endif()
|
||||
target_compile_options(${libname} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler ${option}>)
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
/* styles needed for the Google Search button */
|
||||
|
||||
.pytorch-left-menu-search input[type=text] {
|
||||
background-image: none;
|
||||
.gsc-overflow-hidden {
|
||||
overflow: visible !important;
|
||||
}
|
||||
|
||||
.gsc-control-cse {
|
||||
|
||||
@ -10,7 +10,9 @@ torch.accelerator
|
||||
device_count
|
||||
is_available
|
||||
current_accelerator
|
||||
set_device_index
|
||||
set_device_idx
|
||||
current_device_index
|
||||
current_device_idx
|
||||
set_stream
|
||||
current_stream
|
||||
|
||||
@ -305,6 +305,7 @@ coverage_ignore_functions = [
|
||||
"node_arg_is_weight",
|
||||
"return_arg_list",
|
||||
# torch.ao.quantization.pt2e.graph_utils
|
||||
"bfs_trace_with_node_process",
|
||||
"find_sequential_partitions",
|
||||
"get_equivalent_types",
|
||||
"update_equivalent_types_dict",
|
||||
|
||||
@ -199,15 +199,8 @@ the model. For example:
|
||||
stage_index,
|
||||
num_stages,
|
||||
device,
|
||||
input_args=example_input_microbatch,
|
||||
)
|
||||
|
||||
|
||||
The ``PipelineStage`` requires an example argument ``input_args`` representing
|
||||
the runtime input to the stage, which would be one microbatch worth of input
|
||||
data. This argument is passed through the forward method of the stage module to
|
||||
determine the input and output shapes required for communication.
|
||||
|
||||
When composing with other Data or Model parallelism techniques, ``output_args``
|
||||
may also be required, if the output shape/dtype of the model chunk will be
|
||||
affected.
|
||||
@ -421,7 +414,7 @@ are subclasses of ``PipelineScheduleMulti``.
|
||||
Logging
|
||||
*******
|
||||
|
||||
You can turn on additional logging using the `TORCH_LOGS` environment variable from [`torch._logging`](https://pytorch.org/docs/main/logging.html#module-torch._logging):
|
||||
You can turn on additional logging using the `TORCH_LOGS` environment variable from `torch._logging <https://pytorch.org/docs/main/logging.html#module-torch._logging>`_:
|
||||
|
||||
* `TORCH_LOGS=+pp` will display `logging.DEBUG` messages and all levels above it.
|
||||
* `TORCH_LOGS=pp` will display `logging.INFO` messages and above.
|
||||
|
||||
@ -508,7 +508,7 @@ API Example::
|
||||
|
||||
import torch
|
||||
from torch.ao.quantization.quantize_pt2e import prepare_pt2e
|
||||
from torch._export import capture_pre_autograd_graph
|
||||
from torch.export import export_for_training
|
||||
from torch.ao.quantization.quantizer import (
|
||||
XNNPACKQuantizer,
|
||||
get_symmetric_quantization_config,
|
||||
@ -535,7 +535,7 @@ API Example::
|
||||
# Step 1. program capture
|
||||
# NOTE: this API will be updated to torch.export API in the future, but the captured
|
||||
# result should mostly stay the same
|
||||
m = capture_pre_autograd_graph(m, *example_inputs)
|
||||
m = export_for_training(m, *example_inputs).module()
|
||||
# we get a model with aten ops
|
||||
|
||||
# Step 2. quantization
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user