Update on "[2/N] Dynamic Shape: Enable dynamic shape support for aoti_eager"

cc voznesenskym penguinwu jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 ColinPeppler amjames desertfire chauhang

[ghstack-poisoned]
This commit is contained in:
Wang, Eikan
2024-06-16 02:16:42 +00:00
195 changed files with 5437 additions and 1125 deletions

View File

@ -373,6 +373,13 @@ case "$image" in
CONDA_CMAKE=yes
EXECUTORCH=yes
;;
pytorch-linux-jammy-py3.12-halide)
CUDA_VERSION=12.4
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
CONDA_CMAKE=yes
HALIDE=yes
;;
pytorch-linux-focal-linter)
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
# We will need to update mypy version eventually, but that's for another day. The task
@ -490,6 +497,7 @@ docker build \
--build-arg "DOCS=${DOCS}" \
--build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
--build-arg "EXECUTORCH=${EXECUTORCH}" \
--build-arg "HALIDE=${HALIDE}" \
--build-arg "XPU_VERSION=${XPU_VERSION}" \
--build-arg "ACL=${ACL:-}" \
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \

View File

@ -0,0 +1 @@
340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d

View File

@ -0,0 +1,46 @@
#!/bin/bash
set -ex
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
COMMIT=$(get_pinned_commit halide)
test -n "$COMMIT"
# activate conda to populate CONDA_PREFIX
test -n "$ANACONDA_PYTHON_VERSION"
eval "$(conda shell.bash hook)"
conda activate py_$ANACONDA_PYTHON_VERSION
if [ -n "${UBUNTU_VERSION}" ];then
apt update
apt-get install -y lld liblld-15-dev libpng-dev libjpeg-dev libgl-dev \
libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
fi
conda_install numpy scipy imageio cmake ninja
git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
cmake -DCMAKE_BUILD_TYPE=Release \
-DLLVM_ENABLE_PROJECTS="clang" \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
-DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \
-DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \
-S llvm-project/llvm -B llvm-build -G Ninja
cmake --build llvm-build
cmake --install llvm-build --prefix llvm-install
export LLVM_ROOT=`pwd`/llvm-install
export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config
git clone https://github.com/halide/Halide.git
pushd Halide
git checkout ${COMMIT} && git submodule update --init --recursive
pip_install -r requirements.txt
cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
cmake --build build
test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
cmake --install build --prefix ${CONDA_PREFIX}
chown -R jenkins ${CONDA_PREFIX}
popd
rm -rf Halide llvm-build llvm-project llvm-install
python -c "import halide" # check for errors

View File

@ -33,7 +33,9 @@ pip_install coloredlogs packaging
pip_install onnxruntime==1.18
pip_install onnx==1.16.0
# pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
pip_install onnxscript==0.1.0.dev20240523 --no-deps
pip_install onnxscript==0.1.0.dev20240613 --no-deps
# required by onnxscript
pip_install ml_dtypes
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/

View File

@ -103,6 +103,14 @@ COPY triton_version.txt triton_version.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
ARG HALIDE
# Build and install halide
COPY ./common/install_halide.sh install_halide.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/halide.txt halide.txt
RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
RUN rm install_halide.sh common_utils.sh halide.txt
# Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH

View File

@ -155,6 +155,14 @@ COPY ci_commit_pins/executorch.txt executorch.txt
RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
RUN rm install_executorch.sh common_utils.sh executorch.txt
ARG HALIDE
# Build and install halide
COPY ./common/install_halide.sh install_halide.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/halide.txt halide.txt
RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
RUN rm install_halide.sh common_utils.sh halide.txt
ARG ONNX
# Install ONNX dependencies
COPY ./common/install_onnx.sh ./common/common_utils.sh ./

View File

@ -188,28 +188,6 @@ function clone_pytorch_xla() {
fi
}
function checkout_install_torchdeploy() {
local commit
commit=$(get_pinned_commit multipy)
pushd ..
git clone --recurse-submodules https://github.com/pytorch/multipy.git
pushd multipy
git checkout "${commit}"
python multipy/runtime/example/generate_examples.py
BUILD_CUDA_TESTS=1 pip install -e .
popd
popd
}
function test_torch_deploy(){
pushd ..
pushd multipy
./multipy/runtime/build/test_deploy
./multipy/runtime/build/test_deploy_gpu
popd
popd
}
function checkout_install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)

View File

@ -550,6 +550,11 @@ test_inductor_micro_benchmark() {
python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
}
test_inductor_halide() {
python test/run_test.py --include inductor/test_halide.py --verbose
assert_git_not_dirty
}
test_dynamo_benchmark() {
# Usage: test_dynamo_benchmark huggingface 0
TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -1237,11 +1242,10 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
if [[ "${SHARD_NUMBER}" == 1 ]]; then
test_rpc
fi
elif [[ "$TEST_CONFIG" == deploy ]]; then
checkout_install_torchdeploy
test_torch_deploy
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
test_inductor_halide
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
test_inductor_micro_benchmark
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then

View File

@ -180,6 +180,9 @@ def mock_gh_get_info() -> Any:
return {
"closed": False,
"isCrossRepository": False,
"headRefName": "foo",
"baseRefName": "bar",
"baseRepository": {"defaultBranchRef": {"name": "bar"}},
"files": {"nodes": [], "pageInfo": {"hasNextPage": False}},
"changedFiles": 0,
}

View File

@ -2330,6 +2330,15 @@ def main() -> None:
dry_run=args.dry_run,
)
return
if not pr.is_ghstack_pr() and pr.base_ref() != pr.default_branch():
gh_post_pr_comment(
org,
project,
args.pr_num,
f"PR targets {pr.base_ref()} rather than {pr.default_branch()}, refusing merge request",
dry_run=args.dry_run,
)
return
if args.check_mergeability:
if pr.is_ghstack_pr():

View File

@ -54,6 +54,7 @@ jobs:
pytorch-linux-focal-py3-clang9-android-ndk-r21e,
pytorch-linux-jammy-py3.8-gcc11,
pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks,
pytorch-linux-jammy-py3.12-halide,
pytorch-linux-jammy-xpu-2024.0-py3,
pytorch-linux-jammy-py3-clang15-asan,
pytorch-linux-focal-py3-clang10-onnx,

View File

@ -102,6 +102,26 @@ jobs:
docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
linux-jammy-cpu-py3_12-inductor-halide-build:
name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-jammy-py3.12-gcc11
docker-image-name: pytorch-linux-jammy-py3.12-halide
test-matrix: |
{ include: [
{ config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
]}
linux-jammy-cpu-py3_12-inductor-halide-test:
name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cpu-py3_12-inductor-halide-build
with:
build-environment: linux-jammy-py3.12-gcc11
docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
# Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
name: cuda12.4-py3.10-gcc9-sm86

View File

@ -270,7 +270,6 @@ jobs:
{ config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
{ config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
{ config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
]}
linux-focal-cuda12_1-py3_10-gcc9-test:

View File

@ -68,6 +68,7 @@ include_patterns = [
'aten/src/ATen/native/cudnn/*.cpp',
'c10/**/*.h',
'c10/**/*.cpp',
'distributed/c10d/*SymmetricMemory.*',
'torch/csrc/**/*.h',
'torch/csrc/**/*.hpp',
'torch/csrc/**/*.cpp',
@ -216,7 +217,6 @@ exclude_patterns = [
'c10/util/complex_math.h',
'c10/util/complex_utils.h',
'c10/util/flat_hash_map.h',
'c10/util/Float8*.h',
'c10/util/logging*.h',
'c10/util/hash.h',
'c10/util/strong_type.h',
@ -1756,9 +1756,7 @@ exclude_patterns = [
'torch/testing/_internal/codegen/__init__.py',
'torch/testing/_internal/codegen/random_topo_test.py',
'torch/testing/_internal/common_cuda.py',
'torch/testing/_internal/common_device_type.py',
'torch/testing/_internal/common_distributed.py',
'torch/testing/_internal/common_dtype.py',
'torch/testing/_internal/common_jit.py',
'torch/testing/_internal/common_methods_invocations.py',
'torch/testing/_internal/common_modules.py',

View File

@ -744,6 +744,7 @@ cc_library(
"torch/csrc/cuda/python_nccl.cpp",
"torch/csrc/cuda/nccl.cpp",
"torch/csrc/distributed/c10d/intra_node_comm.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
"torch/csrc/distributed/c10d/Utils.cu",
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
],

View File

@ -6,7 +6,7 @@
- [Untrusted inputs](#untrusted-inputs)
- [Data privacy](#data-privacy)
- [Using distributed features](#using-distributed-features)
- [**CI/CD security principles**](#cicd-security-principles)
## Reporting Security Issues
Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch.
@ -61,3 +61,27 @@ If applicable, prepare your model against bad inputs and prompt injections. Some
PyTorch can be used for distributed computing, and as such there is a `torch.distributed` package. PyTorch Distributed features are intended for internal communication only. They are not built for use in untrusted environments or networks.
For performance reasons, none of the PyTorch Distributed primitives (including c10d, RPC, and TCPStore) include any authorization protocol and will send messages unencrypted. They accept connections from anywhere, and execute the workload sent without performing any checks. Therefore, if you run a PyTorch Distributed program on your network, anybody with access to the network can execute arbitrary code with the privileges of the user running PyTorch.
## CI/CD security principles
_Audience_: Contributors and reviewers, especially if modifying the workflow files/build system.
PyTorch CI/CD security philosophy is based on finding a balance between open and transparent CI pipelines while keeping the environment efficient and safe.
PyTorch testing requirements are complex, and a large part of the code base can only be tested on specialized powerful hardware, such as GPU, making it a lucrative target for resource misuse. To prevent this, we require workflow run approval for PRs from non-member contributors. To keep the volume of those approvals relatively low, we easily extend write permissions to the repository to regular contributors.
More widespread write access to the repo presents challenges when it comes to reviewing changes, merging code into trunk, and creating releases. [Protected branches](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/managing-protected-branches/about-protected-branches) are used to restrict the ability to merge to the trunk/release branches only to the repository administrators and merge bot. The merge bot is responsible for mechanistically merging the change and validating reviews against the path-based rules defined in [merge_rules.yml](https://github.com/pytorch/pytorch/blob/main/.github/merge_rules.yaml). Once a PR has been reviewed by person(s) mentioned in these rules, leaving a `@pytorchbot merge` comment on the PR will initiate the merge process. To protect merge bot credentials from leaking, merge actions must be executed only on ephemeral runners (see definition below) using a specialized deployment environment.
To speed up the CI system, build steps of the workflow rely on the distributed caching mechanism backed by [sccache](https://github.com/mozilla/sccache), making them susceptible to cache corruption compromises. For that reason binary artifacts generated during CI should not be executed in an environment that contains an access to any sensitive/non-public information and should not be published for use by general audience. One should not have any expectation about the lifetime of those artifacts, although in practice they likely remain accessible for about two weeks after the PR has been closed.
To speed up CI system setup, PyTorch relies heavily on Docker to pre-build and pre-install the dependencies. To prevent a potentially malicious PR from altering ones that were published in the past, ECR has been configured to use immutable tags.
To improve runner availability and more efficient resource utilization, some of the CI runners are non-ephemeral, i.e., workflow steps from completely unrelated PRs could be scheduled sequentially on the same runner, making them susceptible to reverse shell attacks. For that reason, PyTorch does not rely on the repository secrets mechanism, as these can easily be compromised in such attacks.
### Release pipelines security
To ensure safe binary releases, PyTorch release pipelines are built on the following principles:
- All binary builds/upload jobs must be run on ephemeral runners, i.e., on a machine that is allocated from the cloud to do the build and released back to the cloud after the build is finished. This protects those builds from interference from external actors, who potentially can get reverse shell access to a non-ephemeral runner and wait there for a binary build.
- All binary builds are cold-start builds, i.e., distributed caching/incremental builds are not permitted. This renders builds much slower than incremental CI builds but isolates them from potential compromises of the intermediate artifacts caching systems.
- All upload jobs are executed in a [deployment environments](https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment) that are restricted to protected branches
- Security credentials needed to upload binaries to PyPI/conda or stable indexes `download.pytorch.org/whl` are never uploaded to repo secrets storage/environment. This requires an extra manual step to publish the release but ensures that access to those would not be compromised by deliberate/accidental leaks of secrets stored in the cloud.
- No binary artifacts should be published to GitHub releases pages, as these are overwritable by anyone with write permission to the repo.

View File

@ -35,6 +35,12 @@ void SavedTensorDefaultHooks::enable() {
tls.disabled_error_message = c10::nullopt;
}
/* static */ bool SavedTensorDefaultHooks::set_tracing(bool is_tracing) {
bool prior = tls.is_tracing;
tls.is_tracing = is_tracing;
return prior;
}
const std::optional<std::string>& SavedTensorDefaultHooks::get_disabled_error_message() {
return tls.disabled_error_message;
}
@ -59,25 +65,20 @@ void SavedTensorDefaultHooks::push_hooks(PyObject* pack_hook, PyObject* unpack_h
tls.stack.emplace(pack_hook, unpack_hook);
}
void SavedTensorDefaultHooks::pop_hooks() {
std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::pop_hooks() {
// Reference counting is handled by the caller of `pop_hooks`
TORCH_INTERNAL_ASSERT(is_initialized && !tls.stack.empty());
std::pair<PyObject*, PyObject*> hooks = tls.stack.top();
tls.stack.pop();
return hooks;
}
std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::get_hooks() {
if (!is_initialized || tls.stack.empty()) {
// For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
if (!is_initialized || tls.stack.empty() || tls.is_tracing) {
return std::make_pair(nullptr, nullptr);
}
return tls.stack.top();
}
std::stack<std::pair<PyObject*, PyObject*>> SavedTensorDefaultHooks::get_stack() {
return tls.stack;
}
void SavedTensorDefaultHooks::set_stack(std::stack<std::pair<PyObject*, PyObject*>> stack_) {
tls.stack = std::move(stack_);
}
}

View File

@ -22,17 +22,18 @@ struct TORCH_API SavedTensorDefaultHooksTLS {
// We did this for efficiency (so we didn't have to keep a separate bool
// around)
std::optional<std::string> disabled_error_message;
// See NOTE: [Deferring tensor pack/unpack hooks until runtime]
bool is_tracing = false;
};
} // namespace impl
struct TORCH_API SavedTensorDefaultHooks {
static void push_hooks(PyObject* pack_hook, PyObject* unpack_hook);
static void pop_hooks();
static std::pair<PyObject*, PyObject*> pop_hooks();
static std::pair<PyObject*, PyObject*> get_hooks();
static void lazy_initialize();
static std::stack<std::pair<PyObject*, PyObject*>> get_stack();
static void set_stack(std::stack<std::pair<PyObject*, PyObject*>>);
static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
static void set_tls_state(const impl::SavedTensorDefaultHooksTLS& tls);
@ -42,11 +43,20 @@ struct TORCH_API SavedTensorDefaultHooks {
// hooks, especially if their feature does not work with it. If they are
// disabled, then the following will raise an error:
// - Attempting to push_hooks
// - calling disable(message) with a non-zero stack (from get_stack) size
// - calling disable(message) with a non-zero stack (hooks) size
static void disable(const std::string& error_message);
static void enable();
static bool is_enabled();
static const std::optional<std::string>& get_disabled_error_message();
// NOTE: [Deferring tensor pack/unpack hooks until runtime]
// To preserve eager semantics of pack/unpack hooks firing only once per saved
// variable, Dynamo/AOTAutograd need to defer hook firing until runtime. Using
// disable() would loud error at trace time, and pushing a no-op hook would
// fail when the traced code is wrapped in a disable_saved_tensors_hooks ctx.
// To do so, we disable these hooks during tracing. See
// https://github.com/pytorch/pytorch/issues/113263.
static bool set_tracing(bool is_tracing);
};
} // namespace at

View File

@ -794,12 +794,16 @@ Vectorized<BFloat16> inline clamp_min(const Vectorized<BFloat16>& a, const Vecto
template <>
inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
_mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
@ -992,12 +996,16 @@ Vectorized<Half> inline clamp_min(const Vectorized<Half>& a, const Vectorized<Ha
template <>
inline void convert(const Half* src, Half* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<Half>::size()); i += Vectorized<Half>::size()) {
auto vsrc = _mm256_loadu_si256(reinterpret_cast<__m256i*>((void*)(src + i)));
_mm256_storeu_si256(reinterpret_cast<__m256i*>((void*)(dst + i)), vsrc);
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}

View File

@ -416,11 +416,15 @@ inline Vectorized<double> Vectorized<double>::le(const Vectorized<double>& other
template <>
inline void convert(const double* src, double* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
_mm256_storeu_pd(dst + i, _mm256_loadu_pd(src + i));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}

View File

@ -512,11 +512,15 @@ inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) c
template <>
inline void convert(const float* src, float* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
_mm256_storeu_ps(dst + i, _mm256_loadu_ps(src + i));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}

View File

@ -823,12 +823,16 @@ inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) c
template <>
inline void convert(const float* src, int32_t* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
vst1q_s32(dst + i, vcvtq_s32_f32(vld1q_f32(src + i)));
vst1q_s32(dst + i + 4, vcvtq_s32_f32(vld1q_f32(src + i + 4)));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = static_cast<int32_t>(src[i]);
}
@ -837,12 +841,16 @@ inline void convert(const float* src, int32_t* dst, int64_t n) {
template <>
inline void convert(const int32_t* src, float* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
vst1q_f32(dst + i, vcvtq_f32_s32(vld1q_s32(src + i)));
vst1q_f32(dst + i + 4, vcvtq_f32_s32(vld1q_s32(src + i + 4)));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = static_cast<float>(src[i]);
}

View File

@ -765,13 +765,17 @@ inline Vectorized<c10::Half> Vectorized<c10::Half>::le(
template <>
inline void convert(const float16_t* src, int16_t* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<c10::Half>::size());
i += Vectorized<c10::Half>::size()) {
vst1q_s16(dst + i, vcvtq_s16_f16(vld1q_f16(src + i)));
vst1q_s16(dst + i + 8, vcvtq_s16_f16(vld1q_f16(src + i + 8)));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = static_cast<int16_t>(src[i]);
}
@ -780,13 +784,17 @@ inline void convert(const float16_t* src, int16_t* dst, int64_t n) {
template <>
inline void convert(const int16_t* src, float16_t* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<c10::Half>::size());
i += Vectorized<c10::Half>::size()) {
vst1q_f16(dst + i, vcvtq_f16_s16(vld1q_s16(src + i)));
vst1q_f16(dst + i + 8, vcvtq_f16_s16(vld1q_s16(src + i + 8)));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = static_cast<float16_t>(src[i]);
}

View File

@ -914,12 +914,16 @@ Vectorized<BFloat16> inline clamp_min(const Vectorized<BFloat16>& a, const Vecto
template <>
inline void convert(const BFloat16* src, BFloat16* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<BFloat16>::size()); i += Vectorized<BFloat16>::size()) {
auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
_mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
@ -986,7 +990,9 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
// j0-j15 n0-n15
// k0-k15 o0-o15
// l0-l15 p0-p15
#ifndef __msvc_cl__
#pragma unroll(4)
#endif
for (int i = 0; i < 4; i++) {
r[i] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i]), t[i + 4], 0x01);
r[i + 4] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01);
@ -998,7 +1004,9 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
// u3: c4c5 d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15 g4g5 h4h5 g6g7 h6h7 g12g13 h12h13 g14g15 h14h15
// i j m n
// k l o p
#ifndef __msvc_cl__
#pragma unroll(4)
#endif
for (int i = 0; i < 8; i += 2) {
u[i] = _mm512_unpacklo_epi32(r[i], r[i + 1]);
u[i + 1] = _mm512_unpackhi_epi32(r[i], r[i + 1]);
@ -1061,7 +1069,9 @@ static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) {
// 12-- 13--
// 6-- 7--
// 14-- 15--
#ifndef __msvc_cl__
#pragma unroll(4)
#endif
for (int i = 0; i < 4; i++) {
u[i] = _mm512_permutex2var_epi16(r[i], const1, r[i + 4]);
u[i + 4] = _mm512_permutex2var_epi16(r[i], const2, r[i + 4]);
@ -1095,7 +1105,9 @@ inline void transpose_mxn<BFloat16, 16, 16>(
// n: n0 n1 n2 n3 n4 n5 n6 n7 n8 n9 n10 n11 n12 n13 n14 n15
// o: o0 o1 o2 o3 o4 o5 o6 o7 o8 o9 o10 o11 o12 o13 o14 o15
// p: p0 p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15
#ifndef __msvc_cl__
#pragma unroll(16)
#endif
for (int i = 0; i < 16; i++) {
t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
}
@ -1103,7 +1115,9 @@ inline void transpose_mxn<BFloat16, 16, 16>(
__m512i u[8];
_transpose_mxn_half_16_16(t, u);
#ifndef __msvc_cl__
#pragma unroll(8)
#endif
for (int i = 0; i < 8; i++) {
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
@ -1125,7 +1139,9 @@ inline void transpose_mxn<Half, 16, 16>(
__m256i t[16];
// load from src to registers
// Same matrix indices as above transpose_mxn<BFloat16, 16, 16>
#ifndef __msvc_cl__
#pragma unroll(16)
#endif
for (int i = 0; i < 16; i++) {
t[i] = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + i * ld_src));
}
@ -1133,7 +1149,9 @@ inline void transpose_mxn<Half, 16, 16>(
__m512i u[8];
_transpose_mxn_half_16_16(t, u);
#ifndef __msvc_cl__
#pragma unroll(8)
#endif
for (int i = 0; i < 8; i++) {
_mm256_storeu_si256(
reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst),
@ -1164,7 +1182,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
// t[16]: 512 544 513 545 514 546 515 547 520 552 521 553 522 554 523 555 528 ... 571
// ...
// t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007 980 ... 1023
#ifndef __msvc_cl__
#pragma unroll(16)
#endif
for (int i = 0; i < 16; ++i) {
d[i * 2] = _mm512_unpacklo_epi16(r[i * 2], r[i * 2 + 1]);
d[i * 2 + 1] = _mm512_unpackhi_epi16(r[i * 2], r[i * 2 + 1]);
@ -1189,7 +1209,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
// t[16]: 512 544 576 608 513 545 577 609 520 552 584 616 521 553 585 617 528 ... 633
// ...
// t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007 918 ... 1023
#ifndef __msvc_cl__
#pragma unroll(8)
#endif
for (int i = 0; i < 8; ++i) {
r[i * 4] = _mm512_unpacklo_epi32(d[i * 4], d[i * 4 + 2]);
r[i * 4 + 1] = _mm512_unpackhi_epi32(d[i * 4], d[i * 4 + 2]);
@ -1216,7 +1238,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
// t[16]: 512 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760
// ...
// t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791 ... 1023
#ifndef __msvc_cl__
#pragma unroll(4)
#endif
for (int i = 0; i < 4; ++i) {
d[i * 8] = _mm512_unpacklo_epi64(r[i * 8], r[i * 8 + 4]);
d[i * 8 + 1] = _mm512_unpackhi_epi64(r[i * 8], r[i * 8 + 4]);
@ -1265,7 +1289,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
0x000000000000000a,
0x0000000000000003,
0x0000000000000002);
#ifndef __msvc_cl__
#pragma unroll(8)
#endif
for (int i = 0; i < 8; ++i) {
r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/const1, d[i + 8]);
r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/const2, d[i + 8]);
@ -1310,7 +1336,9 @@ static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) {
0x0000000000000006,
0x0000000000000005,
0x0000000000000004);
#ifndef __msvc_cl__
#pragma unroll(16)
#endif
for (int i = 0; i < 16; ++i) {
d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/const3, r[i + 16]);
d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/const4, r[i + 16]);
@ -1327,7 +1355,9 @@ inline void transpose_mxn<BFloat16, 32, 32>(
int64_t ld_dst) {
// Load from memory
__m512i r[32];
#ifndef __msvc_cl__
#pragma unroll(32)
#endif
for (int i = 0; i < 32; ++i) {
r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i* ld_src));
}
@ -1336,7 +1366,9 @@ inline void transpose_mxn<BFloat16, 32, 32>(
_transpose_mxn_half_32_32(r, d);
// Store to dst
#ifndef __msvc_cl__
#pragma unroll(32)
#endif
for (int i = 0; i < 32; ++i) {
_mm512_storeu_si512(dst + i* ld_dst, d[i]);
}
@ -1350,7 +1382,9 @@ inline void transpose_mxn<Half, 32, 32>(
int64_t ld_dst) {
// Load from memory
__m512i r[32];
#ifndef __msvc_cl__
#pragma unroll(32)
#endif
for (int i = 0; i < 32; ++i) {
r[i] = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(src + i* ld_src));
}
@ -1359,7 +1393,9 @@ inline void transpose_mxn<Half, 32, 32>(
_transpose_mxn_half_32_32(r, d);
// Store to dst
#ifndef __msvc_cl__
#pragma unroll(32)
#endif
for (int i = 0; i < 32; ++i) {
_mm512_storeu_si512(dst + i* ld_dst, d[i]);
}
@ -1514,12 +1550,16 @@ Vectorized<Half> inline clamp_min(const Vectorized<Half>& a, const Vectorized<Ha
template <>
inline void convert(const Half* src, Half* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<Half>::size()); i += Vectorized<Half>::size()) {
auto vsrc = _mm512_loadu_si512(reinterpret_cast<__m512i*>((void*)(src + i)));
_mm512_storeu_si512(reinterpret_cast<__m512i*>((void*)(dst + i)), vsrc);
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}

View File

@ -443,11 +443,15 @@ inline Vectorized<double> Vectorized<double>::le(const Vectorized<double>& other
template <>
inline void convert(const double* src, double* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<double>::size()); i += Vectorized<double>::size()) {
_mm512_storeu_pd(dst + i, _mm512_loadu_pd(src + i));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}

View File

@ -552,11 +552,15 @@ inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) c
template <>
inline void convert(const float* src, float* dst, int64_t n) {
int64_t i;
#ifndef __msvc_cl__
#pragma unroll
#endif
for (i = 0; i <= (n - Vectorized<float>::size()); i += Vectorized<float>::size()) {
_mm512_storeu_ps(dst + i, _mm512_loadu_ps(src + i));
}
#ifndef __msvc_cl__
#pragma unroll
#endif
for (; i < n; i++) {
dst[i] = src[i];
}

View File

@ -42,6 +42,15 @@
#define __FORCE_INLINE __forceinline
#endif
#if defined(_MSC_FULL_VER)
/*
https://learn.microsoft.com/en-us/cpp/overview/compiler-versions?view=msvc-170
Use _MSC_FULL_VER to identify current compiler is msvc,
Windows llvm will not have this defination.
*/
#define __msvc_cl__
#endif
// These macros helped us unify vec_base.h
#ifdef CPU_CAPABILITY_AVX512
#if defined(__GNUC__)

View File

@ -127,7 +127,9 @@ class VecMask {
static VecMask<T, N> from(U* b) {
using int_t = int_same_size_t<T>;
__at_align__ T mask[size()];
#ifndef __msvc_cl__
#pragma unroll
#endif
for (int i = 0; i < size(); i++) {
*(int_t*)(mask + i) = b[i] ? ~(int_t)0 : (int_t)0;
}

View File

@ -103,10 +103,10 @@ inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tens
// with max value if it is integer type
inline Tensor& fill_empty_deterministic_(Tensor& tensor) {
if (tensor.is_floating_point() || tensor.is_complex()) {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
kBFloat16, kHalf, tensor.scalar_type(), "fill_empty_deterministic_", [&]() {
AT_DISPATCH_V2(
tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
tensor.fill_(std::numeric_limits<scalar_t>::quiet_NaN());
});
}), AT_EXPAND(AT_FLOATING_TYPES), AT_EXPAND(AT_COMPLEX_TYPES), AT_EXPAND(AT_FLOAT8_TYPES), kBFloat16, kHalf);
} else {
AT_DISPATCH_V2(
tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {

View File

@ -43,6 +43,14 @@ void fill_kernel(TensorIterator& iter, const Scalar& value_scalar) {
fill_non_native_type<at::BFloat16>(iter, value_scalar);
} else if (iter.dtype() == ScalarType::ComplexHalf) {
fill_non_native_type<c10::complex<at::Half>>(iter, value_scalar);
} else if (iter.dtype() == ScalarType::Float8_e4m3fn) {
fill_non_native_type<at::Float8_e4m3fn>(iter, value_scalar);
} else if (iter.dtype() == ScalarType::Float8_e5m2) {
fill_non_native_type<at::Float8_e5m2>(iter, value_scalar);
} else if (iter.dtype() == ScalarType::Float8_e4m3fnuz) {
fill_non_native_type<at::Float8_e4m3fnuz>(iter, value_scalar);
} else if (iter.dtype() == ScalarType::Float8_e5m2fnuz) {
fill_non_native_type<at::Float8_e5m2fnuz>(iter, value_scalar);
} else {
AT_DISPATCH_V2(
iter.dtype(), "fill_cpu", AT_WRAP([&]() {

View File

@ -685,7 +685,6 @@ SDPBackend select_sdp_backend(sdp_params const& kernel_params) {
switch (backend) {
case SDPBackend::cudnn_attention:
if (sdp::can_use_cudnn_attention(kernel_params, print_debug)) {
TORCH_WARN("USING CUDNN SDPA");
return SDPBackend::cudnn_attention;
}
break;

View File

@ -6,6 +6,14 @@ import textwrap
import pandas as pd
# Hack to have something similar to DISABLED_TEST. These models are flaky.
flaky_models = {
"yolov3",
"gluon_inception_v3",
}
def get_field(csv, model_name: str, field: str):
try:
return csv.loc[csv["name"] == model_name][field].item()
@ -25,6 +33,13 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
status = "PASS" if expected_accuracy == "pass" else "XFAIL"
print(f"{model:34} {status}")
continue
elif model in flaky_models:
if accuracy == "pass":
# model passed but marked xfailed
status = "PASS_BUT_FLAKY:"
else:
# model failed but marked passe
status = "FAIL_BUT_FLAKY:"
elif accuracy != "pass":
status = "FAIL:"
failed.append(model)

View File

@ -378,4 +378,4 @@ vision_maskrcnn,pass,17
yolov3,pass,2
yolov3,pass,0

1 name accuracy graph_breaks
378
379
380
381

View File

@ -286,4 +286,4 @@ vision_maskrcnn,pass,34
yolov3,pass,9
yolov3,pass,8

1 name accuracy graph_breaks
286
287
288
289

View File

@ -350,4 +350,4 @@ vision_maskrcnn,fail_to_run,0
yolov3,fail_to_run,0
yolov3,pass,0

1 name accuracy graph_breaks
350
351
352
353

View File

@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0
DALLE2_pytorch,model_fail_to_load,0
DALLE2_pytorch,eager_fail_to_run,0
@ -338,4 +338,4 @@ vision_maskrcnn,pass,28
yolov3,pass,2
yolov3,pass,0

1 name accuracy graph_breaks
10 basic_gnn_gin pass 0
11 basic_gnn_sage pass 0
12 dcgan pass 0
13 demucs pass 3
14 densenet121 pass 0
15 detectron2_fasterrcnn_r_101_c4 pass 42
16 detectron2_fasterrcnn_r_101_dc5 pass 42
338
339
340
341

View File

@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0
DALLE2_pytorch,model_fail_to_load,0
DALLE2_pytorch,eager_fail_to_run,0
@ -338,4 +338,4 @@ vision_maskrcnn,pass,28
yolov3,pass,2
yolov3,pass,0

1 name accuracy graph_breaks
10 basic_gnn_gin pass 0
11 basic_gnn_sage pass 0
12 dcgan pass 0
13 demucs pass 3
14 densenet121 pass 0
15 detectron2_fasterrcnn_r_101_c4 pass 42
16 detectron2_fasterrcnn_r_101_dc5 pass 42
338
339
340
341

View File

@ -350,4 +350,4 @@ vision_maskrcnn,fail_to_run,0
yolov3,fail_to_run,0
yolov3,pass,0

1 name accuracy graph_breaks
350
351
352
353

View File

@ -98,7 +98,7 @@ hf_Bert_large,pass,6
hf_BigBird,pass,52
hf_BigBird,pass,49

1 name accuracy graph_breaks
98
99
100
101
102
103
104

View File

@ -374,4 +374,4 @@ vision_maskrcnn,pass,17
yolov3,pass,2
yolov3,pass,0

1 name accuracy graph_breaks
374
375
376
377

View File

@ -282,4 +282,4 @@ vision_maskrcnn,pass,34
yolov3,pass,9
yolov3,pass,8

1 name accuracy graph_breaks
282
283
284
285

View File

@ -10,7 +10,7 @@ Background_Matting,pass_due_to_skip,0
DALLE2_pytorch,model_fail_to_load,0
DALLE2_pytorch,eager_fail_to_run,0
@ -298,4 +298,4 @@ vision_maskrcnn,pass,28
yolov3,pass,2
yolov3,pass,0

1 name accuracy graph_breaks
10 basic_gnn_gin pass 0
11 basic_gnn_sage pass 0
12 dcgan pass 0
13 demucs pass 3
14 densenet121 pass 0
15 detectron2_fcos_r_50_fpn pass 23
16 dlrm pass 0
298
299
300
301

View File

@ -374,4 +374,4 @@ vision_maskrcnn,pass,17
yolov3,pass,2
yolov3,pass,0

1 name accuracy graph_breaks
374
375
376
377

View File

@ -282,4 +282,4 @@ vision_maskrcnn,pass,34
yolov3,pass,9
yolov3,pass,8

1 name accuracy graph_breaks
282
283
284
285

View File

@ -378,4 +378,4 @@ vision_maskrcnn,pass,17
yolov3,pass,2
yolov3,pass,0

1 name accuracy graph_breaks
378
379
380
381

View File

@ -286,4 +286,4 @@ vision_maskrcnn,pass,34
yolov3,pass,9
yolov3,pass,8

1 name accuracy graph_breaks
286
287
288
289

View File

@ -378,4 +378,4 @@ vision_maskrcnn,pass,17
yolov3,pass,2
yolov3,pass,0

1 name accuracy graph_breaks
378
379
380
381

View File

@ -286,4 +286,4 @@ vision_maskrcnn,pass,34
yolov3,pass,9
yolov3,pass,8

1 name accuracy graph_breaks
286
287
288
289

View File

@ -2,12 +2,17 @@ import argparse
import csv
import dataclasses
import os
import time
from generate import run_llama2_7b_bf16, run_llama2_7b_int8, run_mixtral_8x7b_int8
from triton.testing import do_bench
import torch
import torch.nn as nn
from torch.utils.flop_counter import FlopCounterMode
WARMUP_ITER = 5
A100_80G_BF16_TFLOPS = 312
@dataclasses.dataclass
@ -16,59 +21,191 @@ class Experiment:
metric: str
target: float
actual: float
dtype: str
device: str
def do_inference(mod, x, num_samples: int = 5):
total_time = 0
start = -1
for i in range(start, num_samples):
torch.cuda.synchronize("cuda")
t0 = time.perf_counter()
mod(x)
if i == -1:
print(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
continue
torch.cuda.synchronize("cuda")
total_time += time.perf_counter() - t0
total_time = total_time / num_samples
return total_time
def run_multi_layer_norm():
class MultiLayerNorm(nn.Module):
def __init__(self, num_layers, normalized_shape, eps=1e-5, bias=True):
super().__init__()
self.num_layers = num_layers
self.norm_layers = nn.ModuleList(
[
nn.LayerNorm(normalized_shape, eps=eps, bias=bias)
for _ in range(num_layers)
]
)
def forward(self, x):
for layer_norm in self.norm_layers:
x = layer_norm(x)
return x
mod = MultiLayerNorm(num_layers=8, normalized_shape=4096).to("cuda")
mod = torch.compile(mod)
input = torch.randn([512, 1024, 4096], dtype=torch.bfloat16, device="cuda")
inference_time = do_inference(mod, input)
memory_bandwidth = input.numel() * input.dtype.itemsize / inference_time / 1e9
return [
Experiment(
"multi_layer_norm", "memory_bandwidth(GB/s)", 92, f"{memory_bandwidth:.02f}"
class SimpleMLP(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, dtype):
super().__init__()
self.layers = nn.ModuleList(
[
nn.Linear(input_dim, hidden_dim, dtype=dtype),
nn.LayerNorm(hidden_dim, dtype=dtype),
nn.Linear(hidden_dim, output_dim, dtype=dtype),
nn.LayerNorm(output_dim, dtype=dtype),
]
)
]
def forward(self, x):
for layer in self.layers:
x = layer(x)
return x
def run_mlp_layer_norm_gelu(device: str = "cuda"):
dtype_flops_utilization_map = {
torch.bfloat16: "0.71",
}
input_shapes = [1024, 4096, 8192, 16384]
intermediate_size = 14336
results = []
for dtype, expected_flops_utilization in dtype_flops_utilization_map.items():
flops_utilization = 0
for D in input_shapes:
mod = SimpleMLP(
input_dim=D, hidden_dim=intermediate_size, output_dim=D, dtype=dtype
).to(device)
x = torch.randn(D, device=device, dtype=torch.bfloat16)
with FlopCounterMode(display=False) as mode:
mod(x)
flops = mode.get_total_flops()
compiled_mod = torch.compile(mod, dynamic=False)
for _ in range(WARMUP_ITER):
compiled_mod(x)
us_per_iter = do_bench(lambda: compiled_mod(x)) * 1000
flops_utilization += us_per_iter * flops / 1e9 / A100_80G_BF16_TFLOPS
flops_utilization = flops_utilization / len(input_shapes)
dtype_str = str(dtype).replace("torch.", "")
results.append(
Experiment(
f"mlp_layer_norm_gelu_{dtype_str}",
"flops_utilization",
expected_flops_utilization,
f"{flops_utilization:.02f}",
dtype_str,
device,
)
)
return results
def run_layer_norm(device: str = "cuda"):
dtype_memory_bandwidth_map = {
torch.bfloat16: "1017",
}
input_shapes = [1024, 4096, 8192, 16384]
BS = 4096
results = []
for dtype, expected_memory_bandwidth in dtype_memory_bandwidth_map.items():
memory_bandwidth = 0
for D in input_shapes:
mod = nn.LayerNorm(D).to(device)
x = torch.randn(BS, D, device=device, dtype=dtype)
compiled_mod = torch.compile(mod, dynamic=False)
for _ in range(WARMUP_ITER):
compiled_mod(x)
us_per_iter = do_bench(lambda: compiled_mod(x)) * 1000
memory_bandwidth += (1e6 / us_per_iter) * 2 * BS * D * dtype.itemsize / 1e9
memory_bandwidth = memory_bandwidth / len(input_shapes)
dtype_str = str(dtype).replace("torch.", "")
results.append(
Experiment(
f"layer_norm_{dtype_str}",
"memory_bandwidth(GB/s)",
expected_memory_bandwidth,
f"{memory_bandwidth:.02f}",
dtype_str,
device,
)
)
return results
@torch._inductor.config.patch(coordinate_descent_tuning=True)
def run_gather_gemv(device: str = "cuda"):
E = 8
dtype_memory_bandwidth_map = {
torch.int8: "1113",
torch.bfloat16: "1249",
}
input_shapes = [1024, 4096, 8192, 16384]
results = []
for dtype, expected_memory_bandwidth in dtype_memory_bandwidth_map.items():
memory_bandwidth = 0
for D in input_shapes:
def gather_gemv(W, score_idxs, x):
return W[score_idxs].to(x.dtype) @ x
W = torch.randn(E, D, D, device=device).to(dtype=dtype)
x = torch.randn(D, device=device, dtype=torch.bfloat16)
score_idxs = torch.tensor([3, 5], device=device)
compiled_fn = torch.compile(gather_gemv, dynamic=False)
for _ in range(WARMUP_ITER):
compiled_fn(W, score_idxs, x)
us_per_iter = do_bench(lambda: compiled_fn(W, score_idxs, x)) * 1000
memory_bandwidth += (1e6 / us_per_iter) * 2 * D * D * dtype.itemsize / 1e9
memory_bandwidth = memory_bandwidth / len(input_shapes)
dtype_str = str(dtype).replace("torch.", "")
results.append(
Experiment(
f"gather_gemv_{dtype_str}",
"memory_bandwidth(GB/s)",
expected_memory_bandwidth,
f"{memory_bandwidth:.02f}",
dtype_str,
device,
)
)
return results
@torch._inductor.config.patch(coordinate_descent_tuning=True)
def run_gemv(device: str = "cuda"):
dtype_memory_bandwidth_map = {
torch.int8: "990",
torch.bfloat16: "1137",
}
input_shapes = [1024, 4096, 8192, 16384]
results = []
for dtype, expected_memory_bandwidth in dtype_memory_bandwidth_map.items():
memory_bandwidth = 0
for D in input_shapes:
def gemv(W, x):
return W.to(x.dtype) @ x
W = torch.randn(D, D, device="cuda").to(dtype=dtype)
x = torch.randn(D, device="cuda", dtype=torch.bfloat16)
compiled_fn = torch.compile(gemv, dynamic=False)
for _ in range(WARMUP_ITER):
compiled_fn(W, x)
us_per_iter = do_bench(lambda: compiled_fn(W, x)) * 1000
memory_bandwidth += (1e6 / us_per_iter) * D * D * dtype.itemsize / 1e9
memory_bandwidth = memory_bandwidth / len(input_shapes)
dtype_str = str(dtype).replace("torch.", "")
results.append(
Experiment(
f"gemv_{dtype_str}",
"memory_bandwidth(GB/s)",
expected_memory_bandwidth,
f"{memory_bandwidth:.02f}",
dtype_str,
device,
)
)
return results
def output_csv(output_file, headers, row):
@ -100,7 +237,10 @@ all_experiments = {
run_llama2_7b_int8,
run_mixtral_8x7b_int8,
# A list of micro-benchmarks.
run_multi_layer_norm,
run_mlp_layer_norm_gelu,
run_layer_norm,
run_gather_gemv,
run_gemv,
}

View File

@ -172,8 +172,8 @@ def run_experiment(
max_new_tokens: int = 200,
top_k: int = 200,
temperature: float = 0.8,
device: str = "cuda",
) -> None:
device = "cuda"
print(f"Loading model {x.name}")
t0 = time.time()
model = _load_model(x)
@ -221,7 +221,7 @@ def run_experiment(
# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
def run_llama2_7b_bf16():
def run_llama2_7b_bf16(device: str = "cuda"):
from benchmark import Experiment
model = GPTModelConfig(
@ -235,22 +235,26 @@ def run_llama2_7b_bf16():
token_per_sec, memory_bandwidth = run_experiment(model)
return [
Experiment(
"llama2_7b_bf16",
model.name,
"token_per_sec",
model.token_per_sec,
f"{token_per_sec:.02f}",
model.mode,
device,
),
Experiment(
"llama2_7b_bf16",
model.name,
"memory_bandwidth(GB/s)",
model.memory_bandwidth,
f"{memory_bandwidth:.02f}",
model.mode,
device,
),
]
# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
def run_llama2_7b_int8():
def run_llama2_7b_int8(device: str = "cuda"):
from benchmark import Experiment
model = GPTModelConfig(
@ -264,22 +268,26 @@ def run_llama2_7b_int8():
token_per_sec, memory_bandwidth = run_experiment(model)
return [
Experiment(
"llama2_7b_int8",
model.name,
"token_per_sec",
model.token_per_sec,
f"{token_per_sec:.02f}",
model.mode,
device,
),
Experiment(
"llama2_7b_int8",
model.name,
"memory_bandwidth(GB/s)",
model.memory_bandwidth,
f"{memory_bandwidth:.02f}",
model.mode,
device,
),
]
# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
def run_mixtral_8x7b_int8():
def run_mixtral_8x7b_int8(device: str = "cuda"):
from benchmark import Experiment
# We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
@ -294,15 +302,19 @@ def run_mixtral_8x7b_int8():
token_per_sec, memory_bandwidth = run_experiment(model)
return [
Experiment(
"mixtral_8x7b_int8",
model.name,
"token_per_sec",
model.token_per_sec,
f"{token_per_sec:.02f}",
model.mode,
device,
),
Experiment(
"mixtral_8x7b_int8",
model.name,
"memory_bandwidth(GB/s)",
model.memory_bandwidth,
f"{memory_bandwidth:.02f}",
model.mode,
device,
),
]

View File

@ -501,6 +501,7 @@ libtorch_distributed_base_sources = [
"torch/csrc/distributed/c10d/ProcessGroupMPI.cpp",
"torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp",
"torch/csrc/distributed/c10d/Store.cpp",
"torch/csrc/distributed/c10d/SymmetricMemory.cpp",
"torch/csrc/distributed/c10d/TCPStore.cpp",
"torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
"torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
@ -684,6 +685,7 @@ libtorch_cuda_distributed_extra_sources = [
"torch/csrc/distributed/c10d/UCCUtils.cpp",
"torch/csrc/distributed/c10d/intra_node_comm.cpp",
"torch/csrc/distributed/c10d/intra_node_comm.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
"torch/csrc/distributed/c10d/Utils.cu",
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",

View File

@ -18,14 +18,17 @@
} \
} while (0)
#define C10_LIBCUDA_DRIVER_API(_) \
_(cuMemAddressReserve) \
_(cuMemRelease) \
_(cuMemMap) \
_(cuMemAddressFree) \
_(cuMemSetAccess) \
_(cuMemUnmap) \
_(cuMemCreate) \
#define C10_LIBCUDA_DRIVER_API(_) \
_(cuMemAddressReserve) \
_(cuMemRelease) \
_(cuMemMap) \
_(cuMemAddressFree) \
_(cuMemSetAccess) \
_(cuMemUnmap) \
_(cuMemCreate) \
_(cuMemGetAllocationGranularity) \
_(cuMemExportToShareableHandle) \
_(cuMemImportFromShareableHandle) \
_(cuGetErrorString)
#define C10_NVML_DRIVER_API(_) \

View File

@ -64,6 +64,25 @@
#define C10_ASAN_ENABLED 0
#endif
// Detect undefined-behavior sanitizer (UBSAN)
#undef C10_UBSAN_ENABLED
// for clang or gcc >= 14
// NB: gcc 14 adds support for Clang's __has_feature
// https://gcc.gnu.org/gcc-14/changes.html
// gcc < 14 doesn't have a macro for UBSAN
// (e.g. __SANITIZE_UNDEFINED__ does not exist in gcc)
// https://github.com/google/sanitizers/issues/765
#if defined(__has_feature)
#if ((__has_feature(undefined_behavior_sanitizer)))
#define C10_UBSAN_ENABLED 1
#endif
#endif
#if !defined(C10_UBSAN_ENABLED)
#define C10_UBSAN_ENABLED 0
#endif
// Disable the copy and assignment operator for a class. Note that this will
// disable the usage of the class in std containers.
#define C10_DISABLE_COPY_AND_ASSIGN(classname) \

View File

@ -2,6 +2,7 @@
#include <c10/core/SymInt.h>
#include <c10/core/SymNodeImpl.h>
#include <c10/macros/Macros.h>
using namespace c10;
#ifndef C10_MOBILE
@ -22,6 +23,8 @@ TEST(SymIntTest, CheckRange) {
EXPECT_FALSE(SymInt::check_range(INT64_MIN));
}
#if !C10_UBSAN_ENABLED
// This test fails signed-integer-overflow UBSAN check
TEST(SymIntTest, Overflows) {
const auto x = SymInt(INT64_MAX);
EXPECT_NE(-(x + 1), 0);
@ -30,5 +33,6 @@ TEST(SymIntTest, Overflows) {
EXPECT_NE(-y, 0);
EXPECT_NE(0 - y, 0);
}
#endif
#endif

View File

@ -1,7 +1,7 @@
#pragma once
#include <c10/macros/Macros.h>
#include <cstring>
#include <cstdint>
#include <limits>
C10_CLANG_DIAGNOSTIC_PUSH()

View File

@ -15,9 +15,7 @@
/// and inspired by Half implementation from pytorch/c10/util/Half.h
#include <c10/macros/Macros.h>
#include <c10/util/TypeSafeSignMath.h>
#include <c10/util/floating_point_utils.h>
#include <type_traits>
#if defined(__cplusplus)
#include <cmath>
@ -32,16 +30,7 @@
#endif
#include <climits>
#include <cstdint>
#include <cstring>
#include <iosfwd>
#include <limits>
#include <sstream>
#include <stdexcept>
#include <string>
#include <utility>
#include <typeinfo> // operator typeid
#include <iostream>
namespace c10 {

View File

@ -1,3 +1,4 @@
#include <c10/macros/Macros.h>
#include <c10/util/Float8_e4m3fnuz.h>
namespace c10 {

View File

@ -17,8 +17,8 @@
/// Implementation based on the paper https://arxiv.org/pdf/2206.02915.pdf and
/// the existing Float8_e4m3fn implementation.
#include <c10/macros/Export.h>
#include <c10/macros/Macros.h>
#include <c10/util/TypeSafeSignMath.h>
#include <c10/util/floating_point_utils.h>
#include <type_traits>

View File

@ -235,7 +235,7 @@ class numeric_limits<c10::Float8_e5m2> {
static constexpr bool is_specialized = true;
static constexpr bool is_exact = false;
static constexpr bool has_infinity = true;
static constexpr bool has_quiet_NaN = false;
static constexpr bool has_quiet_NaN = true;
static constexpr bool has_signaling_NaN = false;
static constexpr auto has_denorm = true;
static constexpr auto has_denorm_loss = true;
@ -273,6 +273,9 @@ class numeric_limits<c10::Float8_e5m2> {
static constexpr c10::Float8_e5m2 infinity() {
return c10::Float8_e5m2(0x7C, c10::Float8_e5m2::from_bits());
}
static constexpr c10::Float8_e5m2 quiet_NaN() {
return c10::Float8_e5m2(0x7F, c10::Float8_e5m2::from_bits());
}
static constexpr c10::Float8_e5m2 denorm_min() {
return c10::Float8_e5m2(0x01, c10::Float8_e5m2::from_bits());
}

View File

@ -3,7 +3,7 @@
namespace c10 {
static_assert(
std::is_standard_layout<Float8_e5m2>::value,
std::is_standard_layout_v<Float8_e5m2>,
"c10::Float8_e5m2 must be standard layout.");
} // namespace c10

View File

@ -270,6 +270,11 @@ class numeric_limits<c10::Float8_e5m2fnuz> {
static constexpr c10::Float8_e5m2fnuz infinity() {
return c10::Float8_e5m2fnuz(0x80, c10::Float8_e5m2fnuz::from_bits());
}
// TODO(future): we are mapping neg_zero to both inf and NaN, this is
// surprising and we should figure out what to do about it.
static constexpr c10::Float8_e5m2fnuz quiet_NaN() {
return c10::Float8_e5m2fnuz(0x80, c10::Float8_e5m2fnuz::from_bits());
}
static constexpr c10::Float8_e5m2fnuz denorm_min() {
return c10::Float8_e5m2fnuz(0x01, c10::Float8_e5m2fnuz::from_bits());
}

View File

@ -1,3 +1,4 @@
#include <c10/macros/Macros.h>
#include <c10/util/Float8_e5m2fnuz.h>
namespace c10 {

View File

@ -560,6 +560,7 @@ if(USE_CUDA)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
set_source_files_properties(
${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemory.cu
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
)
endif()

View File

@ -28,8 +28,6 @@
#include <c10/macros/Macros.h>
#include "c10/util/string_utils.h"
namespace caffe2 {
// Using statements for common classes that we refer to in caffe2 very often.

View File

@ -213,9 +213,9 @@ void PyTorchStreamReader::init() {
if (version_ < static_cast<decltype(version_)>(kMinSupportedFileFormatVersion)) {
CAFFE_THROW(
"Attempted to read a PyTorch file with version ",
c10::to_string(version_),
std::to_string(version_),
", but the minimum supported version for reading is ",
c10::to_string(kMinSupportedFileFormatVersion),
std::to_string(kMinSupportedFileFormatVersion),
". Your PyTorch script module file is too old. Please regenerate it",
" with latest version of PyTorch to mitigate this issue.");
}
@ -733,7 +733,7 @@ void PyTorchStreamWriter::writeEndOfFile() {
auto allRecords = getAllWrittenRecords();
// If no ".data/version" or "version" record in the output model, rewrites version info
if(allRecords.find(".data/version") == allRecords.end() && allRecords.find("version") == allRecords.end()) {
std::string version = c10::to_string(version_);
std::string version = std::to_string(version_);
version.push_back('\n');
if (version_ >= 0x6L) {
writeRecord(".data/version", version.c_str(), version.size());

View File

@ -154,6 +154,19 @@ should now merrily print the tensor (exact output subject to randomness):
Also, make sure you specify the correct configuration in the ``cmake --build .``
line above.
System Requirements
-------------------
To ensure smooth installation and usage of LibTorch, please ensure your system
meets the following requirements:
1. **GLIBC Version**:
- GLIBC 2.29 or newer for cxx11 ABI version
- GLIBC 2.17 or newer for pre-cxx11 ABI version
2. **GCC Version**:
- GCC 9 or newer for cxx11 and pre-cxx11 ABI versions
Visual Studio Extension
-----------------------

View File

@ -0,0 +1,339 @@
Pytorch 2.4: Getting Started on Intel GPU
=========================================
The support for Intel GPUs is released alongside PyTorch v2.4.
This release only supports build from source for Intel GPUs.
Hardware Prerequisites
----------------------
.. list-table::
:header-rows: 1
* - Supported Hardware
- Intel® Data Center GPU Max Series
* - Supported OS
- Linux
PyTorch for Intel GPUs is compatible with Intel® Data Center GPU Max Series and only supports OS Linux with release 2.4.
Software Prerequisites
----------------------
As a prerequisite, install the driver and required packages by following the `PyTorch Installation Prerequisites for Intel GPUs <https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html>`_.
Set up Environment
------------------
Before you begin, you need to set up the environment. This can be done by sourcing the ``setvars.sh`` script provided by the ``intel-for-pytorch-gpu-dev`` and ``intel-pti-dev`` packages.
.. code-block::
source ${ONEAPI_ROOT}/setvars.sh
.. note::
The ``ONEAPI_ROOT`` is the folder you installed your ``intel-for-pytorch-gpu-dev`` and ``intel-pti-dev`` packages. Typically, it is located at ``/opt/intel/oneapi/`` or ``~/intel/oneapi/``.
Build from source
-----------------
Now we have all the required packages installed and environment acitvated. Use the following commands to install ``pytorch``, ``torchvision``, ``torchaudio`` by building from source. For more details, refer to official guides in `PyTorch from source <https://github.com/pytorch/pytorch?tab=readme-ov-file#intel-gpu-support>`_, `Vision from source <https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation>`_ and `Audio from source <https://pytorch.org/audio/main/build.linux.html>`_.
.. code-block::
# Get PyTorch Source Code
git clone --recursive https://github.com/pytorch/pytorch
cd pytorch
git checkout main # or checkout the specific release version >= v2.4
git submodule sync
git submodule update --init --recursive
# Get required packages for compilation
conda install cmake ninja
pip install -r requirements.txt
# Pytorch for Intel GPUs only support Linux platform for now.
# Install the required packages for pytorch compilation.
conda install intel::mkl-static intel::mkl-include
# (optional) If using torch.compile with inductor/triton, install the matching version of triton
# Run from the pytorch directory after cloning
# For Intel GPU support, please explicitly `export USE_XPU=1` before running command.
USE_XPU=1 make triton
# If you would like to compile PyTorch with new C++ ABI enabled, then first run this command:
export _GLIBCXX_USE_CXX11_ABI=1
# pytorch build from source
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
python setup.py develop
cd ..
# (optional) If using torchvison.
# Get torchvision Code
git clone https://github.com/pytorch/vision.git
cd vision
git checkout main # or specific version
python setup.py develop
cd ..
# (optional) If using torchaudio.
# Get torchaudio Code
git clone https://github.com/pytorch/audio.git
cd audio
pip install -r requirements.txt
git checkout main # or specific version
git submodule sync
git submodule update --init --recursive
python setup.py develop
cd ..
Check availability for Intel GPU
--------------------------------
.. note::
Make sure the environment is properly set up by following `Environment Set up <#set-up-environment>`_ before running the code.
To check if your Intel GPU is available, you would typically use the following code:
.. code-block::
import torch
torch.xpu.is_available() # torch.xpu is the API for Intel GPU support
If the output is ``False``, ensure that you have Intel GPU in your system and correctly follow the `PyTorch Installation Prerequisites for Intel GPUs <https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html>`_. Then, check that the PyTorch compilation is correctly finished.
Minimum Code Change
-------------------
If you are migrating code from ``cuda``, you would change references from ``cuda`` to ``xpu``. For example:
.. code-block::
# CUDA CODE
tensor = torch.tensor([1.0, 2.0]).to("cuda")
# CODE for Intel GPU
tensor = torch.tensor([1.0, 2.0]).to("xpu")
The following points outline the support and limitations for PyTorch with Intel GPU:
#. Both training and inference workflows are supported.
#. Both eager mode and ``torch.compile`` is supported.
#. Data types such as FP32, BF16, FP16, and Automatic Mixed Precision (AMP) are all supported.
#. Models that depend on third-party components, will not be supported until PyTorch v2.5 or later.
Examples
--------
This section contains usage examples for both inference and training workflows.
Inference Examples
^^^^^^^^^^^^^^^^^^
Here is a few inference workflow examples.
Inference with FP32
"""""""""""""""""""
.. code-block::
import torch
import torchvision.models as models
model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
model.eval()
data = torch.rand(1, 3, 224, 224)
######## code changes #######
model = model.to("xpu")
data = data.to("xpu")
######## code changes #######
with torch.no_grad():
model(data)
print("Execution finished")
Inference with AMP
""""""""""""""""""
.. code-block::
import torch
import torchvision.models as models
model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
model.eval()
data = torch.rand(1, 3, 224, 224)
#################### code changes #################
model = model.to("xpu")
data = data.to("xpu")
#################### code changes #################
with torch.no_grad():
d = torch.rand(1, 3, 224, 224)
############################# code changes #####################
d = d.to("xpu")
# set dtype=torch.bfloat16 for BF16
with torch.autocast(device_type="xpu", dtype=torch.float16, enabled=True):
############################# code changes #####################
model(data)
print("Execution finished")
Inference with ``torch.compile``
""""""""""""""""""""""""""""""""
.. code-block::
import torch
import torchvision.models as models
model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
model.eval()
data = torch.rand(1, 3, 224, 224)
ITERS = 10
######## code changes #######
model = model.to("xpu")
data = data.to("xpu")
######## code changes #######
model = torch.compile(model)
for i in range(ITERS):
with torch.no_grad():
model(data)
print("Execution finished")
Training Examples
^^^^^^^^^^^^^^^^^
Here is a few training workflow examples.
Train with FP32
"""""""""""""""
.. code-block::
import torch
import torchvision
LR = 0.001
DOWNLOAD = True
DATA = "datasets/cifar10/"
transform = torchvision.transforms.Compose(
[
torchvision.transforms.Resize((224, 224)),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
train_dataset = torchvision.datasets.CIFAR10(
root=DATA,
train=True,
transform=transform,
download=DOWNLOAD,
)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128)
model = torchvision.models.resnet50()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9)
model.train()
######################## code changes #######################
model = model.to("xpu")
criterion = criterion.to("xpu")
######################## code changes #######################
for batch_idx, (data, target) in enumerate(train_loader):
########## code changes ##########
data = data.to("xpu")
target = target.to("xpu")
########## code changes ##########
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
print(batch_idx)
torch.save(
{
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
},
"checkpoint.pth",
)
print("Execution finished")
Train with AMP
""""""""""""""
.. code-block::
import torch
import torchvision
LR = 0.001
DOWNLOAD = True
DATA = "datasets/cifar10/"
use_amp=True
transform = torchvision.transforms.Compose(
[
torchvision.transforms.Resize((224, 224)),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
]
)
train_dataset = torchvision.datasets.CIFAR10(
root=DATA,
train=True,
transform=transform,
download=DOWNLOAD,
)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128)
model = torchvision.models.resnet50()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9)
scaler = torch.amp.GradScaler(enabled=use_amp)
model.train()
######################## code changes #######################
model = model.to("xpu")
criterion = criterion.to("xpu")
######################## code changes #######################
for batch_idx, (data, target) in enumerate(train_loader):
########## code changes ##########
data = data.to("xpu")
target = target.to("xpu")
########## code changes ##########
# set dtype=torch.bfloat16 for BF16
with torch.autocast(device_type="xpu", dtype=torch.float16, enabled=use_amp):
output = model(data)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
print(batch_idx)
torch.save(
{
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
},
"checkpoint.pth",
)
print("Execution finished")

View File

@ -140,7 +140,7 @@ struct TensorQueue : torch::CustomClassHolder {
for (const auto index : c10::irange(queue_size)) {
at::Tensor val;
queue_[index] = dict.at(key + "/" + c10::to_string(index));
queue_[index] = dict.at(key + "/" + std::to_string(index));
queue_.push_back(val);
}
}
@ -152,7 +152,7 @@ struct TensorQueue : torch::CustomClassHolder {
dict.insert(
key + "/size", torch::tensor(static_cast<int64_t>(queue_.size())));
for (const auto index : c10::irange(queue_.size())) {
dict.insert(key + "/" + c10::to_string(index), queue_[index]);
dict.insert(key + "/" + std::to_string(index), queue_[index]);
}
return dict;
}

View File

@ -6,6 +6,7 @@
#include <ATen/core/interned_strings.h>
#include <ATen/core/ivalue.h>
#include <ATen/core/jit_type_base.h>
#include <c10/macros/Macros.h>
#include <test/cpp/jit/test_utils.h>
#include <torch/csrc/jit/passes/remove_mutation.h>
#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
@ -491,13 +492,7 @@ TEST(ControlFlowTest, Basic) {
ASSERT_EQ(256, run_binary("while_test", 2, 0));
}
#if defined(__has_feature)
#if __has_feature(address_sanitizer)
#define HAS_ASANUBSAN 1
#endif
#endif
#ifndef HAS_ASANUBSAN
#if !(C10_ASAN_ENABLED || C10_UBSAN_ENABLED)
// This test fails vptr UBSAN checks
TEST(ProtoTest, Basic) {

View File

@ -2,8 +2,9 @@
import unittest
from collections import deque, OrderedDict
from contextlib import ContextDecorator
from contextlib import ContextDecorator, contextmanager, nullcontext
from copy import deepcopy
from functools import partial
from typing import Tuple
import torch
@ -11,6 +12,7 @@ import torch.nn as nn
from torch.distributed._composable import checkpoint
from torch.testing._internal.common_cuda import TEST_CUDA
from torch.testing._internal.common_utils import run_tests, TestCase
from torch.utils.checkpoint import CheckpointError
class MemoryDelta(ContextDecorator):
@ -68,7 +70,7 @@ class MultiOutputModel(nn.Module):
self.w1 = nn.Parameter(torch.randn((100, 100), device=device))
self.w2 = nn.Parameter(torch.randn((100, 100), device=device))
def forward(self, x: torch.Tensor) -> torch.Tensor:
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
z = x @ self.w1
z = nn.functional.relu(z)
z = z @ self.w2
@ -219,6 +221,116 @@ class TestCheckpoint(TestCase):
self.assertEqual(None, checkpoint.state(m)._ac_generator)
def test_checkpoint_kwargs(self):
class MyModel(torch.nn.Module):
def __init__(self, raise_exp: bool, change_shape_in_recomp: bool):
super().__init__()
self.fwd_count = 0
self.raise_exp = raise_exp
self.change_shape_in_recomp = change_shape_in_recomp
self.a = torch.nn.Linear(2, 2)
def forward(self, x):
if self.raise_exp and self.fwd_count == 0:
raise RuntimeError("foo")
if self.raise_exp and self.fwd_count == 1:
raise RuntimeError("bar")
if self.change_shape_in_recomp and self.fwd_count == 1:
x.relu_()
random_tensor = torch.randn(1, 2)
x = self.a(x + random_tensor)
self.fwd_count += 1
return x
m = MyModel(True, False)
m0, m1, m2, m3 = (deepcopy(m) for _ in range(4))
# composable checkpoint does not support use_reentrant=True
with self.assertRaisesRegex(
NotImplementedError,
"use_reentrant=True is not supported in composable checkpoint. "
"Please use torch.utils.checkpoint.checkpoint instead.",
):
checkpoint(m, use_reentrant=True)
# check giving an unsupported kwarg
with self.assertRaisesRegex(ValueError, "Unexpected keyword arguments: foo"):
checkpoint(m0, foo="bar")
handled_fwd_exp = False
handled_recomp_exp = False
@contextmanager
def fwd_ctx(mod: MyModel):
try:
mod.raise_exp = False
yield
finally:
nonlocal handled_fwd_exp
handled_fwd_exp = True
mod.raise_exp = True
@contextmanager
def recomp_ctx(mod: MyModel):
try:
mod.raise_exp = False
yield
finally:
nonlocal handled_recomp_exp
handled_recomp_exp = True
mod.raise_exp = True
# Test different context functions
x = torch.randn(1, 2, requires_grad=True)
checkpoint(
m1, context_fn=lambda: (partial(fwd_ctx, m1)(), partial(recomp_ctx, m1)())
)
m1(x.clone()).sum().backward()
self.assertEqual((handled_fwd_exp, handled_recomp_exp), (True, True))
checkpoint(m2, context_fn=lambda: (nullcontext(), partial(recomp_ctx, m2)()))
with self.assertRaisesRegex(RuntimeError, "foo"):
m2(x.clone())
handled_fwd_exp = False # Reset flag
checkpoint(m3, context_fn=lambda: (partial(fwd_ctx, m3)(), nullcontext()))
with self.assertRaisesRegex(RuntimeError, "bar"):
m3(x.clone()).sum().backward()
self.assertEqual(handled_fwd_exp, True)
# Test determinism check failure
m4 = MyModel(False, True)
m5 = deepcopy(m4)
# Determinism check should not throw an error,
# but autograd should throw a RuntimeError
checkpoint(m4, determinism_check="none")
with self.assertRaises(RuntimeError):
m4(x.clone()).sum().backward()
# Determinism check should throw a CheckpointError
checkpoint(m5, determinism_check="default")
with self.assertRaises(CheckpointError):
m5(x.clone()).sum().backward()
# Test preserving random state
m6 = MyModel(False, False)
m7, m8 = (deepcopy(m6) for _ in range(2))
checkpoint(m7, preserve_rng_state=False)
checkpoint(m8, preserve_rng_state=True)
for mi in (m6, m7, m8):
torch.manual_seed(42)
loss = mi(x.clone()).sum()
torch.manual_seed(41)
loss.backward()
# check that m6 and m7 have at least one different grad
self.assertNotEqual(
(p1.grad for p1 in m6.parameters()), (p2.grad for p2 in m7.parameters())
)
# check that m6 and m8 have identical grads
for p1, p2 in zip(m6.parameters(), m8.parameters()):
self.assertEqual(p1.grad, p2.grad)
if __name__ == "__main__":
run_tests()

View File

@ -0,0 +1,140 @@
# Owner(s): ["module: unknown"]
from copy import copy
import torch
from torch.distributed._tools.mod_tracker import ModTracker
from torch.testing._internal.common_utils import run_tests, TestCase, xfailIfTorchDynamo
class TestModTracker(TestCase):
# "https://github.com/pytorch/pytorch/issues/127112
@xfailIfTorchDynamo
def test_module_hierarchy(self):
seen_fw = []
seen_bw = []
class Foo(torch.nn.Module):
def forward(self, x):
x = x["a"].relu_()
seen_fw.append((copy(tracker.parents), tracker.is_bw))
x.register_hook(
lambda grad: seen_bw.append((copy(tracker.parents), tracker.is_bw))
)
return {"a": torch.mm(x, x)}
class Mod(torch.nn.Module):
def __init__(self):
super().__init__()
self.a = Foo()
self.b = torch.nn.ModuleDict({"nest": Foo()})
self.c = torch.nn.ModuleList([Foo()])
def forward(self, x):
x = self.c[0](x)
return self.b["nest"](self.a(x))
mod = Mod()
with ModTracker() as tracker:
mod({"a": torch.randn(10, 10, requires_grad=True).clone()})[
"a"
].sum().backward()
mod({"a": torch.randn(10, 10, requires_grad=True).clone()})[
"a"
].sum().backward()
self.assertEqual(
seen_fw,
[
({"Global", "Mod", "Mod.c.0"}, False),
({"Global", "Mod", "Mod.a"}, False),
({"Global", "Mod", "Mod.b.nest"}, False),
({"Global", "Mod", "Mod.c.0"}, False),
({"Global", "Mod", "Mod.a"}, False),
({"Global", "Mod", "Mod.b.nest"}, False),
],
)
self.assertEqual(
seen_bw,
[
({"Global", "Mod", "Mod.b.nest"}, True),
({"Global", "Mod", "Mod.a"}, True),
({"Global", "Mod", "Mod.c.0"}, True),
({"Global", "Mod", "Mod.b.nest"}, True),
({"Global", "Mod", "Mod.a"}, True),
({"Global", "Mod", "Mod.c.0"}, True),
],
)
def test_bw_detection(self):
mod = torch.nn.Linear(2, 2)
with ModTracker() as tracker:
mod(torch.rand(2, requires_grad=True)).sum().backward()
self.assertFalse(tracker.is_bw)
self.assertEqual(tracker.parents, {"Global"})
@xfailIfTorchDynamo
def test_user_hooks(self):
class Bar(torch.nn.Module):
def __init__(self):
super().__init__()
self.foo = torch.nn.Linear(10, 10)
def forward(self, x):
return self.foo(x).relu_()
mt = ModTracker()
test_op = []
def hook(mod, hook_name):
mfqn = mt.get_known_fqn(mod) if mod is not None else None
test_op.append((hook_name, mfqn, mfqn in mt.parents, mt.is_bw))
mod = Bar()
mt.register_user_hooks(
lambda m, inp: hook(m, "pre_fw"),
lambda m, inp, op: hook(m, "post_fw"),
lambda m, gop: hook(m, "pre_bw"),
lambda m, ginp: hook(m, "post_bw"),
)
with mt:
mod(torch.rand(10, 10, requires_grad=True)).sum().backward()
expected_op = [
("pre_fw", "Bar", True, False),
("pre_fw", "Bar.foo", True, False),
("post_fw", "Bar.foo", True, False),
("post_fw", "Bar", True, False),
("pre_bw", "Bar", True, True),
("pre_bw", "Bar.foo", True, True),
("post_bw", "Bar", True, True),
("post_bw", "Bar.foo", True, True),
]
self.assertEqual(test_op, expected_op)
with self.assertRaises(AssertionError):
mt.register_user_hooks(lambda x, y: x, None, None, None)
test_op.clear()
with mt:
loss = mod(torch.rand(10, 10, requires_grad=True)).sum()
del mod
loss.backward()
expected_op = [
("pre_fw", "Bar", True, False),
("pre_fw", "Bar.foo", True, False),
("post_fw", "Bar.foo", True, False),
("post_fw", "Bar", True, False),
("pre_bw", None, False, True),
("pre_bw", None, False, True),
("post_bw", None, False, True),
("post_bw", None, False, True),
]
self.assertEqual(test_op, expected_op)
if __name__ == "__main__":
run_tests()

View File

@ -16,10 +16,7 @@ import torch.distributed.distributed_c10d as c10d
import torch.distributed.rpc as rpc
from torch.distributed import DistError, DistNetworkError, DistStoreError
from torch.testing._internal.common_distributed import MultiThreadedTestCase
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
parametrize,
)
from torch.testing._internal.common_utils import instantiate_parametrized_tests
if not dist.is_available():
print("torch.distributed not available, skipping tests", file=sys.stderr)
@ -841,19 +838,11 @@ class TestPythonStore(TestCase):
class TestMultiThreadedWait(MultiThreadedTestCase):
# TODO (xilunwu): Use less hacky means of instantiating stores.
# Note, stores accumulate values per test.
stores = [
dist.FileStore(tempfile.NamedTemporaryFile(delete=False).name, 1),
dist.HashStore(),
dist.PrefixStore(
"pre", dist.FileStore(tempfile.NamedTemporaryFile(delete=False).name, 1)
),
create_tcp_store(use_libuv=False),
create_tcp_store(use_libuv=True),
dist.PrefixStore("pre", create_tcp_store(use_libuv=False)),
dist.PrefixStore("pre", create_tcp_store(use_libuv=True)),
]
file_store = dist.FileStore(tempfile.NamedTemporaryFile(delete=False).name, 1)
hash_store = dist.HashStore()
tcp_store = create_tcp_store(use_libuv=False)
tcp_store_uv = create_tcp_store(use_libuv=True)
@property
def world_size(self):
@ -863,10 +852,7 @@ class TestMultiThreadedWait(MultiThreadedTestCase):
super().setUp()
self._spawn_threads()
# Iterates over self.stores, keep 7 in sync with len(self.stores).
@parametrize("i", range(7))
def test_wait(self, i):
store = self.stores[i]
def _test_wait(self, store):
store.set_timeout(timedelta(seconds=2))
if dist.get_rank() == 0:
store.wait(["key1"])
@ -874,6 +860,39 @@ class TestMultiThreadedWait(MultiThreadedTestCase):
if dist.get_rank() == 1:
store.set("key1", "value1")
def test_wait_hash_store(self):
self._test_wait(self.hash_store)
def test_wait_file_store(self):
self._test_wait(self.file_store)
def test_wait_prefix_file_store(self):
store = dist.PrefixStore("pre", self.file_store)
self._test_wait(store)
def _test_wait_tcp_store(self, master_store):
store = (
master_store
if dist.get_rank() == 0
else dist.TCPStore(
host_name=master_store.host,
port=master_store.port,
is_master=False,
wait_for_workers=False,
use_libuv=False,
)
)
self._test_wait(store)
prefix_store = dist.PrefixStore("pre", store)
self._test_wait(prefix_store)
def test_wait_tcp_store(self):
self._test_wait_tcp_store(self.tcp_store)
def test_wait_tcp_store_uv(self):
self._test_wait_tcp_store(self.tcp_store_uv)
instantiate_parametrized_tests(TestMultiThreadedWait)

View File

@ -0,0 +1,156 @@
# Owner(s): ["module: c10d"]
import torch
import torch.distributed as dist
from torch._C._distributed_c10d import _SymmetricMemory
from torch.distributed.distributed_c10d import _get_process_group_store
from torch.testing._internal.common_distributed import (
MultiProcessTestCase,
skip_if_lt_x_gpu,
)
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
run_tests,
skip_but_pass_in_sandcastle_if,
skipIfRocm,
)
def requires_cuda_p2p_access():
cuda_p2p_access_available = (
torch.cuda.is_available() and torch.cuda.device_count() >= 2
)
num_devices = torch.cuda.device_count()
for i in range(num_devices - 1):
for j in range(i + 1, num_devices):
if not torch.cuda.can_device_access_peer(i, j):
cuda_p2p_access_available = False
break
if not cuda_p2p_access_available:
break
return skip_but_pass_in_sandcastle_if(
not cuda_p2p_access_available,
"cuda p2p access is not available",
)
@instantiate_parametrized_tests
@requires_cuda_p2p_access()
class SymmetricMemoryTest(MultiProcessTestCase):
def setUp(self) -> None:
super().setUp()
self._spawn_processes()
@property
def world_size(self) -> int:
return 2
@property
def device(self) -> torch.device:
return torch.device(f"cuda:{self.rank}")
def _init_process(self):
torch.cuda.set_device(self.device)
store = dist.FileStore(self.file_name, self.world_size)
dist.init_process_group(
backend="nccl",
world_size=self.world_size,
rank=self.rank,
store=store,
)
_SymmetricMemory.set_group_info(
"0",
self.rank,
self.world_size,
_get_process_group_store(dist.GroupMember.WORLD),
)
def _verify_symmetric_memory(self, symm_mem):
self.assertEqual(symm_mem.world_size, 2)
buf = symm_mem.get_buffer(0, (64, 64), torch.float32)
if symm_mem.rank == 0:
symm_mem.wait_signal(src_rank=1)
self.assertTrue(buf.eq(42).all())
else:
buf.fill_(42)
symm_mem.put_signal(dst_rank=0)
symm_mem.barrier()
if symm_mem.rank == 0:
symm_mem.barrier()
self.assertTrue(buf.eq(43).all())
else:
buf.fill_(43)
symm_mem.barrier()
symm_mem.barrier()
@skipIfRocm
@skip_if_lt_x_gpu(2)
def test_empty_strided_p2p(self) -> None:
self._init_process()
shape = (64, 64)
stride = (64, 1)
dtype = torch.float32
device = self.device
group_name = "0"
alloc_args = (shape, stride, dtype, device, group_name)
t = torch.empty(shape, dtype=dtype, device=device)
with self.assertRaises(RuntimeError):
_SymmetricMemory.rendezvous(t)
t = _SymmetricMemory.empty_strided_p2p(*alloc_args)
symm_mem = _SymmetricMemory.rendezvous(t)
del t
self._verify_symmetric_memory(symm_mem)
@skipIfRocm
@skip_if_lt_x_gpu(2)
def test_empty_strided_p2p_persistent(self) -> None:
self._init_process()
shape = (64, 64)
stride = (64, 1)
dtype = torch.float32
device = self.device
alloc_id = 42 # Persistent allocation
group_name = "0"
alloc_args = (shape, stride, dtype, device, group_name, alloc_id)
t = _SymmetricMemory.empty_strided_p2p(*alloc_args)
data_ptr = t.data_ptr()
# Verify that persistent allocation would fail if there's an active
# allocation with the same alloc_id.
with self.assertRaises(RuntimeError):
_SymmetricMemory.empty_strided_p2p(*alloc_args)
# Verify that persistent allocation would succeed in lieu of activate
# allocations with the same alloc_id, and the returned tensor would
# have the same data pointer.
del t
t = _SymmetricMemory.empty_strided_p2p(*alloc_args)
self.assertEqual(t.data_ptr(), data_ptr)
# Verify that get_symmetric_memory would fail if called before
# rendezvous.
with self.assertRaises(RuntimeError):
_SymmetricMemory.get_symmetric_memory(t)
symm_mem_0 = _SymmetricMemory.rendezvous(t)
symm_mem_1 = _SymmetricMemory.get_symmetric_memory(t)
self.assertEqual(id(symm_mem_0), id(symm_mem_1))
self._verify_symmetric_memory(symm_mem_0)
if __name__ == "__main__":
run_tests()

View File

@ -27,6 +27,8 @@ from torch._dynamo.testing import (
normalize_gm,
)
from torch._dynamo.utils import ifdynstaticdefault, same
from torch._dynamo.variables import ConstantVariable
from torch._dynamo.variables.lists import RangeVariable
from torch.nn import functional as F
from torch.testing._internal.common_utils import (
@ -2369,6 +2371,157 @@ class GraphModule(torch.nn.Module):
opt_fn = torch._dynamo.optimize(nopython=True)(fn)
self.assertEqual(opt_fn(), fn())
def gen_random_range_args(self):
args_count = random.randint(1, 3)
args = [random.randint(-10, 10) for _ in range(args_count)]
if args_count == 3 and args[2] == 0:
args[2] = 1
return args
def test_range_length(self):
def test(*args, expected=None):
r = range(*args)
range_variable = RangeVariable([ConstantVariable.create(v) for v in args])
self.assertEqual(len(r), range_variable.range_length())
if expected is not None:
self.assertEqual(len(r), expected)
test(1, 1, 1, expected=0)
test(1, 0, expected=0)
test(-10, expected=0)
test(4, expected=4)
test(10, expected=10)
# step >1
test(1, 10, 2, expected=5)
# negative step
test(10, 1, -1, expected=9)
test(10, 1, -3)
# Fuzz testing
for i in range(100):
args = self.gen_random_range_args()
print("testing :", args)
test(*args)
def test_indexed_range(self):
def test(range, index, expected=None):
range_variable = RangeVariable(
[
ConstantVariable.create(v)
for v in [range.start, range.stop, range.step]
]
)
self.assertEqual(
range[index],
range_variable.apply_index(index).as_python_constant(),
)
if expected is not None:
self.assertEqual(range[index], expected)
test(range(10), 1, expected=1)
test(range(10, 20, 2), 1, expected=12)
# Fuzz testing
for i in range(100):
range_args = self.gen_random_range_args()
r = range(*range_args)
if len(r) == 0:
continue
index = random.randint(0, len(r) - 1)
print("testing:", r, index)
test(r, index)
def test_sliced_range(self):
def test(range, slice, expected=None):
range_variable = RangeVariable(
[
ConstantVariable.create(v)
for v in [range.start, range.stop, range.step]
]
)
self.assertEqual(
range[slice],
range_variable.apply_slice(slice).as_python_constant(),
)
if expected is not None:
self.assertEqual(
range[slice],
expected,
)
test(range(10), slice(1, 10, 2), expected=range(1, 10, 2))
test(range(10), slice(None, 10, None), expected=range(0, 10))
test(range(10), slice(-1, 7, None), expected=range(9, 7))
test(range(10), slice(-1, 7, 2), expected=range(9, 7, 2))
test(range(1, 10, 2), slice(3, 7, 2), expected=range(7, 11, 4))
test(range(1, 10, 2), slice(-3, 7, 2), expected=range(5, 11, 4))
test(range(-1, -5, -3), slice(5, None, -3), expected=range(-4, 2, 9))
def rand_slice():
def flip_coin():
# 1 out of 10
return random.randint(1, 10) == 5
def r_item(allow_zero=True):
i = random.randint(-10, 10)
if not allow_zero and i == 0:
i = 1
if flip_coin():
i = None
return i
arg_count = random.randint(1, 3)
if arg_count == 1:
return slice(r_item())
elif arg_count == 2:
return slice(r_item(), r_item())
else:
return slice(r_item(), r_item(), r_item(False))
# Fuzz testing
for i in range(100):
range_args = self.gen_random_range_args()
r = range(*range_args)
# generate random slice
s = rand_slice()
print("testing:", r, s)
test(r, s)
def test_range_with_slice_index(self):
def fn(x):
acc = 1
for k in range(2)[1::2]:
acc *= acc * k
return x * acc
opt_fn = torch.compile(fullgraph=True)(fn)
x = torch.ones(1)
self.assertEqual(opt_fn(x), fn(x))
def test_range_with_index(self):
def fn(x):
acc = 1
acc *= acc * range(10, 20, 2)[2]
return x * acc
opt_fn = torch.compile(fullgraph=True)(fn)
x = torch.ones(1)
self.assertEqual(opt_fn(x), fn(x))
def test_rand_inlined(self):
@torch.compile(backend="eager", dynamic=True)
def fn():

View File

@ -45,7 +45,8 @@ def check_dynamic_shape_capture():
def count_ops(gm, args, freq, op):
assert [node.target for node in gm.graph.nodes].count(op) == freq
actual = [node.target for node in gm.graph.nodes].count(op)
assert actual == freq, f"expected={freq}, actual={actual}"
return gm
@ -6049,9 +6050,7 @@ class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
y = torch.randn(4, 4, requires_grad=True)
fw_compiler = functools.partial(count_ops, freq=1, op=torch.ops.aten.mm.default)
bw_compiler = functools.partial(
count_ops, freq=3, op=torch.ops.aten.mm.default
) # mm recomputed in the bwd
bw_compiler = functools.partial(count_ops, freq=2, op=torch.ops.aten.mm.default)
backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
self._validate(fn, backend, x, y)
@ -6074,9 +6073,7 @@ class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
y = torch.randn(4, 4, requires_grad=True)
fw_compiler = functools.partial(count_ops, freq=1, op=torch.ops.aten.mm.default)
bw_compiler = functools.partial(
count_ops, freq=3, op=torch.ops.aten.mm.default
) # mm recomputed in the bwd
bw_compiler = functools.partial(count_ops, freq=2, op=torch.ops.aten.mm.default)
backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
self._validate(fn, backend, x, y)
@ -6097,8 +6094,9 @@ class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
fw_compiler = functools.partial(
count_ops, freq=1, op=torch.ops.rngprims.philox_rand.default
)
# philox_rand is passed from fwd
bw_compiler = functools.partial(
count_ops, freq=1, op=torch.ops.rngprims.philox_rand.default
count_ops, freq=0, op=torch.ops.rngprims.philox_rand.default
)
backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
self._validate(
@ -6178,8 +6176,9 @@ class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
fw_compiler = functools.partial(
count_ops, freq=1, op=torch.ops.aten.sigmoid.default
)
# sigmoid passed from fwd
bw_compiler = functools.partial(
count_ops, freq=1, op=torch.ops.aten.sigmoid.default
count_ops, freq=0, op=torch.ops.aten.sigmoid.default
)
backend = aot_autograd(fw_compiler=fw_compiler, bw_compiler=bw_compiler)
self._validate(fn, backend, x)

View File

@ -574,6 +574,23 @@ class MiscTests(torch._inductor.test_case.TestCase):
cleanup_op("mylib::foo")
del lib
def test_auto_functionalize_can_with_none_return(self):
with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
lib.define("foo(Tensor x, Tensor(a!) out) -> None")
def foo_impl(x, out):
out.copy_(x)
lib.impl("foo", foo_impl, "CompositeExplicitAutograd")
x = torch.randn(3)
out = torch.zeros(3)
@torch.compile
def f(x, out):
torch.ops.mylib.foo(x, out)
f(x, out)
def test_user_defined_setattr1(self):
@torch.compile(backend="eager", fullgraph=True)
def fn(obj):
@ -10446,6 +10463,14 @@ fn
res = opt_fn(x)
self.assertEqual(ref, res)
def test_assert_size_stride(self):
x = torch.randn(2, 3, 4)
with self.assertRaisesRegex(
AssertionError,
"expected size 2==5, stride 12==9 at dim=0; expected size 3==6, stride 4==9 at dim=1; expected size 4==7, stride 1==10 at dim=2",
):
torch._C._dynamo.guards.assert_size_stride(x, (5, 6, 7), (9, 9, 10))
def test_module_dunder_dict(self):
class MyModule(torch.nn.Module):
def __init__(self):

View File

@ -2512,6 +2512,19 @@ class OptimizedModuleTest(torch._dynamo.test_case.TestCase):
self.assertEqual(eager_res, optim_res)
self.assertEqual(cnt.frame_count, 1)
def test_module_setattr(self):
models = torch.nn.Sequential(torch.nn.Linear(3, 3))
models[0].abc = False
def run():
models[0].abc = True
x = torch.randn(1, 3)
return models(x)
run = torch.compile(run, fullgraph=True)
run()
self.assertTrue(models[0].abc)
def test_assign_does_not_exist(self):
class MyModule(torch.nn.Module):
def forward(self, x):

View File

@ -8,6 +8,7 @@ import collections
import contextlib
import copy
import functools
import gc
import inspect
import itertools
import random
@ -1079,6 +1080,67 @@ class ReproTests(torch._dynamo.test_case.TestCase):
out_test.sum().backward()
self.assertEqual(leaf.grad, leaf_test.grad)
# https://github.com/pytorch/pytorch/issues/113263
def test_unpack_hooks_dont_run_during_tracing(self):
def f(x, y):
return x * y
f_compiled = torch.compile(f, backend="aot_eager")
pack_count = 0
unpack_count = 0
def pack_hook(x):
nonlocal pack_count
pack_count += 1
return x
# unpack hook shouldn't run during compilation, while we trace the forward
def unpack_hook(x):
nonlocal unpack_count
unpack_count += 1
return x
x = torch.ones(4, requires_grad=True)
y = torch.ones(4, requires_grad=False)
with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook):
out_test = f_compiled(x, y)
self.assertEqual(pack_count, 1)
self.assertEqual(unpack_count, 0)
out_test.sum().backward()
self.assertEqual(pack_count, 1)
self.assertEqual(unpack_count, 1)
# https://github.com/pytorch/pytorch/issues/113263
def test_unpack_hooks_can_be_disabled(self):
def f(x, y):
return x * y
f_compiled = torch.compile(f, backend="aot_eager")
x = torch.ones(4, requires_grad=True)
y = torch.ones(4, requires_grad=False)
with torch.autograd.graph.disable_saved_tensors_hooks("hooks are disabled"):
out_test = f_compiled(x, y)
out_test.sum().backward()
# https://github.com/pytorch/pytorch/issues/113263
def test_disabling_unpack_hooks_within_compiled_region(self):
def g(z):
with torch.autograd.graph.disable_saved_tensors_hooks("hooks are disabled"):
return z + 5
def f(x, y):
z = x * y
return g(z)
f_compiled = torch.compile(f, backend="aot_eager")
x = torch.ones(4, requires_grad=True)
y = torch.ones(4, requires_grad=False)
out_test = f_compiled(x, y)
out_test.sum().backward()
# See https://github.com/pytorch/pytorch/issues/97745
def test_gan_repro_trying_to_backward_through_the_graph_a_second_time(self):
def f(a, b):
@ -4659,6 +4721,66 @@ def forward(self, s0 : torch.SymInt, s1 : torch.SymInt, L_x_ : torch.Tensor):
self.assertEqual(type(actual), type(expected))
self.assertEqual(actual.__dict__, expected.__dict__)
def test_weakref(self):
def fn(x_weak, weight, y):
if x_weak is not None and x_weak() is not weight:
return torch.sin(y)
return torch.cos(y)
weight = torch.randn(4)
y = torch.randn(4)
x_weak = weakref.ref(weight)
ref = fn(x_weak, weight, y)
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
res = opt_fn(x_weak, weight, y)
self.assertEqual(ref, res)
def test_weakref_reconstruct(self):
def fn(x_weak, weight, y):
y = torch.sin(y)
referent = x_weak()
torch._dynamo.graph_break()
if referent is not weight:
return torch.sin(y)
return torch.cos(y)
weight = torch.randn(4)
y = torch.randn(4)
x_weak = weakref.ref(weight)
ref = fn(x_weak, weight, y)
cnt = torch._dynamo.testing.CompileCounter()
opt_fn = torch.compile(fn, backend=cnt)
res = opt_fn(x_weak, weight, y)
self.assertEqual(ref, res)
self.assertEqual(cnt.frame_count, 2)
def test_weakref_del(self):
def fn(x_weak, y):
x = x_weak()
if x is not None:
return torch.sin(y)
return torch.cos(y)
weight = torch.randn(4)
x_weak = weakref.ref(weight)
y = torch.randn(4)
opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
ref = fn(x_weak, y)
res = opt_fn(x_weak, y)
self.assertEqual(ref, res)
del weight
gc.collect()
ref = fn(x_weak, y)
res = opt_fn(x_weak, y)
self.assertEqual(ref, res)
def test_storage_resize_forward_full_graph(self):
class TestModule(torch.nn.Module):
def __init__(self):

View File

@ -1392,6 +1392,7 @@ class GraphModule(torch.nn.Module):
yield t.select(-1, 6), False
# https://github.com/pytorch/pytorch/issues/128649
yield t[2:3, 5:9], dynamic
yield t.view(-1, 15), False
def f(x):
return x * 2

View File

@ -182,10 +182,12 @@ def forward(self, x, n):
self.assertExpectedInline(
ep.graph_module.code.strip(),
"""\
def forward(self, obj_attr, x, n):
call_torchbind = torch.ops.higher_order.call_torchbind(obj_attr, 'add_tensor', x); obj_attr = None
add = torch.ops.aten.add.Tensor(x, call_torchbind); x = call_torchbind = None
return (add,)""",
def forward(self, token, obj_attr, x, n):
with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops.higher_order.call_torchbind, obj_attr, 'add_tensor', x); token = obj_attr = None
getitem = with_effects[0]
getitem_1 = with_effects[1]; with_effects = None
add = torch.ops.aten.add.Tensor(x, getitem_1); x = getitem_1 = None
return (getitem, add)""", # noqa: B950
)
def test_method_schema(self):
@ -227,10 +229,12 @@ def forward(self, x):
self.assertExpectedInline(
ep.graph_module.code.strip(),
"""\
def forward(self, obj_attr, x):
call_torchbind = torch.ops.higher_order.call_torchbind(obj_attr, 'add_tensor', x); obj_attr = None
add = torch.ops.aten.add.Tensor(x, call_torchbind); x = call_torchbind = None
return (add,)""",
def forward(self, token, obj_attr, x):
with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops.higher_order.call_torchbind, obj_attr, 'add_tensor', x); token = obj_attr = None
getitem = with_effects[0]
getitem_1 = with_effects[1]; with_effects = None
add = torch.ops.aten.add.Tensor(x, getitem_1); x = getitem_1 = None
return (getitem, add)""", # noqa: B950
)
@parametrize("pre_dispatch", [True, False])
@ -293,10 +297,12 @@ def forward(self, x, cc):
self.assertExpectedInline(
ep.graph_module.code.strip(),
"""\
def forward(self, x, cc):
call_torchbind = torch.ops.higher_order.call_torchbind(cc, 'add_tensor', x); cc = None
add = torch.ops.aten.add.Tensor(x, call_torchbind); x = call_torchbind = None
return (add,)""",
def forward(self, token, x, cc):
with_effects = torch._higher_order_ops.effects.with_effects(token, torch.ops.higher_order.call_torchbind, cc, 'add_tensor', x); token = cc = None
getitem = with_effects[0]
getitem_1 = with_effects[1]; with_effects = None
add = torch.ops.aten.add.Tensor(x, getitem_1); x = getitem_1 = None
return (getitem, add)""", # noqa: B950
)
# aot_export_function runs the program twice
# in run_functionalized_fw_and_collect_metadata and create_aot_dispatcher_function

View File

@ -198,6 +198,33 @@ def forward(self, arg0_1, arg1_1, arg2_1):
res = torch.compile(f, backend="inductor")(*inputs)
self.assertTrue(torch.allclose(res, f(*inputs)))
@unittest.skipIf(IS_WINDOWS, "Skipped on Windows!")
@skipIfNoDynamoSupport
def test_compile_inductor_external_op_return_none(self):
with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
torch.library.define(
"mylib::inplace_add",
"(Tensor input, Tensor(a!) output) -> ()",
lib=lib,
)
def inplace_add(input: torch.Tensor, output: torch.Tensor) -> None:
assert input.device == output.device
output.add_(input)
lib.impl("inplace_add", inplace_add, "CompositeExplicitAutograd")
def f(x):
out = torch.empty(3)
out = torch.zeros_like(out)
torch.ops.mylib.inplace_add(x, out)
return out
inputs = (torch.randn(3),)
res = torch.compile(f, backend="inductor")(*inputs)
self.assertTrue(torch.allclose(res, f(*inputs)))
def test_compile_aot_eager_requires_grad(self):
def f(x):
torch.ops.aten._print("moo")

View File

@ -109,6 +109,7 @@ class TestFxGraphCache(TestCase):
@requires_triton()
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
@parametrize("device", (GPU_TYPE, "cpu"))
@parametrize("dtype", (torch.float32, torch.bfloat16))
@parametrize("dynamic", (False, True))
@ -216,6 +217,7 @@ class TestFxGraphCache(TestCase):
@requires_triton()
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
@parametrize("device", (GPU_TYPE, "cpu"))
@parametrize("dtype", (torch.float32, torch.float64))
@parametrize("dynamic", (False, True))
@ -255,6 +257,7 @@ class TestFxGraphCache(TestCase):
@largeTensorTest("64GB", device=GPU_TYPE)
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
@parametrize("device", (GPU_TYPE,))
@parametrize("dtype", (torch.float16, torch.bfloat16))
def test_cache_load_with_guards_int32_bounds(self, device, dtype):
@ -303,6 +306,7 @@ class TestFxGraphCache(TestCase):
self.assertEqual(res1, res2)
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
@parametrize("device", (GPU_TYPE, "cpu"))
@parametrize("dtype", (torch.float32, torch.bfloat16))
def test_cache_load_with_guards_static_bounds(self, device, dtype):
@ -346,6 +350,7 @@ class TestFxGraphCache(TestCase):
self.assertEqual(res1, res2)
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
@parametrize("device", (GPU_TYPE, "cpu"))
def test_constant_handling(self, device):
"""
@ -378,6 +383,7 @@ class TestFxGraphCache(TestCase):
@requires_gpu()
@requires_triton()
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
def test_higher_order_op_bypass(self):
"""
Verify that we bypass the cache when we have higher order ops.
@ -403,6 +409,7 @@ class TestFxGraphCache(TestCase):
self.assertGreater(counters["inductor"]["fxgraph_cache_bypass"], 0)
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
def test_generated_kernel_count(self):
"""
Test that we bump the generated_kernel_count metric on a cache hit.
@ -431,6 +438,7 @@ class TestFxGraphCache(TestCase):
self.assertEqual(metrics.generated_kernel_count, 2)
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
def test_cache_clear(self):
"""
Test clearing the cache.
@ -465,6 +473,7 @@ class TestFxGraphCache(TestCase):
self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
def test_cache_with_nt(self):
def gen_nt(r):
values = torch.randn(r, 16)
@ -493,6 +502,7 @@ class TestFxGraphCache(TestCase):
self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
def test_cache_with_symint_non_arg_guard(self):
def fn(x, ref_id):
self_id = 22
@ -516,6 +526,7 @@ class TestFxGraphCache(TestCase):
self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
def test_cache_guard(self):
def f(x, val):
if val > 5:
@ -740,6 +751,7 @@ class TestFxGraphCacheHashing(TestCase):
class TestUtils(TestCase):
@config.patch({"fx_graph_remote_cache": False})
def test_fresh_inductor_cache(self):
def fn(x, y):
return x + y

View File

@ -1,5 +1,6 @@
# Owner(s): ["module: inductor"]
import operator
import os
from torch._inductor.compile_worker.subproc_pool import (
raise_testexc,
@ -31,6 +32,21 @@ class TestCompileWorker(TestCase):
finally:
pool.shutdown()
def test_crash(self):
pool = SubprocPool(2)
try:
with self.assertRaises(Exception):
a = pool.submit(os._exit, 1)
a.result()
# Pool should still be usable after a crash
b = pool.submit(operator.add, 100, 1)
c = pool.submit(operator.sub, 100, 1)
self.assertEqual(b.result(), 101)
self.assertEqual(c.result(), 99)
finally:
pool.shutdown()
if __name__ == "__main__":
from torch._inductor.test_case import run_tests

View File

@ -2223,6 +2223,7 @@ known_failing_tests = {
"test_save_for_backward_inputs_are_namedtuple", # torch._dynamo.exc.Unsupported: 'skip function
"test_setitem", # AssertionError: Tensor-likes are not close!
"test_grad_nonleaf_register_hook", # IndexError: list index out of range (NB: x.grad = y where both x and y are input tensors)
"test_unpack_hooks_exec_count", # pack/unpack saved tensor hooks firing more than once
"test_scalar_grad_mixed_device", # Fake Tensors aren't propagating device properly for 0-dim grads
}

View File

@ -18,7 +18,10 @@ from torch._inductor.runtime.hints import DeviceProperties
from torch._inductor.utils import run_and_get_code
from torch.fx.experimental.proxy_tensor import make_fx
from torch.testing import FileCheck
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
from torch.testing._internal.common_cuda import (
PLATFORM_SUPPORTS_FLASH_ATTENTION,
SM80OrLater,
)
from torch.testing._internal.common_utils import (
DeterministicGuard,
freeze_rng_state,
@ -27,6 +30,8 @@ from torch.testing._internal.common_utils import (
TEST_WITH_ASAN,
)
from torch.testing._internal.inductor_utils import skipCUDAIf
try:
try:
import triton
@ -1239,6 +1244,47 @@ def triton_(in_ptr0, in_ptr1, out_ptr0, xnumel, XBLOCK : tl.constexpr):
tl.store(out_ptr0 + (x3), tmp2, xmask)""", # noqa: B950
)
@skipCUDAIf(not SM80OrLater, "uses bfloat16 which requires SM >= 80")
def test_int64_index_intermediate(self):
def foo(inp):
view_23 = torch.ops.aten.view.default(inp, [-1, 8192, 8192])
split_1 = torch.ops.aten.split.Tensor(view_23, 1024, 1)
view_23 = None
getitem_17 = split_1[0]
getitem_18 = split_1[1]
getitem_19 = split_1[2]
getitem_20 = split_1[3]
getitem_21 = split_1[4]
getitem_22 = split_1[5]
getitem_23 = split_1[6]
getitem_24 = split_1[7]
split_1 = None
cat_1 = torch.ops.aten.cat.default(
[
getitem_17,
getitem_18,
getitem_19,
getitem_20,
getitem_21,
getitem_22,
getitem_23,
getitem_24,
]
)
getitem_17 = (
getitem_18
) = (
getitem_19
) = getitem_20 = getitem_21 = getitem_22 = getitem_23 = getitem_24 = None
return cat_1
for mark_dynamic in [False, True]:
inp = torch.rand((65536, 8192), dtype=torch.bfloat16, device="cuda")
if mark_dynamic:
torch._dynamo.mark_dynamic(inp, 0)
foo_c = torch.compile(foo)
torch.testing.assert_allclose(foo(inp), foo_c(inp))
if __name__ == "__main__":
from torch._inductor.test_case import run_tests

View File

@ -2526,6 +2526,7 @@ class TestPatternMatcher(TestPatternMatcherBase):
om(*example_inputs)
om(*example_inputs)
@torch._dynamo.config.patch("inline_inbuilt_nn_modules", True)
def test_reproduce_121253_issue(self):
class Mod(torch.nn.Module):
def __init__(self, weight, bias, beta, alpha):
@ -2550,8 +2551,8 @@ class TestPatternMatcher(TestPatternMatcherBase):
else "mkldnn._linear_pointwise"
)
for beta, alpha in zip([1.0, 0.1, 0.0], [1.0, 0.1, 1.0]):
weight = torch.randn(64, 64, dtype=dtype)
bias = torch.randn(64, dtype=dtype)
weight = torch.nn.Parameter(torch.randn(64, 64, dtype=dtype))
bias = torch.nn.Parameter(torch.randn(64, dtype=dtype))
mod = Mod(weight, bias, beta, alpha).to(dtype).eval()
with torch.no_grad():
x = torch.randn(1, 64, dtype=dtype)

View File

@ -1459,6 +1459,26 @@ class CommonTemplate:
actual = _run_and_assert_no_indirect_indexing(self, copy_opt, x)
self.assertEqual(expect, actual)
@dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
@config.patch(implicit_fallbacks=True)
def test_index_propagation_nested_indirect_indexing(self):
def nested(x, repeats):
rank = torch.arange(repeats.numel(), device=x.device)
index = rank.repeat_interleave(repeats, dim=0)
return torch.index_select(x, index=index, dim=0)
example_inputs = (
torch.randn((32, 64), device=self.device),
repeats := torch.tensor([5, 10, 15], device=self.device),
)
torch._dynamo.mark_dynamic(repeats, 0) # create backed symint
nested_opt = torch._dynamo.optimize("inductor")(nested)
expect = nested(*example_inputs)
actual = nested_opt(*example_inputs)
self.assertEqual(expect, actual)
def test_index_propagation_flip(self):
def flip(x):
i = torch.arange(x.size(0) - 1, -1, -1, device=x.device)
@ -10843,6 +10863,7 @@ if HAS_GPU and not TEST_WITH_ASAN:
self.assertEqual(fn_opt(*inps), fn(*inps))
@config.patch({"fx_graph_remote_cache": False})
def test_optimize_indexing_dtype_with_constraint(self):
def fn1(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
x = torch.arange(0, b.shape[0], device=GPU_TYPE)

View File

@ -406,44 +406,132 @@ inductor_override_kwargs = {
}
# Always test with all sample for following ops
inductor_all_samples = {
"arange",
"diagonal",
"diagonal_copy",
"diagonal_scatter",
"softmax.with_dtype",
"index_add",
"index_copy",
"index_reduce.prod",
"index_reduce.mean",
"index_reduce.amax",
"index_reduce.amin",
"scatter_reduce.sum",
"select_scatter",
"squeeze",
"unfold",
"unsqueeze",
"sum",
"amax",
"amin",
"all",
"T",
"H",
"isinf",
"isposinf",
"isneginf",
"nan_to_num",
"mT",
"mH",
"rsub",
"triu",
"cummax",
"cummin",
"nextafter",
"gather",
"_chunk_cat",
"constant_pad_nd",
# Test with one sample only for following ops
inductor_one_sample = {
"_segment_reduce.lengths": {f16},
"_segment_reduce.offsets": {f16},
"addmv": {f16},
"argsort": {b8, f16, f32, f64, i32, i64},
"as_strided.partial_views": {f16},
"clamp_max": {b8},
"clamp_min": {b8},
"corrcoef": {f16},
"diff": {f16},
"einsum": {f16, i32},
"gradient": {f16},
"histogram": {f32, f64},
"histogramdd": {f32, f64},
"index_put": {f16, f32, f64},
"linalg.eig": {f32, f64},
"linspace": {f16, i32, i64},
"linspace.tensor_overload": {f16, f32, f64, i32, i64},
"logspace": {f16},
"logspace.tensor_overload": {f16, f32, f64, i32, i64},
"masked_logsumexp": {i64},
"max.binary": {b8},
"max_pool2d_with_indices_backward": {f16, f32, f64},
"maximum": {b8},
"min.binary": {b8},
"minimum": {b8},
"ne": {b8},
"new_empty_strided": {f16},
"nn.functional.adaptive_avg_pool3d": {f16},
"nn.functional.adaptive_max_pool1d": {f16, f32},
"nn.functional.adaptive_max_pool2d": {f16, f32},
"nn.functional.bilinear": {f16},
"nn.functional.conv_transpose1d": {f16},
"nn.functional.conv_transpose2d": {f16},
"nn.functional.conv_transpose3d": {f16},
"nn.functional.cosine_similarity": {f16},
"nn.functional.cross_entropy": {f16, f32, f64},
"nn.functional.gaussian_nll_loss": {f16},
"nn.functional.grid_sample": {f32, f64},
"nn.functional.interpolate.area": {f16},
"nn.functional.max_pool2d": {f16, f32, f64, i32, i64},
"nn.functional.nll_loss": {f16, f32, f64},
"normal": {f16, f32, f64},
"put": {f16, f32, f64},
"rot90": {b8, f16, f32, f64, i32, i64},
"scatter": {b8, i64},
"take": {b8, f16, f32, f64, i32, i64},
("__rdiv__", "cuda"): {f16},
("__rmod__", "cuda"): {f16, i64},
("__rmul__", "cuda"): {f16},
("__rpow__", "cuda"): {f16},
("addcdiv", "cuda"): {f16},
("addcmul", "cuda"): {f16},
("atan2", "cuda"): {f16},
("cumsum", "cuda"): {f16},
("cumulative_trapezoid", "cuda"): {f16},
("dist", "cuda"): {f16},
("div.no_rounding_mode", "cuda"): {f16},
("fmod", "cuda"): {f16},
("grid_sampler_2d", "cuda"): {f16},
("index_fill", "cuda"): {f16, f32, f64},
("ldexp", "cuda"): {f16},
("lerp", "cuda"): {f16},
("linalg.householder_product", "cuda"): {f32},
("linalg.matrix_norm", "cuda"): {f16},
("linalg.vector_norm", "cuda"): {f16},
("logspace", "cuda"): {i32, i64},
("masked.cumsum", "cuda"): {f16},
("masked.logsumexp", "cuda"): {f16},
("masked.mean", "cuda"): {b8},
("masked.normalize", "cuda"): {f16},
("masked.prod", "cuda"): {f16},
("masked.std", "cuda"): {f16},
("masked.var", "cuda"): {f16},
("mul", "cuda"): {f16},
("nn.functional.alpha_dropout", "cuda"): {f16, f32, f64},
("nn.functional.avg_pool1d", "cuda"): {f16, f32, f64},
("nn.functional.avg_pool2d", "cuda"): {f16, f32, f64},
("nn.functional.avg_pool3d", "cuda"): {f16, f32, f64},
("nn.functional.binary_cross_entropy", "cuda"): {f16},
("nn.functional.binary_cross_entropy_with_logits", "cuda"): {f16},
("nn.functional.conv2d", "cuda"): {f16},
("nn.functional.cosine_embedding_loss", "cuda"): {f16},
("nn.functional.dropout2d", "cuda"): {f16, f32, f64},
("nn.functional.dropout3d", "cuda"): {f16, f32, f64},
("nn.functional.dropout", "cuda"): {f16, f32, f64},
("nn.functional.feature_alpha_dropout.with_train", "cuda"): {f16, f32, f64},
("nn.functional.fractional_max_pool2d", "cuda"): {f16, f32, f64},
("nn.functional.fractional_max_pool3d", "cuda"): {f16, f32, f64},
("nn.functional.grid_sample", "cuda"): {f16},
("nn.functional.group_norm", "cuda"): {f16},
("nn.functional.hinge_embedding_loss", "cuda"): {f16},
("nn.functional.interpolate.bicubic", "cuda"): {f16},
("nn.functional.interpolate.bilinear", "cuda"): {f16},
("nn.functional.interpolate.trilinear", "cuda"): {f16},
("nn.functional.kl_div", "cuda"): {f16},
("nn.functional.margin_ranking_loss", "cuda"): {f16},
("nn.functional.max_pool1d", "cuda"): {f16, f32, f64},
("nn.functional.max_pool3d", "cuda"): {f16},
("nn.functional.mse_loss", "cuda"): {f16},
("nn.functional.multi_margin_loss", "cuda"): {f16},
("nn.functional.multilabel_margin_loss", "cuda"): {f16},
("nn.functional.multilabel_soft_margin_loss", "cuda"): {f16},
("nn.functional.normalize", "cuda"): {f16},
("nn.functional.pad.replicate", "cuda"): {f16, f32, f64},
("nn.functional.pad.reflect", "cuda"): {f16},
("nn.functional.pairwise_distance", "cuda"): {f16},
("nn.functional.poisson_nll_loss", "cuda"): {f16},
("nn.functional.rms_norm", "cuda"): {f16},
("norm", "cuda"): {f16},
("pow", "cuda"): {f16},
("prod", "cuda"): {f16},
("scatter_reduce.amax", "cuda"): {f16, f32, f64},
("scatter_reduce.amin", "cuda"): {f16, f32, f64},
("scatter_reduce.mean", "cuda"): {f16, f32, f64},
("special.xlog1py", "cuda"): {f16},
("std", "cuda"): {f16},
("std_mean", "cuda"): {f16},
("svd_lowrank", "cuda"): {f32, f64},
("trapezoid", "cuda"): {f16},
("trapz", "cuda"): {f16},
("true_divide", "cuda"): {f16},
("var", "cuda"): {f16},
("var_mean", "cuda"): {f16},
("xlogy", "cuda"): {f16},
}
@ -489,10 +577,14 @@ class TestInductorOpInfo(TestCase):
)
@collection_decorator
def test_comprehensive(self, device, dtype, op):
device_type = torch.device(device).type
assert device_type in (GPU_TYPE, "cpu")
torch._dynamo.reset()
with torch.no_grad():
# TODO: should we move empty_cache to the common device interface
if device == "cuda":
if device_type == "cuda":
torch.cuda.empty_cache()
op_name = op.name
if op.variant_test_name:
@ -509,10 +601,6 @@ class TestInductorOpInfo(TestCase):
if dtype not in allowed_dtypes:
raise unittest.SkipTest("Skipped!")
device_type = torch.device(device).type
assert device_type in (GPU_TYPE, "cpu")
# with open("test_output.txt", "a") as f:
# print(f"CONSIDERING OP {op_name} on {device_type} with {dtype} |
# {inductor_skips[device_type].get(op_name, set())}", flush=True, file=f)
@ -557,7 +645,10 @@ class TestInductorOpInfo(TestCase):
)
samples = op.sample_inputs(device, dtype, requires_grad=requires_grad)
if op_name not in inductor_all_samples and not ALL_SAMPLES:
if (
dtype in inductor_one_sample.get(op_name, {})
or dtype in inductor_one_sample.get((op_name, device_type), {})
) and not ALL_SAMPLES:
if isinstance(samples, (list, tuple)):
samples = [samples[0]]
else:

View File

@ -1,16 +1,16 @@
# Owner(s): ["module: inductor"]
import functools
import unittest
import torch
from torch._dynamo import config as dynamo_config
from torch._inductor import config as inductor_config
from torch._inductor.test_case import TestCase as InductorTestCase
from torch._inductor.utils import is_big_gpu
from torch.testing import make_tensor
from torch.testing._internal.common_device_type import instantiate_device_type_tests
from torch.testing._internal.common_utils import IS_LINUX
from torch.testing._internal.common_utils import IS_LINUX, parametrize
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CUDA, skipCUDAIf
@ -214,6 +214,44 @@ class TestUnbackedSymints(InductorTestCase):
torch.testing.assert_close(actual, expected)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@dynamo_config.patch({"capture_scalar_outputs": True})
@parametrize(
"torch_fn", [torch.mm, torch.bmm, torch.addmm], name_fn=lambda fn: fn.__name__
)
@parametrize("coordinate_descent_tuning", [True, False], name_fn=str)
def test_mm_and_friends(self, device, torch_fn, coordinate_descent_tuning):
if torch_fn == torch.addmm:
torch_fn = functools.partial(torch_fn, torch.ones(1, device=device))
def fn(x, w, repeats, is_bmm):
u0 = repeats.item()
torch._check_is_size(u0)
x_unbacked = x.expand(u0, 32)
w_unbacked = w.expand(32, u0)
if is_bmm:
# Make sure inputs are batched.
x_unbacked = x_unbacked.expand(10, *x_unbacked.shape)
w_unbacked = w_unbacked.expand(10, *w_unbacked.shape)
return torch_fn(x_unbacked, w_unbacked)
example_inputs = (
torch.randn(1, 32, device=device),
torch.randn(32, 1, device=device),
torch.tensor(100, device=device),
torch_fn == torch.bmm,
)
with inductor_config.patch(
{
# coordinate_descent_tuning has its own path during decomp
"coordinate_descent_tuning": coordinate_descent_tuning,
}
):
actual = torch.compile(fn, fullgraph=True)(*example_inputs)
expected = fn(*example_inputs)
torch.testing.assert_close(actual, expected)
instantiate_device_type_tests(
TestUnbackedSymints, globals(), only_for=(GPU_TYPE, "cpu")

View File

@ -471,7 +471,12 @@ class TestDynamoWithONNXRuntime(onnx_test_common._TestONNXRuntime):
if test_local_backend:
assert local_ort is not None
number_of_captured_graphs = 2 if test_backward else 1
if torch._dynamo.config.inline_inbuilt_nn_modules:
# with inlining and dynamic=True, we have more graph captures
number_of_captured_graphs = 3 if test_backward else 2
else:
number_of_captured_graphs = 2 if test_backward else 1
execution_count = len(example_args_collection) * number_of_captured_graphs
self._assert_counting_information(
local_ort,
@ -564,8 +569,14 @@ class TestDynamoWithONNXRuntime(onnx_test_common._TestONNXRuntime):
if test_local_backend:
assert local_ort is not None
number_of_captured_graphs = 2 if test_backward else 1
if torch._dynamo.config.inline_inbuilt_nn_modules:
# with inlining and dynamic=True, we have more graph captures
number_of_captured_graphs = 3 if test_backward else 2
else:
number_of_captured_graphs = 2 if test_backward else 1
execution_count = len(example_args_collection) * number_of_captured_graphs
self._assert_counting_information(
local_ort,
expected_execution_count=execution_count,
@ -649,7 +660,11 @@ class TestDynamoWithONNXRuntime(onnx_test_common._TestONNXRuntime):
if test_local_backend:
assert local_ort is not None
number_of_captured_graphs = 2 if test_backward else 1
if torch._dynamo.config.inline_inbuilt_nn_modules:
# with inlining and dynamic=True, we have more graph captures
number_of_captured_graphs = 3 if test_backward else 2
else:
number_of_captured_graphs = 2 if test_backward else 1
execution_count = len(example_args_collection) * number_of_captured_graphs
self._assert_counting_information(
local_ort,

View File

@ -33,6 +33,11 @@ class SampleModelTwoInputs(torch.nn.Module):
return (y, z)
class SampleModelForDynamicShapes(torch.nn.Module):
def forward(self, x, b):
return x.relu(), b.sigmoid()
class _LargeModel(torch.nn.Module):
def __init__(self):
super().__init__()
@ -230,8 +235,15 @@ class TestLargeProtobufONNXProgramSerializerAPI(common_utils.TestCase):
class TestONNXExportWithDynamo(common_utils.TestCase):
def test_args_normalization_with_no_kwargs(self):
exported_program = torch.export.export(
SampleModelTwoInputs(),
(
torch.randn(1, 1, 2),
torch.randn(1, 1, 2),
),
)
onnx_program_from_new_exporter = torch.onnx.dynamo_export(
SampleModelTwoInputs(), torch.randn(1, 1, 2), torch.randn(1, 1, 2)
exported_program, torch.randn(1, 1, 2), torch.randn(1, 1, 2)
)
onnx_program_from_old_exporter = torch.onnx.export(
SampleModelTwoInputs(),
@ -243,9 +255,25 @@ class TestONNXExportWithDynamo(common_utils.TestCase):
onnx_program_from_old_exporter.model_proto,
)
def test_args_normalization_with_kwargs(self):
def test_args_is_tensor_not_tuple(self):
exported_program = torch.export.export(SampleModel(), (torch.randn(1, 1, 2),))
onnx_program_from_new_exporter = torch.onnx.dynamo_export(
SampleModelTwoInputs(), torch.randn(1, 1, 2), b=torch.randn(1, 1, 2)
exported_program, torch.randn(1, 1, 2)
)
onnx_program_from_old_exporter = torch.onnx.export(
SampleModel(), torch.randn(1, 1, 2), dynamo=True
)
self.assertEqual(
onnx_program_from_new_exporter.model_proto,
onnx_program_from_old_exporter.model_proto,
)
def test_args_normalization_with_kwargs(self):
exported_program = torch.export.export(
SampleModelTwoInputs(), (torch.randn(1, 1, 2),), {"b": torch.randn(1, 1, 2)}
)
onnx_program_from_new_exporter = torch.onnx.dynamo_export(
exported_program, torch.randn(1, 1, 2), b=torch.randn(1, 1, 2)
)
onnx_program_from_old_exporter = torch.onnx.export(
SampleModelTwoInputs(),
@ -258,8 +286,11 @@ class TestONNXExportWithDynamo(common_utils.TestCase):
)
def test_args_normalization_with_empty_dict_at_the_tail(self):
exported_program = torch.export.export(
SampleModelTwoInputs(), (torch.randn(1, 1, 2),), {"b": torch.randn(1, 1, 2)}
)
onnx_program_from_new_exporter = torch.onnx.dynamo_export(
SampleModelTwoInputs(), torch.randn(1, 1, 2), b=torch.randn(1, 1, 2)
exported_program, torch.randn(1, 1, 2), b=torch.randn(1, 1, 2)
)
onnx_program_from_old_exporter = torch.onnx.export(
SampleModelTwoInputs(),
@ -271,17 +302,111 @@ class TestONNXExportWithDynamo(common_utils.TestCase):
onnx_program_from_old_exporter.model_proto,
)
def test_dynamic_axes_enable_dynamic_shape(self):
def test_dynamic_axes_enable_dynamic_shapes_with_fully_specified_axes(self):
exported_program = torch.export.export(
SampleModelForDynamicShapes(),
(
torch.randn(2, 2, 3),
torch.randn(2, 2, 3),
),
dynamic_shapes={
"x": {
0: torch.export.Dim("customx_dim_0"),
1: torch.export.Dim("customx_dim_1"),
2: torch.export.Dim("customx_dim_2"),
},
"b": {
0: torch.export.Dim("customb_dim_0"),
1: torch.export.Dim("customb_dim_1"),
2: torch.export.Dim("customb_dim_2"),
},
},
)
onnx_program_from_new_exporter = torch.onnx.dynamo_export(
SampleModelTwoInputs(),
torch.randn(1, 1, 2),
b=torch.randn(1, 1, 2),
export_options=ExportOptions(dynamic_shapes=True),
exported_program,
torch.randn(2, 2, 3),
b=torch.randn(2, 2, 3),
)
onnx_program_from_old_exporter = torch.onnx.export(
SampleModelTwoInputs(),
(torch.randn(1, 1, 2), {"b": torch.randn(1, 1, 2)}, {}),
dynamic_axes={"b": [0, 1, 2]},
SampleModelForDynamicShapes(),
(torch.randn(2, 2, 3), {"b": torch.randn(2, 2, 3)}, {}),
dynamic_axes={
"x": {0: "customx_dim_0", 1: "customx_dim_1", 2: "customx_dim_2"},
"b": {0: "customb_dim_0", 1: "customb_dim_1", 2: "customb_dim_2"},
},
dynamo=True,
)
self.assertEqual(
onnx_program_from_new_exporter.model_proto,
onnx_program_from_old_exporter.model_proto,
)
def test_dynamic_axes_enable_dynamic_shapes_with_default_axe_names(self):
exported_program = torch.export.export(
SampleModelForDynamicShapes(),
(
torch.randn(2, 2, 3),
torch.randn(2, 2, 3),
),
dynamic_shapes={
"x": {
0: torch.export.Dim("customx_dim_0"),
1: torch.export.Dim("customx_dim_1"),
2: torch.export.Dim("customx_dim_2"),
},
"b": {
0: torch.export.Dim("customb_dim_0"),
1: torch.export.Dim("customb_dim_1"),
2: torch.export.Dim("customb_dim_2"),
},
},
)
onnx_program_from_new_exporter = torch.onnx.dynamo_export(
exported_program,
torch.randn(2, 2, 3),
b=torch.randn(2, 2, 3),
)
onnx_program_from_old_exporter = torch.onnx.export(
SampleModelForDynamicShapes(),
(torch.randn(2, 2, 3), {"b": torch.randn(2, 2, 3)}, {}),
dynamic_axes={
"x": [0, 1, 2],
"b": [0, 1, 2],
},
dynamo=True,
)
self.assertEqual(
onnx_program_from_new_exporter.model_proto,
onnx_program_from_old_exporter.model_proto,
)
def test_dynamic_axes_supports_partial_dynamic_shapes(self):
exported_program = torch.export.export(
SampleModelForDynamicShapes(),
(
torch.randn(2, 2, 3),
torch.randn(2, 2, 3),
),
dynamic_shapes={
"x": None,
"b": {
0: torch.export.Dim("customb_dim_0"),
1: torch.export.Dim("customb_dim_1"),
2: torch.export.Dim("customb_dim_2"),
},
},
)
onnx_program_from_new_exporter = torch.onnx.dynamo_export(
exported_program,
torch.randn(2, 2, 3),
b=torch.randn(2, 2, 3),
)
onnx_program_from_old_exporter = torch.onnx.export(
SampleModelForDynamicShapes(),
(torch.randn(2, 2, 3), {"b": torch.randn(2, 2, 3)}, {}),
dynamic_axes={
"b": [0, 1, 2],
},
dynamo=True,
)
self.assertEqual(
@ -303,16 +428,37 @@ class TestONNXExportWithDynamo(common_utils.TestCase):
dynamo=True,
)
def test_raises_unsupported_specific_dynamic_axes_warning(self):
message = (
"Specified dynamic axes is not supported for dynamo export at the moment."
)
with self.assertWarnsOnceRegex(UserWarning, message):
def test_input_names_are_not_yet_supported_in_dynamic_axes(self):
with self.assertRaisesRegex(
ValueError,
"Assinging new input names is not supported yet. Please use model forward signature "
"to specify input names in dynamix_axes.",
):
_ = torch.onnx.export(
SampleModel(),
(torch.randn(1, 1, 2),),
dynamic_axes={"input": [0, 1, 2]},
SampleModelForDynamicShapes(),
(
torch.randn(2, 2, 3),
torch.randn(2, 2, 3),
),
input_names=["input"],
dynamic_axes={"input": [0, 1]},
dynamo=True,
)
def test_dynamic_shapes_hit_constraints_in_dynamo(self):
# SampleModelTwoInputs has constraints becuse of add of two inputs,
# so the two input shapes are related.
with self.assertRaisesRegex(
torch._dynamo.exc.UserError,
"Constraints violated",
):
_ = torch.onnx.export(
SampleModelTwoInputs(),
(torch.randn(2, 2, 3), torch.randn(2, 2, 3)),
dynamic_axes={
"x": {0: "x_dim_0", 1: "x_dim_1", 2: "x_dim_2"},
"b": {0: "b_dim_0", 1: "b_dim_1", 2: "b_dim_2"},
},
dynamo=True,
)
@ -323,6 +469,17 @@ class TestONNXExportWithDynamo(common_utils.TestCase):
)
self.assertTrue(os.path.exists(path))
def test_raises_error_when_input_is_script_module(self):
class ScriptModule(torch.jit.ScriptModule):
def forward(self, x):
return x
with self.assertRaisesRegex(
TypeError,
"Dynamo export does not support ScriptModule or ScriptFunction.",
):
_ = torch.onnx.export(ScriptModule(), torch.randn(1, 1, 2), dynamo=True)
if __name__ == "__main__":
common_utils.run_tests()

View File

@ -527,8 +527,7 @@ EXPECTED_SKIPS_OR_FAILS_WITH_DTYPES: Tuple[onnx_test_common.DecorateMeta, ...] =
),
xfail(
"gather",
reason="HandleNegativeAxis(int64_t, int64_t) IsAxisInRange(axis, tensor_rank) was \
false. axis 0 is not in valid range [-0,-1]"
reason="GatherElements op: Rank of input 'data' needs to be equal to rank of input 'indices'"
),
xfail(
"geometric",
@ -1517,7 +1516,6 @@ SKIP_XFAIL_SUBTESTS_WITH_MATCHER_AND_MODEL_TYPE: tuple[
"nn.functional.batch_norm",
matcher=lambda sample: sample.kwargs.get("training") is True
and any(arg is not None for arg in sample.args[2:4]),
model_type=pytorch_test_common.TorchModelType.TORCH_EXPORT_EXPORTEDPROGRAM,
reason="Flaky failure: https://github.com/pytorch/pytorch/issues/115106",
),
xfail(
@ -1998,7 +1996,7 @@ class TestOnnxModelOutputConsistency(onnx_test_common._TestONNXRuntime):
"nn.functional.hardsigmoid": [1e-3, 5e-3],
"nn.functional.hardswish": [1e-3, 5e-3],
"nn.functional.hinge_embedding_loss": [4e-1, 3e-3],
"nn.functional.huber_loss": [1e-3, 1e-2],
"nn.functional.huber_loss": [1e-2, 1e-1],
"nn.functional.instance_norm": [1e-2, 1e-3],
"nn.functional.interpolate": [1e-2, 1e-3],
"nn.functional.kl_div": [2e-3, 2e-4],

View File

@ -171,13 +171,9 @@ class TestFxToOnnx(pytorch_test_common.ExportTestCase):
torch.argmax(input, dim=1, keepdim=True),
)
# NOTE: KeyError: dim raised in optimizer
with self.assertWarnsOnceRegex(
UserWarning, "ONNXScript optimizer failed. Skipping optimization."
):
_ = dynamo_export(
ArgminArgmaxModel(), model_input, export_options=self.export_options
)
_ = dynamo_export(
ArgminArgmaxModel(), model_input, export_options=self.export_options
)
def test_multiple_outputs_op_with_evaluator(self):
class TopKModel(torch.nn.Module):

Some files were not shown because too many files have changed in this diff Show More