mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-19 01:54:54 +08:00
Update
[ghstack-poisoned]
This commit is contained in:
@ -1 +1 @@
|
||||
e03a63be43e33596f7f0a43b0f530353785e4a59
|
||||
22bc29b4d503fc895ff73bc720ff396e9723465f
|
||||
|
||||
@ -111,7 +111,10 @@ ninja==1.11.1.3
|
||||
#Pinned versions: 1.11.1.3
|
||||
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
|
||||
|
||||
numba==0.60.0 ; python_version <= "3.12" and platform_machine != "s390x"
|
||||
numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
|
||||
numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
|
||||
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
|
||||
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
#Description: Just-In-Time Compiler for Numerical Functions
|
||||
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
|
||||
#test that import: test_numba_integration.py
|
||||
@ -130,7 +133,8 @@ numba==0.60.0 ; python_version <= "3.12" and platform_machine != "s390x"
|
||||
#test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
|
||||
#test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
|
||||
#test_binary_ufuncs.py
|
||||
numpy==1.26.2; python_version < "3.13"
|
||||
numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
|
||||
numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
|
||||
numpy==2.1.2; python_version >= "3.13"
|
||||
|
||||
pandas==2.0.3; python_version < "3.13"
|
||||
@ -259,11 +263,6 @@ scipy==1.14.1 ; python_version >= "3.12"
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
tb-nightly==2.13.0a20230426
|
||||
#Description: TensorBoard
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
# needed by torchgen utils
|
||||
typing-extensions>=4.10.0
|
||||
#Description: type hints for python
|
||||
|
||||
@ -55,6 +55,9 @@ python -m pip install pulp==2.9.0
|
||||
# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
|
||||
python -m pip install expecttest==0.3.0
|
||||
|
||||
# Install intel-openmp
|
||||
python -m pip install intel-openmp==2025.1.1
|
||||
|
||||
run_tests() {
|
||||
# Run nvidia-smi if available
|
||||
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
|
||||
|
||||
@ -1,12 +1,22 @@
|
||||
set ADDITIONAL_OPTIONS=""
|
||||
set PYTHON_EXEC="python"
|
||||
|
||||
|
||||
if "%DESIRED_PYTHON%" == "3.13t" (
|
||||
echo Python version is set to 3.13t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
|
||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||
set PYTHON_EXEC="python3.13t"
|
||||
) else if "%DESIRED_PYTHON%"=="3.14" (
|
||||
echo Python version is set to 3.14 or 3.14t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
||||
) else if "%DESIRED_PYTHON%"=="3.14t" (
|
||||
echo Python version is set to 3.14 or 3.14t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||
set PYTHON_EXEC="python3.14t"
|
||||
) else (
|
||||
echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON%
|
||||
echo Python version is set to %DESIRED_PYTHON%
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =%
|
||||
)
|
||||
|
||||
|
||||
@ -7,6 +7,8 @@ call "internal\install_python.bat"
|
||||
|
||||
%PYTHON_EXEC% --version
|
||||
set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
|
||||
if "%DESIRED_PYTHON%" == "3.14t" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.14" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
|
||||
|
||||
1
.flake8
1
.flake8
@ -48,6 +48,7 @@ per-file-ignores =
|
||||
torch/__init__.py: F401,TOR901
|
||||
torch/_custom_op/impl.py: TOR901
|
||||
torch/_export/serde/upgrade.py: TOR901
|
||||
torch/_functorch/predispatch.py: TOR901
|
||||
torch/_functorch/vmap.py: TOR901
|
||||
torch/_inductor/test_operators.py: TOR901
|
||||
torch/_library/abstract_impl.py: TOR901
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
dfa5a3a85849f59af5438c7c2811235d52d93a95
|
||||
a645da617ed8836727cf9c28944d87154700d360
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
c9b38be8aafb02b69ccb704b33d2bb4329fbb0e6
|
||||
bbea1cefdd1a29b53355b1655f5d2ae343921f85
|
||||
|
||||
4
.github/dependabot.yml
vendored
4
.github/dependabot.yml
vendored
@ -8,6 +8,9 @@ updates:
|
||||
target-branch: "main"
|
||||
allow:
|
||||
- dependency-name: "transformers"
|
||||
ignore:
|
||||
- dependency-name: "*"
|
||||
update-types: ["version-update:semver-patch"]
|
||||
commit-message:
|
||||
prefix: "[Dependabot] Update"
|
||||
include: "scope"
|
||||
@ -18,3 +21,4 @@ updates:
|
||||
- "topic: not user facing"
|
||||
- "module: ci"
|
||||
- "module: inductor"
|
||||
- "ciflow/inductor"
|
||||
|
||||
27
.github/scripts/generate_binary_build_matrix.py
vendored
27
.github/scripts/generate_binary_build_matrix.py
vendored
@ -314,8 +314,8 @@ def generate_wheels_matrix(
|
||||
# TODO: Enable python 3.13t on cpu-s390x
|
||||
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
|
||||
continue
|
||||
# TODO: Enable python 3.14 on non linux OSes
|
||||
if os not in ["linux", "linux-aarch64", "macos-arm64"] and (
|
||||
# TODO: Enable python 3.14 for rest
|
||||
if os not in ["linux", "linux-aarch64", "macos-arm64", "windows"] and (
|
||||
python_version == "3.14" or python_version == "3.14t"
|
||||
):
|
||||
continue
|
||||
@ -356,29 +356,6 @@ def generate_wheels_matrix(
|
||||
), # include special case for aarch64 build, remove the -aarch64 postfix
|
||||
}
|
||||
)
|
||||
# Special build building to use on Colab. Python 3.11 for 12.6 CUDA
|
||||
if python_version == "3.11" and arch_version == CUDA_STABLE:
|
||||
ret.append(
|
||||
{
|
||||
"python_version": python_version,
|
||||
"gpu_arch_type": gpu_arch_type,
|
||||
"gpu_arch_version": gpu_arch_version,
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"container_image": WHEEL_CONTAINER_IMAGES[
|
||||
arch_version
|
||||
].split(":")[0],
|
||||
"container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
|
||||
arch_version
|
||||
].split(":")[1],
|
||||
"package_type": package_type,
|
||||
"pytorch_extra_install_requirements": "",
|
||||
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950
|
||||
".", "_"
|
||||
),
|
||||
}
|
||||
)
|
||||
else:
|
||||
ret.append(
|
||||
{
|
||||
|
||||
65
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
65
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -1425,71 +1425,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_8-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_8-full
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_11-cuda12_8-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_8-full
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda12_8-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_8-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
|
||||
2370
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
2370
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
@ -58,9 +58,14 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
default: inductor_huggingface_perf_cuda_h100,inductor_timm_perf_cuda_h100,inductor_torchbench_perf_cuda_h100
|
||||
pull_request:
|
||||
# Changing these files guarantees that this workflow needs to be run
|
||||
paths:
|
||||
- .github/workflows/inductor-perf-test-nightly-h100.yml
|
||||
- .ci/docker/ci_commit_pins/huggingface-requirements.txt
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
@ -160,10 +165,9 @@ jobs:
|
||||
name: cuda12.8-py3.10-gcc9-sm90
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
|
||||
4
.github/workflows/test-h100.yml
vendored
4
.github/workflows/test-h100.yml
vendored
@ -4,6 +4,10 @@ on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/test-h100.yml
|
||||
- test/inductor/test_max_autotune.py
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- torch/_inductor/kernel/mm_grouped.py
|
||||
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: 0 4,10,16,22 * * * # every 6 hours
|
||||
|
||||
@ -1454,7 +1454,7 @@ init_command = [
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'usort==1.0.8.post1',
|
||||
'isort==6.0.1',
|
||||
'ruff==0.12.2', # sync with RUFF
|
||||
'ruff==0.12.9', # sync with RUFF
|
||||
]
|
||||
is_formatter = true
|
||||
|
||||
@ -1589,7 +1589,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'ruff==0.12.2', # sync with PYFMT
|
||||
'ruff==0.12.9', # sync with PYFMT
|
||||
]
|
||||
is_formatter = true
|
||||
|
||||
|
||||
@ -279,6 +279,7 @@ header_template_rule(
|
||||
"@AT_BLAS_F2C@": "0",
|
||||
"@AT_BLAS_USE_CBLAS_DOT@": "1",
|
||||
"@AT_KLEIDIAI_ENABLED@": "0",
|
||||
"@AT_USE_EIGEN_SPARSE@": "0",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@ -289,6 +289,7 @@ option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build."
|
||||
option(USE_PROF "Use profiling" OFF)
|
||||
option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
|
||||
option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
|
||||
option(USE_EIGEN_SPARSE "Use Eigen Sparse Matrices" OFF)
|
||||
option(USE_SYSTEM_EIGEN_INSTALL
|
||||
"Use system Eigen instead of the one under third_party" OFF)
|
||||
cmake_dependent_option(
|
||||
|
||||
@ -242,7 +242,6 @@ git submodule update --init --recursive
|
||||
**Common**
|
||||
|
||||
```bash
|
||||
conda install cmake ninja
|
||||
# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
@ -96,6 +96,8 @@ file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
|
||||
file(GLOB vulkan_cpp "vulkan/*.cpp")
|
||||
file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/impl/*.cpp" "native/vulkan/ops/*.cpp")
|
||||
|
||||
file(GLOB native_eigen_cpp "native/sparse/eigen/*.cpp")
|
||||
|
||||
# Metal
|
||||
file(GLOB metal_h "metal/*.h")
|
||||
file(GLOB metal_cpp "metal/*.cpp")
|
||||
@ -341,6 +343,9 @@ if(USE_VULKAN)
|
||||
else()
|
||||
set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp})
|
||||
endif()
|
||||
if(USE_EIGEN_SPARSE)
|
||||
set(all_cpu_cpp ${all_cpu_cpp} ${native_eigen_cpp})
|
||||
endif()
|
||||
|
||||
if(USE_MTIA)
|
||||
set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} ${mtia_cpp} ${mtia_h} ${native_mtia_cpp} ${native_mtia_h})
|
||||
|
||||
@ -20,3 +20,4 @@
|
||||
#define AT_BLAS_F2C() @AT_BLAS_F2C@
|
||||
#define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@
|
||||
#define AT_KLEIDIAI_ENABLED() @AT_KLEIDIAI_ENABLED@
|
||||
#define AT_USE_EIGEN_SPARSE() @AT_USE_EIGEN_SPARSE@
|
||||
|
||||
@ -698,6 +698,14 @@ bool Context::hasLAPACK() {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool Context::hasEigenSparse() {
|
||||
#if AT_USE_EIGEN_SPARSE()
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
at::QEngine Context::qEngine() const {
|
||||
static auto _quantized_engine = []() {
|
||||
at::QEngine qengine = at::kNoQEngine;
|
||||
|
||||
@ -133,6 +133,7 @@ class TORCH_API Context {
|
||||
static bool hasLAPACK();
|
||||
static bool hasMKLDNN();
|
||||
static bool ckSupported();
|
||||
static bool hasEigenSparse();
|
||||
static bool hasMAGMA() {
|
||||
return detail::getCUDAHooks().hasMAGMA();
|
||||
}
|
||||
@ -615,6 +616,10 @@ inline bool hasLAPACK() {
|
||||
return globalContext().hasLAPACK();
|
||||
}
|
||||
|
||||
inline bool hasEigenSparse() {
|
||||
return globalContext().hasEigenSparse();
|
||||
}
|
||||
|
||||
inline bool hasMAGMA() {
|
||||
return globalContext().hasMAGMA();
|
||||
}
|
||||
|
||||
@ -97,8 +97,8 @@ class TORCH_API KernelFunction final {
|
||||
KernelFunction();
|
||||
~KernelFunction();
|
||||
|
||||
KernelFunction(const KernelFunction&) = default;
|
||||
KernelFunction& operator=(const KernelFunction&) = default;
|
||||
KernelFunction(const KernelFunction& other);
|
||||
KernelFunction& operator=(const KernelFunction& other);
|
||||
|
||||
KernelFunction(KernelFunction&&) noexcept = default;
|
||||
|
||||
@ -276,10 +276,6 @@ class TORCH_API KernelFunction final {
|
||||
// Register a token to be invalidated when this KernelFunction is destroyed
|
||||
void registerToken(std::weak_ptr<KernelToken> token) const;
|
||||
|
||||
// List of tokens that need to be invalidated when this KernelFunction is
|
||||
// destroyed
|
||||
mutable std::vector<std::weak_ptr<KernelToken>> tokens_;
|
||||
|
||||
private:
|
||||
explicit KernelFunction(
|
||||
std::unique_ptr<OperatorKernel> functor,
|
||||
@ -294,6 +290,9 @@ class TORCH_API KernelFunction final {
|
||||
BoxedKernel boxed_kernel_func_;
|
||||
void* unboxed_kernel_func_;
|
||||
void* sym_unboxed_kernel_func_;
|
||||
// List of tokens that need to be invalidated when this KernelFunction is
|
||||
// destroyed (lazy allocation to save memory when empty)
|
||||
mutable std::unique_ptr<std::vector<std::weak_ptr<KernelToken>>> tokens_;
|
||||
};
|
||||
|
||||
// Token held by SafeKernelFunction that gets invalidated when KernelFunction is
|
||||
|
||||
@ -25,13 +25,35 @@ inline KernelFunction::KernelFunction()
|
||||
sym_unboxed_kernel_func_(nullptr) {}
|
||||
|
||||
inline KernelFunction::~KernelFunction() {
|
||||
for (auto& weak_token : tokens_) {
|
||||
if (auto token = weak_token.lock()) {
|
||||
token->invalidate();
|
||||
if (tokens_) {
|
||||
for (auto& weak_token : *tokens_) {
|
||||
if (auto token = weak_token.lock()) {
|
||||
token->invalidate();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline KernelFunction::KernelFunction(const KernelFunction& other)
|
||||
: boxed_kernel_func_(other.boxed_kernel_func_),
|
||||
unboxed_kernel_func_(other.unboxed_kernel_func_),
|
||||
sym_unboxed_kernel_func_(other.sym_unboxed_kernel_func_) {
|
||||
// tokens_ is intentionally not copied as we only care about invalidating
|
||||
// tokens if the original KernelFunction is destroyed
|
||||
}
|
||||
|
||||
inline KernelFunction& KernelFunction::operator=(const KernelFunction& other) {
|
||||
if (this != &other) {
|
||||
boxed_kernel_func_ = other.boxed_kernel_func_;
|
||||
unboxed_kernel_func_ = other.unboxed_kernel_func_;
|
||||
sym_unboxed_kernel_func_ = other.sym_unboxed_kernel_func_;
|
||||
|
||||
// tokens_ is intentionally not copied as we only care about invalidating
|
||||
// tokens if the original KernelFunction is destroyed
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
inline KernelFunction::KernelFunction(
|
||||
std::unique_ptr<OperatorKernel> functor,
|
||||
InternalBoxedKernelFunction* boxed_kernel_func,
|
||||
@ -167,7 +189,10 @@ C10_ALWAYS_INLINE Return KernelFunction::call(
|
||||
|
||||
inline void KernelFunction::registerToken(
|
||||
std::weak_ptr<KernelToken> token) const {
|
||||
tokens_.push_back(std::move(token));
|
||||
if (!tokens_) {
|
||||
tokens_ = std::make_unique<std::vector<std::weak_ptr<KernelToken>>>();
|
||||
}
|
||||
tokens_->push_back(std::move(token));
|
||||
}
|
||||
|
||||
inline KernelFunction KernelFunction::makeFromBoxedKernel(
|
||||
|
||||
@ -161,6 +161,11 @@ struct CUDACachingHostAllocatorImpl
|
||||
return true;
|
||||
}
|
||||
|
||||
bool pinned_use_background_threads() override {
|
||||
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
|
||||
pinned_use_background_threads();
|
||||
}
|
||||
|
||||
EventPool::Event create_event_internal(DeviceIndex idx) {
|
||||
// Leak the event pool to avoid shutdown issue.
|
||||
static auto* event_pool = new EventPool();
|
||||
|
||||
@ -19,10 +19,6 @@
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
#include <c10/cuda/driver_api.h>
|
||||
#endif
|
||||
|
||||
#if AT_CUDNN_ENABLED()
|
||||
#include <ATen/cudnn/cudnn-wrapper.h>
|
||||
#endif
|
||||
@ -93,29 +89,6 @@ void CUDAHooks::init() const {
|
||||
// have a chance to enable vitals.
|
||||
at::vitals::VitalsAPI.setVital("CUDA", "used", "true", /* force = */ true);
|
||||
|
||||
// Sets the CUDA_MODULE_LOADING environment variable
|
||||
// if it's not set by the user.
|
||||
// CUDA_MODULE_LOADING="LAZY" is default for all drivers released for CUDA 12.2+.
|
||||
// Check the driver version and only set the env variable if needed.
|
||||
bool set_lazy_module_loading = true;
|
||||
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
auto driver_api = c10::cuda::DriverAPI::get();
|
||||
// Initialize NVML
|
||||
if (driver_api->nvmlInit_v2_() == NVML_SUCCESS) {
|
||||
// Get the driver version
|
||||
int version = -1;
|
||||
auto res = driver_api->nvmlSystemGetCudaDriverVersion_v2_(&version);
|
||||
if (res == NVML_SUCCESS) {
|
||||
// Check if driver is sufficiently new
|
||||
if (version >= 12020) {
|
||||
set_lazy_module_loading = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (set_lazy_module_loading) {
|
||||
c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false);
|
||||
}
|
||||
const auto num_devices = c10::cuda::device_count_ensure_non_zero();
|
||||
c10::cuda::CUDACachingAllocator::init(num_devices);
|
||||
at::cuda::detail::init_p2p_access_cache(num_devices);
|
||||
|
||||
@ -22,7 +22,7 @@ namespace {
|
||||
// Check if tensor list has either a boolean tensor or a integer tensor
|
||||
inline bool has_integral_tensor(TensorList tensors, const bool includeBool) {
|
||||
return std::any_of(
|
||||
tensors.begin(), tensors.end(), [&includeBool](const auto& t) {
|
||||
tensors.begin(), tensors.end(), [includeBool](const auto& t) {
|
||||
return at::isIntegralType(t.scalar_type(), includeBool);
|
||||
});
|
||||
}
|
||||
|
||||
@ -1360,6 +1360,7 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
|
||||
static inline int64_t get_mkldnn_matmul_min_dim() {
|
||||
static auto value = [&] {
|
||||
const int64_t default_min_dim = [&] {
|
||||
@ -1393,6 +1394,7 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
|
||||
const int64_t min_size = get_mkldnn_matmul_min_size();
|
||||
return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static void addmm_impl_cpu_(
|
||||
@ -1771,6 +1773,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
|
||||
(strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
|
||||
};
|
||||
|
||||
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
|
||||
bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
|
||||
if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
|
||||
try {
|
||||
@ -1781,6 +1784,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
|
||||
at::globalContext().setUserEnabledMkldnn(false);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (contraction_size * res_rows * res_cols < 400) {
|
||||
if (is_bmm_out) {
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Parallel.h>
|
||||
@ -647,10 +648,10 @@ _vec_softmax(
|
||||
parallel_for(
|
||||
0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) {
|
||||
int64_t idx = begin;
|
||||
auto temp_vec_input = std::make_unique<float[]>(dim_size * vectorized_step);
|
||||
auto temp_vec_output = std::make_unique<float[]>(dim_size * vectorized_step);
|
||||
float* temp_vec_input_data = temp_vec_input.get();
|
||||
float* temp_vec_output_data = temp_vec_output.get();
|
||||
std::vector<float> temp_vec_input(dim_size * vectorized_step);
|
||||
std::vector<float> temp_vec_output(dim_size * vectorized_step);
|
||||
float* temp_vec_input_data = temp_vec_input.data();
|
||||
float* temp_vec_output_data = temp_vec_output.data();
|
||||
while (idx < end) {
|
||||
int64_t outer_idx = idx / inner_size;
|
||||
int64_t inner_idx = idx % inner_size;
|
||||
|
||||
@ -1347,7 +1347,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
// We are doing row-wise scaling
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
|
||||
&& (dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)) {
|
||||
&& ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
|
||||
// cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
|
||||
|| (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
|
||||
TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
|
||||
at::cuda::detail::f8f8bf16_rowwise(
|
||||
mat1,
|
||||
|
||||
@ -436,7 +436,6 @@ static inline void launch_vectorized_templated_kernel(
|
||||
loader_t l,
|
||||
storer_t s) {
|
||||
TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
|
||||
using traits = function_traits<func_t>;
|
||||
int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) /
|
||||
vectorized_templated_config::block_work_size();
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
@ -14,7 +14,7 @@ namespace {
|
||||
// pow(double, int)
|
||||
// pow(float, float)
|
||||
// pow(double, double)
|
||||
#ifdef _MSC_VER
|
||||
#if defined(_MSC_VER) || defined(_LIBCPP_VERSION)
|
||||
// Functions for pow
|
||||
// pow for at::Half
|
||||
static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) {
|
||||
|
||||
@ -225,8 +225,9 @@ void launch_stable_sort_kernel(
|
||||
return;
|
||||
}
|
||||
|
||||
int64_t numel_or_intmax =
|
||||
std::min(numel, static_cast<int64_t>(std::numeric_limits<int>::max()));
|
||||
const int64_t intmax = static_cast<int64_t>(std::numeric_limits<int>::max());
|
||||
// On ROCm, std::min -> ::min did not work as expected on when input values >= 2147483648
|
||||
int64_t numel_or_intmax = numel < intmax ? numel : intmax;
|
||||
int64_t nsort = self.size(dim);
|
||||
int64_t nbatch = (numel_or_intmax / nsort) * nsort;
|
||||
TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort);
|
||||
@ -238,7 +239,8 @@ void launch_stable_sort_kernel(
|
||||
scalar_t* values_ptr = values.mutable_data_ptr<scalar_t>();
|
||||
int64_t remaining = numel;
|
||||
while (remaining > 0) {
|
||||
int64_t n = std::min(remaining, nbatch);
|
||||
// On ROCm, std::min -> ::min did not work as expected on when input values >= 2147483648
|
||||
int64_t n = remaining < nbatch ? remaining : nbatch;
|
||||
int64_t nsegments = n / nsort;
|
||||
|
||||
if (nsegments == 1 ||
|
||||
|
||||
@ -285,7 +285,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
|
||||
sizeof(algos) / sizeof(algos[0]) == num_algos,
|
||||
"Missing cuDNN convolution forward algorithms");
|
||||
int perf_count;
|
||||
auto perf_results = std::make_unique<perf_t[]>(num_algos);
|
||||
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_FWD_ALGO_COUNT> perf_results;
|
||||
if (!benchmark) {
|
||||
AT_CUDNN_CHECK_WITH_SHAPES(
|
||||
cudnnGetConvolutionForwardAlgorithm_v7(
|
||||
@ -296,7 +296,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
|
||||
args.odesc.desc(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get()),
|
||||
perf_results.data()),
|
||||
args);
|
||||
} else {
|
||||
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
|
||||
@ -314,7 +314,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
|
||||
args.output.data_ptr(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get(),
|
||||
perf_results.data(),
|
||||
ws.data,
|
||||
ws.size),
|
||||
args);
|
||||
@ -324,7 +324,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
|
||||
// memory, e.g. a few GBs.
|
||||
c10::cuda::CUDACachingAllocator::emptyCache();
|
||||
}
|
||||
return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
|
||||
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
|
||||
}
|
||||
|
||||
static void getWorkspaceSize(
|
||||
@ -369,7 +369,8 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
|
||||
sizeof(algos) / sizeof(algos[0]) == num_algos,
|
||||
"Missing cuDNN convolution backward data algorithms.");
|
||||
int perf_count;
|
||||
auto perf_results = std::make_unique<perf_t[]>(num_algos);
|
||||
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT>
|
||||
perf_results;
|
||||
if (!benchmark) {
|
||||
AT_CUDNN_CHECK_WITH_SHAPES(
|
||||
cudnnGetConvolutionBackwardDataAlgorithm_v7(
|
||||
@ -380,7 +381,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
|
||||
args.idesc.desc(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get()),
|
||||
perf_results.data()),
|
||||
args);
|
||||
} else {
|
||||
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
|
||||
@ -398,7 +399,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
|
||||
args.input.data_ptr(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get(),
|
||||
perf_results.data(),
|
||||
ws.data,
|
||||
ws.size),
|
||||
args);
|
||||
@ -408,7 +409,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
|
||||
// memory, e.g. a few GBs.
|
||||
c10::cuda::CUDACachingAllocator::emptyCache();
|
||||
}
|
||||
return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
|
||||
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
|
||||
}
|
||||
|
||||
static void getWorkspaceSize(
|
||||
@ -456,7 +457,8 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
|
||||
static_assert(
|
||||
sizeof(algos) / sizeof(algos[0]) == num_algos,
|
||||
"Missing cuDNN convolution backward filter algorithms.");
|
||||
auto perf_results = std::make_unique<perf_t[]>(num_algos);
|
||||
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT>
|
||||
perf_results;
|
||||
int perf_count;
|
||||
if (!benchmark) {
|
||||
AT_CUDNN_CHECK_WITH_SHAPES(
|
||||
@ -468,7 +470,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
|
||||
args.wdesc.desc(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get()),
|
||||
perf_results.data()),
|
||||
args);
|
||||
} else {
|
||||
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
|
||||
@ -486,7 +488,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
|
||||
args.weight.data_ptr(),
|
||||
num_algos,
|
||||
&perf_count,
|
||||
perf_results.get(),
|
||||
perf_results.data(),
|
||||
ws.data,
|
||||
ws.size),
|
||||
args);
|
||||
@ -496,7 +498,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
|
||||
// memory, e.g. a few GBs.
|
||||
c10::cuda::CUDACachingAllocator::emptyCache();
|
||||
}
|
||||
return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
|
||||
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
|
||||
}
|
||||
|
||||
static void getWorkspaceSize(
|
||||
|
||||
@ -512,7 +512,28 @@ TORCH_IMPL_FUNC(index_add_mps_out)
|
||||
return;
|
||||
}
|
||||
|
||||
TORCH_CHECK(source.scalar_type() != ScalarType::Long, "index_add(): Expected non int64 dtype for source.");
|
||||
bool use_deterministic_algorithm = globalContext().deterministicAlgorithms();
|
||||
|
||||
// TODO: Do not use deterministic algorithm for long/complex but rather implement it as Metal shader
|
||||
use_deterministic_algorithm |= source.scalar_type() == ScalarType::Long;
|
||||
use_deterministic_algorithm |= c10::isComplexType(source.scalar_type());
|
||||
|
||||
if (use_deterministic_algorithm) {
|
||||
if (!result.is_same(self)) {
|
||||
result.copy_(self);
|
||||
}
|
||||
torch::List<std::optional<Tensor>> indices;
|
||||
indices.reserve(dim + 1);
|
||||
for (const auto i : c10::irange(dim)) {
|
||||
indices.emplace_back();
|
||||
}
|
||||
indices.emplace_back(index.to(at::kLong));
|
||||
const Tensor result_ = (result.dim() == 0) ? result.view(1) : result;
|
||||
const Tensor source_ = (source.dim() == 0) ? source.view(1) : source;
|
||||
result_.index_put_(indices, source_.mul(alpha), true);
|
||||
return;
|
||||
}
|
||||
|
||||
auto casted_type = isFloatingType(source.scalar_type()) ? ScalarType::Float : ScalarType::Int;
|
||||
|
||||
struct CachedGraph : public MPSCachedGraph {
|
||||
@ -921,6 +942,8 @@ Tensor& index_fill_mps_(Tensor& self, int64_t dim, const Tensor& index, const Te
|
||||
TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int,
|
||||
"index_fill_(): Expected dtype int32 or int64 for index");
|
||||
TORCH_CHECK(dim == 0 || dim < self.dim(), "index_fill_(): Indexing dim ", dim, " is out of bounds of tensor");
|
||||
// MPS.scatter crashes if used with complex dtypes
|
||||
TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "index_fill_(): Complex types are yet not supported");
|
||||
|
||||
// Empty index
|
||||
if (num_indices == 0) {
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
|
||||
namespace at::native {
|
||||
@ -53,8 +54,8 @@ static void upsample_nearest2d_out_frame(
|
||||
return;
|
||||
}
|
||||
|
||||
auto input_offset_arr = std::make_unique<int64_t[]>(output_width);
|
||||
int64_t* input_offset = input_offset_arr.get();
|
||||
std::vector<int64_t> input_offset_arr(output_width);
|
||||
int64_t* input_offset = input_offset_arr.data();
|
||||
|
||||
for (const auto w2 : c10::irange(output_width)) {
|
||||
const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width);
|
||||
|
||||
@ -23,6 +23,9 @@
|
||||
#include <ATen/Parallel.h>
|
||||
#endif
|
||||
|
||||
#if AT_USE_EIGEN_SPARSE()
|
||||
#include <ATen/native/sparse/eigen/SparseBlasImpl.h>
|
||||
#endif
|
||||
|
||||
namespace at::native::sparse::impl {
|
||||
|
||||
@ -442,13 +445,15 @@ void add_out_sparse_csr(
|
||||
const Tensor& mat2,
|
||||
const Scalar& alpha,
|
||||
const Tensor& result) {
|
||||
#if !AT_MKL_ENABLED()
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Calling add on a sparse CPU tensor requires compiling PyTorch with MKL. ",
|
||||
"Please use PyTorch built MKL support.");
|
||||
#else
|
||||
#if AT_USE_MKL_SPARSE()
|
||||
sparse::impl::mkl::add_out_sparse_csr(mat1, mat2, alpha, result);
|
||||
#elif AT_USE_EIGEN_SPARSE()
|
||||
sparse::impl::eigen::add_out_sparse(mat1, mat2, alpha, result);
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Calling add on a sparse CPU tensor requires compiling PyTorch with MKL. ",
|
||||
"Please use PyTorch built MKL support.");
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -459,7 +464,7 @@ void triangular_solve_out_sparse_csr(
|
||||
bool upper,
|
||||
bool transpose,
|
||||
bool unitriangular) {
|
||||
#if !AT_MKL_ENABLED()
|
||||
#if !AT_USE_MKL_SPARSE()
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Calling triangular_solve on a sparse CPU tensor requires compiling PyTorch with MKL. ",
|
||||
|
||||
@ -127,6 +127,10 @@
|
||||
#include <ATen/ops/zeros_like.h>
|
||||
#endif
|
||||
|
||||
#if AT_USE_EIGEN_SPARSE()
|
||||
#include <ATen/native/sparse/eigen/SparseBlasImpl.h>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace at {
|
||||
@ -536,7 +540,12 @@ static void addmm_out_sparse_csr_native_cpu(
|
||||
auto values = sparse.values();
|
||||
|
||||
scalar_t cast_alpha = alpha.to<scalar_t>();
|
||||
r.mul_(beta);
|
||||
// If beta is zero NaN and Inf should not be propagated to the result
|
||||
if (beta.toComplexDouble() == 0.) {
|
||||
r.zero_();
|
||||
} else {
|
||||
r.mul_(beta);
|
||||
}
|
||||
AT_DISPATCH_INDEX_TYPES(
|
||||
col_indices.scalar_type(), "csr_mm_crow_indices", [&]() {
|
||||
auto csr_accessor = csr.accessor<index_t, 1>();
|
||||
@ -648,6 +657,15 @@ Tensor& addmm_out_sparse_compressed_cpu(
|
||||
return result;
|
||||
}
|
||||
|
||||
#if AT_USE_EIGEN_SPARSE()
|
||||
if ((result.layout() == kSparseCsr || result.layout() == kSparseCsc) &&
|
||||
(mat1.layout() == kSparseCsr || mat1.layout() == kSparseCsc) &&
|
||||
(mat2.layout() == kSparseCsr || mat2.layout() == kSparseCsc)) {
|
||||
sparse::impl::eigen::addmm_out_sparse(mat1, mat2, result, alpha, beta);
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !AT_USE_MKL_SPARSE()
|
||||
// The custom impl addmm_out_sparse_csr_native_cpu only supports CSR @
|
||||
// strided -> strided
|
||||
|
||||
@ -800,7 +800,7 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
|
||||
Tensor indices_dim1 = indices[1].to(ScalarType::Int);
|
||||
Tensor indices_dim2 = indices[2].to(ScalarType::Int);
|
||||
|
||||
auto mat_el_end_indices_host = std::make_unique<int64_t[]>(num_matrices);
|
||||
std::vector<int64_t> mat_el_end_indices_host(num_matrices);
|
||||
|
||||
{
|
||||
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
|
||||
@ -809,14 +809,14 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
|
||||
|
||||
search_end_matrix_indices(mat_el_end_indices_device, num_matrices, indices_dim0);
|
||||
AT_CUDA_CHECK(cudaMemcpy(
|
||||
mat_el_end_indices_host.get(),
|
||||
mat_el_end_indices_host.data(),
|
||||
mat_el_end_indices_device,
|
||||
num_matrices*sizeof(int64_t),
|
||||
cudaMemcpyDeviceToHost
|
||||
));
|
||||
}
|
||||
// Need a pointer to an array to access within a lambda
|
||||
int64_t* mat_el_end_indices = &mat_el_end_indices_host[0];
|
||||
int64_t* mat_el_end_indices = mat_el_end_indices_host.data();
|
||||
|
||||
Scalar beta = 0;
|
||||
Scalar alpha = 1;
|
||||
|
||||
329
aten/src/ATen/native/sparse/eigen/SparseBlasImpl.cpp
Normal file
329
aten/src/ATen/native/sparse/eigen/SparseBlasImpl.cpp
Normal file
@ -0,0 +1,329 @@
|
||||
#include <ATen/native/sparse/eigen/SparseBlasImpl.h>
|
||||
|
||||
#if AT_USE_EIGEN_SPARSE()
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/SparseCsrTensorUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#else
|
||||
#include <ATen/ops/empty_like.h>
|
||||
#endif
|
||||
|
||||
#include <c10/core/ScalarType.h>
|
||||
|
||||
#include <Eigen/SparseCore>
|
||||
|
||||
namespace at::native::sparse::impl::eigen {
|
||||
|
||||
namespace {
|
||||
|
||||
void inline sparse_indices_to_result_dtype_inplace(
|
||||
const c10::ScalarType& dtype,
|
||||
const at::Tensor& input) {
|
||||
auto [compressed_indices, plain_indices] =
|
||||
at::sparse_csr::getCompressedPlainIndices(input);
|
||||
static_cast<at::SparseCsrTensorImpl*>(input.unsafeGetTensorImpl())
|
||||
->set_member_tensors(
|
||||
compressed_indices.to(dtype),
|
||||
plain_indices.to(dtype),
|
||||
input.values(),
|
||||
input.sizes());
|
||||
}
|
||||
|
||||
void inline sparse_indices_and_values_resize(
|
||||
const at::Tensor& input,
|
||||
int64_t nnz) {
|
||||
auto [compressed_indices, plain_indices] =
|
||||
at::sparse_csr::getCompressedPlainIndices(input);
|
||||
static_cast<SparseCsrTensorImpl*>(input.unsafeGetTensorImpl())
|
||||
->set_member_tensors(
|
||||
compressed_indices,
|
||||
plain_indices.resize_({nnz}),
|
||||
input.values().resize_({nnz}),
|
||||
input.sizes());
|
||||
}
|
||||
|
||||
template <typename scalar_t, int eigen_options, typename index_t>
|
||||
const Eigen::Map<Eigen::SparseMatrix<scalar_t, eigen_options, index_t>>
|
||||
Tensor_to_Eigen(const at::Tensor& tensor) {
|
||||
int64_t rows = tensor.size(0);
|
||||
int64_t cols = tensor.size(1);
|
||||
int64_t nnz = tensor._nnz();
|
||||
TORCH_CHECK(tensor.values().is_contiguous(), "eigen accepts only contiguous tensor values");
|
||||
auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(tensor);
|
||||
index_t* c_indices_ptr = compressed_indices.data_ptr<index_t>();
|
||||
index_t* p_indices_ptr = plain_indices.data_ptr<index_t>();
|
||||
scalar_t* values_ptr = tensor.values().data_ptr<scalar_t>();
|
||||
Eigen::Map<Eigen::SparseMatrix<scalar_t, eigen_options, index_t>> map(
|
||||
rows, cols, nnz, c_indices_ptr, p_indices_ptr, values_ptr);
|
||||
return map;
|
||||
}
|
||||
|
||||
template <typename scalar_t, int eigen_options, typename index_t>
|
||||
void Eigen_to_Tensor(
|
||||
const at::Tensor& tensor,
|
||||
const Eigen::SparseMatrix<scalar_t, eigen_options, index_t>& matrix) {
|
||||
const Layout eigen_layout = (eigen_options == Eigen::RowMajor ? kSparseCsr : kSparseCsc);
|
||||
TORCH_CHECK(
|
||||
tensor.layout() == eigen_layout,
|
||||
"Eigen_to_Tensor, expected tensor be ", eigen_layout, ", but got ",
|
||||
tensor.layout());
|
||||
int64_t nnz = matrix.nonZeros();
|
||||
int64_t csize = matrix.outerSize();
|
||||
sparse_indices_and_values_resize(tensor, nnz);
|
||||
auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(tensor);
|
||||
if (nnz > 0) {
|
||||
std::memcpy(
|
||||
tensor.values().mutable_data_ptr<scalar_t>(),
|
||||
matrix.valuePtr(),
|
||||
nnz * sizeof(scalar_t));
|
||||
std::memcpy(
|
||||
plain_indices.mutable_data_ptr<index_t>(),
|
||||
matrix.innerIndexPtr(),
|
||||
nnz * sizeof(index_t));
|
||||
}
|
||||
if (csize > 0) {
|
||||
std::memcpy(
|
||||
compressed_indices.mutable_data_ptr<index_t>(),
|
||||
matrix.outerIndexPtr(),
|
||||
csize * sizeof(index_t));
|
||||
}
|
||||
compressed_indices.mutable_data_ptr<index_t>()[csize] = nnz;
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void add_out_sparse_eigen(
|
||||
const at::Tensor& mat1,
|
||||
const at::Tensor& mat2,
|
||||
const at::Scalar& alpha,
|
||||
const at::Tensor& result) {
|
||||
// empty matrices
|
||||
if (mat1._nnz() == 0 && mat2._nnz() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (mat2._nnz() == 0 || alpha.toComplexDouble() == 0.) {
|
||||
sparse_indices_and_values_resize(result, mat1._nnz());
|
||||
result.copy_(mat1);
|
||||
return;
|
||||
} else if (mat1._nnz() == 0) {
|
||||
sparse_indices_and_values_resize(result, mat2._nnz());
|
||||
result.copy_(mat2);
|
||||
result.values().mul_(alpha);
|
||||
return;
|
||||
}
|
||||
|
||||
c10::ScalarType result_index_dtype = at::sparse_csr::getIndexDtype(result);
|
||||
|
||||
sparse_indices_to_result_dtype_inplace(result_index_dtype, mat1);
|
||||
sparse_indices_to_result_dtype_inplace(result_index_dtype, mat2);
|
||||
|
||||
AT_DISPATCH_INDEX_TYPES(
|
||||
result_index_dtype, "eigen_sparse_add", [&]() {
|
||||
scalar_t _alpha = alpha.to<scalar_t>();
|
||||
|
||||
if (result.layout() == kSparseCsr) {
|
||||
auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat1);
|
||||
auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
|
||||
auto mat1_mat2_eigen = (mat1_eigen + _alpha * mat2_eigen);
|
||||
Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(result, mat1_mat2_eigen);
|
||||
} else {
|
||||
auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat1);
|
||||
auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
|
||||
auto mat1_mat2_eigen = (mat1_eigen + _alpha * mat2_eigen);
|
||||
Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(result, mat1_mat2_eigen);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
void addmm_out_sparse_eigen(
|
||||
const at::Tensor& mat1,
|
||||
const at::Tensor& mat2,
|
||||
const at::Tensor& result,
|
||||
const at::Scalar& alpha,
|
||||
const at::Scalar& beta) {
|
||||
// empty matrices
|
||||
if (mat1._nnz() == 0 || mat2._nnz() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If beta is zero NaN and Inf should not be propagated to the result
|
||||
// In addition, beta = 0 lets us enable a fast-path for result = alpha * A @ B
|
||||
bool is_beta_zero = false;
|
||||
if (beta.toComplexDouble() == 0.) {
|
||||
is_beta_zero = true;
|
||||
result.values().zero_();
|
||||
} else {
|
||||
result.values().mul_(beta);
|
||||
}
|
||||
|
||||
c10::ScalarType result_index_dtype = at::sparse_csr::getIndexDtype(result);
|
||||
|
||||
sparse_indices_to_result_dtype_inplace(result_index_dtype, mat1);
|
||||
sparse_indices_to_result_dtype_inplace(result_index_dtype, mat2);
|
||||
|
||||
AT_DISPATCH_INDEX_TYPES(
|
||||
result_index_dtype, "eigen_sparse_mm", [&]() {
|
||||
typedef Eigen::SparseMatrix<scalar_t, Eigen::RowMajor, index_t> EigenCsrMatrix;
|
||||
typedef Eigen::SparseMatrix<scalar_t, Eigen::ColMajor, index_t> EigenCscMatrix;
|
||||
|
||||
at::Tensor mat1_mat2;
|
||||
if (is_beta_zero) {
|
||||
mat1_mat2 = result;
|
||||
} else {
|
||||
mat1_mat2 = at::empty_like(result, result.options());
|
||||
}
|
||||
|
||||
if (mat1_mat2.layout() == kSparseCsr) {
|
||||
if (mat1.layout() == kSparseCsr) {
|
||||
const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat1);
|
||||
if (mat2.layout() == kSparseCsr) {
|
||||
// Out_csr = M1_csr * M2_csr
|
||||
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
|
||||
const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
|
||||
Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
|
||||
} else {
|
||||
// Out_csr = M1_csr * M2_csc
|
||||
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
|
||||
const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
|
||||
Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
|
||||
}
|
||||
} else {
|
||||
const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat1);
|
||||
if (mat2.layout() == kSparseCsr) {
|
||||
// Out_csr = M1_csc * M2_csr
|
||||
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
|
||||
const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
|
||||
Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
|
||||
} else {
|
||||
// Out_csr = M1_csc * M2_csc
|
||||
// This multiplication will be computationally inefficient, as it will require
|
||||
// additional conversion of the output matrix from CSC to CSR format.
|
||||
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
|
||||
const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
|
||||
Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (mat1.layout() == kSparseCsr) {
|
||||
const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat1);
|
||||
if (mat2.layout() == kSparseCsr) {
|
||||
// Out_csc = M1_csr * M2_csr
|
||||
// This multiplication will be computationally inefficient, as it will require
|
||||
// additional conversion of the output matrix from CSR to CSC format.
|
||||
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
|
||||
const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
|
||||
Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
|
||||
} else {
|
||||
// Out_csc = M1_csr * M2_csc
|
||||
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
|
||||
const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
|
||||
Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
|
||||
}
|
||||
} else {
|
||||
const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat1);
|
||||
if (mat2.layout() == kSparseCsr) {
|
||||
// Out_csc = M1_csc * M2_csr
|
||||
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
|
||||
const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
|
||||
Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
|
||||
} else {
|
||||
// Out_csc = M1_csc * M2_csc
|
||||
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
|
||||
const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
|
||||
Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is_beta_zero) {
|
||||
result.mul_(alpha.to<scalar_t>());
|
||||
} else {
|
||||
result.add_(mat1_mat2, alpha.to<scalar_t>());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
void addmm_out_sparse(
|
||||
const at::Tensor& mat1,
|
||||
const at::Tensor& mat2,
|
||||
const at::Tensor& result,
|
||||
const at::Scalar& alpha,
|
||||
const at::Scalar& beta) {
|
||||
AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(mat1.layout(), "eigen::addmm_out_sparse:mat1", [&]{});
|
||||
AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(mat2.layout(), "eigen::addmm_out_sparse:mat2", [&]{});
|
||||
AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(result.layout(), "eigen::addmm_out_sparse:result", [&]{});
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
|
||||
result.scalar_type(), "addmm_out_sparse_eigen", [&] {
|
||||
addmm_out_sparse_eigen<scalar_t>(mat1, mat2, result, alpha, beta);
|
||||
});
|
||||
}
|
||||
|
||||
void add_out_sparse(
|
||||
const at::Tensor& mat1,
|
||||
const at::Tensor& mat2,
|
||||
const at::Scalar& alpha,
|
||||
const at::Tensor& result) {
|
||||
TORCH_CHECK(
|
||||
(result.layout() == kSparseCsr && mat1.layout() == kSparseCsr && mat2.layout() == kSparseCsr) ||
|
||||
(result.layout() == kSparseCsc && mat1.layout() == kSparseCsc && mat2.layout() == kSparseCsc),
|
||||
"eigen::add_out_sparse: expected the same layout for all operands but got ",
|
||||
mat1.layout(),
|
||||
" + ",
|
||||
mat2.layout(),
|
||||
" -> ",
|
||||
result.layout());
|
||||
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
|
||||
result.scalar_type(), "add_out_sparse_eigen", [&] {
|
||||
add_out_sparse_eigen<scalar_t>(mat1, mat2, alpha, result);
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace at::native::sparse::impl::eigen
|
||||
|
||||
#else
|
||||
|
||||
namespace at::native::sparse::impl::eigen {
|
||||
|
||||
void addmm_out_sparse(
|
||||
const at::Tensor& mat1,
|
||||
const at::Tensor& mat2,
|
||||
const at::Tensor& result,
|
||||
const at::Scalar& alpha,
|
||||
const at::Scalar& beta) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"eigen::addmm_out_sparse: Eigen was not enabled for ",
|
||||
result.layout(),
|
||||
" + ",
|
||||
mat1.layout(),
|
||||
" @ ",
|
||||
mat2.layout());
|
||||
}
|
||||
|
||||
void add_out_sparse(
|
||||
const at::Tensor& mat1,
|
||||
const at::Tensor& mat2,
|
||||
const at::Scalar& alpha,
|
||||
const at::Tensor& result) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"eigen::add_out_sparse: Eigen was not enabled for ",
|
||||
mat1.layout(),
|
||||
" + ",
|
||||
mat2.layout(),
|
||||
" -> ",
|
||||
result.layout());
|
||||
}
|
||||
|
||||
} // namespace at::native::sparse::impl::eigen
|
||||
|
||||
#endif // AT_USE_EIGEN_SPARSE()
|
||||
29
aten/src/ATen/native/sparse/eigen/SparseBlasImpl.h
Normal file
29
aten/src/ATen/native/sparse/eigen/SparseBlasImpl.h
Normal file
@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/Config.h>
|
||||
|
||||
#if AT_USE_EIGEN_SPARSE()
|
||||
#ifndef EIGEN_MPL2_ONLY
|
||||
#define EIGEN_MPL2_ONLY
|
||||
#endif
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
|
||||
namespace at::native::sparse::impl::eigen {
|
||||
|
||||
void addmm_out_sparse(
|
||||
const at::Tensor& mat1,
|
||||
const at::Tensor& mat2,
|
||||
const at::Tensor& result,
|
||||
const at::Scalar& alpha,
|
||||
const at::Scalar& beta);
|
||||
|
||||
void add_out_sparse(
|
||||
const at::Tensor& mat1,
|
||||
const at::Tensor& mat2,
|
||||
const at::Scalar& alpha,
|
||||
const at::Tensor& result);
|
||||
|
||||
} // namespace at::native::sparse::impl::eigen
|
||||
|
||||
#endif
|
||||
@ -58,7 +58,7 @@ DistilBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,2
|
||||
DistillGPT2,pass,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -58,7 +58,7 @@ DistilBertForQuestionAnswering,pass,0
|
||||
|
||||
|
||||
|
||||
DistillGPT2,pass,2
|
||||
DistillGPT2,pass,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
dla102,pass,0
|
||||
dla102,timeout,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -346,7 +346,7 @@ vgg16,pass,0
|
||||
|
||||
|
||||
|
||||
vision_maskrcnn,fail_accuracy,30
|
||||
vision_maskrcnn,fail_accuracy,29
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,0
|
||||
|
||||
|
||||
|
||||
dla102,pass,0
|
||||
dla102,timeout,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -146,7 +146,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,fail_to_run,0
|
||||
hf_BigBird,pass,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,fail_to_run,19
|
||||
hf_Reformer,fail_to_run,21
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -34,7 +34,7 @@ basic_gnn_gin,pass,0
|
||||
|
||||
|
||||
|
||||
basic_gnn_sage,fail_to_run,0
|
||||
basic_gnn_sage,pass,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -146,7 +146,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,fail_to_run,0
|
||||
hf_BigBird,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,fail_to_run,19
|
||||
hf_Reformer,fail_to_run,21
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -634,6 +634,7 @@ libtorch_nativert_sources = [
|
||||
"torch/nativert/graph/passes/SubgraphRewriter.cpp",
|
||||
"torch/nativert/graph/passes/pass_manager/GraphPasses.cpp",
|
||||
"torch/nativert/graph/passes/pass_manager/PassManager.cpp",
|
||||
"torch/nativert/kernels/KernelHandlerRegistry.cpp",
|
||||
]
|
||||
|
||||
torch_mobile_tracer_sources = [
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
#include <c10/core/AllocatorConfig.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/cuda/CUDAMacros.h>
|
||||
#include <c10/util/Deprecated.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/env.h>
|
||||
|
||||
@ -18,13 +17,9 @@ enum class Expandable_Segments_Handle_Type : int {
|
||||
// Environment config parser
|
||||
class C10_CUDA_API CUDAAllocatorConfig {
|
||||
public:
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
|
||||
static size_t max_split_size() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
|
||||
}
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
|
||||
static double garbage_collection_threshold() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
garbage_collection_threshold();
|
||||
@ -65,8 +60,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
return instance().m_pinned_num_register_threads;
|
||||
}
|
||||
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
|
||||
static bool pinned_use_background_threads() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
pinned_use_background_threads();
|
||||
@ -79,29 +72,25 @@ class C10_CUDA_API CUDAAllocatorConfig {
|
||||
return 128;
|
||||
}
|
||||
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
|
||||
// This is used to round-up allocation size to nearest power of 2 divisions.
|
||||
// More description below in function roundup_power2_next_division
|
||||
// As an example, if we want 4 divisions between 2's power, this can be done
|
||||
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
|
||||
static size_t roundup_power2_divisions(size_t size) {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
roundup_power2_divisions(size);
|
||||
}
|
||||
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
|
||||
static std::vector<size_t> roundup_power2_divisions() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
roundup_power2_divisions();
|
||||
}
|
||||
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_non_split_rounding_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_non_split_rounding_size() instead.")
|
||||
static size_t max_non_split_rounding_size() {
|
||||
return c10::CachingAllocator::AcceleratorAllocatorConfig::
|
||||
max_non_split_rounding_size();
|
||||
}
|
||||
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
|
||||
static std::string last_allocator_settings() {
|
||||
return c10::CachingAllocator::getAllocatorSettings();
|
||||
}
|
||||
|
||||
@ -1218,7 +1218,7 @@ class DeviceCachingAllocator {
|
||||
DeviceCachingAllocator()
|
||||
: large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
|
||||
stats.max_split_size =
|
||||
static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
|
||||
static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
|
||||
context_recorder_.store(nullptr);
|
||||
}
|
||||
|
||||
@ -1343,8 +1343,7 @@ class DeviceCachingAllocator {
|
||||
// Do garbage collection if the flag is set.
|
||||
if (C10_UNLIKELY(
|
||||
set_fraction &&
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() >
|
||||
0.0)) {
|
||||
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
|
||||
garbage_collect_cached_blocks(context);
|
||||
}
|
||||
// Attempt allocate
|
||||
@ -1596,7 +1595,7 @@ class DeviceCachingAllocator {
|
||||
stats.active_bytes[stat_type].increase(block->size);
|
||||
stats.requested_bytes[stat_type].increase(block->requested_size);
|
||||
});
|
||||
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
|
||||
if (block->size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_allocations.increase(1);
|
||||
|
||||
auto allocated_bytes_gauge =
|
||||
@ -1647,7 +1646,7 @@ class DeviceCachingAllocator {
|
||||
block->pool->owner_MempoolId(),
|
||||
context ? context : block->context_when_allocated);
|
||||
|
||||
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
|
||||
if (block->size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_allocations.decrease(1);
|
||||
|
||||
if (!block->stream_uses.empty()) {
|
||||
@ -2196,8 +2195,7 @@ class DeviceCachingAllocator {
|
||||
if (size < kMinBlockSize) {
|
||||
return kMinBlockSize;
|
||||
} else {
|
||||
auto divisions =
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions(size);
|
||||
auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
|
||||
if (divisions > 1 && size > (kMinBlockSize * divisions)) {
|
||||
return roundup_power2_next_division(size, divisions);
|
||||
} else {
|
||||
@ -2676,7 +2674,7 @@ class DeviceCachingAllocator {
|
||||
if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
|
||||
return remaining >= kMinBlockSize;
|
||||
} else {
|
||||
return (size < AcceleratorAllocatorConfig::max_split_size()) &&
|
||||
return (size < CUDAAllocatorConfig::max_split_size()) &&
|
||||
(remaining > kSmallSize);
|
||||
}
|
||||
}
|
||||
@ -2696,7 +2694,7 @@ class DeviceCachingAllocator {
|
||||
|
||||
if (C10_UNLIKELY(
|
||||
set_fraction &&
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
|
||||
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
|
||||
// Track block reuse interval only when garbage collection is enabled.
|
||||
++pool.get_free_blocks_call_count;
|
||||
}
|
||||
@ -2738,13 +2736,13 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
|
||||
// Do not return an oversized block for a large request
|
||||
if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
|
||||
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
|
||||
if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
|
||||
((*it)->size >= CUDAAllocatorConfig::max_split_size()))
|
||||
return false;
|
||||
// Allow oversized block size to be rounded up but within a limit
|
||||
if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
|
||||
if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
|
||||
((*it)->size >=
|
||||
p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
|
||||
p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
|
||||
return false;
|
||||
p.block = *it;
|
||||
pool.blocks.erase(it);
|
||||
@ -2767,7 +2765,7 @@ class DeviceCachingAllocator {
|
||||
// therefore should be of less overheads.
|
||||
|
||||
size_t gc_threshold = static_cast<size_t>(
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold() *
|
||||
CUDAAllocatorConfig::garbage_collection_threshold() *
|
||||
static_cast<double>(allowed_memory_maximum));
|
||||
// No need to trigger GC yet
|
||||
if (total_allocated_memory <= gc_threshold) {
|
||||
@ -2915,7 +2913,7 @@ class DeviceCachingAllocator {
|
||||
stats.segment[stat_type].increase(1);
|
||||
stats.reserved_bytes[stat_type].increase(size);
|
||||
});
|
||||
if (size >= AcceleratorAllocatorConfig::max_split_size())
|
||||
if (size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_segments.increase(1);
|
||||
auto reserved_bytes_gauge =
|
||||
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
|
||||
@ -2944,7 +2942,7 @@ class DeviceCachingAllocator {
|
||||
bool release_available_cached_blocks(
|
||||
const AllocParams& p,
|
||||
const std::shared_ptr<GatheredContext>& context) {
|
||||
if (AcceleratorAllocatorConfig::max_split_size() ==
|
||||
if (CUDAAllocatorConfig::max_split_size() ==
|
||||
std::numeric_limits<size_t>::max())
|
||||
return false;
|
||||
BlockPool& pool = *p.pool;
|
||||
@ -2952,8 +2950,8 @@ class DeviceCachingAllocator {
|
||||
// because of std::unique_ptr, block cannot be trivially copied
|
||||
// Use constructor for search key.
|
||||
Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
|
||||
key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
|
||||
? AcceleratorAllocatorConfig::max_split_size()
|
||||
key.size = (key.size < CUDAAllocatorConfig::max_split_size())
|
||||
? CUDAAllocatorConfig::max_split_size()
|
||||
: key.size;
|
||||
auto it = pool.blocks.lower_bound(&key);
|
||||
if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
|
||||
@ -2966,7 +2964,7 @@ class DeviceCachingAllocator {
|
||||
--it; // Back up one item. Now on the largest block for the correct
|
||||
// stream
|
||||
while ((totalReleased < key.size) &&
|
||||
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
|
||||
((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
|
||||
((*it)->stream == p.stream())) {
|
||||
auto cur = it;
|
||||
bool is_first = cur == pool.blocks.begin();
|
||||
@ -3091,7 +3089,7 @@ class DeviceCachingAllocator {
|
||||
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
|
||||
.current);
|
||||
|
||||
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
|
||||
if (block->size >= CUDAAllocatorConfig::max_split_size())
|
||||
stats.oversize_segments.decrease(1);
|
||||
pool->blocks.erase(block);
|
||||
delete block;
|
||||
@ -3718,8 +3716,8 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
|
||||
auto& md = result.config_metadata;
|
||||
md.garbage_collection_threshold =
|
||||
AcceleratorAllocatorConfig::garbage_collection_threshold();
|
||||
md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
|
||||
CUDAAllocatorConfig::garbage_collection_threshold();
|
||||
md.max_split_size = CUDAAllocatorConfig::max_split_size();
|
||||
md.pinned_num_register_threads =
|
||||
CUDAAllocatorConfig::pinned_num_register_threads();
|
||||
md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
|
||||
@ -3727,10 +3725,9 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
CUDAAllocatorConfig::release_lock_on_cudamalloc();
|
||||
md.pinned_use_host_register =
|
||||
CUDAAllocatorConfig::pinned_use_cuda_host_register();
|
||||
md.last_allocator_settings =
|
||||
AcceleratorAllocatorConfig::last_allocator_settings();
|
||||
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
|
||||
md.roundup_power2_divisions =
|
||||
AcceleratorAllocatorConfig::roundup_power2_divisions();
|
||||
CUDAAllocatorConfig::roundup_power2_divisions();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -67,7 +67,11 @@
|
||||
_(nvmlDeviceGetComputeRunningProcesses) \
|
||||
_(nvmlSystemGetCudaDriverVersion_v2)
|
||||
|
||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12040)
|
||||
#define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV)
|
||||
#else
|
||||
#define C10_NVML_DRIVER_API_OPTIONAL(_)
|
||||
#endif
|
||||
|
||||
namespace c10::cuda {
|
||||
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
#include <c10/core/AllocatorConfig.h>
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <c10/xpu/XPUCachingAllocator.h>
|
||||
@ -21,6 +20,8 @@ constexpr size_t kMinBlockSize = 512;
|
||||
constexpr size_t kSmallSize = 1048576;
|
||||
// "small" allocations are packed in 2 MiB blocks
|
||||
constexpr size_t kSmallBuffer = 2097152;
|
||||
// "large" allocations may be packed in 20 MiB blocks
|
||||
constexpr size_t kLargeBuffer = 20971520;
|
||||
// allocations between 1 and 10 MiB may use kLargeBuffer
|
||||
constexpr size_t kMinLargeAlloc = 10485760;
|
||||
// round up large allocations to 2 MiB
|
||||
|
||||
@ -153,6 +153,7 @@ set(AT_MKLDNN_ACL_ENABLED 0)
|
||||
set(AT_MKLDNN_ENABLED 0)
|
||||
set(AT_MKL_ENABLED 0)
|
||||
set(AT_KLEIDIAI_ENABLED 0)
|
||||
set(AT_USE_EIGEN_SPARSE 0)
|
||||
# setting default preferred BLAS options if not already present.
|
||||
if(NOT INTERN_BUILD_MOBILE)
|
||||
set(BLAS "MKL" CACHE STRING "Selected BLAS library")
|
||||
@ -262,6 +263,15 @@ if(BLAS_LIBRARIES AND BLAS_CHECK_F2C)
|
||||
include(cmake/BLAS_ABI.cmake)
|
||||
endif()
|
||||
|
||||
if(USE_EIGEN_SPARSE AND BLAS_INFO STREQUAL "mkl")
|
||||
message(WARNING "Disabling USE_EIGEN_SPARSE because MKL is enabled")
|
||||
set(USE_EIGEN_SPARSE OFF)
|
||||
endif()
|
||||
|
||||
if(USE_EIGEN_SPARSE)
|
||||
set(AT_USE_EIGEN_SPARSE 1)
|
||||
endif()
|
||||
|
||||
if(NOT INTERN_BUILD_MOBILE)
|
||||
set(AT_MKL_SEQUENTIAL 0)
|
||||
set(USE_BLAS 1)
|
||||
|
||||
@ -135,6 +135,7 @@ function(caffe2_print_configuration_summary)
|
||||
endif()
|
||||
message(STATUS " BUILD_NVFUSER : ${BUILD_NVFUSER}")
|
||||
message(STATUS " USE_EIGEN_FOR_BLAS : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
|
||||
message(STATUS " USE_EIGEN_FOR_SPARSE : ${USE_EIGEN_SPARSE}")
|
||||
message(STATUS " USE_FBGEMM : ${USE_FBGEMM}")
|
||||
message(STATUS " USE_KINETO : ${USE_KINETO}")
|
||||
message(STATUS " USE_GFLAGS : ${USE_GFLAGS}")
|
||||
|
||||
@ -268,10 +268,6 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
|
||||
.. py:module:: torch.cuda.comm
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. py:module:: torch.cuda.error
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. py:module:: torch.cuda.gds
|
||||
```
|
||||
|
||||
@ -9,8 +9,9 @@ This note will eventually contain more details on how to use the APIs in torch/c
|
||||
| type in custom extension | StableIValue representation | type in libtorch | Schema Type |
|
||||
| -------- | ------- | ------- | ------- |
|
||||
| std::optional\<S> | if there is a value, raw bitwise copy into leading bytes of uint64_t of pointer to a new StableIValue representing S. if there is no value, nullptr. | std::optional\<T> | Type? |
|
||||
| RAIIATH | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor | Tensor |
|
||||
| int32_t | raw bitwise copy into leading bytes of uint64_t | at::ScalarType | ScalarType |
|
||||
| torch::stable::Tensor | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor | Tensor |
|
||||
| RAIIATH (outdated) | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor | Tensor |
|
||||
| torch::headeronly::ScalarType | raw bitwise copy of the translated underlying enum into leading bytes of uint64_t | torch::headeronly::ScalarType | ScalarType |
|
||||
| int32_t | raw bitwise copy into leading bytes of uint64_t | at::Layout | Layout |
|
||||
| int32_t | raw bitwise copy into leading bytes of uint64_t | at::MemoryFormat | MemoryFormat |
|
||||
| bool | raw bitwise copy into leading bytes of uint64_t | bool | bool |
|
||||
|
||||
@ -110,7 +110,6 @@ and supported quantized modules and functions.
|
||||
.. py:module:: torch.ao.quantization.backend_config.executorch
|
||||
.. py:module:: torch.ao.quantization.backend_config.fbgemm
|
||||
.. py:module:: torch.ao.quantization.backend_config.native
|
||||
.. py:module:: torch.ao.quantization.backend_config.observation_type
|
||||
.. py:module:: torch.ao.quantization.backend_config.onednn
|
||||
.. py:module:: torch.ao.quantization.backend_config.qnnpack
|
||||
.. py:module:: torch.ao.quantization.backend_config.tensorrt
|
||||
|
||||
1
setup.py
1
setup.py
@ -1588,7 +1588,6 @@ def main() -> None:
|
||||
"networkx>=2.5.1",
|
||||
"jinja2",
|
||||
"fsspec>=0.8.5",
|
||||
'intel-openmp==2025.1.1 ;platform_system == "Windows" ', # for Windows inductor
|
||||
]
|
||||
if BUILD_PYTHON_ONLY:
|
||||
install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"]
|
||||
|
||||
@ -39,6 +39,7 @@ set(NATIVERT_TEST_SRCS
|
||||
${TORCH_ROOT}/torch/nativert/graph/passes/SubgraphRewriter.cpp
|
||||
${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
|
||||
${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/PassManager.cpp
|
||||
${TORCH_ROOT}/torch/nativert/kernels/KernelHandlerRegistry.cpp
|
||||
)
|
||||
|
||||
add_executable(test_nativert
|
||||
|
||||
158
test/cpp/nativert/static_kernel_test_utils.h
Normal file
158
test/cpp/nativert/static_kernel_test_utils.h
Normal file
@ -0,0 +1,158 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <torch/nativert/executor/Executor.h>
|
||||
#include <torch/nativert/graph/Graph.h>
|
||||
#include <torch/torch.h>
|
||||
|
||||
#include <torch/nativert/kernels/KernelHandlerRegistry.h>
|
||||
|
||||
namespace torch::nativert {
|
||||
|
||||
/*
|
||||
* This is a lightweight version of ModelRunner that executes a model in
|
||||
* interpreter mode given a string graph with no weights/attributes
|
||||
*/
|
||||
class SimpleTestModelRunner {
|
||||
public:
|
||||
SimpleTestModelRunner(
|
||||
const std::string_view source,
|
||||
const ExecutorConfig& config) {
|
||||
register_kernel_handlers();
|
||||
graph_ = stringToGraph(source);
|
||||
weights_ = std::make_shared<Weights>(graph_.get());
|
||||
|
||||
executor_ = std::make_unique<Executor>(config, graph_, weights_);
|
||||
}
|
||||
|
||||
std::vector<c10::IValue> run(const std::vector<c10::IValue>& inputs) const {
|
||||
return executor_->execute(inputs);
|
||||
}
|
||||
|
||||
ProfileMetrics benchmarkIndividualNodes(
|
||||
const std::vector<c10::IValue>& inputs) const {
|
||||
return executor_->benchmarkIndividualNodes({inputs}, 10, 10);
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<Graph> graph_;
|
||||
std::unique_ptr<Executor> executor_;
|
||||
std::shared_ptr<Weights> weights_;
|
||||
};
|
||||
|
||||
inline void compareIValue(
|
||||
const c10::IValue& expected,
|
||||
const c10::IValue& actual,
|
||||
bool native = false) {
|
||||
if (expected.isTensor()) {
|
||||
EXPECT_TRUE(actual.isTensor());
|
||||
EXPECT_TRUE(torch::allclose(
|
||||
expected.toTensor(),
|
||||
actual.toTensor(),
|
||||
1e-5,
|
||||
1e-8,
|
||||
/*equal_nan*/ true));
|
||||
if (!native) {
|
||||
EXPECT_TRUE(expected.toTensor().strides() == actual.toTensor().strides());
|
||||
}
|
||||
} else if (expected.isTuple()) {
|
||||
EXPECT_TRUE(actual.isTuple());
|
||||
auto expected_tuple = expected.toTupleRef().elements();
|
||||
auto actual_tuple = actual.toTupleRef().elements();
|
||||
ASSERT_TRUE(expected_tuple.size() == actual_tuple.size());
|
||||
for (size_t i = 0; i < expected_tuple.size(); i++) {
|
||||
compareIValue(expected_tuple[i], actual_tuple[i], native);
|
||||
}
|
||||
} else if (expected.isList()) {
|
||||
EXPECT_TRUE(actual.isList());
|
||||
auto expected_list = expected.toList();
|
||||
auto actual_list = actual.toList();
|
||||
ASSERT_TRUE(expected_list.size() == actual_list.size());
|
||||
for (size_t i = 0; i < expected_list.size(); i++) {
|
||||
compareIValue(expected_list[i], actual_list[i], native);
|
||||
}
|
||||
} else if (expected.isGenericDict()) {
|
||||
EXPECT_TRUE(actual.isGenericDict());
|
||||
auto expected_dict = expected.toGenericDict();
|
||||
auto actual_dict = actual.toGenericDict();
|
||||
EXPECT_TRUE(expected_dict.size() == actual_dict.size());
|
||||
for (auto& expected_kv : expected_dict) {
|
||||
auto actual_kv = actual_dict.find(expected_kv.key());
|
||||
ASSERT_FALSE(actual_kv == actual_dict.end());
|
||||
compareIValue(expected_kv.value(), actual_kv->value(), native);
|
||||
}
|
||||
} else {
|
||||
// Fall back to default comparison from IValue
|
||||
EXPECT_TRUE(expected == actual);
|
||||
}
|
||||
}
|
||||
|
||||
void compareIValues(
|
||||
std::vector<c10::IValue> expected,
|
||||
std::vector<c10::IValue> actual,
|
||||
bool native = false) {
|
||||
ASSERT_TRUE(expected.size() == actual.size());
|
||||
for (size_t i = 0; i < expected.size(); i++) {
|
||||
compareIValue(expected[i], actual[i], native);
|
||||
}
|
||||
}
|
||||
|
||||
inline void testStaticKernelEqualityInternal(
|
||||
const SimpleTestModelRunner& modelRunner,
|
||||
const SimpleTestModelRunner& staticModelRunner,
|
||||
const std::vector<c10::IValue>& args,
|
||||
bool native = false) {
|
||||
auto expected = modelRunner.run(args);
|
||||
|
||||
auto output = staticModelRunner.run(args);
|
||||
compareIValues(expected, output, native);
|
||||
|
||||
// Run again to test the static kernel when outputs IValue are cached in the
|
||||
// execution frame
|
||||
auto output2 = staticModelRunner.run(args);
|
||||
compareIValues(expected, output2, native);
|
||||
}
|
||||
|
||||
void testStaticKernelEquality(
|
||||
const std::string_view source,
|
||||
const std::vector<c10::IValue>& args,
|
||||
bool native = false) {
|
||||
ExecutorConfig config;
|
||||
config.enableStaticCPUKernels = false;
|
||||
SimpleTestModelRunner model(source, config);
|
||||
|
||||
config.enableStaticCPUKernels = true;
|
||||
SimpleTestModelRunner staticKernelModel(source, config);
|
||||
|
||||
testStaticKernelEqualityInternal(model, staticKernelModel, args, native);
|
||||
}
|
||||
|
||||
inline void testGraphABEquality(
|
||||
const std::string_view graph_a,
|
||||
const std::string_view graph_b,
|
||||
const std::vector<c10::IValue>& args,
|
||||
const ExecutorConfig& config = {},
|
||||
bool native = false) {
|
||||
SimpleTestModelRunner model_a(graph_a, config);
|
||||
auto expected = model_a.run(args);
|
||||
|
||||
SimpleTestModelRunner model_b(graph_b, config);
|
||||
auto output = model_b.run(args);
|
||||
|
||||
compareIValues(expected, output, native);
|
||||
}
|
||||
|
||||
inline void testGraphABPerf(
|
||||
const std::string_view graph_a,
|
||||
const std::string_view graph_b,
|
||||
const std::vector<c10::IValue>& args,
|
||||
const ExecutorConfig& config = {}) {
|
||||
SimpleTestModelRunner model_a(graph_a, config);
|
||||
auto resultA = model_a.benchmarkIndividualNodes(args);
|
||||
|
||||
SimpleTestModelRunner model_b(graph_b, config);
|
||||
auto resultB = model_b.benchmarkIndividualNodes(args);
|
||||
ASSERT_TRUE(resultA.totalTime > resultB.totalTime);
|
||||
}
|
||||
|
||||
} // namespace torch::nativert
|
||||
@ -0,0 +1,15 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <torch/nativert/kernels/KernelFactory.h>
|
||||
#include <torch/nativert/kernels/KernelHandlerRegistry.h>
|
||||
|
||||
using namespace ::testing;
|
||||
using namespace torch::nativert;
|
||||
|
||||
TEST(StaticDispatchKernelRegistrationTests, TestRegistration) {
|
||||
EXPECT_FALSE(KernelFactory::isHandlerRegistered("static_cpu"));
|
||||
register_kernel_handlers();
|
||||
EXPECT_TRUE(KernelFactory::isHandlerRegistered("static_cpu"));
|
||||
// try to re-register, which should be a no-op
|
||||
register_kernel_handlers();
|
||||
}
|
||||
@ -4,6 +4,7 @@
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/headeronly/util/Exception.h>
|
||||
#include <torch/headeronly/core/ScalarType.h>
|
||||
|
||||
#ifdef LAE_USE_CUDA
|
||||
#include <cuda_runtime.h>
|
||||
@ -139,12 +140,10 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
|
||||
const auto num_args = 6;
|
||||
StableIValue stack[num_args];
|
||||
|
||||
int32_t t_dtype;
|
||||
aoti_torch_get_dtype(t.get(), &t_dtype);
|
||||
auto mf = aoti_torch_memory_format_contiguous_format();
|
||||
|
||||
stack[0] = from(t);
|
||||
stack[1] = from(std::optional(t_dtype)); // dtype
|
||||
stack[1] = from(std::optional(t.scalar_type())); // dtype
|
||||
stack[2] = from(std::nullopt); // layout
|
||||
stack[3] = from(std::optional(device)); // device
|
||||
stack[4] = from(std::optional(false)); // pin_memory
|
||||
@ -342,12 +341,24 @@ void boxed_my_narrow(
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
Tensor my_new_empty_dtype_variant(Tensor t) {
|
||||
std::vector<int64_t> sizes = {2, 5};
|
||||
auto dtype = std::make_optional(at::ScalarType::BFloat16);
|
||||
return new_empty(t, sizes, dtype);
|
||||
}
|
||||
|
||||
void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_new_empty_dtype_variant(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
|
||||
m.def("my_empty_like(Tensor t) -> Tensor");
|
||||
m.def("fill_infinity(Tensor(a!) t) -> Tensor(a!)");
|
||||
m.def("my_pad(Tensor t) -> Tensor");
|
||||
m.def("my_narrow(Tensor t, int dim, int start, int length) -> Tensor");
|
||||
m.def("my_new_empty_dtype_variant(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
@ -355,6 +366,7 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_empty_like", &boxed_empty_like);
|
||||
m.impl("fill_infinity", &boxed_fill_infinity);
|
||||
m.impl("my_is_cpu", &boxed_my_is_cpu);
|
||||
m.impl("my_new_empty_dtype_variant", &boxed_my_new_empty_dtype_variant);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
|
||||
@ -371,10 +383,31 @@ void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
Tensor my_amax(Tensor t) {
|
||||
return amax(t, 0, false);
|
||||
}
|
||||
|
||||
void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_amax(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
Tensor my_amax_vec(Tensor t) {
|
||||
std::vector<int64_t> v = {0,1};
|
||||
return amax(t, v, false);
|
||||
}
|
||||
|
||||
void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_amax_vec(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
|
||||
m.def("my_amax(Tensor a) -> Tensor");
|
||||
m.def("my_amax_vec(Tensor a) -> Tensor");
|
||||
m.def("my_is_cpu(Tensor t) -> bool");
|
||||
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
|
||||
@ -414,6 +447,8 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_default_constructor", &boxed_test_default_constructor);
|
||||
m.impl("my_amax", &boxed_my_amax);
|
||||
m.impl("my_amax_vec", &boxed_my_amax_vec);
|
||||
}
|
||||
|
||||
// Test functions for torch::stable::accelerator APIs
|
||||
|
||||
@ -167,6 +167,30 @@ def my_zero_(t) -> Tensor:
|
||||
return torch.ops.libtorch_agnostic.my_zero_.default(t)
|
||||
|
||||
|
||||
def my_amax(t) -> Tensor:
|
||||
"""
|
||||
Returns t.amax()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: amax(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_amax.default(t)
|
||||
|
||||
|
||||
def my_amax_vec(t) -> Tensor:
|
||||
"""
|
||||
Returns t.amax()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: amax(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_amax_vec.default(t)
|
||||
|
||||
|
||||
def fill_infinity(t) -> Tensor:
|
||||
"""
|
||||
Fills the tensor with inf.
|
||||
@ -259,3 +283,15 @@ def test_get_current_device_index() -> int:
|
||||
Returns: Current device index as an integer
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_get_current_device_index.default()
|
||||
|
||||
|
||||
def my_new_empty_dtype_variant(t) -> Tensor:
|
||||
"""
|
||||
Returns a new empty tensor with shape [2, 5] and dtype bfloat16
|
||||
|
||||
Args:
|
||||
t: Input tensor used as a reference for device and other properties
|
||||
|
||||
Returns: New empty tensor with shape [2, 5] and dtype bfloat16
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_new_empty_dtype_variant.default(t)
|
||||
|
||||
@ -190,7 +190,7 @@ if not IS_WINDOWS:
|
||||
|
||||
deterministic = torch.are_deterministic_algorithms_enabled()
|
||||
try:
|
||||
# set use_deterministic_algorithms to fill unintialized memory
|
||||
# set use_deterministic_algorithms to fill uninitialized memory
|
||||
torch.use_deterministic_algorithms(True)
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
@ -209,6 +209,20 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(id(out), id(t))
|
||||
self.assertEqual(out, torch.zeros_like(t))
|
||||
|
||||
def test_my_amax(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
out = libtorch_agnostic.ops.my_amax(t)
|
||||
self.assertEqual(out, torch.amax(t, 0))
|
||||
|
||||
def test_my_amax_vec(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, 5, device=device)
|
||||
out = libtorch_agnostic.ops.my_amax_vec(t)
|
||||
self.assertEqual(out, torch.amax(t, (0, 1)))
|
||||
|
||||
def test_my_is_cpu(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
@ -308,6 +322,21 @@ if not IS_WINDOWS:
|
||||
finally:
|
||||
torch.cuda.set_device(prev_device)
|
||||
|
||||
def test_my_new_empty_dtype_variant(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
deterministic = torch.are_deterministic_algorithms_enabled()
|
||||
try:
|
||||
# set use_deterministic_algorithms to fill uninitialized memory
|
||||
torch.use_deterministic_algorithms(True)
|
||||
t = torch.randn(3, 4, device=device)
|
||||
out = libtorch_agnostic.ops.my_new_empty_dtype_variant(t)
|
||||
ref_out = t.new_empty((2, 5), dtype=torch.bfloat16)
|
||||
|
||||
self.assertEqual(out, ref_out, exact_device=True)
|
||||
finally:
|
||||
torch.use_deterministic_algorithms(deterministic)
|
||||
|
||||
instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -1486,8 +1486,8 @@ class TestFullyShardWorldSize1(FSDPTest):
|
||||
@skip_if_lt_x_gpu(1)
|
||||
def test_train_parity_single_worldsize1(self):
|
||||
"""
|
||||
Tests train parity with DDP for a single FSDP group
|
||||
when sharding parameters on dim-0.
|
||||
Tests train parity with DDP for a single FSDP group when sharding
|
||||
parameters on dim-0.
|
||||
"""
|
||||
self.run_subtests(
|
||||
{
|
||||
@ -1535,7 +1535,9 @@ class TestFullyShardWorldSize1(FSDPTest):
|
||||
losses.append(model(*inp).sum())
|
||||
losses[-1].backward()
|
||||
|
||||
self.assertEqual(comm_mode.get_total_counts(), 0)
|
||||
# Before there was 1 all-gather and 1 reduce-scatter
|
||||
# Now therre is 1 reduce-scatter
|
||||
self.assertEqual(comm_mode.get_total_counts(), 1)
|
||||
optim.step()
|
||||
|
||||
self.assertEqual(losses[0], losses[1])
|
||||
|
||||
@ -286,11 +286,11 @@ class TestFullyShard2DTraining(FSDPTest):
|
||||
with CommDebugMode() as bwd_comm_mode:
|
||||
loss.backward()
|
||||
bwd_comm_counts = bwd_comm_mode.get_comm_counts()
|
||||
self.assertEqual(len(bwd_comm_counts), 1)
|
||||
self.assertEqual(len(bwd_comm_counts), 2)
|
||||
# First MLP's input gradient does not need to be all-reduced
|
||||
self.assertEqual(bwd_comm_counts[funcol.all_reduce], num_mlps - 1)
|
||||
self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], 0)
|
||||
self.assertEqual(bwd_comm_counts[c10d_ops._reduce_scatter_base_], 0)
|
||||
self.assertEqual(bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_mlps)
|
||||
ref_loss.backward()
|
||||
|
||||
optim.step()
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
|
||||
from torch.testing._internal.common_distributed import MultiProcContinousTest
|
||||
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
|
||||
|
||||
class TestTemplate(MultiProcContinousTest):
|
||||
class TestTemplate(MultiProcContinuousTest):
|
||||
def testABC(self):
|
||||
print(f"rank {self.rank} of {self.world_size} testing ABC")
|
||||
|
||||
|
||||
@ -55,7 +55,7 @@ class TestMakeCheckpointer(TestCase):
|
||||
|
||||
# Test that it works for sync operations
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_factory_sync")
|
||||
result = checkpointer.save(self.state_dict, checkpoint_path)
|
||||
result = checkpointer.save(checkpoint_path, self.state_dict)
|
||||
self.assertIsNone(result) # Sync mode returns None
|
||||
|
||||
# Verify checkpoint was created
|
||||
@ -81,7 +81,7 @@ class TestMakeCheckpointer(TestCase):
|
||||
checkpoint_path = os.path.join(
|
||||
self.temp_dir, "checkpoint_factory_sync_config_first"
|
||||
)
|
||||
result = checkpointer.save(self.state_dict, checkpoint_path)
|
||||
result = checkpointer.save(checkpoint_path, self.state_dict)
|
||||
self.assertIsNone(result) # Sync mode returns None
|
||||
|
||||
# Verify checkpoint was created
|
||||
@ -105,7 +105,7 @@ class TestMakeCheckpointer(TestCase):
|
||||
checkpoint_path = os.path.join(
|
||||
self.temp_dir, "checkpoint_factory_sync_custom_config"
|
||||
)
|
||||
result = checkpointer.save(self.state_dict, checkpoint_path)
|
||||
result = checkpointer.save(checkpoint_path, self.state_dict)
|
||||
self.assertIsNone(result) # Sync mode returns None
|
||||
|
||||
# Verify checkpoint was created
|
||||
@ -135,7 +135,7 @@ class TestMakeCheckpointer(TestCase):
|
||||
# Test that it works for async operations
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_factory_async")
|
||||
stage_future, write_future = checkpointer.save(
|
||||
self.state_dict, checkpoint_path
|
||||
checkpoint_path, self.state_dict
|
||||
)
|
||||
|
||||
# Verify futures are returned
|
||||
|
||||
@ -90,7 +90,7 @@ class TestCheckpointWriter(TestCase):
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint")
|
||||
|
||||
# Call write
|
||||
self.writer.write(self.state_dict, checkpoint_path)
|
||||
self.writer.write(checkpoint_path, self.state_dict)
|
||||
|
||||
# Verify that the checkpoint file exists
|
||||
expected_file_path = os.path.join(
|
||||
@ -111,7 +111,7 @@ class TestCheckpointWriter(TestCase):
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint")
|
||||
|
||||
# Call write
|
||||
self.writer.write(self.state_dict, checkpoint_path)
|
||||
self.writer.write(checkpoint_path, self.state_dict)
|
||||
|
||||
# Verify that the barrier was called
|
||||
self.mock_barrier.execute_barrier.assert_called_once()
|
||||
@ -123,7 +123,7 @@ class TestCheckpointWriter(TestCase):
|
||||
|
||||
# Call write with additional kwargs
|
||||
kwargs = {"extra": "value"}
|
||||
self.writer.write(self.state_dict, checkpoint_path, **kwargs)
|
||||
self.writer.write(checkpoint_path, self.state_dict, **kwargs)
|
||||
|
||||
# Verify that the pre_commit hook was called with the correct parameters
|
||||
self.assertTrue(self.mock_hook.pre_commit_called)
|
||||
@ -157,7 +157,7 @@ class TestCheckpointWriter(TestCase):
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_no_barrier")
|
||||
|
||||
# Call write
|
||||
writer.write(self.state_dict, checkpoint_path)
|
||||
writer.write(checkpoint_path, self.state_dict)
|
||||
|
||||
# Verify that the checkpoint file exists
|
||||
expected_file_path = os.path.join(
|
||||
@ -179,7 +179,7 @@ class TestCheckpointWriter(TestCase):
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_no_hook")
|
||||
|
||||
# Call write
|
||||
writer.write(self.state_dict, checkpoint_path)
|
||||
writer.write(checkpoint_path, self.state_dict)
|
||||
|
||||
# Verify that the checkpoint file exists
|
||||
expected_file_path = os.path.join(
|
||||
|
||||
@ -3,8 +3,14 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from concurrent.futures import Future
|
||||
from unittest.mock import Mock
|
||||
|
||||
import torch
|
||||
from torch.distributed.checkpoint._experimental.checkpoint_process import (
|
||||
CheckpointProcess,
|
||||
CheckpointProcessConfig,
|
||||
)
|
||||
from torch.distributed.checkpoint._experimental.checkpoint_reader import (
|
||||
CheckpointReader,
|
||||
)
|
||||
@ -12,12 +18,39 @@ from torch.distributed.checkpoint._experimental.checkpoint_writer import (
|
||||
CheckpointWriter,
|
||||
CheckpointWriterConfig,
|
||||
)
|
||||
from torch.distributed.checkpoint._experimental.checkpointer import SyncCheckpointer
|
||||
from torch.distributed.checkpoint._experimental.checkpointer import (
|
||||
AsyncCheckpointer,
|
||||
Checkpointer,
|
||||
SyncCheckpointer,
|
||||
)
|
||||
from torch.distributed.checkpoint._experimental.staging import (
|
||||
CheckpointStagerConfig,
|
||||
DefaultStager,
|
||||
)
|
||||
from torch.distributed.checkpoint._experimental.types import RankInfo
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
|
||||
class TestSyncCheckpointer(TestCase):
|
||||
def subprocess_init_fn(name: str, parent_pid: int) -> None:
|
||||
"""Initialize the subprocess for async checkpointer tests."""
|
||||
assert name == "test-async-checkpointer", f"Unexpected subprocess name: {name}"
|
||||
assert os.getpid() != parent_pid, "This was supposed to run in a different process"
|
||||
assert os.getppid() == parent_pid, (
|
||||
"This was supposed to run as a child to main process"
|
||||
)
|
||||
|
||||
|
||||
def ckpt_writer_init_fn(**kwargs) -> CheckpointWriter:
|
||||
"""Initialize a CheckpointWriter in the subprocess."""
|
||||
return CheckpointWriter(
|
||||
config=kwargs.get("config"),
|
||||
rank_info=kwargs.get("rank_info"),
|
||||
)
|
||||
|
||||
|
||||
class TestCheckpointer(TestCase):
|
||||
"""Parameterized tests that work with both sync and async checkpointers."""
|
||||
|
||||
def setUp(self):
|
||||
# Create a temporary directory for checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
@ -28,20 +61,13 @@ class TestSyncCheckpointer(TestCase):
|
||||
global_rank=0,
|
||||
)
|
||||
self.writer_config = CheckpointWriterConfig()
|
||||
self.writer = CheckpointWriter(
|
||||
config=self.writer_config,
|
||||
rank_info=self.rank_info,
|
||||
)
|
||||
|
||||
# Create reader for testing
|
||||
self.reader = CheckpointReader(
|
||||
rank_info=self.rank_info,
|
||||
)
|
||||
|
||||
# Create sync checkpointer
|
||||
self.checkpointer = SyncCheckpointer(self.writer, self.reader)
|
||||
|
||||
# Create a test state dictionary
|
||||
# Create test state dictionary
|
||||
self.state_dict = {
|
||||
"model": torch.nn.Linear(10, 5).state_dict(),
|
||||
"optimizer": {"param_groups": [{"lr": 0.01}]},
|
||||
@ -53,129 +79,562 @@ class TestSyncCheckpointer(TestCase):
|
||||
# Clean up the temporary directory
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
def test_sync_save_and_read(self):
|
||||
"""Test saving and reading a checkpoint synchronously."""
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_sync")
|
||||
|
||||
# Save the checkpoint synchronously
|
||||
result = self.checkpointer.save(self.state_dict, checkpoint_path)
|
||||
self.assertIsNone(result) # Sync mode returns None
|
||||
|
||||
# Verify that the checkpoint file exists
|
||||
checkpoint_file = os.path.join(
|
||||
checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
|
||||
)
|
||||
self.assertTrue(os.path.exists(checkpoint_file))
|
||||
|
||||
# Load the checkpoint using the checkpointer
|
||||
loaded_state_dict = self.checkpointer.load(checkpoint_path)
|
||||
|
||||
# Verify the loaded state dictionary
|
||||
self.assertIn("model", loaded_state_dict)
|
||||
self.assertIn("optimizer", loaded_state_dict)
|
||||
self.assertEqual(loaded_state_dict["epoch"], 5)
|
||||
self.assertEqual(loaded_state_dict["step"], 1000)
|
||||
|
||||
def test_read_with_map_location(self):
|
||||
"""Test reading a checkpoint with a specific map_location."""
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_map_location")
|
||||
|
||||
# Save the checkpoint
|
||||
self.checkpointer.save(self.state_dict, checkpoint_path)
|
||||
|
||||
# Load the checkpoint with map_location='cpu'
|
||||
loaded_state_dict = self.checkpointer.load(
|
||||
checkpoint_path, default_map_location="cpu"
|
||||
)
|
||||
|
||||
# Verify the loaded state dictionary
|
||||
self.assertIn("model", loaded_state_dict)
|
||||
self.assertIn("optimizer", loaded_state_dict)
|
||||
self.assertEqual(loaded_state_dict["epoch"], 5)
|
||||
self.assertEqual(loaded_state_dict["step"], 1000)
|
||||
|
||||
def test_partial_load(self):
|
||||
"""Test loading only specific keys from a checkpoint."""
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_partial")
|
||||
|
||||
# Save the full checkpoint
|
||||
self.checkpointer.save(self.state_dict, checkpoint_path)
|
||||
|
||||
# Create a partial state dictionary with only some keys
|
||||
partial_state_dict = {
|
||||
"model": torch.nn.Linear(10, 5).state_dict(),
|
||||
"epoch": None, # Will be loaded from checkpoint
|
||||
}
|
||||
|
||||
# Load only the keys in partial_state_dict
|
||||
loaded_state_dict = self.checkpointer.load(
|
||||
checkpoint_path, state_dict=partial_state_dict, default_map_location="cpu"
|
||||
)
|
||||
|
||||
# Verify that the loaded state dictionary contains values from the checkpoint
|
||||
self.assertIn("model", loaded_state_dict)
|
||||
self.assertIn("epoch", loaded_state_dict)
|
||||
self.assertEqual(loaded_state_dict["epoch"], 5) # From checkpoint
|
||||
|
||||
# Verify that keys not in the partial_state_dict are not loaded
|
||||
self.assertNotIn("step", loaded_state_dict)
|
||||
self.assertNotIn("optimizer", loaded_state_dict)
|
||||
|
||||
# Verify that the loaded state dictionary is the same object as the input
|
||||
self.assertIs(loaded_state_dict, partial_state_dict)
|
||||
|
||||
def test_partial_load_with_nested_dict(self):
|
||||
"""Test loading only specific nested keys from a checkpoint."""
|
||||
# Create a checkpoint with nested dictionaries
|
||||
nested_state_dict = {
|
||||
"model": {
|
||||
"layer1": {"weight": torch.randn(5, 10), "bias": torch.randn(5)},
|
||||
"layer2": {"weight": torch.randn(2, 5), "bias": torch.randn(2)},
|
||||
},
|
||||
"metadata": {"epoch": 10, "step": 2000},
|
||||
}
|
||||
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_nested")
|
||||
|
||||
# Create a writer and save the nested state dict
|
||||
def _create_sync_checkpointer(self) -> SyncCheckpointer:
|
||||
"""Create a synchronous checkpointer."""
|
||||
writer = CheckpointWriter(
|
||||
config=self.writer_config,
|
||||
rank_info=self.rank_info,
|
||||
)
|
||||
writer.write(nested_state_dict, checkpoint_path)
|
||||
return SyncCheckpointer(writer, self.reader)
|
||||
|
||||
# Create a partial state dictionary with nested structure
|
||||
partial_state_dict = {
|
||||
"model": {
|
||||
"layer1": {"weight": None}, # Only request layer1.weight
|
||||
def _create_async_checkpointer(self) -> AsyncCheckpointer:
|
||||
"""Create an asynchronous checkpointer."""
|
||||
# Create staging config for async operations
|
||||
# Use conservative settings to avoid CUDA issues in test environment
|
||||
stager_config = CheckpointStagerConfig(
|
||||
use_async_staging=True,
|
||||
use_pinned_memory=False, # Disable to avoid CUDA memory issues
|
||||
use_shared_memory=True,
|
||||
use_non_blocking_copy=False, # Disable to avoid CUDA issues
|
||||
)
|
||||
|
||||
# Create process config
|
||||
process_config = CheckpointProcessConfig(
|
||||
subprocess_init_timeout_secs=30,
|
||||
subprocess_shutdown_timeout_secs=60,
|
||||
)
|
||||
|
||||
# Create stager
|
||||
checkpoint_stager = DefaultStager(stager_config)
|
||||
|
||||
# Create checkpoint process
|
||||
checkpoint_process = CheckpointProcess(
|
||||
rank_info=self.rank_info,
|
||||
config=process_config,
|
||||
subprocess_init_fn=subprocess_init_fn,
|
||||
subprocess_init_args=(
|
||||
"test-async-checkpointer",
|
||||
os.getpid(),
|
||||
),
|
||||
checkpoint_writer_init_fn=ckpt_writer_init_fn,
|
||||
checkpoint_writer_init_args={
|
||||
"config": self.writer_config,
|
||||
"rank_info": self.rank_info,
|
||||
},
|
||||
"metadata": {"epoch": None}, # Only request metadata.epoch
|
||||
)
|
||||
|
||||
# Wait for process initialization
|
||||
checkpoint_process.process_creation_future.result()
|
||||
|
||||
return AsyncCheckpointer(
|
||||
checkpoint_stager=checkpoint_stager,
|
||||
checkpoint_process=checkpoint_process,
|
||||
reader=self.reader,
|
||||
)
|
||||
|
||||
def _get_checkpointers(self):
|
||||
"""Get both sync and async checkpointers for parameterized testing."""
|
||||
return [
|
||||
("sync", self._create_sync_checkpointer()),
|
||||
("async", self._create_async_checkpointer()),
|
||||
]
|
||||
|
||||
def _save_checkpoint(self, checkpointer: Checkpointer, path, state_dict, **kwargs):
|
||||
"""Save checkpoint and handle both sync/async return values."""
|
||||
result = checkpointer.save(path, state_dict, **kwargs)
|
||||
return (None, None) if result is None else result
|
||||
|
||||
def _wait_for_save(self, stage_future, write_future):
|
||||
"""Wait for save operation to complete."""
|
||||
if write_future is not None:
|
||||
write_future.result()
|
||||
if stage_future is not None:
|
||||
stage_future.result()
|
||||
|
||||
def test_save_and_load_basic(self):
|
||||
"""Test basic save and load functionality for both sync and async."""
|
||||
for checkpointer_type, checkpointer in self._get_checkpointers():
|
||||
with self.subTest(checkpointer_type=checkpointer_type):
|
||||
try:
|
||||
checkpoint_path = os.path.join(
|
||||
self.temp_dir, f"checkpoint_{checkpointer_type}"
|
||||
)
|
||||
|
||||
# Save the checkpoint
|
||||
stage_future, write_future = self._save_checkpoint(
|
||||
checkpointer, checkpoint_path, self.state_dict
|
||||
)
|
||||
self._wait_for_save(stage_future, write_future)
|
||||
|
||||
# Verify that the checkpoint file exists
|
||||
checkpoint_file = os.path.join(
|
||||
checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
|
||||
)
|
||||
self.assertTrue(os.path.exists(checkpoint_file))
|
||||
|
||||
# Load the checkpoint using the checkpointer
|
||||
loaded_state_dict = checkpointer.load(checkpoint_path)
|
||||
|
||||
# Verify the loaded state dictionary
|
||||
self.assertIn("model", loaded_state_dict)
|
||||
self.assertIn("optimizer", loaded_state_dict)
|
||||
self.assertEqual(loaded_state_dict["epoch"], 5)
|
||||
self.assertEqual(loaded_state_dict["step"], 1000)
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
def test_load_with_map_location(self):
|
||||
"""Test loading with map_location for both sync and async."""
|
||||
for checkpointer_type, checkpointer in self._get_checkpointers():
|
||||
with self.subTest(checkpointer_type=checkpointer_type):
|
||||
try:
|
||||
checkpoint_path = os.path.join(
|
||||
self.temp_dir, f"checkpoint_map_{checkpointer_type}"
|
||||
)
|
||||
|
||||
# Save the checkpoint
|
||||
stage_future, write_future = self._save_checkpoint(
|
||||
checkpointer, checkpoint_path, self.state_dict
|
||||
)
|
||||
self._wait_for_save(stage_future, write_future)
|
||||
|
||||
# Load with map_location
|
||||
loaded_state_dict = checkpointer.load(
|
||||
checkpoint_path, default_map_location="cpu"
|
||||
)
|
||||
|
||||
# Verify the loaded state dictionary
|
||||
self.assertIn("model", loaded_state_dict)
|
||||
self.assertEqual(loaded_state_dict["epoch"], 5)
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
def test_partial_load(self):
|
||||
"""Test partial loading for both sync and async."""
|
||||
for checkpointer_type, checkpointer in self._get_checkpointers():
|
||||
with self.subTest(checkpointer_type=checkpointer_type):
|
||||
try:
|
||||
checkpoint_path = os.path.join(
|
||||
self.temp_dir, f"checkpoint_partial_{checkpointer_type}"
|
||||
)
|
||||
|
||||
# Save the full checkpoint
|
||||
stage_future, write_future = self._save_checkpoint(
|
||||
checkpointer, checkpoint_path, self.state_dict
|
||||
)
|
||||
self._wait_for_save(stage_future, write_future)
|
||||
|
||||
# Create a partial state dictionary
|
||||
partial_state_dict = {
|
||||
"model": torch.nn.Linear(10, 5).state_dict(),
|
||||
"epoch": None,
|
||||
}
|
||||
|
||||
# Load only the keys in partial_state_dict
|
||||
loaded_state_dict = checkpointer.load(
|
||||
checkpoint_path, state_dict=partial_state_dict
|
||||
)
|
||||
|
||||
# Verify partial loading worked
|
||||
self.assertIn("model", loaded_state_dict)
|
||||
self.assertIn("epoch", loaded_state_dict)
|
||||
self.assertEqual(loaded_state_dict["epoch"], 5)
|
||||
self.assertNotIn("step", loaded_state_dict)
|
||||
self.assertNotIn("optimizer", loaded_state_dict)
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
def test_load_strict_mode(self):
|
||||
"""Test strict mode loading for both sync and async."""
|
||||
for checkpointer_type, checkpointer in self._get_checkpointers():
|
||||
with self.subTest(checkpointer_type=checkpointer_type):
|
||||
try:
|
||||
checkpoint_path = os.path.join(
|
||||
self.temp_dir, f"checkpoint_strict_{checkpointer_type}"
|
||||
)
|
||||
|
||||
# Save a checkpoint with limited keys
|
||||
limited_state_dict = {"model": torch.nn.Linear(10, 5).state_dict()}
|
||||
stage_future, write_future = self._save_checkpoint(
|
||||
checkpointer, checkpoint_path, limited_state_dict
|
||||
)
|
||||
self._wait_for_save(stage_future, write_future)
|
||||
|
||||
# Try to load with more keys than exist in checkpoint
|
||||
partial_state_dict = {
|
||||
"model": torch.nn.Linear(10, 5).state_dict(),
|
||||
"missing_key": None,
|
||||
}
|
||||
|
||||
# Should raise error in strict mode
|
||||
with self.assertRaises(RuntimeError) as cm:
|
||||
checkpointer.load(
|
||||
checkpoint_path, state_dict=partial_state_dict, strict=True
|
||||
)
|
||||
|
||||
self.assertIn("missing keys", str(cm.exception))
|
||||
|
||||
# Should work without strict mode
|
||||
loaded_state_dict = checkpointer.load(
|
||||
checkpoint_path, state_dict=partial_state_dict, strict=False
|
||||
)
|
||||
self.assertIn("model", loaded_state_dict)
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
def test_save_with_kwargs(self):
|
||||
"""Test save with additional kwargs for both sync and async."""
|
||||
for checkpointer_type, checkpointer in self._get_checkpointers():
|
||||
with self.subTest(checkpointer_type=checkpointer_type):
|
||||
try:
|
||||
checkpoint_path = os.path.join(
|
||||
self.temp_dir, f"checkpoint_kwargs_{checkpointer_type}"
|
||||
)
|
||||
|
||||
# For sync checkpointer, we can pass arbitrary kwargs to the writer
|
||||
# For async checkpointer, we test without kwargs to avoid conflicts
|
||||
if checkpointer_type == "sync":
|
||||
# Sync checkpointer passes kwargs directly to writer, so arbitrary kwargs are OK
|
||||
stage_future, write_future = self._save_checkpoint(
|
||||
checkpointer,
|
||||
checkpoint_path,
|
||||
self.state_dict,
|
||||
custom_arg="test_value",
|
||||
another_arg=42,
|
||||
)
|
||||
else:
|
||||
# Async checkpointer has complex kwargs handling between stager and writer
|
||||
# Just test basic save without kwargs to avoid conflicts
|
||||
stage_future, write_future = self._save_checkpoint(
|
||||
checkpointer,
|
||||
checkpoint_path,
|
||||
self.state_dict,
|
||||
)
|
||||
|
||||
self._wait_for_save(stage_future, write_future)
|
||||
|
||||
# Verify checkpoint was created
|
||||
checkpoint_file = os.path.join(
|
||||
checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
|
||||
)
|
||||
self.assertTrue(os.path.exists(checkpoint_file))
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
def test_nested_dict_partial_load(self):
|
||||
"""Test loading nested dictionaries partially for both sync and async."""
|
||||
for checkpointer_type, checkpointer in self._get_checkpointers():
|
||||
with self.subTest(checkpointer_type=checkpointer_type):
|
||||
try:
|
||||
# Create a checkpoint with nested dictionaries
|
||||
nested_state_dict = {
|
||||
"model": {
|
||||
"layer1": {
|
||||
"weight": torch.randn(5, 10),
|
||||
"bias": torch.randn(5),
|
||||
},
|
||||
"layer2": {
|
||||
"weight": torch.randn(2, 5),
|
||||
"bias": torch.randn(2),
|
||||
},
|
||||
},
|
||||
"metadata": {"epoch": 10, "step": 2000},
|
||||
}
|
||||
|
||||
checkpoint_path = os.path.join(
|
||||
self.temp_dir, f"checkpoint_nested_{checkpointer_type}"
|
||||
)
|
||||
|
||||
# Save the nested state dict
|
||||
stage_future, write_future = self._save_checkpoint(
|
||||
checkpointer, checkpoint_path, nested_state_dict
|
||||
)
|
||||
self._wait_for_save(stage_future, write_future)
|
||||
|
||||
# Create a partial state dictionary with nested structure
|
||||
partial_state_dict = {
|
||||
"model": {
|
||||
"layer1": {"weight": None}, # Only request layer1.weight
|
||||
},
|
||||
"metadata": {"epoch": None}, # Only request metadata.epoch
|
||||
}
|
||||
|
||||
# Load only the keys in partial_state_dict
|
||||
loaded_state_dict = checkpointer.load(
|
||||
checkpoint_path, state_dict=partial_state_dict
|
||||
)
|
||||
|
||||
# Verify that the nested keys were correctly loaded
|
||||
self.assertIn("model", loaded_state_dict)
|
||||
self.assertIn("layer1", loaded_state_dict["model"])
|
||||
self.assertIn("weight", loaded_state_dict["model"]["layer1"])
|
||||
self.assertIn("metadata", loaded_state_dict)
|
||||
self.assertIn("epoch", loaded_state_dict["metadata"])
|
||||
|
||||
# Verify values were loaded correctly
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
loaded_state_dict["model"]["layer1"]["weight"],
|
||||
nested_state_dict["model"]["layer1"]["weight"],
|
||||
)
|
||||
)
|
||||
self.assertEqual(loaded_state_dict["metadata"]["epoch"], 10)
|
||||
|
||||
# Verify that keys not in the partial_state_dict are not loaded
|
||||
self.assertNotIn("layer2", loaded_state_dict["model"])
|
||||
self.assertNotIn("step", loaded_state_dict["metadata"])
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
|
||||
class TestAsyncCheckpointerSpecific(TestCase):
|
||||
"""Tests specific to AsyncCheckpointer functionality."""
|
||||
|
||||
def setUp(self):
|
||||
# Create a temporary directory for checkpoints
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
# Create real objects for testing
|
||||
self.rank_info = RankInfo(
|
||||
global_world_size=1,
|
||||
global_rank=0,
|
||||
)
|
||||
self.writer_config = CheckpointWriterConfig()
|
||||
|
||||
# Create reader for testing
|
||||
self.reader = CheckpointReader(
|
||||
rank_info=self.rank_info,
|
||||
)
|
||||
|
||||
# Create test state dictionary
|
||||
self.state_dict = {
|
||||
"model": torch.nn.Linear(10, 5).state_dict(),
|
||||
"optimizer": {"param_groups": [{"lr": 0.01}]},
|
||||
"epoch": 5,
|
||||
"step": 1000,
|
||||
}
|
||||
|
||||
# Load only the keys in partial_state_dict
|
||||
loaded_state_dict = self.checkpointer.load(
|
||||
checkpoint_path, state_dict=partial_state_dict, default_map_location="cpu"
|
||||
def tearDown(self):
|
||||
# Clean up the temporary directory
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
def _create_async_checkpointer(self) -> AsyncCheckpointer:
|
||||
"""Helper method to create AsyncCheckpointer with real components."""
|
||||
# Create staging config for async operations
|
||||
# Use conservative settings to avoid CUDA issues in test environment
|
||||
stager_config = CheckpointStagerConfig(
|
||||
use_async_staging=True,
|
||||
use_pinned_memory=False, # Disable to avoid CUDA memory issues
|
||||
use_shared_memory=True,
|
||||
use_non_blocking_copy=False, # Disable to avoid CUDA issues
|
||||
)
|
||||
|
||||
# Verify that the nested keys were correctly loaded
|
||||
self.assertIn("model", loaded_state_dict)
|
||||
self.assertIn("layer1", loaded_state_dict["model"])
|
||||
self.assertIn("weight", loaded_state_dict["model"]["layer1"])
|
||||
self.assertIn("metadata", loaded_state_dict)
|
||||
self.assertIn("epoch", loaded_state_dict["metadata"])
|
||||
# Create process config
|
||||
process_config = CheckpointProcessConfig(
|
||||
subprocess_init_timeout_secs=30,
|
||||
subprocess_shutdown_timeout_secs=60,
|
||||
)
|
||||
|
||||
# Verify values were loaded correctly
|
||||
self.assertTrue(
|
||||
torch.allclose(
|
||||
loaded_state_dict["model"]["layer1"]["weight"],
|
||||
nested_state_dict["model"]["layer1"]["weight"],
|
||||
# Create stager
|
||||
checkpoint_stager = DefaultStager(stager_config)
|
||||
|
||||
# Create checkpoint process
|
||||
checkpoint_process = CheckpointProcess(
|
||||
rank_info=self.rank_info,
|
||||
config=process_config,
|
||||
subprocess_init_fn=subprocess_init_fn,
|
||||
subprocess_init_args=(
|
||||
"test-async-checkpointer",
|
||||
os.getpid(),
|
||||
),
|
||||
checkpoint_writer_init_fn=ckpt_writer_init_fn,
|
||||
checkpoint_writer_init_args={
|
||||
"config": self.writer_config,
|
||||
"rank_info": self.rank_info,
|
||||
},
|
||||
)
|
||||
|
||||
# Wait for process initialization
|
||||
checkpoint_process.process_creation_future.result()
|
||||
|
||||
return AsyncCheckpointer(
|
||||
checkpoint_stager=checkpoint_stager,
|
||||
checkpoint_process=checkpoint_process,
|
||||
reader=self.reader,
|
||||
)
|
||||
|
||||
def test_async_returns_futures(self):
|
||||
"""Test that async save returns futures."""
|
||||
checkpointer = self._create_async_checkpointer()
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_futures")
|
||||
|
||||
try:
|
||||
# Save the checkpoint asynchronously
|
||||
result = checkpointer.save(checkpoint_path, self.state_dict)
|
||||
|
||||
# Verify that futures are returned
|
||||
self.assertIsInstance(result, tuple)
|
||||
self.assertEqual(len(result), 2)
|
||||
stage_future, write_future = result
|
||||
self.assertIsInstance(stage_future, Future)
|
||||
self.assertIsInstance(write_future, Future)
|
||||
|
||||
# Wait for completion
|
||||
stage_future.result()
|
||||
write_future.result()
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
def test_async_sequential_saves_wait(self):
|
||||
"""Test that sequential async saves wait for previous operations."""
|
||||
checkpointer = self._create_async_checkpointer()
|
||||
|
||||
try:
|
||||
# First save
|
||||
checkpoint_path1 = os.path.join(self.temp_dir, "checkpoint_seq_1")
|
||||
stage_future1, write_future1 = checkpointer.save(
|
||||
checkpoint_path1, self.state_dict
|
||||
)
|
||||
)
|
||||
self.assertEqual(loaded_state_dict["metadata"]["epoch"], 10)
|
||||
|
||||
# Verify that keys not in the partial_state_dict are not loaded
|
||||
self.assertNotIn("layer2", loaded_state_dict["model"])
|
||||
self.assertNotIn("step", loaded_state_dict["metadata"])
|
||||
# Second save (should wait for first to complete)
|
||||
checkpoint_path2 = os.path.join(self.temp_dir, "checkpoint_seq_2")
|
||||
modified_state_dict = self.state_dict.copy()
|
||||
modified_state_dict["epoch"] = 10
|
||||
stage_future2, write_future2 = checkpointer.save(
|
||||
checkpoint_path2, modified_state_dict
|
||||
)
|
||||
|
||||
# Wait for both to complete
|
||||
write_future1.result()
|
||||
write_future2.result()
|
||||
|
||||
# Verify both checkpoints were created with correct content
|
||||
checkpoint_file1 = os.path.join(
|
||||
checkpoint_path1, f"checkpoint_{self.rank_info.global_rank}.pt"
|
||||
)
|
||||
checkpoint_file2 = os.path.join(
|
||||
checkpoint_path2, f"checkpoint_{self.rank_info.global_rank}.pt"
|
||||
)
|
||||
|
||||
self.assertTrue(os.path.exists(checkpoint_file1))
|
||||
self.assertTrue(os.path.exists(checkpoint_file2))
|
||||
|
||||
loaded1 = torch.load(checkpoint_file1)
|
||||
loaded2 = torch.load(checkpoint_file2)
|
||||
|
||||
self.assertEqual(loaded1["epoch"], 5)
|
||||
self.assertEqual(loaded2["epoch"], 10)
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
def test_async_multiple_saves_ordering(self):
|
||||
"""Test that multiple async saves maintain proper ordering."""
|
||||
checkpointer = self._create_async_checkpointer()
|
||||
|
||||
try:
|
||||
# Create multiple state dicts
|
||||
state_dicts = [
|
||||
{"epoch": 1, "model": torch.nn.Linear(5, 3).state_dict()},
|
||||
{"epoch": 2, "model": torch.nn.Linear(5, 3).state_dict()},
|
||||
{"epoch": 3, "model": torch.nn.Linear(5, 3).state_dict()},
|
||||
]
|
||||
|
||||
# Save multiple checkpoints
|
||||
futures = []
|
||||
checkpoint_paths = []
|
||||
for i, state_dict in enumerate(state_dicts, 1):
|
||||
checkpoint_path = os.path.join(self.temp_dir, f"multi_{i}")
|
||||
checkpoint_paths.append(checkpoint_path)
|
||||
stage_future, write_future = checkpointer.save(
|
||||
checkpoint_path, state_dict
|
||||
)
|
||||
futures.append((stage_future, write_future))
|
||||
|
||||
# Wait for all to complete
|
||||
for stage_future, write_future in futures:
|
||||
write_future.result()
|
||||
|
||||
# Verify all checkpoints exist and have correct content
|
||||
for i, checkpoint_path in enumerate(checkpoint_paths, 1):
|
||||
checkpoint_file = os.path.join(
|
||||
checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
|
||||
)
|
||||
self.assertTrue(os.path.exists(checkpoint_file))
|
||||
|
||||
loaded = torch.load(checkpoint_file)
|
||||
self.assertEqual(loaded["epoch"], i)
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
def test_async_error_handling(self):
|
||||
"""Test error handling in async operations."""
|
||||
# Create checkpointer with mocked components to simulate errors
|
||||
mock_stager = Mock()
|
||||
mock_process = Mock()
|
||||
mock_reader = Mock()
|
||||
|
||||
# Mock staging to return a completed future
|
||||
mock_staging_future = Future()
|
||||
mock_staging_future.set_result({"staged": "data"})
|
||||
mock_stager.stage.return_value = mock_staging_future
|
||||
|
||||
# Mock process write to raise an error
|
||||
mock_write_future = Future()
|
||||
mock_write_future.set_exception(RuntimeError("Write failed"))
|
||||
mock_process.write.return_value = mock_write_future
|
||||
|
||||
checkpointer = AsyncCheckpointer(
|
||||
checkpoint_stager=mock_stager,
|
||||
checkpoint_process=mock_process,
|
||||
reader=mock_reader,
|
||||
)
|
||||
|
||||
try:
|
||||
# This should not raise immediately
|
||||
stage_future, write_future = checkpointer.save("/tmp/test", self.state_dict)
|
||||
|
||||
# But waiting for the write future should raise the error
|
||||
with self.assertRaises(RuntimeError) as cm:
|
||||
write_future.result()
|
||||
|
||||
self.assertIn("Write failed", str(cm.exception))
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
def test_async_future_results(self):
|
||||
"""Test the results returned by async futures."""
|
||||
checkpointer = self._create_async_checkpointer()
|
||||
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_results")
|
||||
|
||||
try:
|
||||
# Save checkpoint
|
||||
stage_future, write_future = checkpointer.save(
|
||||
checkpoint_path, self.state_dict
|
||||
)
|
||||
|
||||
# Both futures should complete successfully
|
||||
stage_result = stage_future.result()
|
||||
write_result = write_future.result()
|
||||
|
||||
# Stage result is wrapped by wrap_future() so it returns None on success
|
||||
# This is intentional - the stage_future indicates completion, not data access
|
||||
self.assertIsNone(stage_result)
|
||||
|
||||
# Write result should be None (success indicator)
|
||||
self.assertIsNone(write_result)
|
||||
|
||||
finally:
|
||||
checkpointer.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -25,7 +25,7 @@ from torch.distributed.distributed_c10d import _get_default_group
|
||||
from torch.distributed.tensor import DTensor
|
||||
from torch.testing._internal.common_cuda import TEST_MULTIGPU
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcContinousTest,
|
||||
MultiProcContinuousTest,
|
||||
requires_nccl,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
@ -201,7 +201,7 @@ def _test_pg_transport_with_sharded_tensor(self, device) -> None:
|
||||
torch.testing.assert_close(expected_local_tensor, received_local_tensor)
|
||||
|
||||
|
||||
class PgTransportCPU(MultiProcContinousTest):
|
||||
class PgTransportCPU(MultiProcContinuousTest):
|
||||
world_size = 8
|
||||
timeout: timedelta = timedelta(seconds=20)
|
||||
|
||||
@ -227,7 +227,7 @@ class PgTransportCPU(MultiProcContinousTest):
|
||||
_test_pg_transport_with_sharded_tensor(self, self.device)
|
||||
|
||||
|
||||
class PgTransportCUDA(MultiProcContinousTest):
|
||||
class PgTransportCUDA(MultiProcContinuousTest):
|
||||
world_size = 2
|
||||
timeout: timedelta = timedelta(seconds=20)
|
||||
|
||||
|
||||
@ -31,7 +31,7 @@ from torch.distributed.pipelining.schedules import _PipelineScheduleRuntime
|
||||
from torch.nn.modules.loss import MSELoss
|
||||
from torch.testing._internal.common_cuda import TEST_MULTIGPU
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcContinousTest,
|
||||
MultiProcContinuousTest,
|
||||
requires_nccl,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
@ -199,7 +199,7 @@ def zero_gradients(stage_modules):
|
||||
stage_module.zero_grad()
|
||||
|
||||
|
||||
class ScheduleTest(MultiProcContinousTest):
|
||||
class ScheduleTest(MultiProcContinuousTest):
|
||||
world_size = 4
|
||||
|
||||
@classmethod
|
||||
@ -802,7 +802,7 @@ class ScheduleTest(MultiProcContinousTest):
|
||||
instantiate_parametrized_tests(ScheduleTest)
|
||||
|
||||
|
||||
class CustomSchedulesTest(MultiProcContinousTest):
|
||||
class CustomSchedulesTest(MultiProcContinuousTest):
|
||||
"""
|
||||
These schedules are from the ScheduleRegistry and require world_size == 2
|
||||
The schedules test weird and unconventional schedules for edge cases
|
||||
|
||||
@ -16,7 +16,7 @@ from torch.distributed.pipelining import (
|
||||
from torch.distributed.pipelining._utils import PipeliningShapeError
|
||||
from torch.testing._internal.common_cuda import TEST_MULTIGPU
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcContinousTest,
|
||||
MultiProcContinuousTest,
|
||||
MultiProcessTestCase,
|
||||
requires_nccl,
|
||||
)
|
||||
@ -63,7 +63,7 @@ def get_flatten_hook():
|
||||
return flatten_hook
|
||||
|
||||
|
||||
class StageTest(MultiProcContinousTest):
|
||||
class StageTest(MultiProcContinuousTest):
|
||||
@classmethod
|
||||
def backend_str(cls) -> str:
|
||||
# Testing with NCCL backend
|
||||
|
||||
@ -1,11 +1,15 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
import functools
|
||||
import itertools
|
||||
import random
|
||||
import unittest
|
||||
from typing import Union
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from torch import nn, Tensor
|
||||
from torch.distributed.device_mesh import init_device_mesh
|
||||
from torch.distributed.tensor import DeviceMesh
|
||||
from torch.distributed.tensor.debug import CommDebugMode
|
||||
@ -22,7 +26,11 @@ from torch.distributed.tensor.experimental._attention import (
|
||||
)
|
||||
from torch.distributed.tensor.parallel import parallelize_module
|
||||
from torch.nn.attention import sdpa_kernel, SDPBackend
|
||||
from torch.nn.attention.flex_attention import create_block_mask, flex_attention
|
||||
from torch.nn.attention.flex_attention import (
|
||||
_mask_mod_signature,
|
||||
create_block_mask,
|
||||
flex_attention,
|
||||
)
|
||||
from torch.testing._internal.common_cuda import (
|
||||
PLATFORM_SUPPORTS_CUDNN_ATTENTION,
|
||||
PLATFORM_SUPPORTS_FLASH_ATTENTION,
|
||||
@ -446,18 +454,94 @@ compiled_create_block_mask = torch.compile(
|
||||
)
|
||||
|
||||
|
||||
def causal_mask(b, h, q_idx, kv_idx):
|
||||
return q_idx >= kv_idx
|
||||
|
||||
|
||||
# copied from https://github.com/meta-pytorch/attention-gym/blob/main/attn_gym/masks/document_mask.py
|
||||
def generate_random_lengths(total_length, num_documents):
|
||||
# Initialize all lengths to 1 to ensure each document has at least one token
|
||||
lengths = [1] * num_documents
|
||||
remaining_length = total_length - num_documents
|
||||
|
||||
# Randomly distribute the remaining length
|
||||
for _ in range(remaining_length):
|
||||
index = random.randint(0, num_documents - 1)
|
||||
lengths[index] += 1
|
||||
|
||||
return lengths
|
||||
|
||||
|
||||
def length_to_offsets(
|
||||
lengths: list[list[int]], device: Union[str, torch.device]
|
||||
) -> Tensor:
|
||||
"""Converts a list of lengths to a list of offsets.
|
||||
|
||||
Args:
|
||||
lengths: A list of lengths.
|
||||
|
||||
"""
|
||||
offsets = [[0] + lengths_in_batch for lengths_in_batch in lengths]
|
||||
offsets = torch.tensor(offsets, device=device, dtype=torch.int32)
|
||||
offsets = torch.cumsum(offsets, dim=-1)
|
||||
return offsets
|
||||
|
||||
|
||||
def _offsets_to_doc_ids_tensor(offsets):
|
||||
doc_ids = []
|
||||
device = offsets.device
|
||||
for batch_idx in range(offsets.size(0)):
|
||||
counts = offsets[batch_idx][1:] - offsets[batch_idx][:-1]
|
||||
doc_id = torch.repeat_interleave(
|
||||
torch.arange(len(counts), device=device, dtype=torch.int32), counts
|
||||
)
|
||||
doc_ids.append(doc_id)
|
||||
|
||||
return torch.stack(doc_ids)
|
||||
|
||||
|
||||
def generate_doc_mask_mod(
|
||||
mask_mod: _mask_mod_signature, offsets: Tensor
|
||||
) -> _mask_mod_signature:
|
||||
"""Generates mask mods that apply to inputs to flex attention in the sequence stacked
|
||||
format.
|
||||
|
||||
Args:
|
||||
mask_mod: The mask mod to apply to the documents
|
||||
offsets: This tensor should be of shape(num_documents + 1)
|
||||
this should contain the cumulative counts of document tokens.
|
||||
e.g. if you have 3 documents of length 2, 4, 3 then
|
||||
offsets = [0, 2, 6, 9]
|
||||
|
||||
Note:
|
||||
What is the sequence stacked format? When assembling batches of inputs, we
|
||||
take multiple sequences and stack them together to form 1 large sequence. We then
|
||||
use masking to ensure that the attention scores are only applied to tokens within
|
||||
the same document.
|
||||
"""
|
||||
document_id = _offsets_to_doc_ids_tensor(offsets)
|
||||
|
||||
def doc_mask_mod(b, h, q_idx, kv_idx):
|
||||
same_doc = document_id[b][q_idx] == document_id[b][kv_idx]
|
||||
q_logical = q_idx - offsets[b, document_id[b, q_idx]]
|
||||
kv_logical = kv_idx - offsets[b, document_id[b, kv_idx]]
|
||||
inner_mask = mask_mod(b, h, q_logical, kv_logical)
|
||||
return same_doc & inner_mask
|
||||
|
||||
return doc_mask_mod
|
||||
|
||||
|
||||
class RingFlexAttentionTest(DTensorTestBase):
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
return 2
|
||||
|
||||
def _test_ring_flex_attention(self, qkv_size) -> None:
|
||||
def causal_mask(b, h, q_idx, kv_idx):
|
||||
return q_idx >= kv_idx
|
||||
|
||||
def _test_ring_flex_attention(
|
||||
self, qkv_size, B=1, mask_func=causal_mask, atol=1e-6, rtol=1e-2
|
||||
) -> None:
|
||||
torch.cuda.manual_seed(10)
|
||||
dtype = torch.float32
|
||||
bs = 8
|
||||
bs = B if B > 1 else 8
|
||||
query_tokens = context_tokens = qkv_size
|
||||
dim = 32
|
||||
nheads = 8
|
||||
@ -482,8 +566,8 @@ class RingFlexAttentionTest(DTensorTestBase):
|
||||
)
|
||||
|
||||
block_mask = compiled_create_block_mask(
|
||||
causal_mask,
|
||||
B=1,
|
||||
mask_func,
|
||||
B=B,
|
||||
H=1,
|
||||
Q_LEN=query_tokens,
|
||||
KV_LEN=context_tokens,
|
||||
@ -531,8 +615,8 @@ class RingFlexAttentionTest(DTensorTestBase):
|
||||
|
||||
# NOTE: call create_block_mask() within TorchFunctionMode would cause error in create_fw_bw_graph
|
||||
cp_block_mask = create_cp_block_mask(
|
||||
causal_mask,
|
||||
B=1,
|
||||
mask_func,
|
||||
B=B,
|
||||
H=1,
|
||||
Q_LEN=query_tokens,
|
||||
KV_LEN=context_tokens,
|
||||
@ -574,8 +658,8 @@ class RingFlexAttentionTest(DTensorTestBase):
|
||||
|
||||
# unshard the output
|
||||
cp_out, cp_lse = context_parallel_unshard(device_mesh, [cp_out, cp_lse], [2, 2])
|
||||
torch.testing.assert_close(cp_out, expect_out, atol=1e-6, rtol=1e-2)
|
||||
torch.testing.assert_close(cp_lse, expect_lse, atol=1e-6, rtol=1e-2)
|
||||
torch.testing.assert_close(cp_out, expect_out, atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(cp_lse, expect_lse, atol=atol, rtol=rtol)
|
||||
|
||||
# unshard the gradient
|
||||
cp_q_grad, cp_k_grad, cp_v_grad = context_parallel_unshard(
|
||||
@ -583,9 +667,9 @@ class RingFlexAttentionTest(DTensorTestBase):
|
||||
[cp_q.grad, cp_k.grad, cp_v.grad],
|
||||
[2, 2, 2],
|
||||
)
|
||||
torch.testing.assert_close(cp_q_grad, q.grad, atol=1e-6, rtol=1e-2)
|
||||
torch.testing.assert_close(cp_k_grad, k.grad, atol=1e-6, rtol=1e-2)
|
||||
torch.testing.assert_close(cp_v_grad, v.grad, atol=1e-6, rtol=1e-2)
|
||||
torch.testing.assert_close(cp_q_grad, q.grad, atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(cp_k_grad, k.grad, atol=atol, rtol=rtol)
|
||||
torch.testing.assert_close(cp_v_grad, v.grad, atol=atol, rtol=rtol)
|
||||
|
||||
# reset CP context dispatch mode to default
|
||||
torch.distributed.tensor.experimental._attention._dispatch_mode = (
|
||||
@ -607,6 +691,53 @@ class RingFlexAttentionTest(DTensorTestBase):
|
||||
self._test_ring_flex_attention,
|
||||
)
|
||||
|
||||
# TODO: merge with the above test
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@with_comms
|
||||
def test_ring_flex_attention_document_mask(self) -> None:
|
||||
random.seed(10)
|
||||
|
||||
# NOTE: Each (batch_size, seq_len) tuple introduces 2 create_block_mask
|
||||
# compilations: 1 for single-rank flex_attention and 1 for CP flex_attention.
|
||||
# In order to avoid the "exceeds_recompile_limit" error, we need to increase
|
||||
# the cache_size_limit to 12 which is the total number of compilations in our
|
||||
# test case.
|
||||
torch._dynamo.config.cache_size_limit = 12
|
||||
|
||||
# initialize document mask
|
||||
doc_count = 28
|
||||
batch_size_list = [2, 4, 8]
|
||||
max_seq_len_list = [
|
||||
256 * self.world_size,
|
||||
2048,
|
||||
# 128 * self.world_size # NOTE: Mismatched elements: 8 / 131072 (0.0%),
|
||||
]
|
||||
|
||||
# TODO: change this for-loop to run_subtests
|
||||
# Use a for-loop instead of run_subtests because we need to intialize the mask
|
||||
# for each subtest. This can be baked into self._test_ring_flex_attention as
|
||||
# a str argument denoting mask type.
|
||||
for batch_size, max_seq_len in itertools.product(
|
||||
batch_size_list, max_seq_len_list
|
||||
):
|
||||
lengths = [
|
||||
generate_random_lengths(max_seq_len, doc_count)
|
||||
for _ in range(batch_size)
|
||||
]
|
||||
offsets = length_to_offsets(lengths, self.device_type)
|
||||
document_causal_mask = generate_doc_mask_mod(causal_mask, offsets)
|
||||
|
||||
# construct testing function
|
||||
test_func = functools.partial(
|
||||
self._test_ring_flex_attention,
|
||||
qkv_size=max_seq_len,
|
||||
B=batch_size,
|
||||
mask_func=document_causal_mask,
|
||||
atol=1e-6,
|
||||
)
|
||||
|
||||
test_func()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -25,7 +25,7 @@ import torch.distributed as dist
|
||||
from torch.testing._internal.common_cuda import TEST_MULTIGPU
|
||||
from torch.testing._internal.common_distributed import (
|
||||
init_multigpu_helper,
|
||||
MultiProcContinousTest,
|
||||
MultiProcContinuousTest,
|
||||
requires_nccl,
|
||||
requires_nccl_version,
|
||||
sm_is_or_higher_than,
|
||||
@ -45,7 +45,7 @@ if TEST_WITH_DEV_DBG_ASAN:
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
class ProcessGroupNCCLOpTest(MultiProcContinousTest):
|
||||
class ProcessGroupNCCLOpTest(MultiProcContinuousTest):
|
||||
@classmethod
|
||||
def backend_str(cls) -> str:
|
||||
return "nccl"
|
||||
|
||||
@ -19,7 +19,7 @@ from torch.distributed.tensor import DTensor
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
from torch.testing._internal.common_cuda import TEST_MULTIGPU
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcContinousTest,
|
||||
MultiProcContinuousTest,
|
||||
requires_nccl,
|
||||
skip_if_lt_x_gpu,
|
||||
)
|
||||
@ -91,7 +91,7 @@ def loss_fn(y, target, scale=1e-4):
|
||||
return torch.nn.functional.cross_entropy(y, target) * scale
|
||||
|
||||
|
||||
class ComposabilityTest(MultiProcContinousTest):
|
||||
class ComposabilityTest(MultiProcContinuousTest):
|
||||
@classmethod
|
||||
def backend_str(cls) -> str:
|
||||
# Testing with NCCL backend
|
||||
|
||||
@ -7,7 +7,7 @@ from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
from torch.multiprocessing.reductions import reduce_tensor
|
||||
from torch.testing._internal.common_distributed import MultiProcContinousTest
|
||||
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
||||
from torch.testing._internal.common_utils import (
|
||||
requires_cuda_p2p_access,
|
||||
run_tests,
|
||||
@ -46,7 +46,7 @@ def from_buffer(
|
||||
|
||||
|
||||
@requires_cuda_p2p_access()
|
||||
class CupyAsTensorTest(MultiProcContinousTest):
|
||||
class CupyAsTensorTest(MultiProcContinuousTest):
|
||||
@classmethod
|
||||
def backend_str(cls):
|
||||
return "gloo"
|
||||
|
||||
@ -1580,14 +1580,65 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
|
||||
# We want to make sure no unnecessary copy is made.
|
||||
(
|
||||
FileCheck()
|
||||
.check("= torch.ops._c10d_functional.all_gather_into_tensor")
|
||||
.check("torch.ops._c10d_functional.all_gather_into_tensor_out.default(")
|
||||
.check("= torch.ops._c10d_functional.all_gather_into_tensor")
|
||||
.check_count(".all_gather_into_tensor_out.default(", 2, exactly=True)
|
||||
.run(code)
|
||||
)
|
||||
out = compiled(*inputs, **self.get_world_trs())
|
||||
assert same(out, correct), f"{out} va {correct}"
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not SM80OrLater, "bfloat16")
|
||||
def test_all_gather_bucket_path(self):
|
||||
def func(x, w, ag_0, ag_1, *, tag, ranks, group_size):
|
||||
# do some unrelated matmuls
|
||||
y = torch.mm(x, w)
|
||||
|
||||
# cast the inputs
|
||||
ag_0_cast = ag_0.to(torch.bfloat16)
|
||||
ag_1_cast = ag_1.to(torch.bfloat16)
|
||||
|
||||
# first allgather
|
||||
group_name = (
|
||||
torch.distributed.distributed_c10d._get_default_group().group_name
|
||||
)
|
||||
ag_0_out = torch.ops._c10d_functional.all_gather_into_tensor(
|
||||
ag_0_cast, group_size, group_name
|
||||
)
|
||||
ag_0_out = torch.ops.c10d_functional.wait_tensor(ag_0_out)
|
||||
ag_0_out = ag_0_out * 2
|
||||
|
||||
# Create dependency: second allgather input depends on first allgather output
|
||||
# This prevents fusion of the two allgather operations
|
||||
ag_1_modified = (
|
||||
ag_1_cast + ag_0_out[: ag_1_cast.shape[0]]
|
||||
) # Use part of ag_0_out
|
||||
|
||||
# second allgather (now depends on the first one)
|
||||
ag_1_out = torch.ops._c10d_functional.all_gather_into_tensor(
|
||||
ag_1_modified, group_size, group_name
|
||||
)
|
||||
ag_1_out = torch.ops.c10d_functional.wait_tensor(ag_1_out)
|
||||
|
||||
return y, ag_0_out, ag_1_out
|
||||
|
||||
x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
|
||||
w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
|
||||
ag_0 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
|
||||
ag_1 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
|
||||
inputs = [x, w, ag_0, ag_1]
|
||||
|
||||
with torch._inductor.config.patch(
|
||||
{
|
||||
"bucket_all_gathers_fx": "all",
|
||||
"reorder_for_compute_comm_overlap": False,
|
||||
}
|
||||
):
|
||||
compiled = torch.compile(func)
|
||||
code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
|
||||
|
||||
# shouldnt have bucketed
|
||||
FileCheck().check_count("wait_tensor.default(", 2, exactly=True).run(code)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not SM80OrLater, "bfloat16")
|
||||
def test_reduce_scatter_bucket(self):
|
||||
|
||||
@ -14,7 +14,7 @@ from torch.testing._internal.common_device_type import (
|
||||
instantiate_device_type_tests,
|
||||
)
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcContinousTest,
|
||||
MultiProcContinuousTest,
|
||||
skip_if_lt_x_gpu,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
@ -246,7 +246,7 @@ class TestNCCL(TestCase):
|
||||
|
||||
|
||||
@requires_cuda_p2p_access()
|
||||
class NCCLSymmetricMemoryTest(MultiProcContinousTest):
|
||||
class NCCLSymmetricMemoryTest(MultiProcContinuousTest):
|
||||
@property
|
||||
def device(self) -> torch.device:
|
||||
return torch.device("cuda", self.rank)
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.distributed._symmetric_memory as symm_mem
|
||||
from torch.testing._internal.common_distributed import MultiProcContinousTest
|
||||
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
@ -33,7 +33,7 @@ device_module = torch.get_device_module(device_type)
|
||||
|
||||
@requires_nvshmem()
|
||||
@requires_cuda_p2p_access()
|
||||
class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
|
||||
class NVSHMEMSymmetricMemoryTest(MultiProcContinuousTest):
|
||||
def _init_device(self) -> None:
|
||||
# TODO: relieve this (seems to hang if without)
|
||||
device_module.set_device(self.device)
|
||||
@ -65,6 +65,48 @@ class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
|
||||
out = symm_mem.empty(numel, dtype=dtype, device=self.device)
|
||||
symm_mem.rendezvous(out, group=group_name)
|
||||
|
||||
@skipIfRocm
|
||||
def test_rendezvous_slice(self) -> None:
|
||||
# Rendezvous a slice of a tensor
|
||||
self._init_device()
|
||||
group_name = dist.group.WORLD.group_name
|
||||
symm_mem.enable_symm_mem_for_group(group_name)
|
||||
|
||||
x = symm_mem.empty((2, 1024), device=self.device)
|
||||
# Directly rendezvousing a slice should not fail
|
||||
hdls = [symm_mem.rendezvous(y, group=group_name) for y in torch.chunk(x, 2)]
|
||||
# Assert that handles are not the same
|
||||
self.assertIsNot(hdls[0], hdls[1])
|
||||
|
||||
@skipIfRocm
|
||||
def test_rendezvous_view(self) -> None:
|
||||
# Rendezvous a view of a tensor
|
||||
self._init_device()
|
||||
group_name = dist.group.WORLD.group_name
|
||||
symm_mem.enable_symm_mem_for_group(group_name)
|
||||
|
||||
x = symm_mem.empty(1024, device=self.device)
|
||||
y = x.view(32, 32)
|
||||
# Directly rendezvousing a view should not fail
|
||||
hdl_y = symm_mem.rendezvous(y, group=group_name)
|
||||
|
||||
# Assert that view's handle is not the same as the original tensor's handle
|
||||
hdl_x = symm_mem.rendezvous(x, group=group_name)
|
||||
self.assertIsNot(hdl_x, hdl_y)
|
||||
|
||||
@skipIfRocm
|
||||
def test_rendezvous_same(self) -> None:
|
||||
# Rendezvous same tensor multiple times
|
||||
self._init_device()
|
||||
group_name = dist.group.WORLD.group_name
|
||||
symm_mem.enable_symm_mem_for_group(group_name)
|
||||
|
||||
x = symm_mem.empty(1024, device=self.device)
|
||||
hdl_0 = symm_mem.rendezvous(x, group=group_name)
|
||||
hdl_1 = symm_mem.rendezvous(x, group=group_name)
|
||||
# The handle should point to the same object
|
||||
self.assertIs(hdl_0, hdl_1)
|
||||
|
||||
@skipIfRocm
|
||||
def test_nvshmem_put(self) -> None:
|
||||
self._init_device()
|
||||
@ -117,7 +159,7 @@ class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
|
||||
@instantiate_parametrized_tests
|
||||
@requires_nvshmem()
|
||||
@requires_cuda_p2p_access()
|
||||
class NVSHMEMAll2AllTest(MultiProcContinousTest):
|
||||
class NVSHMEMAll2AllTest(MultiProcContinuousTest):
|
||||
def _init_device(self) -> None:
|
||||
# TODO: relieve this (seems to hang if without)
|
||||
device_module.set_device(self.device)
|
||||
|
||||
@ -9,7 +9,7 @@ import torch.distributed as dist
|
||||
import torch.distributed._symmetric_memory as symm_mem
|
||||
import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
|
||||
from torch._inductor.runtime.triton_compat import triton
|
||||
from torch.testing._internal.common_distributed import MultiProcContinousTest
|
||||
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
@ -246,7 +246,7 @@ def nvshmem_reduce_kernel(
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
@requires_nvshmem()
|
||||
class NVSHMEMTritonTest(MultiProcContinousTest):
|
||||
class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
def _init_device(self) -> None:
|
||||
# TODO: relieve this (seems to hang if without)
|
||||
device_module.set_device(self.device)
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
|
||||
import torch
|
||||
from torch.multiprocessing.reductions import reduce_tensor
|
||||
from torch.testing._internal.common_distributed import MultiProcContinousTest
|
||||
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
||||
from torch.testing._internal.common_utils import (
|
||||
requires_cuda_p2p_access,
|
||||
run_tests,
|
||||
@ -20,7 +20,7 @@ device_module = torch.get_device_module(device_type)
|
||||
|
||||
|
||||
@requires_cuda_p2p_access()
|
||||
class P2PIpcTest(MultiProcContinousTest):
|
||||
class P2PIpcTest(MultiProcContinuousTest):
|
||||
@classmethod
|
||||
def backend_str(cls):
|
||||
return "gloo"
|
||||
|
||||
@ -24,7 +24,7 @@ from torch.distributed._symmetric_memory import (
|
||||
from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater
|
||||
from torch.testing._internal.common_device_type import e4m3_type
|
||||
from torch.testing._internal.common_distributed import (
|
||||
MultiProcContinousTest,
|
||||
MultiProcContinuousTest,
|
||||
MultiProcessTestCase,
|
||||
requires_multicast_support,
|
||||
skip_if_lt_x_gpu,
|
||||
@ -52,7 +52,7 @@ device_module = torch.get_device_module(device_type)
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
@requires_cuda_p2p_access()
|
||||
class SymmetricMemoryTest(MultiProcContinousTest):
|
||||
class SymmetricMemoryTest(MultiProcContinuousTest):
|
||||
@property
|
||||
def device(self) -> torch.device:
|
||||
return torch.device(device_type, self.rank)
|
||||
@ -636,7 +636,7 @@ class SymmetricMemoryTest(MultiProcContinousTest):
|
||||
|
||||
# This Test class is used to test the error handling of SymmetricMemory APIs.
|
||||
# Since a process restart is often needed after each test, we use the
|
||||
# MultiProcessTestCase instead of MultiProcContinousTest.
|
||||
# MultiProcessTestCase instead of MultiProcContinuousTest.
|
||||
@requires_cuda_p2p_access()
|
||||
class SymmMemNegativeTest(MultiProcessTestCase):
|
||||
def setUp(self) -> None:
|
||||
@ -746,7 +746,7 @@ class SymmMemNegativeTest(MultiProcessTestCase):
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
@requires_cuda_p2p_access()
|
||||
class SymmMemCollectiveTest(MultiProcContinousTest):
|
||||
class SymmMemCollectiveTest(MultiProcContinuousTest):
|
||||
@property
|
||||
def device(self) -> torch.device:
|
||||
return torch.device(device_type, self.rank)
|
||||
@ -993,7 +993,7 @@ class SymmMemCollectiveTest(MultiProcContinousTest):
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
@requires_cuda_p2p_access()
|
||||
class LoweringTest(MultiProcContinousTest):
|
||||
class LoweringTest(MultiProcContinuousTest):
|
||||
def _init_process(self) -> None:
|
||||
torch.cuda.set_device(self.device)
|
||||
enable_symm_mem_for_group(dist.group.WORLD.group_name)
|
||||
|
||||
@ -292,6 +292,56 @@ class AOTAutogradCacheTests(InductorTestCase):
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
|
||||
|
||||
@inductor_config.patch("fx_graph_remote_cache", False)
|
||||
@inductor_config.patch("fx_graph_cache", True)
|
||||
@functorch_config.patch({"enable_autograd_cache": True})
|
||||
def test_vmap(self):
|
||||
"""
|
||||
make
|
||||
"""
|
||||
|
||||
def fn(x, y):
|
||||
f = lambda x, y: (x * y + 1).sum(dim=0) # noqa: E731
|
||||
vmapped = torch.vmap(f)(x, y)
|
||||
return vmapped.sum(dim=0)
|
||||
|
||||
x = torch.randn(25, requires_grad=True)
|
||||
y = torch.randn(25, requires_grad=True)
|
||||
x2 = x.detach().clone().requires_grad_(True)
|
||||
y2 = y.detach().clone().requires_grad_(True)
|
||||
|
||||
compiled_fn = torch.compile(fn, backend="inductor")
|
||||
|
||||
# A first call should miss in the cache.
|
||||
self.assertEqual(fn(x, y), compiled_fn(x2, y2))
|
||||
fn(x, y).sum().backward()
|
||||
compiled_fn(x2, y2).sum().backward()
|
||||
self.assertEqual(x.grad, x2.grad)
|
||||
self.assertEqual(y.grad, y2.grad)
|
||||
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
|
||||
|
||||
# Reset all tensors
|
||||
x = torch.randn(25, requires_grad=True)
|
||||
y = torch.randn(25, requires_grad=True)
|
||||
x2 = x.detach().clone().requires_grad_(True)
|
||||
y2 = y.detach().clone().requires_grad_(True)
|
||||
|
||||
# A second call should hit. (First reset so in-memory guards
|
||||
# don't prevent compilation).
|
||||
self._clear_dynamo_and_codecache()
|
||||
self.assertEqual(fn(x, y), compiled_fn(x2, y2))
|
||||
fn(x, y).sum().backward()
|
||||
compiled_fn(x2, y2).sum().backward()
|
||||
self.assertEqual(x.grad, x2.grad)
|
||||
self.assertEqual(y.grad, y2.grad)
|
||||
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
|
||||
self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
|
||||
|
||||
@inductor_config.patch("fx_graph_remote_cache", False)
|
||||
@inductor_config.patch("fx_graph_cache", True)
|
||||
@functorch_config.patch({"enable_autograd_cache": True})
|
||||
|
||||
@ -284,7 +284,7 @@ def fn():
|
||||
def nothing(*args):
|
||||
pass
|
||||
|
||||
code = bytecode_transformation.transform_code_object(fn.__code__, nothing)
|
||||
code, _ = bytecode_transformation.transform_code_object(fn.__code__, nothing)
|
||||
self.assertEqual(code.co_exceptiontable, fn.__code__.co_exceptiontable)
|
||||
|
||||
@skipIfNotPy311
|
||||
@ -300,7 +300,7 @@ def fn():
|
||||
def nothing(*args):
|
||||
pass
|
||||
|
||||
code = bytecode_transformation.transform_code_object(fn.__code__, nothing)
|
||||
code, _ = bytecode_transformation.transform_code_object(fn.__code__, nothing)
|
||||
self.assertEqual(code.co_exceptiontable, fn.__code__.co_exceptiontable)
|
||||
|
||||
@skipIfNotPy311
|
||||
|
||||
@ -13,6 +13,7 @@ if dist.is_available():
|
||||
all_to_all_single_autograd,
|
||||
wait_tensor,
|
||||
)
|
||||
from torch.distributed.device_mesh import init_device_mesh
|
||||
from torch.testing._internal.distributed.fake_pg import FakeStore
|
||||
|
||||
|
||||
@ -26,6 +27,8 @@ class TestFakeDistributed(DynamoTestCase):
|
||||
# Use FakeProcessGroup to run tests on a single process
|
||||
self.store = FakeStore()
|
||||
dist.init_process_group(backend="fake", rank=0, world_size=2, store=self.store)
|
||||
self.local_rank = 0
|
||||
self.world_size = 2
|
||||
|
||||
def tearDown(self):
|
||||
dist.destroy_process_group()
|
||||
@ -115,6 +118,23 @@ class GraphModule(torch.nn.Module):
|
||||
""", # noqa: B950
|
||||
)
|
||||
|
||||
def test_device_mesh_get_local_rank(self):
|
||||
device_mesh = init_device_mesh(
|
||||
device_type="cpu",
|
||||
mesh_shape=(self.world_size,),
|
||||
mesh_dim_names=("dp",), # data parallel dimension
|
||||
)
|
||||
|
||||
@torch.compile(backend="eager", fullgraph=True)
|
||||
def fn(x):
|
||||
local_rank = device_mesh.get_local_rank()
|
||||
global_rank = device_mesh.get_rank()
|
||||
return x + local_rank + global_rank
|
||||
|
||||
x = torch.ones(10)
|
||||
res = fn(x)
|
||||
self.assertEqual(res, x)
|
||||
|
||||
|
||||
instantiate_parametrized_tests(TestFakeDistributed)
|
||||
|
||||
|
||||
@ -3084,29 +3084,29 @@ def forward(self, L_a_ : torch.SymInt, L_b_ : torch.SymInt, L_c_ : torch.SymInt,
|
||||
b = torch.arange(l_b_)
|
||||
c = torch.arange(l_c_)
|
||||
d = torch.arange(l_d_)
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(l_d_, 'error'); _vmap_increment_nesting = None
|
||||
child = torch._C._functorch._add_batch_dim(d, 0, 1); d = None
|
||||
lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_1 = None
|
||||
_vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(l_c_, 'error'); _vmap_increment_nesting_1 = None
|
||||
child_1 = torch._C._functorch._add_batch_dim(c, 0, 2); c = None
|
||||
lazy_load_decompositions_2 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_2 = None
|
||||
_vmap_increment_nesting_2 = torch._C._functorch._vmap_increment_nesting(l_b_, 'error'); _vmap_increment_nesting_2 = None
|
||||
child_2 = torch._C._functorch._add_batch_dim(b, 0, 3); b = None
|
||||
lazy_load_decompositions_3 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_3 = None
|
||||
_vmap_increment_nesting_3 = torch._C._functorch._vmap_increment_nesting(l_a_, 'error'); _vmap_increment_nesting_3 = None
|
||||
_add_batch_dim_3 = torch._C._functorch._add_batch_dim(a, 0, 4); a = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(l_d_, 'error'); _vmap_increment_nesting = None
|
||||
child = torch._functorch.predispatch._add_batch_dim(d, 0, 1); d = None
|
||||
lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_1 = None
|
||||
_vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(l_c_, 'error'); _vmap_increment_nesting_1 = None
|
||||
child_1 = torch._functorch.predispatch._add_batch_dim(c, 0, 2); c = None
|
||||
lazy_load_decompositions_2 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_2 = None
|
||||
_vmap_increment_nesting_2 = torch._functorch.predispatch._vmap_increment_nesting(l_b_, 'error'); _vmap_increment_nesting_2 = None
|
||||
child_2 = torch._functorch.predispatch._add_batch_dim(b, 0, 3); b = None
|
||||
lazy_load_decompositions_3 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_3 = None
|
||||
_vmap_increment_nesting_3 = torch._functorch.predispatch._vmap_increment_nesting(l_a_, 'error'); _vmap_increment_nesting_3 = None
|
||||
_add_batch_dim_3 = torch._functorch.predispatch._add_batch_dim(a, 0, 4); a = None
|
||||
add = _add_batch_dim_3 + child_2; _add_batch_dim_3 = child_2 = None
|
||||
add_1 = add + child_1; add = child_1 = None
|
||||
batched_outputs = add_1 + child; add_1 = child = None
|
||||
batched_outputs_1 = torch._C._functorch._remove_batch_dim(batched_outputs, 4, l_a_, 0); batched_outputs = l_a_ = None
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
batched_outputs_2 = torch._C._functorch._remove_batch_dim(batched_outputs_1, 3, l_b_, 0); batched_outputs_1 = l_b_ = None
|
||||
_vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
|
||||
batched_outputs_3 = torch._C._functorch._remove_batch_dim(batched_outputs_2, 2, l_c_, 0); batched_outputs_2 = l_c_ = None
|
||||
_vmap_decrement_nesting_2 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_2 = None
|
||||
_remove_batch_dim_3 = torch._C._functorch._remove_batch_dim(batched_outputs_3, 1, l_d_, 0); batched_outputs_3 = l_d_ = None
|
||||
_vmap_decrement_nesting_3 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_3 = None
|
||||
batched_outputs_1 = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 4, l_a_, 0); batched_outputs = l_a_ = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
batched_outputs_2 = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 3, l_b_, 0); batched_outputs_1 = l_b_ = None
|
||||
_vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
|
||||
batched_outputs_3 = torch._functorch.predispatch._remove_batch_dim(batched_outputs_2, 2, l_c_, 0); batched_outputs_2 = l_c_ = None
|
||||
_vmap_decrement_nesting_2 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_2 = None
|
||||
_remove_batch_dim_3 = torch._functorch.predispatch._remove_batch_dim(batched_outputs_3, 1, l_d_, 0); batched_outputs_3 = l_d_ = None
|
||||
_vmap_decrement_nesting_3 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_3 = None
|
||||
return (_remove_batch_dim_3,)""", # noqa: B950
|
||||
)
|
||||
|
||||
@ -3739,11 +3739,11 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
child: "f32[12, 4, 3]" = chunk.view(12, 4, 3); chunk = None
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
|
||||
child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
|
||||
|
||||
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
|
||||
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
|
||||
@ -3786,18 +3786,18 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
basis: "f32[12, 4, 3]" = chunk_1.view(12, 4, 3); chunk_1 = None
|
||||
|
||||
lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_1 = None
|
||||
lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_1 = None
|
||||
|
||||
_vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting_1 = None
|
||||
_vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting_1 = None
|
||||
|
||||
_add_batch_dim_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 3); basis = None
|
||||
_add_batch_dim_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 3); basis = None
|
||||
|
||||
_autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim_1], retain_graph = True, create_graph = True); primals_out = diff_primals = _add_batch_dim_1 = None
|
||||
batched_outputs: "f32[4, 3]" = _autograd_grad[0]; _autograd_grad = None
|
||||
|
||||
chunked_result: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 3, 12, 0); batched_outputs = None
|
||||
chunked_result: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 3, 12, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
split = chunked_result.split((12,), dim = 0); chunked_result = None
|
||||
split_1: "f32[12, 4, 3]" = split[0]; split = None
|
||||
@ -3816,9 +3816,9 @@ class GraphModule(torch.nn.Module):
|
||||
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
|
||||
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
|
||||
|
||||
results_1: "f32[12, 4, 3, 4, 3]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
|
||||
results_1: "f32[12, 4, 3, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
|
||||
|
||||
_vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
|
||||
_vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
|
||||
|
||||
movedim: "f32[4, 3, 4, 3, 12]" = results_1.movedim(0, -1); results_1 = None
|
||||
split_2 = movedim.split((12,), dim = -1); movedim = None
|
||||
@ -3867,11 +3867,11 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
child: "f32[12, 3, 4]" = chunk.view(12, 3, 4); chunk = None
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
|
||||
child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
|
||||
|
||||
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
|
||||
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
|
||||
@ -3916,18 +3916,18 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
basis: "f32[12, 4, 3]" = chunk_1.view(12, 4, 3); chunk_1 = None
|
||||
|
||||
lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_1 = None
|
||||
lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_1 = None
|
||||
|
||||
_vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting_1 = None
|
||||
_vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting_1 = None
|
||||
|
||||
_add_batch_dim_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 3); basis = None
|
||||
_add_batch_dim_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 3); basis = None
|
||||
|
||||
_autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [child_4], [_add_batch_dim_1], retain_graph = True, create_graph = True); primals_out = child_4 = _add_batch_dim_1 = None
|
||||
child_5: "f32[3, 4]" = _autograd_grad[0]; _autograd_grad = None
|
||||
|
||||
child_6: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(child_5, 3, 12, 0); child_5 = None
|
||||
child_6: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_5, 3, 12, 0); child_5 = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
split = child_6.split((12,), dim = 0); child_6 = None
|
||||
split_1: "f32[12, 3, 4]" = split[0]; split = None
|
||||
@ -3947,9 +3947,9 @@ class GraphModule(torch.nn.Module):
|
||||
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
|
||||
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
|
||||
|
||||
child_10: "f32[12, 4, 3, 3, 4]" = torch._C._functorch._remove_batch_dim(child_9, 1, 12, 0); child_9 = None
|
||||
child_10: "f32[12, 4, 3, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_9, 1, 12, 0); child_9 = None
|
||||
|
||||
_vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
|
||||
_vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
|
||||
|
||||
movedim: "f32[4, 3, 3, 4, 12]" = child_10.movedim(0, -1); child_10 = None
|
||||
split_2 = movedim.split((12,), dim = -1); movedim = None
|
||||
@ -4014,18 +4014,18 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
basis: "f32[12, 4, 3]" = chunk.view(12, 4, 3); chunk = None
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 1); basis = None
|
||||
_add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1); basis = None
|
||||
|
||||
_autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True); primals_out = diff_primals = _add_batch_dim = None
|
||||
batched_outputs: "f32[4, 3]" = _autograd_grad[0]; _autograd_grad = None
|
||||
|
||||
chunked_result: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
|
||||
chunked_result: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
split = chunked_result.split((12,), dim = 0); chunked_result = None
|
||||
split_1: "f32[12, 4, 3]" = split[0]; split = None
|
||||
@ -4092,18 +4092,18 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
basis: "f32[12, 3, 4]" = chunk.view(12, 3, 4); chunk = None
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[3, 4]" = torch._C._functorch._add_batch_dim(basis, 0, 1); basis = None
|
||||
_add_batch_dim: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1); basis = None
|
||||
|
||||
_autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True); primals_out = diff_primals = _add_batch_dim = None
|
||||
batched_outputs: "f32[3, 4]" = _autograd_grad[0]; _autograd_grad = None
|
||||
|
||||
chunked_result: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
|
||||
chunked_result: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
split = chunked_result.split((12,), dim = 0); chunked_result = None
|
||||
split_1: "f32[12, 3, 4]" = split[0]; split = None
|
||||
@ -4172,18 +4172,18 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
basis: "f32[12, 3, 4]" = chunk.view(12, 3, 4); chunk = None
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[3, 4]" = torch._C._functorch._add_batch_dim(basis, 0, 1); basis = None
|
||||
_add_batch_dim: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1); basis = None
|
||||
|
||||
_autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True); primals_out = diff_primals = _add_batch_dim = None
|
||||
batched_outputs: "f32[3, 4]" = _autograd_grad[0]; _autograd_grad = None
|
||||
|
||||
chunked_result: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
|
||||
chunked_result: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
split = chunked_result.split((12,), dim = 0); chunked_result = None
|
||||
split_1: "f32[12, 3, 4]" = split[0]; split = None
|
||||
@ -5229,11 +5229,11 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
child: "f32[12, 4, 3]" = chunk.view(12, 4, 3); chunk = None
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
|
||||
child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
|
||||
|
||||
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
|
||||
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
|
||||
@ -5259,9 +5259,9 @@ class GraphModule(torch.nn.Module):
|
||||
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
|
||||
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
|
||||
|
||||
results: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
|
||||
results: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
movedim: "f32[4, 3, 12]" = results.movedim(0, -1); results = None
|
||||
split = movedim.split((12,), dim = -1); movedim = None
|
||||
@ -5310,11 +5310,11 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
child: "f32[12, 3, 4]" = chunk.view(12, 3, 4); chunk = None
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
|
||||
child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
|
||||
|
||||
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
|
||||
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
|
||||
@ -5341,9 +5341,9 @@ class GraphModule(torch.nn.Module):
|
||||
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
|
||||
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
|
||||
|
||||
results: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
|
||||
results: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
movedim: "f32[3, 4, 12]" = results.movedim(0, -1); results = None
|
||||
split = movedim.split((12,), dim = -1); movedim = None
|
||||
@ -5392,11 +5392,11 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
child: "f32[12, 3, 4]" = chunk.view(12, 3, 4); chunk = None
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
|
||||
child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
|
||||
|
||||
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
|
||||
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
|
||||
@ -5425,10 +5425,10 @@ class GraphModule(torch.nn.Module):
|
||||
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
|
||||
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
|
||||
|
||||
results: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
|
||||
aux_2: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(aux_1, 1, 12, 0); aux_1 = None
|
||||
results: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
|
||||
aux_2: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(aux_1, 1, 12, 0); aux_1 = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
aux_3: "f32[4, 3]" = aux_2[0]; aux_2 = None
|
||||
|
||||
@ -5479,11 +5479,11 @@ class GraphModule(torch.nn.Module):
|
||||
|
||||
child: "f32[12, 4, 3]" = chunk.view(12, 4, 3); chunk = None
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'same'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'same'); _vmap_increment_nesting = None
|
||||
|
||||
child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
|
||||
child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
|
||||
|
||||
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
|
||||
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
|
||||
@ -5517,10 +5517,10 @@ class GraphModule(torch.nn.Module):
|
||||
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
|
||||
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
|
||||
|
||||
child_8: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(child_6, 1, 12, 0); child_6 = None
|
||||
child_9: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(child_7, 1, 12, 0); child_7 = None
|
||||
child_8: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_6, 1, 12, 0); child_6 = None
|
||||
child_9: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(child_7, 1, 12, 0); child_7 = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
movedim: "f32[3, 4, 12]" = child_8.movedim(0, -1); child_8 = None
|
||||
split = movedim.split((12,), dim = -1); movedim = None
|
||||
@ -6260,19 +6260,19 @@ class GraphModule(torch.nn.Module):
|
||||
def forward(self, L_x_: "f32[3, 3, 3]"):
|
||||
l_x_ = L_x_
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
_add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
|
||||
sum_1: "f32[3]" = _add_batch_dim.sum(0)
|
||||
sum_2: "f32[3]" = _add_batch_dim.sum(1); _add_batch_dim = None
|
||||
batched_outputs: "f32[3]" = sum_1 + sum_2; sum_1 = sum_2 = None
|
||||
|
||||
_remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
|
||||
_remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
return (_remove_batch_dim,)
|
||||
""",
|
||||
)
|
||||
@ -6298,20 +6298,20 @@ class GraphModule(torch.nn.Module):
|
||||
def forward(self, L_x_: "f32[3, 3, 3]"):
|
||||
l_x_ = L_x_
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
_add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
|
||||
sum_1: "f32[3]" = _add_batch_dim.sum(0)
|
||||
sum_2: "f32[3]" = _add_batch_dim.sum(1); _add_batch_dim = None
|
||||
add: "f32[3]" = sum_1 + sum_2; sum_1 = sum_2 = None
|
||||
batched_outputs: "f32[3]" = add + 3; add = None
|
||||
|
||||
_remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
|
||||
_remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
return (_remove_batch_dim,)
|
||||
""",
|
||||
)
|
||||
@ -6338,20 +6338,20 @@ class GraphModule(torch.nn.Module):
|
||||
l_x_ = L_x_
|
||||
l_y_ = L_y_
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
_add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
|
||||
sum_1: "f32[3]" = _add_batch_dim.sum(0)
|
||||
sum_2: "f32[3]" = _add_batch_dim.sum(1); _add_batch_dim = None
|
||||
add: "f32[3]" = sum_1 + sum_2; sum_1 = sum_2 = None
|
||||
batched_outputs: "f32[3, 3]" = add + l_y_; add = l_y_ = None
|
||||
|
||||
_remove_batch_dim: "f32[3, 3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
|
||||
_remove_batch_dim: "f32[3, 3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
return (_remove_batch_dim,)
|
||||
""",
|
||||
)
|
||||
@ -6379,21 +6379,21 @@ class GraphModule(torch.nn.Module):
|
||||
l_x_ = L_x_
|
||||
l_y_ = L_y_
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
_add_batch_dim_1: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 1, 1); l_y_ = None
|
||||
_add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
_add_batch_dim_1: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 1, 1); l_y_ = None
|
||||
|
||||
sum_1: "f32[3]" = _add_batch_dim.sum(0)
|
||||
sum_2: "f32[3]" = _add_batch_dim.sum(1); _add_batch_dim = None
|
||||
add: "f32[3]" = sum_1 + sum_2; sum_1 = sum_2 = None
|
||||
batched_outputs: "f32[3]" = add + _add_batch_dim_1; add = _add_batch_dim_1 = None
|
||||
|
||||
_remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
|
||||
_remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
return (_remove_batch_dim,)
|
||||
""",
|
||||
)
|
||||
@ -6423,21 +6423,21 @@ class GraphModule(torch.nn.Module):
|
||||
l_x_ = L_x_
|
||||
l_y_ = L_y_
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
_add_batch_dim_1: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 1, 1); l_y_ = None
|
||||
_add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
_add_batch_dim_1: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 1, 1); l_y_ = None
|
||||
|
||||
sum_1: "f32[3]" = _add_batch_dim.sum(0)
|
||||
sum_2: "f32[3]" = _add_batch_dim.sum(1); _add_batch_dim = None
|
||||
add: "f32[3]" = sum_1 + sum_2; sum_1 = sum_2 = None
|
||||
batched_outputs: "f32[3]" = add + _add_batch_dim_1; add = _add_batch_dim_1 = None
|
||||
|
||||
_remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
|
||||
_remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
return (_remove_batch_dim,)
|
||||
""",
|
||||
)
|
||||
@ -6463,29 +6463,29 @@ class GraphModule(torch.nn.Module):
|
||||
l_x_ = L_x_
|
||||
l_y_ = L_y_
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
child: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
child_1: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_y_, 0, 1); l_y_ = None
|
||||
child: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
child_1: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 0, 1); l_y_ = None
|
||||
|
||||
lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_1 = None
|
||||
lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_1 = None
|
||||
|
||||
_vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting_1 = None
|
||||
_vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting_1 = None
|
||||
|
||||
_add_batch_dim_2: "f32[3]" = torch._C._functorch._add_batch_dim(child, 1, 2); child = None
|
||||
_add_batch_dim_3: "f32[3]" = torch._C._functorch._add_batch_dim(child_1, 1, 2); child_1 = None
|
||||
_add_batch_dim_2: "f32[3]" = torch._functorch.predispatch._add_batch_dim(child, 1, 2); child = None
|
||||
_add_batch_dim_3: "f32[3]" = torch._functorch.predispatch._add_batch_dim(child_1, 1, 2); child_1 = None
|
||||
|
||||
batched_outputs: "f32[3]" = _add_batch_dim_2 + _add_batch_dim_3; _add_batch_dim_2 = _add_batch_dim_3 = None
|
||||
|
||||
batched_outputs_1: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 2, 3, 0); batched_outputs = None
|
||||
batched_outputs_1: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 2, 3, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
_remove_batch_dim_1: "f32[3, 3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs_1, 1, 3, 0); batched_outputs_1 = None
|
||||
_remove_batch_dim_1: "f32[3, 3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 1, 3, 0); batched_outputs_1 = None
|
||||
|
||||
_vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
|
||||
_vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
|
||||
return (_remove_batch_dim_1,)
|
||||
""",
|
||||
)
|
||||
@ -6512,27 +6512,27 @@ class GraphModule(torch.nn.Module):
|
||||
l_y_ = L_y_
|
||||
l_x_ = L_x_
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(5, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(5, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
child: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 0, 1); l_y_ = None
|
||||
child: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 0, 1); l_y_ = None
|
||||
|
||||
lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_1 = None
|
||||
lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_1 = None
|
||||
|
||||
_vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting_1 = None
|
||||
_vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting_1 = None
|
||||
|
||||
_add_batch_dim_1: "f32[]" = torch._C._functorch._add_batch_dim(child, 0, 2); child = None
|
||||
_add_batch_dim_1: "f32[]" = torch._functorch.predispatch._add_batch_dim(child, 0, 2); child = None
|
||||
|
||||
batched_outputs: "f32[2, 3]" = l_x_ * _add_batch_dim_1; l_x_ = _add_batch_dim_1 = None
|
||||
|
||||
batched_outputs_1: "f32[3, 2, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 2, 3, 0); batched_outputs = None
|
||||
batched_outputs_1: "f32[3, 2, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 2, 3, 0); batched_outputs = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
|
||||
_remove_batch_dim_1: "f32[5, 3, 2, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs_1, 1, 5, 0); batched_outputs_1 = None
|
||||
_remove_batch_dim_1: "f32[5, 3, 2, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 1, 5, 0); batched_outputs_1 = None
|
||||
|
||||
_vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
|
||||
_vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
|
||||
return (_remove_batch_dim_1,)
|
||||
""",
|
||||
)
|
||||
@ -6557,19 +6557,19 @@ class GraphModule(torch.nn.Module):
|
||||
def forward(self, L_x_: "f32[2, 4, 3]"):
|
||||
l_x_ = L_x_
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
_add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
|
||||
child: "f32[3]" = _add_batch_dim.sum(0)
|
||||
child_1: "f32[4]" = _add_batch_dim.sum(1); _add_batch_dim = None
|
||||
|
||||
_remove_batch_dim: "f32[2, 3]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 0); child = None
|
||||
_remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
|
||||
_remove_batch_dim: "f32[2, 3]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 0); child = None
|
||||
_remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
return (_remove_batch_dim, _remove_batch_dim_1)
|
||||
""",
|
||||
)
|
||||
@ -6594,19 +6594,19 @@ class GraphModule(torch.nn.Module):
|
||||
def forward(self, L_x_: "f32[2, 4, 3]"):
|
||||
l_x_ = L_x_
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
_add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
|
||||
child: "f32[3]" = _add_batch_dim.sum(0)
|
||||
child_1: "f32[4]" = _add_batch_dim.sum(1); _add_batch_dim = None
|
||||
|
||||
_remove_batch_dim: "f32[3, 2]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 1); child = None
|
||||
_remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
|
||||
_remove_batch_dim: "f32[3, 2]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 1); child = None
|
||||
_remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
return (_remove_batch_dim, _remove_batch_dim_1)
|
||||
""",
|
||||
)
|
||||
@ -6632,19 +6632,19 @@ class GraphModule(torch.nn.Module):
|
||||
def forward(self, L_x_: "f32[2, 4, 3]"):
|
||||
l_x_ = L_x_
|
||||
|
||||
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
|
||||
|
||||
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
|
||||
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
|
||||
|
||||
_add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
_add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
|
||||
|
||||
child: "f32[3]" = _add_batch_dim.sum(0)
|
||||
child_1: "f32[4]" = _add_batch_dim.sum(1); _add_batch_dim = None
|
||||
|
||||
_remove_batch_dim: "f32[3, 2]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 1); child = None
|
||||
_remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
|
||||
_remove_batch_dim: "f32[3, 2]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 1); child = None
|
||||
_remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
|
||||
|
||||
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
|
||||
return (_remove_batch_dim, _remove_batch_dim_1)
|
||||
""",
|
||||
)
|
||||
|
||||
@ -942,6 +942,7 @@ exclusions = {
|
||||
"aot_graphs",
|
||||
"aot_graphs_effects",
|
||||
"pre_grad_graphs",
|
||||
"joint_graph_passes",
|
||||
"post_grad_graphs",
|
||||
"inductor_metrics",
|
||||
"ir_pre_fusion",
|
||||
|
||||
@ -12848,6 +12848,36 @@ fn
|
||||
res = opt_f(x)
|
||||
self.assertEqual(ref, res)
|
||||
|
||||
def test_builtin_complex(self):
|
||||
def f(x):
|
||||
c = (
|
||||
complex(),
|
||||
complex(1),
|
||||
complex(2, 3),
|
||||
complex(imag=2),
|
||||
complex(real=1),
|
||||
complex(imag=1, real=2),
|
||||
complex("1+2j"),
|
||||
)
|
||||
return [x + z for z in c]
|
||||
|
||||
x = torch.randn(1)
|
||||
opt_f = torch.compile(f, backend="eager", fullgraph=True)
|
||||
res = opt_f(x)
|
||||
ref = f(x)
|
||||
self.assertEqual(res, ref)
|
||||
|
||||
def test_builtin_complex_args(self):
|
||||
@torch.compile(backend="eager", fullgraph=True)
|
||||
def f(*args, **kwargs):
|
||||
return torch.tensor(complex(*args, **kwargs))
|
||||
|
||||
self.assertRaises(Unsupported, f, 1, 1, 1)
|
||||
self.assertRaises(Unsupported, f, 1, 1, fake_arg=1)
|
||||
self.assertRaises(Unsupported, f, fake_arg=1)
|
||||
self.assertRaises(Unsupported, f, [])
|
||||
self.assertRaises(Unsupported, f, "1 + j")
|
||||
|
||||
|
||||
class TestTracer(JitTestCase):
|
||||
def test_jit_save(self):
|
||||
|
||||
@ -16,7 +16,7 @@ from torch._dynamo.package import CompilePackage, DiskDynamoStore, DynamoCache
|
||||
from torch._dynamo.precompile_context import PrecompileContext
|
||||
from torch._dynamo.testing import reduce_to_scalar_loss
|
||||
from torch._functorch import config as functorch_config
|
||||
from torch._inductor.mock_cache import global_stats, PatchCaches
|
||||
from torch._inductor.mock_cache import global_stats, PatchCaches, Stats
|
||||
from torch._inductor.runtime.runtime_utils import cache_dir
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
@ -452,33 +452,27 @@ def add(x, y):
|
||||
def fn(x, y):
|
||||
return x.sin() + y
|
||||
|
||||
arg1 = torch.randn(32, 32, device=device)
|
||||
arg2 = torch.randn(32, 32, device=device)
|
||||
arg1 = torch.randn(3, 3, device=device)
|
||||
arg2 = torch.randn(3, 3, device=device)
|
||||
expected = fn(arg1, arg2).clone()
|
||||
|
||||
with PatchCaches():
|
||||
compiled_fn1 = torch.compile(fn, mode="max-autotune")
|
||||
result = compiled_fn1(arg1, arg2).clone()
|
||||
self.assertEqual(expected, result)
|
||||
self.assertEqual(global_stats.autotune_local.num_get_miss, 1)
|
||||
self.assertEqual(global_stats.autotune_local, Stats(1, 0, 1))
|
||||
DynamoCache.clear()
|
||||
|
||||
total_frames = torch._dynamo.convert_frame.FRAME_COUNTER
|
||||
self._save_and_reload(
|
||||
expected_backends=1, expected_dynamo=1, expected_autotune=1
|
||||
)
|
||||
# During save, we check the autotune cache another time, and now it should hit
|
||||
self.assertEqual(global_stats.autotune_local.num_get_hit, 1)
|
||||
compiled_fn1 = torch.compile(fn, mode="max-autotune")
|
||||
with torch.compiler.set_stance("fail_on_recompile"):
|
||||
result1 = compiled_fn1(arg1, arg2).clone()
|
||||
self.assertEqual(expected, result1)
|
||||
self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
|
||||
# No new hits or misses
|
||||
# Unfortunately, we don't *actually* know how many puts there will be, because
|
||||
# it's possible the best autotune config was found by coordesc.
|
||||
self.assertEqual(global_stats.autotune_local.num_get_hit, 1)
|
||||
self.assertEqual(global_stats.autotune_local.num_get_miss, 1)
|
||||
self.assertEqual(global_stats.autotune_local, Stats(2, 1, 1))
|
||||
|
||||
@parametrize("device", ("cpu", "cuda", "xpu"))
|
||||
@torch._dynamo.config.patch(caching_precompile=True)
|
||||
|
||||
@ -245,7 +245,7 @@ class StructuredTraceTest(TestCase):
|
||||
"""\
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -254,6 +254,8 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -277,7 +279,7 @@ class StructuredTraceTest(TestCase):
|
||||
"""\
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -286,6 +288,8 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -314,10 +318,10 @@ class StructuredTraceTest(TestCase):
|
||||
"""\
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['y']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "l_y_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -326,6 +330,8 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -336,7 +342,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"create_symbol": {"symbol": "s48", "val": "1", "vr": "[-int_oo, int_oo]", "source": "L['y']", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -346,6 +352,8 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -367,7 +375,7 @@ class StructuredTraceTest(TestCase):
|
||||
"""\
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "ones_1": [1000, 1000], "output_1": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -376,6 +384,8 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -401,28 +411,28 @@ class StructuredTraceTest(TestCase):
|
||||
"""\
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
|
||||
{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack1']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
|
||||
{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
|
||||
{"dynamo_output_graph": {"sizes": {"l_stack0_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "sum_1": []}}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
@ -450,7 +460,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['output']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"compilation_metrics": "METRICS", "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
""", # noqa: B950
|
||||
@ -470,7 +480,7 @@ class StructuredTraceTest(TestCase):
|
||||
"""\
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"artifact": {"name": "dynamo_error", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
@ -504,7 +514,7 @@ class StructuredTraceTest(TestCase):
|
||||
"""\
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -615,7 +625,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['args'][0]"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
@ -631,32 +641,32 @@ class StructuredTraceTest(TestCase):
|
||||
{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['self']._modules['layers']._modules['0']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['self']._modules['layers']._modules['0']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 2, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 2, "source": "L['x']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 3, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 3, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 3, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 8, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 4, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 9, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 4, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 9, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 4, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 9, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_self_modules_layers_modules_0_parameters_weight_": [1024, 1024], "l_self_modules_layers_modules_0_parameters_bias_": [1024], "l_x_": [1024, 1024], "l_self_modules_layers_modules_1_parameters_weight_": [1024, 1024], "l_self_modules_layers_modules_1_parameters_bias_": [1024], "input_1": [1024, 1024], "input_2": [1024, 1024]}}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"optimize_ddp_split_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"optimize_ddp_split_child": {"name": "submod_0"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"optimize_ddp_split_child": {"name": "submod_1"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['self']._modules['layers']._modules['0']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 2, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 2, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 2, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 2, "source": "L['self']._modules['layers']._modules['0']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -672,10 +682,10 @@ class StructuredTraceTest(TestCase):
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"describe_storage": {"id": 16, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 29, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 17, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 30, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 17, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 30, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 17, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 30, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -715,7 +725,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_x_": [1], "add": [1]}}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -724,6 +734,8 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"aot_inference_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -754,10 +766,10 @@ class StructuredTraceTest(TestCase):
|
||||
"""\
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 800}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 20], "is_leaf": true, "stride": [20, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 20], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [20, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 2400}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [20, 30], "is_leaf": true, "stride": [30, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [20, 30], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [30, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['b']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_a_": [10, 20], "l_b_": [20, 30], "matmul": [10, 30]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -765,12 +777,12 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 200}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [5, 10], "is_leaf": true, "stride": [10, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [5, 10], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [10, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"create_symbol": {"symbol": "s97", "val": "5", "vr": "[2, int_oo]", "source": "L['a'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"create_symbol": {"symbol": "s98", "val": "10", "vr": "[2, int_oo]", "source": "L['a'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 600}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 15], "is_leaf": true, "stride": [15, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 15], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [15, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['b']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"create_symbol": {"symbol": "s52", "val": "10", "vr": "[2, int_oo]", "source": "L['b'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"create_symbol": {"symbol": "s20", "val": "15", "vr": "[2, int_oo]", "source": "L['b'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
@ -806,7 +818,7 @@ class StructuredTraceTest(TestCase):
|
||||
"""\
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -814,7 +826,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -844,10 +856,10 @@ def forward(self, x, y):
|
||||
return add
|
||||
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 12}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 12}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "is_leaf": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['y']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_x_": [3], "l_y_": [3], "add": [3]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -874,7 +886,7 @@ def forward(self, x, y):
|
||||
"""\
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_a_": [1], "sin": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -883,6 +895,8 @@ def forward(self, x, y):
|
||||
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -892,7 +906,7 @@ def forward(self, x, y):
|
||||
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"dynamo_output_graph": {"sizes": {"l_a_": [1], "sin": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
|
||||
@ -289,6 +289,39 @@ class TestDynamoTimed(TestCase):
|
||||
"'l_x_': [3], 'linear': [1]}",
|
||||
)
|
||||
|
||||
@dynamo_config.patch({"log_compilation_metrics": True})
|
||||
@inductor_config.patch({"force_disable_caches": True})
|
||||
def test_log_dynamo_start(self):
|
||||
import torch._dynamo.convert_frame as convert_frame
|
||||
|
||||
self.warmup()
|
||||
self.run_forward_backward()
|
||||
|
||||
# Dummy code object
|
||||
def sample_func():
|
||||
pass
|
||||
|
||||
code = sample_func.__code__
|
||||
stack_strings = convert_frame.log_dynamo_start(code)
|
||||
last_entry = stack_strings[-1]
|
||||
# Check if the last entry is a valid stack trace i.e for the sample_func
|
||||
self.assertIn(
|
||||
f"Line: {code.co_firstlineno}",
|
||||
last_entry,
|
||||
"Log does not contain a Line no.",
|
||||
)
|
||||
self.assertIn(
|
||||
f"Name: {code.co_name}", last_entry, "Log does not contain a Name"
|
||||
)
|
||||
self.assertIn(
|
||||
"test_utils.py",
|
||||
last_entry,
|
||||
"Log file does not contain the expected Filename: 'test_utils.py'",
|
||||
)
|
||||
|
||||
# Since the remaining logs are env specific, we just check if they are present instead of checking the exact string
|
||||
self.assertGreater(len(stack_strings), 1)
|
||||
|
||||
@dynamo_config.patch(
|
||||
{
|
||||
"log_compilation_metrics": True,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user