[ghstack-poisoned]
This commit is contained in:
Tom Ritchford
2025-08-22 12:18:30 +00:00
248 changed files with 9568 additions and 2677 deletions

View File

@ -1 +1 @@
e03a63be43e33596f7f0a43b0f530353785e4a59
22bc29b4d503fc895ff73bc720ff396e9723465f

View File

@ -111,7 +111,10 @@ ninja==1.11.1.3
#Pinned versions: 1.11.1.3
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
numba==0.60.0 ; python_version <= "3.12" and platform_machine != "s390x"
numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
#Description: Just-In-Time Compiler for Numerical Functions
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
#test that import: test_numba_integration.py
@ -130,7 +133,8 @@ numba==0.60.0 ; python_version <= "3.12" and platform_machine != "s390x"
#test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
#test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
#test_binary_ufuncs.py
numpy==1.26.2; python_version < "3.13"
numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
numpy==2.1.2; python_version >= "3.13"
pandas==2.0.3; python_version < "3.13"
@ -259,11 +263,6 @@ scipy==1.14.1 ; python_version >= "3.12"
#Pinned versions:
#test that import:
tb-nightly==2.13.0a20230426
#Description: TensorBoard
#Pinned versions:
#test that import:
# needed by torchgen utils
typing-extensions>=4.10.0
#Description: type hints for python

View File

@ -55,6 +55,9 @@ python -m pip install pulp==2.9.0
# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
python -m pip install expecttest==0.3.0
# Install intel-openmp
python -m pip install intel-openmp==2025.1.1
run_tests() {
# Run nvidia-smi if available
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do

View File

@ -1,12 +1,22 @@
set ADDITIONAL_OPTIONS=""
set PYTHON_EXEC="python"
if "%DESIRED_PYTHON%" == "3.13t" (
echo Python version is set to 3.13t
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
set PYTHON_EXEC="python3.13t"
) else if "%DESIRED_PYTHON%"=="3.14" (
echo Python version is set to 3.14 or 3.14t
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
) else if "%DESIRED_PYTHON%"=="3.14t" (
echo Python version is set to 3.14 or 3.14t
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
set PYTHON_EXEC="python3.14t"
) else (
echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON%
echo Python version is set to %DESIRED_PYTHON%
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =%
)

View File

@ -7,6 +7,8 @@ call "internal\install_python.bat"
%PYTHON_EXEC% --version
set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
if "%DESIRED_PYTHON%" == "3.14t" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
if "%DESIRED_PYTHON%" == "3.14" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake

View File

@ -48,6 +48,7 @@ per-file-ignores =
torch/__init__.py: F401,TOR901
torch/_custom_op/impl.py: TOR901
torch/_export/serde/upgrade.py: TOR901
torch/_functorch/predispatch.py: TOR901
torch/_functorch/vmap.py: TOR901
torch/_inductor/test_operators.py: TOR901
torch/_library/abstract_impl.py: TOR901

View File

@ -1 +1 @@
dfa5a3a85849f59af5438c7c2811235d52d93a95
a645da617ed8836727cf9c28944d87154700d360

View File

@ -1 +1 @@
c9b38be8aafb02b69ccb704b33d2bb4329fbb0e6
bbea1cefdd1a29b53355b1655f5d2ae343921f85

View File

@ -8,6 +8,9 @@ updates:
target-branch: "main"
allow:
- dependency-name: "transformers"
ignore:
- dependency-name: "*"
update-types: ["version-update:semver-patch"]
commit-message:
prefix: "[Dependabot] Update"
include: "scope"
@ -18,3 +21,4 @@ updates:
- "topic: not user facing"
- "module: ci"
- "module: inductor"
- "ciflow/inductor"

View File

@ -314,8 +314,8 @@ def generate_wheels_matrix(
# TODO: Enable python 3.13t on cpu-s390x
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
continue
# TODO: Enable python 3.14 on non linux OSes
if os not in ["linux", "linux-aarch64", "macos-arm64"] and (
# TODO: Enable python 3.14 for rest
if os not in ["linux", "linux-aarch64", "macos-arm64", "windows"] and (
python_version == "3.14" or python_version == "3.14t"
):
continue
@ -356,29 +356,6 @@ def generate_wheels_matrix(
), # include special case for aarch64 build, remove the -aarch64 postfix
}
)
# Special build building to use on Colab. Python 3.11 for 12.6 CUDA
if python_version == "3.11" and arch_version == CUDA_STABLE:
ret.append(
{
"python_version": python_version,
"gpu_arch_type": gpu_arch_type,
"gpu_arch_version": gpu_arch_version,
"desired_cuda": translate_desired_cuda(
gpu_arch_type, gpu_arch_version
),
"container_image": WHEEL_CONTAINER_IMAGES[
arch_version
].split(":")[0],
"container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
arch_version
].split(":")[1],
"package_type": package_type,
"pytorch_extra_install_requirements": "",
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950
".", "_"
),
}
)
else:
ret.append(
{

View File

@ -1425,71 +1425,6 @@ jobs:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda12_8-full-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
needs: get-label-type
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu128
GPU_ARCH_VERSION: 12.8
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.11"
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_8-full
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_8-full-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs:
- manywheel-py3_11-cuda12_8-full-build
- get-label-type
uses: ./.github/workflows/_binary-test-linux.yml
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu128
GPU_ARCH_VERSION: 12.8
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda12_8-full
build_environment: linux-binary-manywheel
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_8-full-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_11-cuda12_8-full-test
with:
PYTORCH_ROOT: /pytorch
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: cu128
GPU_ARCH_VERSION: 12.8
GPU_ARCH_TYPE: cuda
DOCKER_IMAGE: manylinux2_28-builder
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda12_8-full
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_11-cuda12_9-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml

File diff suppressed because it is too large Load Diff

View File

@ -58,9 +58,14 @@ on:
required: false
type: string
default: inductor_huggingface_perf_cuda_h100,inductor_timm_perf_cuda_h100,inductor_torchbench_perf_cuda_h100
pull_request:
# Changing these files guarantees that this workflow needs to be run
paths:
- .github/workflows/inductor-perf-test-nightly-h100.yml
- .ci/docker/ci_commit_pins/huggingface-requirements.txt
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
@ -160,10 +165,9 @@ jobs:
name: cuda12.8-py3.10-gcc9-sm90
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 720

View File

@ -4,6 +4,10 @@ on:
pull_request:
paths:
- .github/workflows/test-h100.yml
- test/inductor/test_max_autotune.py
- torch/_inductor/kernel/mm.py
- torch/_inductor/kernel/mm_grouped.py
workflow_dispatch:
schedule:
- cron: 0 4,10,16,22 * * * # every 6 hours

View File

@ -1454,7 +1454,7 @@ init_command = [
'--dry-run={{DRYRUN}}',
'usort==1.0.8.post1',
'isort==6.0.1',
'ruff==0.12.2', # sync with RUFF
'ruff==0.12.9', # sync with RUFF
]
is_formatter = true
@ -1589,7 +1589,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'ruff==0.12.2', # sync with PYFMT
'ruff==0.12.9', # sync with PYFMT
]
is_formatter = true

View File

@ -279,6 +279,7 @@ header_template_rule(
"@AT_BLAS_F2C@": "0",
"@AT_BLAS_USE_CBLAS_DOT@": "1",
"@AT_KLEIDIAI_ENABLED@": "0",
"@AT_USE_EIGEN_SPARSE@": "0",
},
)

View File

@ -289,6 +289,7 @@ option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build."
option(USE_PROF "Use profiling" OFF)
option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
option(USE_EIGEN_SPARSE "Use Eigen Sparse Matrices" OFF)
option(USE_SYSTEM_EIGEN_INSTALL
"Use system Eigen instead of the one under third_party" OFF)
cmake_dependent_option(

View File

@ -242,7 +242,6 @@ git submodule update --init --recursive
**Common**
```bash
conda install cmake ninja
# Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
pip install -r requirements.txt
```

View File

@ -96,6 +96,8 @@ file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
file(GLOB vulkan_cpp "vulkan/*.cpp")
file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/impl/*.cpp" "native/vulkan/ops/*.cpp")
file(GLOB native_eigen_cpp "native/sparse/eigen/*.cpp")
# Metal
file(GLOB metal_h "metal/*.h")
file(GLOB metal_cpp "metal/*.cpp")
@ -341,6 +343,9 @@ if(USE_VULKAN)
else()
set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp})
endif()
if(USE_EIGEN_SPARSE)
set(all_cpu_cpp ${all_cpu_cpp} ${native_eigen_cpp})
endif()
if(USE_MTIA)
set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} ${mtia_cpp} ${mtia_h} ${native_mtia_cpp} ${native_mtia_h})

View File

@ -20,3 +20,4 @@
#define AT_BLAS_F2C() @AT_BLAS_F2C@
#define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@
#define AT_KLEIDIAI_ENABLED() @AT_KLEIDIAI_ENABLED@
#define AT_USE_EIGEN_SPARSE() @AT_USE_EIGEN_SPARSE@

View File

@ -698,6 +698,14 @@ bool Context::hasLAPACK() {
#endif
}
bool Context::hasEigenSparse() {
#if AT_USE_EIGEN_SPARSE()
return true;
#else
return false;
#endif
}
at::QEngine Context::qEngine() const {
static auto _quantized_engine = []() {
at::QEngine qengine = at::kNoQEngine;

View File

@ -133,6 +133,7 @@ class TORCH_API Context {
static bool hasLAPACK();
static bool hasMKLDNN();
static bool ckSupported();
static bool hasEigenSparse();
static bool hasMAGMA() {
return detail::getCUDAHooks().hasMAGMA();
}
@ -615,6 +616,10 @@ inline bool hasLAPACK() {
return globalContext().hasLAPACK();
}
inline bool hasEigenSparse() {
return globalContext().hasEigenSparse();
}
inline bool hasMAGMA() {
return globalContext().hasMAGMA();
}

View File

@ -97,8 +97,8 @@ class TORCH_API KernelFunction final {
KernelFunction();
~KernelFunction();
KernelFunction(const KernelFunction&) = default;
KernelFunction& operator=(const KernelFunction&) = default;
KernelFunction(const KernelFunction& other);
KernelFunction& operator=(const KernelFunction& other);
KernelFunction(KernelFunction&&) noexcept = default;
@ -276,10 +276,6 @@ class TORCH_API KernelFunction final {
// Register a token to be invalidated when this KernelFunction is destroyed
void registerToken(std::weak_ptr<KernelToken> token) const;
// List of tokens that need to be invalidated when this KernelFunction is
// destroyed
mutable std::vector<std::weak_ptr<KernelToken>> tokens_;
private:
explicit KernelFunction(
std::unique_ptr<OperatorKernel> functor,
@ -294,6 +290,9 @@ class TORCH_API KernelFunction final {
BoxedKernel boxed_kernel_func_;
void* unboxed_kernel_func_;
void* sym_unboxed_kernel_func_;
// List of tokens that need to be invalidated when this KernelFunction is
// destroyed (lazy allocation to save memory when empty)
mutable std::unique_ptr<std::vector<std::weak_ptr<KernelToken>>> tokens_;
};
// Token held by SafeKernelFunction that gets invalidated when KernelFunction is

View File

@ -25,13 +25,35 @@ inline KernelFunction::KernelFunction()
sym_unboxed_kernel_func_(nullptr) {}
inline KernelFunction::~KernelFunction() {
for (auto& weak_token : tokens_) {
if (auto token = weak_token.lock()) {
token->invalidate();
if (tokens_) {
for (auto& weak_token : *tokens_) {
if (auto token = weak_token.lock()) {
token->invalidate();
}
}
}
}
inline KernelFunction::KernelFunction(const KernelFunction& other)
: boxed_kernel_func_(other.boxed_kernel_func_),
unboxed_kernel_func_(other.unboxed_kernel_func_),
sym_unboxed_kernel_func_(other.sym_unboxed_kernel_func_) {
// tokens_ is intentionally not copied as we only care about invalidating
// tokens if the original KernelFunction is destroyed
}
inline KernelFunction& KernelFunction::operator=(const KernelFunction& other) {
if (this != &other) {
boxed_kernel_func_ = other.boxed_kernel_func_;
unboxed_kernel_func_ = other.unboxed_kernel_func_;
sym_unboxed_kernel_func_ = other.sym_unboxed_kernel_func_;
// tokens_ is intentionally not copied as we only care about invalidating
// tokens if the original KernelFunction is destroyed
}
return *this;
}
inline KernelFunction::KernelFunction(
std::unique_ptr<OperatorKernel> functor,
InternalBoxedKernelFunction* boxed_kernel_func,
@ -167,7 +189,10 @@ C10_ALWAYS_INLINE Return KernelFunction::call(
inline void KernelFunction::registerToken(
std::weak_ptr<KernelToken> token) const {
tokens_.push_back(std::move(token));
if (!tokens_) {
tokens_ = std::make_unique<std::vector<std::weak_ptr<KernelToken>>>();
}
tokens_->push_back(std::move(token));
}
inline KernelFunction KernelFunction::makeFromBoxedKernel(

View File

@ -161,6 +161,11 @@ struct CUDACachingHostAllocatorImpl
return true;
}
bool pinned_use_background_threads() override {
return c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::
pinned_use_background_threads();
}
EventPool::Event create_event_internal(DeviceIndex idx) {
// Leak the event pool to avoid shutdown issue.
static auto* event_pool = new EventPool();

View File

@ -19,10 +19,6 @@
#include <c10/cuda/CUDAFunctions.h>
#include <c10/util/irange.h>
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h>
#endif
#if AT_CUDNN_ENABLED()
#include <ATen/cudnn/cudnn-wrapper.h>
#endif
@ -93,29 +89,6 @@ void CUDAHooks::init() const {
// have a chance to enable vitals.
at::vitals::VitalsAPI.setVital("CUDA", "used", "true", /* force = */ true);
// Sets the CUDA_MODULE_LOADING environment variable
// if it's not set by the user.
// CUDA_MODULE_LOADING="LAZY" is default for all drivers released for CUDA 12.2+.
// Check the driver version and only set the env variable if needed.
bool set_lazy_module_loading = true;
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
auto driver_api = c10::cuda::DriverAPI::get();
// Initialize NVML
if (driver_api->nvmlInit_v2_() == NVML_SUCCESS) {
// Get the driver version
int version = -1;
auto res = driver_api->nvmlSystemGetCudaDriverVersion_v2_(&version);
if (res == NVML_SUCCESS) {
// Check if driver is sufficiently new
if (version >= 12020) {
set_lazy_module_loading = false;
}
}
}
#endif
if (set_lazy_module_loading) {
c10::utils::set_env("CUDA_MODULE_LOADING", "LAZY", false);
}
const auto num_devices = c10::cuda::device_count_ensure_non_zero();
c10::cuda::CUDACachingAllocator::init(num_devices);
at::cuda::detail::init_p2p_access_cache(num_devices);

View File

@ -22,7 +22,7 @@ namespace {
// Check if tensor list has either a boolean tensor or a integer tensor
inline bool has_integral_tensor(TensorList tensors, const bool includeBool) {
return std::any_of(
tensors.begin(), tensors.end(), [&includeBool](const auto& t) {
tensors.begin(), tensors.end(), [includeBool](const auto& t) {
return at::isIntegralType(t.scalar_type(), includeBool);
});
}

View File

@ -1360,6 +1360,7 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
#endif
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
static inline int64_t get_mkldnn_matmul_min_dim() {
static auto value = [&] {
const int64_t default_min_dim = [&] {
@ -1393,6 +1394,7 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
const int64_t min_size = get_mkldnn_matmul_min_size();
return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
}
#endif
static void addmm_impl_cpu_(
@ -1771,6 +1773,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
(strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
};
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
try {
@ -1781,6 +1784,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
at::globalContext().setUserEnabledMkldnn(false);
}
}
#endif
if (contraction_size * res_rows * res_cols < 400) {
if (is_bmm_out) {

View File

@ -7,6 +7,7 @@
#include <algorithm>
#include <iterator>
#include <numeric>
#include <vector>
#include <ATen/Dispatch.h>
#include <ATen/Parallel.h>
@ -647,10 +648,10 @@ _vec_softmax(
parallel_for(
0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) {
int64_t idx = begin;
auto temp_vec_input = std::make_unique<float[]>(dim_size * vectorized_step);
auto temp_vec_output = std::make_unique<float[]>(dim_size * vectorized_step);
float* temp_vec_input_data = temp_vec_input.get();
float* temp_vec_output_data = temp_vec_output.get();
std::vector<float> temp_vec_input(dim_size * vectorized_step);
std::vector<float> temp_vec_output(dim_size * vectorized_step);
float* temp_vec_input_data = temp_vec_input.data();
float* temp_vec_output_data = temp_vec_output.data();
while (idx < end) {
int64_t outer_idx = idx / inner_size;
int64_t inner_idx = idx % inner_size;

View File

@ -1347,7 +1347,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
// We are doing row-wise scaling
auto dprops = at::cuda::getCurrentDeviceProperties();
if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
&& (dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)) {
&& ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
// cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
|| (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
at::cuda::detail::f8f8bf16_rowwise(
mat1,

View File

@ -436,7 +436,6 @@ static inline void launch_vectorized_templated_kernel(
loader_t l,
storer_t s) {
TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
using traits = function_traits<func_t>;
int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) /
vectorized_templated_config::block_work_size();
auto stream = at::cuda::getCurrentCUDAStream();

View File

@ -14,7 +14,7 @@ namespace {
// pow(double, int)
// pow(float, float)
// pow(double, double)
#ifdef _MSC_VER
#if defined(_MSC_VER) || defined(_LIBCPP_VERSION)
// Functions for pow
// pow for at::Half
static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) {

View File

@ -225,8 +225,9 @@ void launch_stable_sort_kernel(
return;
}
int64_t numel_or_intmax =
std::min(numel, static_cast<int64_t>(std::numeric_limits<int>::max()));
const int64_t intmax = static_cast<int64_t>(std::numeric_limits<int>::max());
// On ROCm, std::min -> ::min did not work as expected on when input values >= 2147483648
int64_t numel_or_intmax = numel < intmax ? numel : intmax;
int64_t nsort = self.size(dim);
int64_t nbatch = (numel_or_intmax / nsort) * nsort;
TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort);
@ -238,7 +239,8 @@ void launch_stable_sort_kernel(
scalar_t* values_ptr = values.mutable_data_ptr<scalar_t>();
int64_t remaining = numel;
while (remaining > 0) {
int64_t n = std::min(remaining, nbatch);
// On ROCm, std::min -> ::min did not work as expected on when input values >= 2147483648
int64_t n = remaining < nbatch ? remaining : nbatch;
int64_t nsegments = n / nsort;
if (nsegments == 1 ||

View File

@ -285,7 +285,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
sizeof(algos) / sizeof(algos[0]) == num_algos,
"Missing cuDNN convolution forward algorithms");
int perf_count;
auto perf_results = std::make_unique<perf_t[]>(num_algos);
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_FWD_ALGO_COUNT> perf_results;
if (!benchmark) {
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnGetConvolutionForwardAlgorithm_v7(
@ -296,7 +296,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
args.odesc.desc(),
num_algos,
&perf_count,
perf_results.get()),
perf_results.data()),
args);
} else {
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@ -314,7 +314,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
args.output.data_ptr(),
num_algos,
&perf_count,
perf_results.get(),
perf_results.data(),
ws.data,
ws.size),
args);
@ -324,7 +324,7 @@ struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
// memory, e.g. a few GBs.
c10::cuda::CUDACachingAllocator::emptyCache();
}
return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
}
static void getWorkspaceSize(
@ -369,7 +369,8 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
sizeof(algos) / sizeof(algos[0]) == num_algos,
"Missing cuDNN convolution backward data algorithms.");
int perf_count;
auto perf_results = std::make_unique<perf_t[]>(num_algos);
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT>
perf_results;
if (!benchmark) {
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnGetConvolutionBackwardDataAlgorithm_v7(
@ -380,7 +381,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
args.idesc.desc(),
num_algos,
&perf_count,
perf_results.get()),
perf_results.data()),
args);
} else {
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@ -398,7 +399,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
args.input.data_ptr(),
num_algos,
&perf_count,
perf_results.get(),
perf_results.data(),
ws.data,
ws.size),
args);
@ -408,7 +409,7 @@ struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
// memory, e.g. a few GBs.
c10::cuda::CUDACachingAllocator::emptyCache();
}
return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
}
static void getWorkspaceSize(
@ -456,7 +457,8 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
static_assert(
sizeof(algos) / sizeof(algos[0]) == num_algos,
"Missing cuDNN convolution backward filter algorithms.");
auto perf_results = std::make_unique<perf_t[]>(num_algos);
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT>
perf_results;
int perf_count;
if (!benchmark) {
AT_CUDNN_CHECK_WITH_SHAPES(
@ -468,7 +470,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
args.wdesc.desc(),
num_algos,
&perf_count,
perf_results.get()),
perf_results.data()),
args);
} else {
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
@ -486,7 +488,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
args.weight.data_ptr(),
num_algos,
&perf_count,
perf_results.get(),
perf_results.data(),
ws.data,
ws.size),
args);
@ -496,7 +498,7 @@ struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
// memory, e.g. a few GBs.
c10::cuda::CUDACachingAllocator::emptyCache();
}
return getValidAlgorithms<perf_t>(perf_results.get(), args, perf_count);
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
}
static void getWorkspaceSize(

View File

@ -512,7 +512,28 @@ TORCH_IMPL_FUNC(index_add_mps_out)
return;
}
TORCH_CHECK(source.scalar_type() != ScalarType::Long, "index_add(): Expected non int64 dtype for source.");
bool use_deterministic_algorithm = globalContext().deterministicAlgorithms();
// TODO: Do not use deterministic algorithm for long/complex but rather implement it as Metal shader
use_deterministic_algorithm |= source.scalar_type() == ScalarType::Long;
use_deterministic_algorithm |= c10::isComplexType(source.scalar_type());
if (use_deterministic_algorithm) {
if (!result.is_same(self)) {
result.copy_(self);
}
torch::List<std::optional<Tensor>> indices;
indices.reserve(dim + 1);
for (const auto i : c10::irange(dim)) {
indices.emplace_back();
}
indices.emplace_back(index.to(at::kLong));
const Tensor result_ = (result.dim() == 0) ? result.view(1) : result;
const Tensor source_ = (source.dim() == 0) ? source.view(1) : source;
result_.index_put_(indices, source_.mul(alpha), true);
return;
}
auto casted_type = isFloatingType(source.scalar_type()) ? ScalarType::Float : ScalarType::Int;
struct CachedGraph : public MPSCachedGraph {
@ -921,6 +942,8 @@ Tensor& index_fill_mps_(Tensor& self, int64_t dim, const Tensor& index, const Te
TORCH_CHECK(index.scalar_type() == ScalarType::Long || index.scalar_type() == ScalarType::Int,
"index_fill_(): Expected dtype int32 or int64 for index");
TORCH_CHECK(dim == 0 || dim < self.dim(), "index_fill_(): Indexing dim ", dim, " is out of bounds of tensor");
// MPS.scatter crashes if used with complex dtypes
TORCH_CHECK(!c10::isComplexType(self.scalar_type()), "index_fill_(): Complex types are yet not supported");
// Empty index
if (num_indices == 0) {

View File

@ -17,6 +17,7 @@
#include <c10/util/irange.h>
#include <cstring>
#include <vector>
namespace at::native {
@ -53,8 +54,8 @@ static void upsample_nearest2d_out_frame(
return;
}
auto input_offset_arr = std::make_unique<int64_t[]>(output_width);
int64_t* input_offset = input_offset_arr.get();
std::vector<int64_t> input_offset_arr(output_width);
int64_t* input_offset = input_offset_arr.data();
for (const auto w2 : c10::irange(output_width)) {
const int64_t w1 = nn_compute_source_index_fn(width_scale, w2, input_width);

View File

@ -23,6 +23,9 @@
#include <ATen/Parallel.h>
#endif
#if AT_USE_EIGEN_SPARSE()
#include <ATen/native/sparse/eigen/SparseBlasImpl.h>
#endif
namespace at::native::sparse::impl {
@ -442,13 +445,15 @@ void add_out_sparse_csr(
const Tensor& mat2,
const Scalar& alpha,
const Tensor& result) {
#if !AT_MKL_ENABLED()
TORCH_CHECK(
false,
"Calling add on a sparse CPU tensor requires compiling PyTorch with MKL. ",
"Please use PyTorch built MKL support.");
#else
#if AT_USE_MKL_SPARSE()
sparse::impl::mkl::add_out_sparse_csr(mat1, mat2, alpha, result);
#elif AT_USE_EIGEN_SPARSE()
sparse::impl::eigen::add_out_sparse(mat1, mat2, alpha, result);
#else
TORCH_CHECK(
false,
"Calling add on a sparse CPU tensor requires compiling PyTorch with MKL. ",
"Please use PyTorch built MKL support.");
#endif
}
@ -459,7 +464,7 @@ void triangular_solve_out_sparse_csr(
bool upper,
bool transpose,
bool unitriangular) {
#if !AT_MKL_ENABLED()
#if !AT_USE_MKL_SPARSE()
TORCH_CHECK(
false,
"Calling triangular_solve on a sparse CPU tensor requires compiling PyTorch with MKL. ",

View File

@ -127,6 +127,10 @@
#include <ATen/ops/zeros_like.h>
#endif
#if AT_USE_EIGEN_SPARSE()
#include <ATen/native/sparse/eigen/SparseBlasImpl.h>
#endif
#include <algorithm>
namespace at {
@ -536,7 +540,12 @@ static void addmm_out_sparse_csr_native_cpu(
auto values = sparse.values();
scalar_t cast_alpha = alpha.to<scalar_t>();
r.mul_(beta);
// If beta is zero NaN and Inf should not be propagated to the result
if (beta.toComplexDouble() == 0.) {
r.zero_();
} else {
r.mul_(beta);
}
AT_DISPATCH_INDEX_TYPES(
col_indices.scalar_type(), "csr_mm_crow_indices", [&]() {
auto csr_accessor = csr.accessor<index_t, 1>();
@ -648,6 +657,15 @@ Tensor& addmm_out_sparse_compressed_cpu(
return result;
}
#if AT_USE_EIGEN_SPARSE()
if ((result.layout() == kSparseCsr || result.layout() == kSparseCsc) &&
(mat1.layout() == kSparseCsr || mat1.layout() == kSparseCsc) &&
(mat2.layout() == kSparseCsr || mat2.layout() == kSparseCsc)) {
sparse::impl::eigen::addmm_out_sparse(mat1, mat2, result, alpha, beta);
return result;
}
#endif
#if !AT_USE_MKL_SPARSE()
// The custom impl addmm_out_sparse_csr_native_cpu only supports CSR @
// strided -> strided

View File

@ -800,7 +800,7 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
Tensor indices_dim1 = indices[1].to(ScalarType::Int);
Tensor indices_dim2 = indices[2].to(ScalarType::Int);
auto mat_el_end_indices_host = std::make_unique<int64_t[]>(num_matrices);
std::vector<int64_t> mat_el_end_indices_host(num_matrices);
{
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
@ -809,14 +809,14 @@ Tensor& bmm_out_sparse_cuda(const SparseTensor& self, const Tensor& mat2, Tensor
search_end_matrix_indices(mat_el_end_indices_device, num_matrices, indices_dim0);
AT_CUDA_CHECK(cudaMemcpy(
mat_el_end_indices_host.get(),
mat_el_end_indices_host.data(),
mat_el_end_indices_device,
num_matrices*sizeof(int64_t),
cudaMemcpyDeviceToHost
));
}
// Need a pointer to an array to access within a lambda
int64_t* mat_el_end_indices = &mat_el_end_indices_host[0];
int64_t* mat_el_end_indices = mat_el_end_indices_host.data();
Scalar beta = 0;
Scalar alpha = 1;

View File

@ -0,0 +1,329 @@
#include <ATen/native/sparse/eigen/SparseBlasImpl.h>
#if AT_USE_EIGEN_SPARSE()
#include <ATen/Tensor.h>
#include <ATen/Dispatch.h>
#include <ATen/SparseCsrTensorUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#else
#include <ATen/ops/empty_like.h>
#endif
#include <c10/core/ScalarType.h>
#include <Eigen/SparseCore>
namespace at::native::sparse::impl::eigen {
namespace {
void inline sparse_indices_to_result_dtype_inplace(
const c10::ScalarType& dtype,
const at::Tensor& input) {
auto [compressed_indices, plain_indices] =
at::sparse_csr::getCompressedPlainIndices(input);
static_cast<at::SparseCsrTensorImpl*>(input.unsafeGetTensorImpl())
->set_member_tensors(
compressed_indices.to(dtype),
plain_indices.to(dtype),
input.values(),
input.sizes());
}
void inline sparse_indices_and_values_resize(
const at::Tensor& input,
int64_t nnz) {
auto [compressed_indices, plain_indices] =
at::sparse_csr::getCompressedPlainIndices(input);
static_cast<SparseCsrTensorImpl*>(input.unsafeGetTensorImpl())
->set_member_tensors(
compressed_indices,
plain_indices.resize_({nnz}),
input.values().resize_({nnz}),
input.sizes());
}
template <typename scalar_t, int eigen_options, typename index_t>
const Eigen::Map<Eigen::SparseMatrix<scalar_t, eigen_options, index_t>>
Tensor_to_Eigen(const at::Tensor& tensor) {
int64_t rows = tensor.size(0);
int64_t cols = tensor.size(1);
int64_t nnz = tensor._nnz();
TORCH_CHECK(tensor.values().is_contiguous(), "eigen accepts only contiguous tensor values");
auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(tensor);
index_t* c_indices_ptr = compressed_indices.data_ptr<index_t>();
index_t* p_indices_ptr = plain_indices.data_ptr<index_t>();
scalar_t* values_ptr = tensor.values().data_ptr<scalar_t>();
Eigen::Map<Eigen::SparseMatrix<scalar_t, eigen_options, index_t>> map(
rows, cols, nnz, c_indices_ptr, p_indices_ptr, values_ptr);
return map;
}
template <typename scalar_t, int eigen_options, typename index_t>
void Eigen_to_Tensor(
const at::Tensor& tensor,
const Eigen::SparseMatrix<scalar_t, eigen_options, index_t>& matrix) {
const Layout eigen_layout = (eigen_options == Eigen::RowMajor ? kSparseCsr : kSparseCsc);
TORCH_CHECK(
tensor.layout() == eigen_layout,
"Eigen_to_Tensor, expected tensor be ", eigen_layout, ", but got ",
tensor.layout());
int64_t nnz = matrix.nonZeros();
int64_t csize = matrix.outerSize();
sparse_indices_and_values_resize(tensor, nnz);
auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(tensor);
if (nnz > 0) {
std::memcpy(
tensor.values().mutable_data_ptr<scalar_t>(),
matrix.valuePtr(),
nnz * sizeof(scalar_t));
std::memcpy(
plain_indices.mutable_data_ptr<index_t>(),
matrix.innerIndexPtr(),
nnz * sizeof(index_t));
}
if (csize > 0) {
std::memcpy(
compressed_indices.mutable_data_ptr<index_t>(),
matrix.outerIndexPtr(),
csize * sizeof(index_t));
}
compressed_indices.mutable_data_ptr<index_t>()[csize] = nnz;
}
template <typename scalar_t>
void add_out_sparse_eigen(
const at::Tensor& mat1,
const at::Tensor& mat2,
const at::Scalar& alpha,
const at::Tensor& result) {
// empty matrices
if (mat1._nnz() == 0 && mat2._nnz() == 0) {
return;
}
if (mat2._nnz() == 0 || alpha.toComplexDouble() == 0.) {
sparse_indices_and_values_resize(result, mat1._nnz());
result.copy_(mat1);
return;
} else if (mat1._nnz() == 0) {
sparse_indices_and_values_resize(result, mat2._nnz());
result.copy_(mat2);
result.values().mul_(alpha);
return;
}
c10::ScalarType result_index_dtype = at::sparse_csr::getIndexDtype(result);
sparse_indices_to_result_dtype_inplace(result_index_dtype, mat1);
sparse_indices_to_result_dtype_inplace(result_index_dtype, mat2);
AT_DISPATCH_INDEX_TYPES(
result_index_dtype, "eigen_sparse_add", [&]() {
scalar_t _alpha = alpha.to<scalar_t>();
if (result.layout() == kSparseCsr) {
auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat1);
auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
auto mat1_mat2_eigen = (mat1_eigen + _alpha * mat2_eigen);
Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(result, mat1_mat2_eigen);
} else {
auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat1);
auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
auto mat1_mat2_eigen = (mat1_eigen + _alpha * mat2_eigen);
Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(result, mat1_mat2_eigen);
}
});
}
template <typename scalar_t>
void addmm_out_sparse_eigen(
const at::Tensor& mat1,
const at::Tensor& mat2,
const at::Tensor& result,
const at::Scalar& alpha,
const at::Scalar& beta) {
// empty matrices
if (mat1._nnz() == 0 || mat2._nnz() == 0) {
return;
}
// If beta is zero NaN and Inf should not be propagated to the result
// In addition, beta = 0 lets us enable a fast-path for result = alpha * A @ B
bool is_beta_zero = false;
if (beta.toComplexDouble() == 0.) {
is_beta_zero = true;
result.values().zero_();
} else {
result.values().mul_(beta);
}
c10::ScalarType result_index_dtype = at::sparse_csr::getIndexDtype(result);
sparse_indices_to_result_dtype_inplace(result_index_dtype, mat1);
sparse_indices_to_result_dtype_inplace(result_index_dtype, mat2);
AT_DISPATCH_INDEX_TYPES(
result_index_dtype, "eigen_sparse_mm", [&]() {
typedef Eigen::SparseMatrix<scalar_t, Eigen::RowMajor, index_t> EigenCsrMatrix;
typedef Eigen::SparseMatrix<scalar_t, Eigen::ColMajor, index_t> EigenCscMatrix;
at::Tensor mat1_mat2;
if (is_beta_zero) {
mat1_mat2 = result;
} else {
mat1_mat2 = at::empty_like(result, result.options());
}
if (mat1_mat2.layout() == kSparseCsr) {
if (mat1.layout() == kSparseCsr) {
const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat1);
if (mat2.layout() == kSparseCsr) {
// Out_csr = M1_csr * M2_csr
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
} else {
// Out_csr = M1_csr * M2_csc
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
}
} else {
const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat1);
if (mat2.layout() == kSparseCsr) {
// Out_csr = M1_csc * M2_csr
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
} else {
// Out_csr = M1_csc * M2_csc
// This multiplication will be computationally inefficient, as it will require
// additional conversion of the output matrix from CSC to CSR format.
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
const EigenCsrMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
Eigen_to_Tensor<scalar_t, Eigen::RowMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
}
}
} else {
if (mat1.layout() == kSparseCsr) {
const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat1);
if (mat2.layout() == kSparseCsr) {
// Out_csc = M1_csr * M2_csr
// This multiplication will be computationally inefficient, as it will require
// additional conversion of the output matrix from CSR to CSC format.
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
} else {
// Out_csc = M1_csr * M2_csc
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
}
} else {
const auto mat1_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat1);
if (mat2.layout() == kSparseCsr) {
// Out_csc = M1_csc * M2_csr
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::RowMajor, index_t>(mat2);
const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
} else {
// Out_csc = M1_csc * M2_csc
const auto mat2_eigen = Tensor_to_Eigen<scalar_t, Eigen::ColMajor, index_t>(mat2);
const EigenCscMatrix mat1_mat2_eigen = (mat1_eigen * mat2_eigen);
Eigen_to_Tensor<scalar_t, Eigen::ColMajor, index_t>(mat1_mat2, mat1_mat2_eigen);
}
}
}
if (is_beta_zero) {
result.mul_(alpha.to<scalar_t>());
} else {
result.add_(mat1_mat2, alpha.to<scalar_t>());
}
});
}
} // anonymous namespace
void addmm_out_sparse(
const at::Tensor& mat1,
const at::Tensor& mat2,
const at::Tensor& result,
const at::Scalar& alpha,
const at::Scalar& beta) {
AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(mat1.layout(), "eigen::addmm_out_sparse:mat1", [&]{});
AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(mat2.layout(), "eigen::addmm_out_sparse:mat2", [&]{});
AT_DISPATCH_SPARSE_COMPRESSED_NONBLOCK_LAYOUTS(result.layout(), "eigen::addmm_out_sparse:result", [&]{});
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
result.scalar_type(), "addmm_out_sparse_eigen", [&] {
addmm_out_sparse_eigen<scalar_t>(mat1, mat2, result, alpha, beta);
});
}
void add_out_sparse(
const at::Tensor& mat1,
const at::Tensor& mat2,
const at::Scalar& alpha,
const at::Tensor& result) {
TORCH_CHECK(
(result.layout() == kSparseCsr && mat1.layout() == kSparseCsr && mat2.layout() == kSparseCsr) ||
(result.layout() == kSparseCsc && mat1.layout() == kSparseCsc && mat2.layout() == kSparseCsc),
"eigen::add_out_sparse: expected the same layout for all operands but got ",
mat1.layout(),
" + ",
mat2.layout(),
" -> ",
result.layout());
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
result.scalar_type(), "add_out_sparse_eigen", [&] {
add_out_sparse_eigen<scalar_t>(mat1, mat2, alpha, result);
});
}
} // namespace at::native::sparse::impl::eigen
#else
namespace at::native::sparse::impl::eigen {
void addmm_out_sparse(
const at::Tensor& mat1,
const at::Tensor& mat2,
const at::Tensor& result,
const at::Scalar& alpha,
const at::Scalar& beta) {
TORCH_CHECK(
false,
"eigen::addmm_out_sparse: Eigen was not enabled for ",
result.layout(),
" + ",
mat1.layout(),
" @ ",
mat2.layout());
}
void add_out_sparse(
const at::Tensor& mat1,
const at::Tensor& mat2,
const at::Scalar& alpha,
const at::Tensor& result) {
TORCH_CHECK(
false,
"eigen::add_out_sparse: Eigen was not enabled for ",
mat1.layout(),
" + ",
mat2.layout(),
" -> ",
result.layout());
}
} // namespace at::native::sparse::impl::eigen
#endif // AT_USE_EIGEN_SPARSE()

View File

@ -0,0 +1,29 @@
#pragma once
#include <ATen/Config.h>
#if AT_USE_EIGEN_SPARSE()
#ifndef EIGEN_MPL2_ONLY
#define EIGEN_MPL2_ONLY
#endif
#include <ATen/Tensor.h>
namespace at::native::sparse::impl::eigen {
void addmm_out_sparse(
const at::Tensor& mat1,
const at::Tensor& mat2,
const at::Tensor& result,
const at::Scalar& alpha,
const at::Scalar& beta);
void add_out_sparse(
const at::Tensor& mat1,
const at::Tensor& mat2,
const at::Scalar& alpha,
const at::Tensor& result);
} // namespace at::native::sparse::impl::eigen
#endif

View File

@ -58,7 +58,7 @@ DistilBertForQuestionAnswering,pass,0
DistillGPT2,pass,2
DistillGPT2,pass,0

1 name accuracy graph_breaks
58
59
60
61
62
63
64

View File

@ -58,7 +58,7 @@ DistilBertForQuestionAnswering,pass,0
DistillGPT2,pass,2
DistillGPT2,pass,0

1 name accuracy graph_breaks
58
59
60
61
62
63
64

View File

@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,0
dla102,pass,0
dla102,timeout,0

1 name accuracy graph_breaks
46 resmlp_12_224 pass 0
47 resnest101e pass 0
48 rexnet_100 pass 0
49 sebotnet33ts_256 pass 0
50 selecsls42b pass 0
51 spnasnet_100 pass 0
52 swin_base_patch4_window7_224 pass 0

View File

@ -346,7 +346,7 @@ vgg16,pass,0
vision_maskrcnn,fail_accuracy,30
vision_maskrcnn,fail_accuracy,29

1 name accuracy graph_breaks
346
347
348
349
350
351
352

View File

@ -46,7 +46,7 @@ deit_base_distilled_patch16_224,pass,0
dla102,pass,0
dla102,timeout,0

1 name accuracy graph_breaks
46 resmlp_12_224 pass 0
47 resnest101e pass 0
48 rexnet_100 pass 0
49 sebotnet33ts_256 pass 0
50 selecsls42b pass 0
51 spnasnet_100 pass 0
52 swin_base_patch4_window7_224 pass 0

View File

@ -146,7 +146,7 @@ hf_Bert_large,pass,0
hf_BigBird,fail_to_run,0
hf_BigBird,pass,0

1 name accuracy graph_breaks
146
147
148
149
150
151
152

View File

@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,fail_to_run,19
hf_Reformer,fail_to_run,21

1 name accuracy graph_breaks
110
111
112
113
114
115
116

View File

@ -34,7 +34,7 @@ basic_gnn_gin,pass,0
basic_gnn_sage,fail_to_run,0
basic_gnn_sage,pass,0

1 name accuracy graph_breaks
34 hf_Roberta_base pass 0
35 hf_T5 pass 0
36 hf_T5_base pass 0
37 hf_T5_large pass_due_to_skip 0
38 hf_distil_whisper pass 0
39 lennard_jones pass 0
40 llama pass 0

View File

@ -146,7 +146,7 @@ hf_Bert_large,pass,0
hf_BigBird,fail_to_run,0
hf_BigBird,fail_accuracy,0

1 name accuracy graph_breaks
146
147
148
149
150
151
152

View File

@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,fail_to_run,19
hf_Reformer,fail_to_run,21

1 name accuracy graph_breaks
110
111
112
113
114
115
116

View File

@ -634,6 +634,7 @@ libtorch_nativert_sources = [
"torch/nativert/graph/passes/SubgraphRewriter.cpp",
"torch/nativert/graph/passes/pass_manager/GraphPasses.cpp",
"torch/nativert/graph/passes/pass_manager/PassManager.cpp",
"torch/nativert/kernels/KernelHandlerRegistry.cpp",
]
torch_mobile_tracer_sources = [

View File

@ -3,7 +3,6 @@
#include <c10/core/AllocatorConfig.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAMacros.h>
#include <c10/util/Deprecated.h>
#include <c10/util/Exception.h>
#include <c10/util/env.h>
@ -18,13 +17,9 @@ enum class Expandable_Segments_Handle_Type : int {
// Environment config parser
class C10_CUDA_API CUDAAllocatorConfig {
public:
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_split_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size() instead.")
static size_t max_split_size() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::max_split_size();
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::garbage_collection_threshold() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::garbage_collection_threshold() instead.")
static double garbage_collection_threshold() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
garbage_collection_threshold();
@ -65,8 +60,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
return instance().m_pinned_num_register_threads;
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::pinned_use_background_threads() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::pinned_use_background_threads() instead.")
static bool pinned_use_background_threads() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
pinned_use_background_threads();
@ -79,29 +72,25 @@ class C10_CUDA_API CUDAAllocatorConfig {
return 128;
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
// This is used to round-up allocation size to nearest power of 2 divisions.
// More description below in function roundup_power2_next_division
// As an example, if we want 4 divisions between 2's power, this can be done
// using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
static size_t roundup_power2_divisions(size_t size) {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions(size);
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::roundup_power2_divisions() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::roundup_power2_divisions() instead.")
static std::vector<size_t> roundup_power2_divisions() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
roundup_power2_divisions();
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::max_non_split_rounding_size() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::max_non_split_rounding_size() instead.")
static size_t max_non_split_rounding_size() {
return c10::CachingAllocator::AcceleratorAllocatorConfig::
max_non_split_rounding_size();
}
C10_DEPRECATED_MESSAGE(
"c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig::last_allocator_settings() is deprecated. Please use c10::CachingAllocator::AcceleratorAllocatorConfig::last_allocator_settings() instead.")
static std::string last_allocator_settings() {
return c10::CachingAllocator::getAllocatorSettings();
}

View File

@ -1218,7 +1218,7 @@ class DeviceCachingAllocator {
DeviceCachingAllocator()
: large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
stats.max_split_size =
static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
context_recorder_.store(nullptr);
}
@ -1343,8 +1343,7 @@ class DeviceCachingAllocator {
// Do garbage collection if the flag is set.
if (C10_UNLIKELY(
set_fraction &&
AcceleratorAllocatorConfig::garbage_collection_threshold() >
0.0)) {
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
garbage_collect_cached_blocks(context);
}
// Attempt allocate
@ -1596,7 +1595,7 @@ class DeviceCachingAllocator {
stats.active_bytes[stat_type].increase(block->size);
stats.requested_bytes[stat_type].increase(block->requested_size);
});
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
if (block->size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_allocations.increase(1);
auto allocated_bytes_gauge =
@ -1647,7 +1646,7 @@ class DeviceCachingAllocator {
block->pool->owner_MempoolId(),
context ? context : block->context_when_allocated);
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
if (block->size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_allocations.decrease(1);
if (!block->stream_uses.empty()) {
@ -2196,8 +2195,7 @@ class DeviceCachingAllocator {
if (size < kMinBlockSize) {
return kMinBlockSize;
} else {
auto divisions =
AcceleratorAllocatorConfig::roundup_power2_divisions(size);
auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
if (divisions > 1 && size > (kMinBlockSize * divisions)) {
return roundup_power2_next_division(size, divisions);
} else {
@ -2676,7 +2674,7 @@ class DeviceCachingAllocator {
if (block->pool->is_small || CUDAAllocatorConfig::expandable_segments()) {
return remaining >= kMinBlockSize;
} else {
return (size < AcceleratorAllocatorConfig::max_split_size()) &&
return (size < CUDAAllocatorConfig::max_split_size()) &&
(remaining > kSmallSize);
}
}
@ -2696,7 +2694,7 @@ class DeviceCachingAllocator {
if (C10_UNLIKELY(
set_fraction &&
AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
CUDAAllocatorConfig::garbage_collection_threshold() > 0.0)) {
// Track block reuse interval only when garbage collection is enabled.
++pool.get_free_blocks_call_count;
}
@ -2738,13 +2736,13 @@ class DeviceCachingAllocator {
}
// Do not return an oversized block for a large request
if ((p.size() < AcceleratorAllocatorConfig::max_split_size()) &&
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()))
if ((p.size() < CUDAAllocatorConfig::max_split_size()) &&
((*it)->size >= CUDAAllocatorConfig::max_split_size()))
return false;
// Allow oversized block size to be rounded up but within a limit
if ((p.size() >= AcceleratorAllocatorConfig::max_split_size()) &&
if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
((*it)->size >=
p.size() + AcceleratorAllocatorConfig::max_non_split_rounding_size()))
p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
return false;
p.block = *it;
pool.blocks.erase(it);
@ -2767,7 +2765,7 @@ class DeviceCachingAllocator {
// therefore should be of less overheads.
size_t gc_threshold = static_cast<size_t>(
AcceleratorAllocatorConfig::garbage_collection_threshold() *
CUDAAllocatorConfig::garbage_collection_threshold() *
static_cast<double>(allowed_memory_maximum));
// No need to trigger GC yet
if (total_allocated_memory <= gc_threshold) {
@ -2915,7 +2913,7 @@ class DeviceCachingAllocator {
stats.segment[stat_type].increase(1);
stats.reserved_bytes[stat_type].increase(size);
});
if (size >= AcceleratorAllocatorConfig::max_split_size())
if (size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_segments.increase(1);
auto reserved_bytes_gauge =
STATIC_GAUGE(pytorch.CUDACachingAllocator.reserved_bytes);
@ -2944,7 +2942,7 @@ class DeviceCachingAllocator {
bool release_available_cached_blocks(
const AllocParams& p,
const std::shared_ptr<GatheredContext>& context) {
if (AcceleratorAllocatorConfig::max_split_size() ==
if (CUDAAllocatorConfig::max_split_size() ==
std::numeric_limits<size_t>::max())
return false;
BlockPool& pool = *p.pool;
@ -2952,8 +2950,8 @@ class DeviceCachingAllocator {
// because of std::unique_ptr, block cannot be trivially copied
// Use constructor for search key.
Block key(p.search_key.device, p.search_key.stream, p.search_key.size);
key.size = (key.size < AcceleratorAllocatorConfig::max_split_size())
? AcceleratorAllocatorConfig::max_split_size()
key.size = (key.size < CUDAAllocatorConfig::max_split_size())
? CUDAAllocatorConfig::max_split_size()
: key.size;
auto it = pool.blocks.lower_bound(&key);
if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
@ -2966,7 +2964,7 @@ class DeviceCachingAllocator {
--it; // Back up one item. Now on the largest block for the correct
// stream
while ((totalReleased < key.size) &&
((*it)->size >= AcceleratorAllocatorConfig::max_split_size()) &&
((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
((*it)->stream == p.stream())) {
auto cur = it;
bool is_first = cur == pool.blocks.begin();
@ -3091,7 +3089,7 @@ class DeviceCachingAllocator {
stats.reserved_bytes[static_cast<int64_t>(StatType::AGGREGATE)]
.current);
if (block->size >= AcceleratorAllocatorConfig::max_split_size())
if (block->size >= CUDAAllocatorConfig::max_split_size())
stats.oversize_segments.decrease(1);
pool->blocks.erase(block);
delete block;
@ -3718,8 +3716,8 @@ class NativeCachingAllocator : public CUDAAllocator {
auto& md = result.config_metadata;
md.garbage_collection_threshold =
AcceleratorAllocatorConfig::garbage_collection_threshold();
md.max_split_size = AcceleratorAllocatorConfig::max_split_size();
CUDAAllocatorConfig::garbage_collection_threshold();
md.max_split_size = CUDAAllocatorConfig::max_split_size();
md.pinned_num_register_threads =
CUDAAllocatorConfig::pinned_num_register_threads();
md.expandable_segments = CUDAAllocatorConfig::expandable_segments();
@ -3727,10 +3725,9 @@ class NativeCachingAllocator : public CUDAAllocator {
CUDAAllocatorConfig::release_lock_on_cudamalloc();
md.pinned_use_host_register =
CUDAAllocatorConfig::pinned_use_cuda_host_register();
md.last_allocator_settings =
AcceleratorAllocatorConfig::last_allocator_settings();
md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
md.roundup_power2_divisions =
AcceleratorAllocatorConfig::roundup_power2_divisions();
CUDAAllocatorConfig::roundup_power2_divisions();
return result;
}

View File

@ -67,7 +67,11 @@
_(nvmlDeviceGetComputeRunningProcesses) \
_(nvmlSystemGetCudaDriverVersion_v2)
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12040)
#define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV)
#else
#define C10_NVML_DRIVER_API_OPTIONAL(_)
#endif
namespace c10::cuda {

View File

@ -1,4 +1,3 @@
#include <c10/core/AllocatorConfig.h>
#include <c10/util/flat_hash_map.h>
#include <c10/util/irange.h>
#include <c10/xpu/XPUCachingAllocator.h>
@ -21,6 +20,8 @@ constexpr size_t kMinBlockSize = 512;
constexpr size_t kSmallSize = 1048576;
// "small" allocations are packed in 2 MiB blocks
constexpr size_t kSmallBuffer = 2097152;
// "large" allocations may be packed in 20 MiB blocks
constexpr size_t kLargeBuffer = 20971520;
// allocations between 1 and 10 MiB may use kLargeBuffer
constexpr size_t kMinLargeAlloc = 10485760;
// round up large allocations to 2 MiB

View File

@ -153,6 +153,7 @@ set(AT_MKLDNN_ACL_ENABLED 0)
set(AT_MKLDNN_ENABLED 0)
set(AT_MKL_ENABLED 0)
set(AT_KLEIDIAI_ENABLED 0)
set(AT_USE_EIGEN_SPARSE 0)
# setting default preferred BLAS options if not already present.
if(NOT INTERN_BUILD_MOBILE)
set(BLAS "MKL" CACHE STRING "Selected BLAS library")
@ -262,6 +263,15 @@ if(BLAS_LIBRARIES AND BLAS_CHECK_F2C)
include(cmake/BLAS_ABI.cmake)
endif()
if(USE_EIGEN_SPARSE AND BLAS_INFO STREQUAL "mkl")
message(WARNING "Disabling USE_EIGEN_SPARSE because MKL is enabled")
set(USE_EIGEN_SPARSE OFF)
endif()
if(USE_EIGEN_SPARSE)
set(AT_USE_EIGEN_SPARSE 1)
endif()
if(NOT INTERN_BUILD_MOBILE)
set(AT_MKL_SEQUENTIAL 0)
set(USE_BLAS 1)

View File

@ -135,6 +135,7 @@ function(caffe2_print_configuration_summary)
endif()
message(STATUS " BUILD_NVFUSER : ${BUILD_NVFUSER}")
message(STATUS " USE_EIGEN_FOR_BLAS : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
message(STATUS " USE_EIGEN_FOR_SPARSE : ${USE_EIGEN_SPARSE}")
message(STATUS " USE_FBGEMM : ${USE_FBGEMM}")
message(STATUS " USE_KINETO : ${USE_KINETO}")
message(STATUS " USE_GFLAGS : ${USE_GFLAGS}")

View File

@ -268,10 +268,6 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
.. py:module:: torch.cuda.comm
```
```{eval-rst}
.. py:module:: torch.cuda.error
```
```{eval-rst}
.. py:module:: torch.cuda.gds
```

View File

@ -9,8 +9,9 @@ This note will eventually contain more details on how to use the APIs in torch/c
| type in custom extension | StableIValue representation | type in libtorch | Schema Type |
| -------- | ------- | ------- | ------- |
| std::optional\<S> | if there is a value, raw bitwise copy into leading bytes of uint64_t of pointer to a new StableIValue representing S. if there is no value, nullptr. | std::optional\<T> | Type? |
| RAIIATH | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor | Tensor |
| int32_t | raw bitwise copy into leading bytes of uint64_t | at::ScalarType | ScalarType |
| torch::stable::Tensor | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor | Tensor |
| RAIIATH (outdated) | raw bitwise copy of underlying AtenTensorHandle into leading bytes of uint64_t | at::Tensor | Tensor |
| torch::headeronly::ScalarType | raw bitwise copy of the translated underlying enum into leading bytes of uint64_t | torch::headeronly::ScalarType | ScalarType |
| int32_t | raw bitwise copy into leading bytes of uint64_t | at::Layout | Layout |
| int32_t | raw bitwise copy into leading bytes of uint64_t | at::MemoryFormat | MemoryFormat |
| bool | raw bitwise copy into leading bytes of uint64_t | bool | bool |

View File

@ -110,7 +110,6 @@ and supported quantized modules and functions.
.. py:module:: torch.ao.quantization.backend_config.executorch
.. py:module:: torch.ao.quantization.backend_config.fbgemm
.. py:module:: torch.ao.quantization.backend_config.native
.. py:module:: torch.ao.quantization.backend_config.observation_type
.. py:module:: torch.ao.quantization.backend_config.onednn
.. py:module:: torch.ao.quantization.backend_config.qnnpack
.. py:module:: torch.ao.quantization.backend_config.tensorrt

View File

@ -1588,7 +1588,6 @@ def main() -> None:
"networkx>=2.5.1",
"jinja2",
"fsspec>=0.8.5",
'intel-openmp==2025.1.1 ;platform_system == "Windows" ', # for Windows inductor
]
if BUILD_PYTHON_ONLY:
install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"]

View File

@ -39,6 +39,7 @@ set(NATIVERT_TEST_SRCS
${TORCH_ROOT}/torch/nativert/graph/passes/SubgraphRewriter.cpp
${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/PassManager.cpp
${TORCH_ROOT}/torch/nativert/kernels/KernelHandlerRegistry.cpp
)
add_executable(test_nativert

View File

@ -0,0 +1,158 @@
#include <gtest/gtest.h>
#include <ATen/ATen.h>
#include <torch/nativert/executor/Executor.h>
#include <torch/nativert/graph/Graph.h>
#include <torch/torch.h>
#include <torch/nativert/kernels/KernelHandlerRegistry.h>
namespace torch::nativert {
/*
* This is a lightweight version of ModelRunner that executes a model in
* interpreter mode given a string graph with no weights/attributes
*/
class SimpleTestModelRunner {
public:
SimpleTestModelRunner(
const std::string_view source,
const ExecutorConfig& config) {
register_kernel_handlers();
graph_ = stringToGraph(source);
weights_ = std::make_shared<Weights>(graph_.get());
executor_ = std::make_unique<Executor>(config, graph_, weights_);
}
std::vector<c10::IValue> run(const std::vector<c10::IValue>& inputs) const {
return executor_->execute(inputs);
}
ProfileMetrics benchmarkIndividualNodes(
const std::vector<c10::IValue>& inputs) const {
return executor_->benchmarkIndividualNodes({inputs}, 10, 10);
}
private:
std::shared_ptr<Graph> graph_;
std::unique_ptr<Executor> executor_;
std::shared_ptr<Weights> weights_;
};
inline void compareIValue(
const c10::IValue& expected,
const c10::IValue& actual,
bool native = false) {
if (expected.isTensor()) {
EXPECT_TRUE(actual.isTensor());
EXPECT_TRUE(torch::allclose(
expected.toTensor(),
actual.toTensor(),
1e-5,
1e-8,
/*equal_nan*/ true));
if (!native) {
EXPECT_TRUE(expected.toTensor().strides() == actual.toTensor().strides());
}
} else if (expected.isTuple()) {
EXPECT_TRUE(actual.isTuple());
auto expected_tuple = expected.toTupleRef().elements();
auto actual_tuple = actual.toTupleRef().elements();
ASSERT_TRUE(expected_tuple.size() == actual_tuple.size());
for (size_t i = 0; i < expected_tuple.size(); i++) {
compareIValue(expected_tuple[i], actual_tuple[i], native);
}
} else if (expected.isList()) {
EXPECT_TRUE(actual.isList());
auto expected_list = expected.toList();
auto actual_list = actual.toList();
ASSERT_TRUE(expected_list.size() == actual_list.size());
for (size_t i = 0; i < expected_list.size(); i++) {
compareIValue(expected_list[i], actual_list[i], native);
}
} else if (expected.isGenericDict()) {
EXPECT_TRUE(actual.isGenericDict());
auto expected_dict = expected.toGenericDict();
auto actual_dict = actual.toGenericDict();
EXPECT_TRUE(expected_dict.size() == actual_dict.size());
for (auto& expected_kv : expected_dict) {
auto actual_kv = actual_dict.find(expected_kv.key());
ASSERT_FALSE(actual_kv == actual_dict.end());
compareIValue(expected_kv.value(), actual_kv->value(), native);
}
} else {
// Fall back to default comparison from IValue
EXPECT_TRUE(expected == actual);
}
}
void compareIValues(
std::vector<c10::IValue> expected,
std::vector<c10::IValue> actual,
bool native = false) {
ASSERT_TRUE(expected.size() == actual.size());
for (size_t i = 0; i < expected.size(); i++) {
compareIValue(expected[i], actual[i], native);
}
}
inline void testStaticKernelEqualityInternal(
const SimpleTestModelRunner& modelRunner,
const SimpleTestModelRunner& staticModelRunner,
const std::vector<c10::IValue>& args,
bool native = false) {
auto expected = modelRunner.run(args);
auto output = staticModelRunner.run(args);
compareIValues(expected, output, native);
// Run again to test the static kernel when outputs IValue are cached in the
// execution frame
auto output2 = staticModelRunner.run(args);
compareIValues(expected, output2, native);
}
void testStaticKernelEquality(
const std::string_view source,
const std::vector<c10::IValue>& args,
bool native = false) {
ExecutorConfig config;
config.enableStaticCPUKernels = false;
SimpleTestModelRunner model(source, config);
config.enableStaticCPUKernels = true;
SimpleTestModelRunner staticKernelModel(source, config);
testStaticKernelEqualityInternal(model, staticKernelModel, args, native);
}
inline void testGraphABEquality(
const std::string_view graph_a,
const std::string_view graph_b,
const std::vector<c10::IValue>& args,
const ExecutorConfig& config = {},
bool native = false) {
SimpleTestModelRunner model_a(graph_a, config);
auto expected = model_a.run(args);
SimpleTestModelRunner model_b(graph_b, config);
auto output = model_b.run(args);
compareIValues(expected, output, native);
}
inline void testGraphABPerf(
const std::string_view graph_a,
const std::string_view graph_b,
const std::vector<c10::IValue>& args,
const ExecutorConfig& config = {}) {
SimpleTestModelRunner model_a(graph_a, config);
auto resultA = model_a.benchmarkIndividualNodes(args);
SimpleTestModelRunner model_b(graph_b, config);
auto resultB = model_b.benchmarkIndividualNodes(args);
ASSERT_TRUE(resultA.totalTime > resultB.totalTime);
}
} // namespace torch::nativert

View File

@ -0,0 +1,15 @@
#include <gtest/gtest.h>
#include <torch/nativert/kernels/KernelFactory.h>
#include <torch/nativert/kernels/KernelHandlerRegistry.h>
using namespace ::testing;
using namespace torch::nativert;
TEST(StaticDispatchKernelRegistrationTests, TestRegistration) {
EXPECT_FALSE(KernelFactory::isHandlerRegistered("static_cpu"));
register_kernel_handlers();
EXPECT_TRUE(KernelFactory::isHandlerRegistered("static_cpu"));
// try to re-register, which should be a no-op
register_kernel_handlers();
}

View File

@ -4,6 +4,7 @@
#include <torch/csrc/stable/tensor.h>
#include <torch/csrc/stable/ops.h>
#include <torch/headeronly/util/Exception.h>
#include <torch/headeronly/core/ScalarType.h>
#ifdef LAE_USE_CUDA
#include <cuda_runtime.h>
@ -139,12 +140,10 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
const auto num_args = 6;
StableIValue stack[num_args];
int32_t t_dtype;
aoti_torch_get_dtype(t.get(), &t_dtype);
auto mf = aoti_torch_memory_format_contiguous_format();
stack[0] = from(t);
stack[1] = from(std::optional(t_dtype)); // dtype
stack[1] = from(std::optional(t.scalar_type())); // dtype
stack[2] = from(std::nullopt); // layout
stack[3] = from(std::optional(device)); // device
stack[4] = from(std::optional(false)); // pin_memory
@ -342,12 +341,24 @@ void boxed_my_narrow(
stack[0] = from(res);
}
Tensor my_new_empty_dtype_variant(Tensor t) {
std::vector<int64_t> sizes = {2, 5};
auto dtype = std::make_optional(at::ScalarType::BFloat16);
return new_empty(t, sizes, dtype);
}
void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
auto res = my_new_empty_dtype_variant(to<Tensor>(stack[0]));
stack[0] = from(res);
}
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
m.def("my_empty_like(Tensor t) -> Tensor");
m.def("fill_infinity(Tensor(a!) t) -> Tensor(a!)");
m.def("my_pad(Tensor t) -> Tensor");
m.def("my_narrow(Tensor t, int dim, int start, int length) -> Tensor");
m.def("my_new_empty_dtype_variant(Tensor t) -> Tensor");
}
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
@ -355,6 +366,7 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
m.impl("my_empty_like", &boxed_empty_like);
m.impl("fill_infinity", &boxed_fill_infinity);
m.impl("my_is_cpu", &boxed_my_is_cpu);
m.impl("my_new_empty_dtype_variant", &boxed_my_new_empty_dtype_variant);
}
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
@ -371,10 +383,31 @@ void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs
stack[0] = from(res);
}
Tensor my_amax(Tensor t) {
return amax(t, 0, false);
}
void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
auto res = my_amax(to<Tensor>(stack[0]));
stack[0] = from(res);
}
Tensor my_amax_vec(Tensor t) {
std::vector<int64_t> v = {0,1};
return amax(t, v, false);
}
void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
auto res = my_amax_vec(to<Tensor>(stack[0]));
stack[0] = from(res);
}
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
m.def("my_amax(Tensor a) -> Tensor");
m.def("my_amax_vec(Tensor a) -> Tensor");
m.def("my_is_cpu(Tensor t) -> bool");
}
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
@ -414,6 +447,8 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
m.impl("test_default_constructor", &boxed_test_default_constructor);
m.impl("my_amax", &boxed_my_amax);
m.impl("my_amax_vec", &boxed_my_amax_vec);
}
// Test functions for torch::stable::accelerator APIs

View File

@ -167,6 +167,30 @@ def my_zero_(t) -> Tensor:
return torch.ops.libtorch_agnostic.my_zero_.default(t)
def my_amax(t) -> Tensor:
"""
Returns t.amax()
Args:
t: Tensor
Returns: amax(t)
"""
return torch.ops.libtorch_agnostic.my_amax.default(t)
def my_amax_vec(t) -> Tensor:
"""
Returns t.amax()
Args:
t: Tensor
Returns: amax(t)
"""
return torch.ops.libtorch_agnostic.my_amax_vec.default(t)
def fill_infinity(t) -> Tensor:
"""
Fills the tensor with inf.
@ -259,3 +283,15 @@ def test_get_current_device_index() -> int:
Returns: Current device index as an integer
"""
return torch.ops.libtorch_agnostic.test_get_current_device_index.default()
def my_new_empty_dtype_variant(t) -> Tensor:
"""
Returns a new empty tensor with shape [2, 5] and dtype bfloat16
Args:
t: Input tensor used as a reference for device and other properties
Returns: New empty tensor with shape [2, 5] and dtype bfloat16
"""
return torch.ops.libtorch_agnostic.my_new_empty_dtype_variant.default(t)

View File

@ -190,7 +190,7 @@ if not IS_WINDOWS:
deterministic = torch.are_deterministic_algorithms_enabled()
try:
# set use_deterministic_algorithms to fill unintialized memory
# set use_deterministic_algorithms to fill uninitialized memory
torch.use_deterministic_algorithms(True)
t = torch.rand(2, 7, device=device)
@ -209,6 +209,20 @@ if not IS_WINDOWS:
self.assertEqual(id(out), id(t))
self.assertEqual(out, torch.zeros_like(t))
def test_my_amax(self, device):
import libtorch_agnostic
t = torch.rand(2, 7, device=device)
out = libtorch_agnostic.ops.my_amax(t)
self.assertEqual(out, torch.amax(t, 0))
def test_my_amax_vec(self, device):
import libtorch_agnostic
t = torch.rand(2, 7, 5, device=device)
out = libtorch_agnostic.ops.my_amax_vec(t)
self.assertEqual(out, torch.amax(t, (0, 1)))
def test_my_is_cpu(self, device):
import libtorch_agnostic
@ -308,6 +322,21 @@ if not IS_WINDOWS:
finally:
torch.cuda.set_device(prev_device)
def test_my_new_empty_dtype_variant(self, device):
import libtorch_agnostic
deterministic = torch.are_deterministic_algorithms_enabled()
try:
# set use_deterministic_algorithms to fill uninitialized memory
torch.use_deterministic_algorithms(True)
t = torch.randn(3, 4, device=device)
out = libtorch_agnostic.ops.my_new_empty_dtype_variant(t)
ref_out = t.new_empty((2, 5), dtype=torch.bfloat16)
self.assertEqual(out, ref_out, exact_device=True)
finally:
torch.use_deterministic_algorithms(deterministic)
instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
if __name__ == "__main__":

View File

@ -1486,8 +1486,8 @@ class TestFullyShardWorldSize1(FSDPTest):
@skip_if_lt_x_gpu(1)
def test_train_parity_single_worldsize1(self):
"""
Tests train parity with DDP for a single FSDP group
when sharding parameters on dim-0.
Tests train parity with DDP for a single FSDP group when sharding
parameters on dim-0.
"""
self.run_subtests(
{
@ -1535,7 +1535,9 @@ class TestFullyShardWorldSize1(FSDPTest):
losses.append(model(*inp).sum())
losses[-1].backward()
self.assertEqual(comm_mode.get_total_counts(), 0)
# Before there was 1 all-gather and 1 reduce-scatter
# Now therre is 1 reduce-scatter
self.assertEqual(comm_mode.get_total_counts(), 1)
optim.step()
self.assertEqual(losses[0], losses[1])

View File

@ -286,11 +286,11 @@ class TestFullyShard2DTraining(FSDPTest):
with CommDebugMode() as bwd_comm_mode:
loss.backward()
bwd_comm_counts = bwd_comm_mode.get_comm_counts()
self.assertEqual(len(bwd_comm_counts), 1)
self.assertEqual(len(bwd_comm_counts), 2)
# First MLP's input gradient does not need to be all-reduced
self.assertEqual(bwd_comm_counts[funcol.all_reduce], num_mlps - 1)
self.assertEqual(bwd_comm_counts[c10d_ops._allgather_base_], 0)
self.assertEqual(bwd_comm_counts[c10d_ops._reduce_scatter_base_], 0)
self.assertEqual(bwd_comm_counts[c10d_ops._reduce_scatter_base_], num_mlps)
ref_loss.backward()
optim.step()

View File

@ -1,10 +1,10 @@
# Owner(s): ["oncall: distributed"]
from torch.testing._internal.common_distributed import MultiProcContinousTest
from torch.testing._internal.common_distributed import MultiProcContinuousTest
from torch.testing._internal.common_utils import run_tests
class TestTemplate(MultiProcContinousTest):
class TestTemplate(MultiProcContinuousTest):
def testABC(self):
print(f"rank {self.rank} of {self.world_size} testing ABC")

View File

@ -55,7 +55,7 @@ class TestMakeCheckpointer(TestCase):
# Test that it works for sync operations
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_factory_sync")
result = checkpointer.save(self.state_dict, checkpoint_path)
result = checkpointer.save(checkpoint_path, self.state_dict)
self.assertIsNone(result) # Sync mode returns None
# Verify checkpoint was created
@ -81,7 +81,7 @@ class TestMakeCheckpointer(TestCase):
checkpoint_path = os.path.join(
self.temp_dir, "checkpoint_factory_sync_config_first"
)
result = checkpointer.save(self.state_dict, checkpoint_path)
result = checkpointer.save(checkpoint_path, self.state_dict)
self.assertIsNone(result) # Sync mode returns None
# Verify checkpoint was created
@ -105,7 +105,7 @@ class TestMakeCheckpointer(TestCase):
checkpoint_path = os.path.join(
self.temp_dir, "checkpoint_factory_sync_custom_config"
)
result = checkpointer.save(self.state_dict, checkpoint_path)
result = checkpointer.save(checkpoint_path, self.state_dict)
self.assertIsNone(result) # Sync mode returns None
# Verify checkpoint was created
@ -135,7 +135,7 @@ class TestMakeCheckpointer(TestCase):
# Test that it works for async operations
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_factory_async")
stage_future, write_future = checkpointer.save(
self.state_dict, checkpoint_path
checkpoint_path, self.state_dict
)
# Verify futures are returned

View File

@ -90,7 +90,7 @@ class TestCheckpointWriter(TestCase):
checkpoint_path = os.path.join(self.temp_dir, "checkpoint")
# Call write
self.writer.write(self.state_dict, checkpoint_path)
self.writer.write(checkpoint_path, self.state_dict)
# Verify that the checkpoint file exists
expected_file_path = os.path.join(
@ -111,7 +111,7 @@ class TestCheckpointWriter(TestCase):
checkpoint_path = os.path.join(self.temp_dir, "checkpoint")
# Call write
self.writer.write(self.state_dict, checkpoint_path)
self.writer.write(checkpoint_path, self.state_dict)
# Verify that the barrier was called
self.mock_barrier.execute_barrier.assert_called_once()
@ -123,7 +123,7 @@ class TestCheckpointWriter(TestCase):
# Call write with additional kwargs
kwargs = {"extra": "value"}
self.writer.write(self.state_dict, checkpoint_path, **kwargs)
self.writer.write(checkpoint_path, self.state_dict, **kwargs)
# Verify that the pre_commit hook was called with the correct parameters
self.assertTrue(self.mock_hook.pre_commit_called)
@ -157,7 +157,7 @@ class TestCheckpointWriter(TestCase):
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_no_barrier")
# Call write
writer.write(self.state_dict, checkpoint_path)
writer.write(checkpoint_path, self.state_dict)
# Verify that the checkpoint file exists
expected_file_path = os.path.join(
@ -179,7 +179,7 @@ class TestCheckpointWriter(TestCase):
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_no_hook")
# Call write
writer.write(self.state_dict, checkpoint_path)
writer.write(checkpoint_path, self.state_dict)
# Verify that the checkpoint file exists
expected_file_path = os.path.join(

View File

@ -3,8 +3,14 @@
import os
import shutil
import tempfile
from concurrent.futures import Future
from unittest.mock import Mock
import torch
from torch.distributed.checkpoint._experimental.checkpoint_process import (
CheckpointProcess,
CheckpointProcessConfig,
)
from torch.distributed.checkpoint._experimental.checkpoint_reader import (
CheckpointReader,
)
@ -12,12 +18,39 @@ from torch.distributed.checkpoint._experimental.checkpoint_writer import (
CheckpointWriter,
CheckpointWriterConfig,
)
from torch.distributed.checkpoint._experimental.checkpointer import SyncCheckpointer
from torch.distributed.checkpoint._experimental.checkpointer import (
AsyncCheckpointer,
Checkpointer,
SyncCheckpointer,
)
from torch.distributed.checkpoint._experimental.staging import (
CheckpointStagerConfig,
DefaultStager,
)
from torch.distributed.checkpoint._experimental.types import RankInfo
from torch.testing._internal.common_utils import run_tests, TestCase
class TestSyncCheckpointer(TestCase):
def subprocess_init_fn(name: str, parent_pid: int) -> None:
"""Initialize the subprocess for async checkpointer tests."""
assert name == "test-async-checkpointer", f"Unexpected subprocess name: {name}"
assert os.getpid() != parent_pid, "This was supposed to run in a different process"
assert os.getppid() == parent_pid, (
"This was supposed to run as a child to main process"
)
def ckpt_writer_init_fn(**kwargs) -> CheckpointWriter:
"""Initialize a CheckpointWriter in the subprocess."""
return CheckpointWriter(
config=kwargs.get("config"),
rank_info=kwargs.get("rank_info"),
)
class TestCheckpointer(TestCase):
"""Parameterized tests that work with both sync and async checkpointers."""
def setUp(self):
# Create a temporary directory for checkpoints
self.temp_dir = tempfile.mkdtemp()
@ -28,20 +61,13 @@ class TestSyncCheckpointer(TestCase):
global_rank=0,
)
self.writer_config = CheckpointWriterConfig()
self.writer = CheckpointWriter(
config=self.writer_config,
rank_info=self.rank_info,
)
# Create reader for testing
self.reader = CheckpointReader(
rank_info=self.rank_info,
)
# Create sync checkpointer
self.checkpointer = SyncCheckpointer(self.writer, self.reader)
# Create a test state dictionary
# Create test state dictionary
self.state_dict = {
"model": torch.nn.Linear(10, 5).state_dict(),
"optimizer": {"param_groups": [{"lr": 0.01}]},
@ -53,129 +79,562 @@ class TestSyncCheckpointer(TestCase):
# Clean up the temporary directory
shutil.rmtree(self.temp_dir)
def test_sync_save_and_read(self):
"""Test saving and reading a checkpoint synchronously."""
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_sync")
# Save the checkpoint synchronously
result = self.checkpointer.save(self.state_dict, checkpoint_path)
self.assertIsNone(result) # Sync mode returns None
# Verify that the checkpoint file exists
checkpoint_file = os.path.join(
checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
)
self.assertTrue(os.path.exists(checkpoint_file))
# Load the checkpoint using the checkpointer
loaded_state_dict = self.checkpointer.load(checkpoint_path)
# Verify the loaded state dictionary
self.assertIn("model", loaded_state_dict)
self.assertIn("optimizer", loaded_state_dict)
self.assertEqual(loaded_state_dict["epoch"], 5)
self.assertEqual(loaded_state_dict["step"], 1000)
def test_read_with_map_location(self):
"""Test reading a checkpoint with a specific map_location."""
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_map_location")
# Save the checkpoint
self.checkpointer.save(self.state_dict, checkpoint_path)
# Load the checkpoint with map_location='cpu'
loaded_state_dict = self.checkpointer.load(
checkpoint_path, default_map_location="cpu"
)
# Verify the loaded state dictionary
self.assertIn("model", loaded_state_dict)
self.assertIn("optimizer", loaded_state_dict)
self.assertEqual(loaded_state_dict["epoch"], 5)
self.assertEqual(loaded_state_dict["step"], 1000)
def test_partial_load(self):
"""Test loading only specific keys from a checkpoint."""
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_partial")
# Save the full checkpoint
self.checkpointer.save(self.state_dict, checkpoint_path)
# Create a partial state dictionary with only some keys
partial_state_dict = {
"model": torch.nn.Linear(10, 5).state_dict(),
"epoch": None, # Will be loaded from checkpoint
}
# Load only the keys in partial_state_dict
loaded_state_dict = self.checkpointer.load(
checkpoint_path, state_dict=partial_state_dict, default_map_location="cpu"
)
# Verify that the loaded state dictionary contains values from the checkpoint
self.assertIn("model", loaded_state_dict)
self.assertIn("epoch", loaded_state_dict)
self.assertEqual(loaded_state_dict["epoch"], 5) # From checkpoint
# Verify that keys not in the partial_state_dict are not loaded
self.assertNotIn("step", loaded_state_dict)
self.assertNotIn("optimizer", loaded_state_dict)
# Verify that the loaded state dictionary is the same object as the input
self.assertIs(loaded_state_dict, partial_state_dict)
def test_partial_load_with_nested_dict(self):
"""Test loading only specific nested keys from a checkpoint."""
# Create a checkpoint with nested dictionaries
nested_state_dict = {
"model": {
"layer1": {"weight": torch.randn(5, 10), "bias": torch.randn(5)},
"layer2": {"weight": torch.randn(2, 5), "bias": torch.randn(2)},
},
"metadata": {"epoch": 10, "step": 2000},
}
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_nested")
# Create a writer and save the nested state dict
def _create_sync_checkpointer(self) -> SyncCheckpointer:
"""Create a synchronous checkpointer."""
writer = CheckpointWriter(
config=self.writer_config,
rank_info=self.rank_info,
)
writer.write(nested_state_dict, checkpoint_path)
return SyncCheckpointer(writer, self.reader)
# Create a partial state dictionary with nested structure
partial_state_dict = {
"model": {
"layer1": {"weight": None}, # Only request layer1.weight
def _create_async_checkpointer(self) -> AsyncCheckpointer:
"""Create an asynchronous checkpointer."""
# Create staging config for async operations
# Use conservative settings to avoid CUDA issues in test environment
stager_config = CheckpointStagerConfig(
use_async_staging=True,
use_pinned_memory=False, # Disable to avoid CUDA memory issues
use_shared_memory=True,
use_non_blocking_copy=False, # Disable to avoid CUDA issues
)
# Create process config
process_config = CheckpointProcessConfig(
subprocess_init_timeout_secs=30,
subprocess_shutdown_timeout_secs=60,
)
# Create stager
checkpoint_stager = DefaultStager(stager_config)
# Create checkpoint process
checkpoint_process = CheckpointProcess(
rank_info=self.rank_info,
config=process_config,
subprocess_init_fn=subprocess_init_fn,
subprocess_init_args=(
"test-async-checkpointer",
os.getpid(),
),
checkpoint_writer_init_fn=ckpt_writer_init_fn,
checkpoint_writer_init_args={
"config": self.writer_config,
"rank_info": self.rank_info,
},
"metadata": {"epoch": None}, # Only request metadata.epoch
)
# Wait for process initialization
checkpoint_process.process_creation_future.result()
return AsyncCheckpointer(
checkpoint_stager=checkpoint_stager,
checkpoint_process=checkpoint_process,
reader=self.reader,
)
def _get_checkpointers(self):
"""Get both sync and async checkpointers for parameterized testing."""
return [
("sync", self._create_sync_checkpointer()),
("async", self._create_async_checkpointer()),
]
def _save_checkpoint(self, checkpointer: Checkpointer, path, state_dict, **kwargs):
"""Save checkpoint and handle both sync/async return values."""
result = checkpointer.save(path, state_dict, **kwargs)
return (None, None) if result is None else result
def _wait_for_save(self, stage_future, write_future):
"""Wait for save operation to complete."""
if write_future is not None:
write_future.result()
if stage_future is not None:
stage_future.result()
def test_save_and_load_basic(self):
"""Test basic save and load functionality for both sync and async."""
for checkpointer_type, checkpointer in self._get_checkpointers():
with self.subTest(checkpointer_type=checkpointer_type):
try:
checkpoint_path = os.path.join(
self.temp_dir, f"checkpoint_{checkpointer_type}"
)
# Save the checkpoint
stage_future, write_future = self._save_checkpoint(
checkpointer, checkpoint_path, self.state_dict
)
self._wait_for_save(stage_future, write_future)
# Verify that the checkpoint file exists
checkpoint_file = os.path.join(
checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
)
self.assertTrue(os.path.exists(checkpoint_file))
# Load the checkpoint using the checkpointer
loaded_state_dict = checkpointer.load(checkpoint_path)
# Verify the loaded state dictionary
self.assertIn("model", loaded_state_dict)
self.assertIn("optimizer", loaded_state_dict)
self.assertEqual(loaded_state_dict["epoch"], 5)
self.assertEqual(loaded_state_dict["step"], 1000)
finally:
checkpointer.close()
def test_load_with_map_location(self):
"""Test loading with map_location for both sync and async."""
for checkpointer_type, checkpointer in self._get_checkpointers():
with self.subTest(checkpointer_type=checkpointer_type):
try:
checkpoint_path = os.path.join(
self.temp_dir, f"checkpoint_map_{checkpointer_type}"
)
# Save the checkpoint
stage_future, write_future = self._save_checkpoint(
checkpointer, checkpoint_path, self.state_dict
)
self._wait_for_save(stage_future, write_future)
# Load with map_location
loaded_state_dict = checkpointer.load(
checkpoint_path, default_map_location="cpu"
)
# Verify the loaded state dictionary
self.assertIn("model", loaded_state_dict)
self.assertEqual(loaded_state_dict["epoch"], 5)
finally:
checkpointer.close()
def test_partial_load(self):
"""Test partial loading for both sync and async."""
for checkpointer_type, checkpointer in self._get_checkpointers():
with self.subTest(checkpointer_type=checkpointer_type):
try:
checkpoint_path = os.path.join(
self.temp_dir, f"checkpoint_partial_{checkpointer_type}"
)
# Save the full checkpoint
stage_future, write_future = self._save_checkpoint(
checkpointer, checkpoint_path, self.state_dict
)
self._wait_for_save(stage_future, write_future)
# Create a partial state dictionary
partial_state_dict = {
"model": torch.nn.Linear(10, 5).state_dict(),
"epoch": None,
}
# Load only the keys in partial_state_dict
loaded_state_dict = checkpointer.load(
checkpoint_path, state_dict=partial_state_dict
)
# Verify partial loading worked
self.assertIn("model", loaded_state_dict)
self.assertIn("epoch", loaded_state_dict)
self.assertEqual(loaded_state_dict["epoch"], 5)
self.assertNotIn("step", loaded_state_dict)
self.assertNotIn("optimizer", loaded_state_dict)
finally:
checkpointer.close()
def test_load_strict_mode(self):
"""Test strict mode loading for both sync and async."""
for checkpointer_type, checkpointer in self._get_checkpointers():
with self.subTest(checkpointer_type=checkpointer_type):
try:
checkpoint_path = os.path.join(
self.temp_dir, f"checkpoint_strict_{checkpointer_type}"
)
# Save a checkpoint with limited keys
limited_state_dict = {"model": torch.nn.Linear(10, 5).state_dict()}
stage_future, write_future = self._save_checkpoint(
checkpointer, checkpoint_path, limited_state_dict
)
self._wait_for_save(stage_future, write_future)
# Try to load with more keys than exist in checkpoint
partial_state_dict = {
"model": torch.nn.Linear(10, 5).state_dict(),
"missing_key": None,
}
# Should raise error in strict mode
with self.assertRaises(RuntimeError) as cm:
checkpointer.load(
checkpoint_path, state_dict=partial_state_dict, strict=True
)
self.assertIn("missing keys", str(cm.exception))
# Should work without strict mode
loaded_state_dict = checkpointer.load(
checkpoint_path, state_dict=partial_state_dict, strict=False
)
self.assertIn("model", loaded_state_dict)
finally:
checkpointer.close()
def test_save_with_kwargs(self):
"""Test save with additional kwargs for both sync and async."""
for checkpointer_type, checkpointer in self._get_checkpointers():
with self.subTest(checkpointer_type=checkpointer_type):
try:
checkpoint_path = os.path.join(
self.temp_dir, f"checkpoint_kwargs_{checkpointer_type}"
)
# For sync checkpointer, we can pass arbitrary kwargs to the writer
# For async checkpointer, we test without kwargs to avoid conflicts
if checkpointer_type == "sync":
# Sync checkpointer passes kwargs directly to writer, so arbitrary kwargs are OK
stage_future, write_future = self._save_checkpoint(
checkpointer,
checkpoint_path,
self.state_dict,
custom_arg="test_value",
another_arg=42,
)
else:
# Async checkpointer has complex kwargs handling between stager and writer
# Just test basic save without kwargs to avoid conflicts
stage_future, write_future = self._save_checkpoint(
checkpointer,
checkpoint_path,
self.state_dict,
)
self._wait_for_save(stage_future, write_future)
# Verify checkpoint was created
checkpoint_file = os.path.join(
checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
)
self.assertTrue(os.path.exists(checkpoint_file))
finally:
checkpointer.close()
def test_nested_dict_partial_load(self):
"""Test loading nested dictionaries partially for both sync and async."""
for checkpointer_type, checkpointer in self._get_checkpointers():
with self.subTest(checkpointer_type=checkpointer_type):
try:
# Create a checkpoint with nested dictionaries
nested_state_dict = {
"model": {
"layer1": {
"weight": torch.randn(5, 10),
"bias": torch.randn(5),
},
"layer2": {
"weight": torch.randn(2, 5),
"bias": torch.randn(2),
},
},
"metadata": {"epoch": 10, "step": 2000},
}
checkpoint_path = os.path.join(
self.temp_dir, f"checkpoint_nested_{checkpointer_type}"
)
# Save the nested state dict
stage_future, write_future = self._save_checkpoint(
checkpointer, checkpoint_path, nested_state_dict
)
self._wait_for_save(stage_future, write_future)
# Create a partial state dictionary with nested structure
partial_state_dict = {
"model": {
"layer1": {"weight": None}, # Only request layer1.weight
},
"metadata": {"epoch": None}, # Only request metadata.epoch
}
# Load only the keys in partial_state_dict
loaded_state_dict = checkpointer.load(
checkpoint_path, state_dict=partial_state_dict
)
# Verify that the nested keys were correctly loaded
self.assertIn("model", loaded_state_dict)
self.assertIn("layer1", loaded_state_dict["model"])
self.assertIn("weight", loaded_state_dict["model"]["layer1"])
self.assertIn("metadata", loaded_state_dict)
self.assertIn("epoch", loaded_state_dict["metadata"])
# Verify values were loaded correctly
self.assertTrue(
torch.allclose(
loaded_state_dict["model"]["layer1"]["weight"],
nested_state_dict["model"]["layer1"]["weight"],
)
)
self.assertEqual(loaded_state_dict["metadata"]["epoch"], 10)
# Verify that keys not in the partial_state_dict are not loaded
self.assertNotIn("layer2", loaded_state_dict["model"])
self.assertNotIn("step", loaded_state_dict["metadata"])
finally:
checkpointer.close()
class TestAsyncCheckpointerSpecific(TestCase):
"""Tests specific to AsyncCheckpointer functionality."""
def setUp(self):
# Create a temporary directory for checkpoints
self.temp_dir = tempfile.mkdtemp()
# Create real objects for testing
self.rank_info = RankInfo(
global_world_size=1,
global_rank=0,
)
self.writer_config = CheckpointWriterConfig()
# Create reader for testing
self.reader = CheckpointReader(
rank_info=self.rank_info,
)
# Create test state dictionary
self.state_dict = {
"model": torch.nn.Linear(10, 5).state_dict(),
"optimizer": {"param_groups": [{"lr": 0.01}]},
"epoch": 5,
"step": 1000,
}
# Load only the keys in partial_state_dict
loaded_state_dict = self.checkpointer.load(
checkpoint_path, state_dict=partial_state_dict, default_map_location="cpu"
def tearDown(self):
# Clean up the temporary directory
shutil.rmtree(self.temp_dir)
def _create_async_checkpointer(self) -> AsyncCheckpointer:
"""Helper method to create AsyncCheckpointer with real components."""
# Create staging config for async operations
# Use conservative settings to avoid CUDA issues in test environment
stager_config = CheckpointStagerConfig(
use_async_staging=True,
use_pinned_memory=False, # Disable to avoid CUDA memory issues
use_shared_memory=True,
use_non_blocking_copy=False, # Disable to avoid CUDA issues
)
# Verify that the nested keys were correctly loaded
self.assertIn("model", loaded_state_dict)
self.assertIn("layer1", loaded_state_dict["model"])
self.assertIn("weight", loaded_state_dict["model"]["layer1"])
self.assertIn("metadata", loaded_state_dict)
self.assertIn("epoch", loaded_state_dict["metadata"])
# Create process config
process_config = CheckpointProcessConfig(
subprocess_init_timeout_secs=30,
subprocess_shutdown_timeout_secs=60,
)
# Verify values were loaded correctly
self.assertTrue(
torch.allclose(
loaded_state_dict["model"]["layer1"]["weight"],
nested_state_dict["model"]["layer1"]["weight"],
# Create stager
checkpoint_stager = DefaultStager(stager_config)
# Create checkpoint process
checkpoint_process = CheckpointProcess(
rank_info=self.rank_info,
config=process_config,
subprocess_init_fn=subprocess_init_fn,
subprocess_init_args=(
"test-async-checkpointer",
os.getpid(),
),
checkpoint_writer_init_fn=ckpt_writer_init_fn,
checkpoint_writer_init_args={
"config": self.writer_config,
"rank_info": self.rank_info,
},
)
# Wait for process initialization
checkpoint_process.process_creation_future.result()
return AsyncCheckpointer(
checkpoint_stager=checkpoint_stager,
checkpoint_process=checkpoint_process,
reader=self.reader,
)
def test_async_returns_futures(self):
"""Test that async save returns futures."""
checkpointer = self._create_async_checkpointer()
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_futures")
try:
# Save the checkpoint asynchronously
result = checkpointer.save(checkpoint_path, self.state_dict)
# Verify that futures are returned
self.assertIsInstance(result, tuple)
self.assertEqual(len(result), 2)
stage_future, write_future = result
self.assertIsInstance(stage_future, Future)
self.assertIsInstance(write_future, Future)
# Wait for completion
stage_future.result()
write_future.result()
finally:
checkpointer.close()
def test_async_sequential_saves_wait(self):
"""Test that sequential async saves wait for previous operations."""
checkpointer = self._create_async_checkpointer()
try:
# First save
checkpoint_path1 = os.path.join(self.temp_dir, "checkpoint_seq_1")
stage_future1, write_future1 = checkpointer.save(
checkpoint_path1, self.state_dict
)
)
self.assertEqual(loaded_state_dict["metadata"]["epoch"], 10)
# Verify that keys not in the partial_state_dict are not loaded
self.assertNotIn("layer2", loaded_state_dict["model"])
self.assertNotIn("step", loaded_state_dict["metadata"])
# Second save (should wait for first to complete)
checkpoint_path2 = os.path.join(self.temp_dir, "checkpoint_seq_2")
modified_state_dict = self.state_dict.copy()
modified_state_dict["epoch"] = 10
stage_future2, write_future2 = checkpointer.save(
checkpoint_path2, modified_state_dict
)
# Wait for both to complete
write_future1.result()
write_future2.result()
# Verify both checkpoints were created with correct content
checkpoint_file1 = os.path.join(
checkpoint_path1, f"checkpoint_{self.rank_info.global_rank}.pt"
)
checkpoint_file2 = os.path.join(
checkpoint_path2, f"checkpoint_{self.rank_info.global_rank}.pt"
)
self.assertTrue(os.path.exists(checkpoint_file1))
self.assertTrue(os.path.exists(checkpoint_file2))
loaded1 = torch.load(checkpoint_file1)
loaded2 = torch.load(checkpoint_file2)
self.assertEqual(loaded1["epoch"], 5)
self.assertEqual(loaded2["epoch"], 10)
finally:
checkpointer.close()
def test_async_multiple_saves_ordering(self):
"""Test that multiple async saves maintain proper ordering."""
checkpointer = self._create_async_checkpointer()
try:
# Create multiple state dicts
state_dicts = [
{"epoch": 1, "model": torch.nn.Linear(5, 3).state_dict()},
{"epoch": 2, "model": torch.nn.Linear(5, 3).state_dict()},
{"epoch": 3, "model": torch.nn.Linear(5, 3).state_dict()},
]
# Save multiple checkpoints
futures = []
checkpoint_paths = []
for i, state_dict in enumerate(state_dicts, 1):
checkpoint_path = os.path.join(self.temp_dir, f"multi_{i}")
checkpoint_paths.append(checkpoint_path)
stage_future, write_future = checkpointer.save(
checkpoint_path, state_dict
)
futures.append((stage_future, write_future))
# Wait for all to complete
for stage_future, write_future in futures:
write_future.result()
# Verify all checkpoints exist and have correct content
for i, checkpoint_path in enumerate(checkpoint_paths, 1):
checkpoint_file = os.path.join(
checkpoint_path, f"checkpoint_{self.rank_info.global_rank}.pt"
)
self.assertTrue(os.path.exists(checkpoint_file))
loaded = torch.load(checkpoint_file)
self.assertEqual(loaded["epoch"], i)
finally:
checkpointer.close()
def test_async_error_handling(self):
"""Test error handling in async operations."""
# Create checkpointer with mocked components to simulate errors
mock_stager = Mock()
mock_process = Mock()
mock_reader = Mock()
# Mock staging to return a completed future
mock_staging_future = Future()
mock_staging_future.set_result({"staged": "data"})
mock_stager.stage.return_value = mock_staging_future
# Mock process write to raise an error
mock_write_future = Future()
mock_write_future.set_exception(RuntimeError("Write failed"))
mock_process.write.return_value = mock_write_future
checkpointer = AsyncCheckpointer(
checkpoint_stager=mock_stager,
checkpoint_process=mock_process,
reader=mock_reader,
)
try:
# This should not raise immediately
stage_future, write_future = checkpointer.save("/tmp/test", self.state_dict)
# But waiting for the write future should raise the error
with self.assertRaises(RuntimeError) as cm:
write_future.result()
self.assertIn("Write failed", str(cm.exception))
finally:
checkpointer.close()
def test_async_future_results(self):
"""Test the results returned by async futures."""
checkpointer = self._create_async_checkpointer()
checkpoint_path = os.path.join(self.temp_dir, "checkpoint_results")
try:
# Save checkpoint
stage_future, write_future = checkpointer.save(
checkpoint_path, self.state_dict
)
# Both futures should complete successfully
stage_result = stage_future.result()
write_result = write_future.result()
# Stage result is wrapped by wrap_future() so it returns None on success
# This is intentional - the stage_future indicates completion, not data access
self.assertIsNone(stage_result)
# Write result should be None (success indicator)
self.assertIsNone(write_result)
finally:
checkpointer.close()
if __name__ == "__main__":

View File

@ -25,7 +25,7 @@ from torch.distributed.distributed_c10d import _get_default_group
from torch.distributed.tensor import DTensor
from torch.testing._internal.common_cuda import TEST_MULTIGPU
from torch.testing._internal.common_distributed import (
MultiProcContinousTest,
MultiProcContinuousTest,
requires_nccl,
)
from torch.testing._internal.common_utils import (
@ -201,7 +201,7 @@ def _test_pg_transport_with_sharded_tensor(self, device) -> None:
torch.testing.assert_close(expected_local_tensor, received_local_tensor)
class PgTransportCPU(MultiProcContinousTest):
class PgTransportCPU(MultiProcContinuousTest):
world_size = 8
timeout: timedelta = timedelta(seconds=20)
@ -227,7 +227,7 @@ class PgTransportCPU(MultiProcContinousTest):
_test_pg_transport_with_sharded_tensor(self, self.device)
class PgTransportCUDA(MultiProcContinousTest):
class PgTransportCUDA(MultiProcContinuousTest):
world_size = 2
timeout: timedelta = timedelta(seconds=20)

View File

@ -31,7 +31,7 @@ from torch.distributed.pipelining.schedules import _PipelineScheduleRuntime
from torch.nn.modules.loss import MSELoss
from torch.testing._internal.common_cuda import TEST_MULTIGPU
from torch.testing._internal.common_distributed import (
MultiProcContinousTest,
MultiProcContinuousTest,
requires_nccl,
)
from torch.testing._internal.common_utils import (
@ -199,7 +199,7 @@ def zero_gradients(stage_modules):
stage_module.zero_grad()
class ScheduleTest(MultiProcContinousTest):
class ScheduleTest(MultiProcContinuousTest):
world_size = 4
@classmethod
@ -802,7 +802,7 @@ class ScheduleTest(MultiProcContinousTest):
instantiate_parametrized_tests(ScheduleTest)
class CustomSchedulesTest(MultiProcContinousTest):
class CustomSchedulesTest(MultiProcContinuousTest):
"""
These schedules are from the ScheduleRegistry and require world_size == 2
The schedules test weird and unconventional schedules for edge cases

View File

@ -16,7 +16,7 @@ from torch.distributed.pipelining import (
from torch.distributed.pipelining._utils import PipeliningShapeError
from torch.testing._internal.common_cuda import TEST_MULTIGPU
from torch.testing._internal.common_distributed import (
MultiProcContinousTest,
MultiProcContinuousTest,
MultiProcessTestCase,
requires_nccl,
)
@ -63,7 +63,7 @@ def get_flatten_hook():
return flatten_hook
class StageTest(MultiProcContinousTest):
class StageTest(MultiProcContinuousTest):
@classmethod
def backend_str(cls) -> str:
# Testing with NCCL backend

View File

@ -1,11 +1,15 @@
# Copyright (c) Meta Platforms, Inc. and affiliates
# Owner(s): ["oncall: distributed"]
import functools
import itertools
import random
import unittest
from typing import Union
import torch
import torch.distributed as dist
import torch.nn.functional as F
from torch import nn
from torch import nn, Tensor
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor import DeviceMesh
from torch.distributed.tensor.debug import CommDebugMode
@ -22,7 +26,11 @@ from torch.distributed.tensor.experimental._attention import (
)
from torch.distributed.tensor.parallel import parallelize_module
from torch.nn.attention import sdpa_kernel, SDPBackend
from torch.nn.attention.flex_attention import create_block_mask, flex_attention
from torch.nn.attention.flex_attention import (
_mask_mod_signature,
create_block_mask,
flex_attention,
)
from torch.testing._internal.common_cuda import (
PLATFORM_SUPPORTS_CUDNN_ATTENTION,
PLATFORM_SUPPORTS_FLASH_ATTENTION,
@ -446,18 +454,94 @@ compiled_create_block_mask = torch.compile(
)
def causal_mask(b, h, q_idx, kv_idx):
return q_idx >= kv_idx
# copied from https://github.com/meta-pytorch/attention-gym/blob/main/attn_gym/masks/document_mask.py
def generate_random_lengths(total_length, num_documents):
# Initialize all lengths to 1 to ensure each document has at least one token
lengths = [1] * num_documents
remaining_length = total_length - num_documents
# Randomly distribute the remaining length
for _ in range(remaining_length):
index = random.randint(0, num_documents - 1)
lengths[index] += 1
return lengths
def length_to_offsets(
lengths: list[list[int]], device: Union[str, torch.device]
) -> Tensor:
"""Converts a list of lengths to a list of offsets.
Args:
lengths: A list of lengths.
"""
offsets = [[0] + lengths_in_batch for lengths_in_batch in lengths]
offsets = torch.tensor(offsets, device=device, dtype=torch.int32)
offsets = torch.cumsum(offsets, dim=-1)
return offsets
def _offsets_to_doc_ids_tensor(offsets):
doc_ids = []
device = offsets.device
for batch_idx in range(offsets.size(0)):
counts = offsets[batch_idx][1:] - offsets[batch_idx][:-1]
doc_id = torch.repeat_interleave(
torch.arange(len(counts), device=device, dtype=torch.int32), counts
)
doc_ids.append(doc_id)
return torch.stack(doc_ids)
def generate_doc_mask_mod(
mask_mod: _mask_mod_signature, offsets: Tensor
) -> _mask_mod_signature:
"""Generates mask mods that apply to inputs to flex attention in the sequence stacked
format.
Args:
mask_mod: The mask mod to apply to the documents
offsets: This tensor should be of shape(num_documents + 1)
this should contain the cumulative counts of document tokens.
e.g. if you have 3 documents of length 2, 4, 3 then
offsets = [0, 2, 6, 9]
Note:
What is the sequence stacked format? When assembling batches of inputs, we
take multiple sequences and stack them together to form 1 large sequence. We then
use masking to ensure that the attention scores are only applied to tokens within
the same document.
"""
document_id = _offsets_to_doc_ids_tensor(offsets)
def doc_mask_mod(b, h, q_idx, kv_idx):
same_doc = document_id[b][q_idx] == document_id[b][kv_idx]
q_logical = q_idx - offsets[b, document_id[b, q_idx]]
kv_logical = kv_idx - offsets[b, document_id[b, kv_idx]]
inner_mask = mask_mod(b, h, q_logical, kv_logical)
return same_doc & inner_mask
return doc_mask_mod
class RingFlexAttentionTest(DTensorTestBase):
@property
def world_size(self) -> int:
return 2
def _test_ring_flex_attention(self, qkv_size) -> None:
def causal_mask(b, h, q_idx, kv_idx):
return q_idx >= kv_idx
def _test_ring_flex_attention(
self, qkv_size, B=1, mask_func=causal_mask, atol=1e-6, rtol=1e-2
) -> None:
torch.cuda.manual_seed(10)
dtype = torch.float32
bs = 8
bs = B if B > 1 else 8
query_tokens = context_tokens = qkv_size
dim = 32
nheads = 8
@ -482,8 +566,8 @@ class RingFlexAttentionTest(DTensorTestBase):
)
block_mask = compiled_create_block_mask(
causal_mask,
B=1,
mask_func,
B=B,
H=1,
Q_LEN=query_tokens,
KV_LEN=context_tokens,
@ -531,8 +615,8 @@ class RingFlexAttentionTest(DTensorTestBase):
# NOTE: call create_block_mask() within TorchFunctionMode would cause error in create_fw_bw_graph
cp_block_mask = create_cp_block_mask(
causal_mask,
B=1,
mask_func,
B=B,
H=1,
Q_LEN=query_tokens,
KV_LEN=context_tokens,
@ -574,8 +658,8 @@ class RingFlexAttentionTest(DTensorTestBase):
# unshard the output
cp_out, cp_lse = context_parallel_unshard(device_mesh, [cp_out, cp_lse], [2, 2])
torch.testing.assert_close(cp_out, expect_out, atol=1e-6, rtol=1e-2)
torch.testing.assert_close(cp_lse, expect_lse, atol=1e-6, rtol=1e-2)
torch.testing.assert_close(cp_out, expect_out, atol=atol, rtol=rtol)
torch.testing.assert_close(cp_lse, expect_lse, atol=atol, rtol=rtol)
# unshard the gradient
cp_q_grad, cp_k_grad, cp_v_grad = context_parallel_unshard(
@ -583,9 +667,9 @@ class RingFlexAttentionTest(DTensorTestBase):
[cp_q.grad, cp_k.grad, cp_v.grad],
[2, 2, 2],
)
torch.testing.assert_close(cp_q_grad, q.grad, atol=1e-6, rtol=1e-2)
torch.testing.assert_close(cp_k_grad, k.grad, atol=1e-6, rtol=1e-2)
torch.testing.assert_close(cp_v_grad, v.grad, atol=1e-6, rtol=1e-2)
torch.testing.assert_close(cp_q_grad, q.grad, atol=atol, rtol=rtol)
torch.testing.assert_close(cp_k_grad, k.grad, atol=atol, rtol=rtol)
torch.testing.assert_close(cp_v_grad, v.grad, atol=atol, rtol=rtol)
# reset CP context dispatch mode to default
torch.distributed.tensor.experimental._attention._dispatch_mode = (
@ -607,6 +691,53 @@ class RingFlexAttentionTest(DTensorTestBase):
self._test_ring_flex_attention,
)
# TODO: merge with the above test
@skip_if_lt_x_gpu(2)
@with_comms
def test_ring_flex_attention_document_mask(self) -> None:
random.seed(10)
# NOTE: Each (batch_size, seq_len) tuple introduces 2 create_block_mask
# compilations: 1 for single-rank flex_attention and 1 for CP flex_attention.
# In order to avoid the "exceeds_recompile_limit" error, we need to increase
# the cache_size_limit to 12 which is the total number of compilations in our
# test case.
torch._dynamo.config.cache_size_limit = 12
# initialize document mask
doc_count = 28
batch_size_list = [2, 4, 8]
max_seq_len_list = [
256 * self.world_size,
2048,
# 128 * self.world_size # NOTE: Mismatched elements: 8 / 131072 (0.0%),
]
# TODO: change this for-loop to run_subtests
# Use a for-loop instead of run_subtests because we need to intialize the mask
# for each subtest. This can be baked into self._test_ring_flex_attention as
# a str argument denoting mask type.
for batch_size, max_seq_len in itertools.product(
batch_size_list, max_seq_len_list
):
lengths = [
generate_random_lengths(max_seq_len, doc_count)
for _ in range(batch_size)
]
offsets = length_to_offsets(lengths, self.device_type)
document_causal_mask = generate_doc_mask_mod(causal_mask, offsets)
# construct testing function
test_func = functools.partial(
self._test_ring_flex_attention,
qkv_size=max_seq_len,
B=batch_size,
mask_func=document_causal_mask,
atol=1e-6,
)
test_func()
if __name__ == "__main__":
run_tests()

View File

@ -25,7 +25,7 @@ import torch.distributed as dist
from torch.testing._internal.common_cuda import TEST_MULTIGPU
from torch.testing._internal.common_distributed import (
init_multigpu_helper,
MultiProcContinousTest,
MultiProcContinuousTest,
requires_nccl,
requires_nccl_version,
sm_is_or_higher_than,
@ -45,7 +45,7 @@ if TEST_WITH_DEV_DBG_ASAN:
sys.exit(0)
class ProcessGroupNCCLOpTest(MultiProcContinousTest):
class ProcessGroupNCCLOpTest(MultiProcContinuousTest):
@classmethod
def backend_str(cls) -> str:
return "nccl"

View File

@ -19,7 +19,7 @@ from torch.distributed.tensor import DTensor
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.testing._internal.common_cuda import TEST_MULTIGPU
from torch.testing._internal.common_distributed import (
MultiProcContinousTest,
MultiProcContinuousTest,
requires_nccl,
skip_if_lt_x_gpu,
)
@ -91,7 +91,7 @@ def loss_fn(y, target, scale=1e-4):
return torch.nn.functional.cross_entropy(y, target) * scale
class ComposabilityTest(MultiProcContinousTest):
class ComposabilityTest(MultiProcContinuousTest):
@classmethod
def backend_str(cls) -> str:
# Testing with NCCL backend

View File

@ -7,7 +7,7 @@ from dataclasses import dataclass
import torch
from torch.multiprocessing.reductions import reduce_tensor
from torch.testing._internal.common_distributed import MultiProcContinousTest
from torch.testing._internal.common_distributed import MultiProcContinuousTest
from torch.testing._internal.common_utils import (
requires_cuda_p2p_access,
run_tests,
@ -46,7 +46,7 @@ def from_buffer(
@requires_cuda_p2p_access()
class CupyAsTensorTest(MultiProcContinousTest):
class CupyAsTensorTest(MultiProcContinuousTest):
@classmethod
def backend_str(cls):
return "gloo"

View File

@ -1580,14 +1580,65 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
# We want to make sure no unnecessary copy is made.
(
FileCheck()
.check("= torch.ops._c10d_functional.all_gather_into_tensor")
.check("torch.ops._c10d_functional.all_gather_into_tensor_out.default(")
.check("= torch.ops._c10d_functional.all_gather_into_tensor")
.check_count(".all_gather_into_tensor_out.default(", 2, exactly=True)
.run(code)
)
out = compiled(*inputs, **self.get_world_trs())
assert same(out, correct), f"{out} va {correct}"
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@unittest.skipIf(not SM80OrLater, "bfloat16")
def test_all_gather_bucket_path(self):
def func(x, w, ag_0, ag_1, *, tag, ranks, group_size):
# do some unrelated matmuls
y = torch.mm(x, w)
# cast the inputs
ag_0_cast = ag_0.to(torch.bfloat16)
ag_1_cast = ag_1.to(torch.bfloat16)
# first allgather
group_name = (
torch.distributed.distributed_c10d._get_default_group().group_name
)
ag_0_out = torch.ops._c10d_functional.all_gather_into_tensor(
ag_0_cast, group_size, group_name
)
ag_0_out = torch.ops.c10d_functional.wait_tensor(ag_0_out)
ag_0_out = ag_0_out * 2
# Create dependency: second allgather input depends on first allgather output
# This prevents fusion of the two allgather operations
ag_1_modified = (
ag_1_cast + ag_0_out[: ag_1_cast.shape[0]]
) # Use part of ag_0_out
# second allgather (now depends on the first one)
ag_1_out = torch.ops._c10d_functional.all_gather_into_tensor(
ag_1_modified, group_size, group_name
)
ag_1_out = torch.ops.c10d_functional.wait_tensor(ag_1_out)
return y, ag_0_out, ag_1_out
x = torch.ones(4, 384, device="cuda", dtype=torch.float32)
w = torch.ones(384, 512, device="cuda", dtype=torch.float32)
ag_0 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
ag_1 = torch.ones(384, 512, device="cuda", dtype=torch.float32)
inputs = [x, w, ag_0, ag_1]
with torch._inductor.config.patch(
{
"bucket_all_gathers_fx": "all",
"reorder_for_compute_comm_overlap": False,
}
):
compiled = torch.compile(func)
code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
# shouldnt have bucketed
FileCheck().check_count("wait_tensor.default(", 2, exactly=True).run(code)
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
@unittest.skipIf(not SM80OrLater, "bfloat16")
def test_reduce_scatter_bucket(self):

View File

@ -14,7 +14,7 @@ from torch.testing._internal.common_device_type import (
instantiate_device_type_tests,
)
from torch.testing._internal.common_distributed import (
MultiProcContinousTest,
MultiProcContinuousTest,
skip_if_lt_x_gpu,
)
from torch.testing._internal.common_utils import (
@ -246,7 +246,7 @@ class TestNCCL(TestCase):
@requires_cuda_p2p_access()
class NCCLSymmetricMemoryTest(MultiProcContinousTest):
class NCCLSymmetricMemoryTest(MultiProcContinuousTest):
@property
def device(self) -> torch.device:
return torch.device("cuda", self.rank)

View File

@ -7,7 +7,7 @@
import torch
import torch.distributed as dist
import torch.distributed._symmetric_memory as symm_mem
from torch.testing._internal.common_distributed import MultiProcContinousTest
from torch.testing._internal.common_distributed import MultiProcContinuousTest
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
parametrize,
@ -33,7 +33,7 @@ device_module = torch.get_device_module(device_type)
@requires_nvshmem()
@requires_cuda_p2p_access()
class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
class NVSHMEMSymmetricMemoryTest(MultiProcContinuousTest):
def _init_device(self) -> None:
# TODO: relieve this (seems to hang if without)
device_module.set_device(self.device)
@ -65,6 +65,48 @@ class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
out = symm_mem.empty(numel, dtype=dtype, device=self.device)
symm_mem.rendezvous(out, group=group_name)
@skipIfRocm
def test_rendezvous_slice(self) -> None:
# Rendezvous a slice of a tensor
self._init_device()
group_name = dist.group.WORLD.group_name
symm_mem.enable_symm_mem_for_group(group_name)
x = symm_mem.empty((2, 1024), device=self.device)
# Directly rendezvousing a slice should not fail
hdls = [symm_mem.rendezvous(y, group=group_name) for y in torch.chunk(x, 2)]
# Assert that handles are not the same
self.assertIsNot(hdls[0], hdls[1])
@skipIfRocm
def test_rendezvous_view(self) -> None:
# Rendezvous a view of a tensor
self._init_device()
group_name = dist.group.WORLD.group_name
symm_mem.enable_symm_mem_for_group(group_name)
x = symm_mem.empty(1024, device=self.device)
y = x.view(32, 32)
# Directly rendezvousing a view should not fail
hdl_y = symm_mem.rendezvous(y, group=group_name)
# Assert that view's handle is not the same as the original tensor's handle
hdl_x = symm_mem.rendezvous(x, group=group_name)
self.assertIsNot(hdl_x, hdl_y)
@skipIfRocm
def test_rendezvous_same(self) -> None:
# Rendezvous same tensor multiple times
self._init_device()
group_name = dist.group.WORLD.group_name
symm_mem.enable_symm_mem_for_group(group_name)
x = symm_mem.empty(1024, device=self.device)
hdl_0 = symm_mem.rendezvous(x, group=group_name)
hdl_1 = symm_mem.rendezvous(x, group=group_name)
# The handle should point to the same object
self.assertIs(hdl_0, hdl_1)
@skipIfRocm
def test_nvshmem_put(self) -> None:
self._init_device()
@ -117,7 +159,7 @@ class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
@instantiate_parametrized_tests
@requires_nvshmem()
@requires_cuda_p2p_access()
class NVSHMEMAll2AllTest(MultiProcContinousTest):
class NVSHMEMAll2AllTest(MultiProcContinuousTest):
def _init_device(self) -> None:
# TODO: relieve this (seems to hang if without)
device_module.set_device(self.device)

View File

@ -9,7 +9,7 @@ import torch.distributed as dist
import torch.distributed._symmetric_memory as symm_mem
import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
from torch._inductor.runtime.triton_compat import triton
from torch.testing._internal.common_distributed import MultiProcContinousTest
from torch.testing._internal.common_distributed import MultiProcContinuousTest
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
parametrize,
@ -246,7 +246,7 @@ def nvshmem_reduce_kernel(
@instantiate_parametrized_tests
@requires_nvshmem()
class NVSHMEMTritonTest(MultiProcContinousTest):
class NVSHMEMTritonTest(MultiProcContinuousTest):
def _init_device(self) -> None:
# TODO: relieve this (seems to hang if without)
device_module.set_device(self.device)

View File

@ -6,7 +6,7 @@
import torch
from torch.multiprocessing.reductions import reduce_tensor
from torch.testing._internal.common_distributed import MultiProcContinousTest
from torch.testing._internal.common_distributed import MultiProcContinuousTest
from torch.testing._internal.common_utils import (
requires_cuda_p2p_access,
run_tests,
@ -20,7 +20,7 @@ device_module = torch.get_device_module(device_type)
@requires_cuda_p2p_access()
class P2PIpcTest(MultiProcContinousTest):
class P2PIpcTest(MultiProcContinuousTest):
@classmethod
def backend_str(cls):
return "gloo"

View File

@ -24,7 +24,7 @@ from torch.distributed._symmetric_memory import (
from torch.testing._internal.common_cuda import _get_torch_cuda_version, SM90OrLater
from torch.testing._internal.common_device_type import e4m3_type
from torch.testing._internal.common_distributed import (
MultiProcContinousTest,
MultiProcContinuousTest,
MultiProcessTestCase,
requires_multicast_support,
skip_if_lt_x_gpu,
@ -52,7 +52,7 @@ device_module = torch.get_device_module(device_type)
@instantiate_parametrized_tests
@requires_cuda_p2p_access()
class SymmetricMemoryTest(MultiProcContinousTest):
class SymmetricMemoryTest(MultiProcContinuousTest):
@property
def device(self) -> torch.device:
return torch.device(device_type, self.rank)
@ -636,7 +636,7 @@ class SymmetricMemoryTest(MultiProcContinousTest):
# This Test class is used to test the error handling of SymmetricMemory APIs.
# Since a process restart is often needed after each test, we use the
# MultiProcessTestCase instead of MultiProcContinousTest.
# MultiProcessTestCase instead of MultiProcContinuousTest.
@requires_cuda_p2p_access()
class SymmMemNegativeTest(MultiProcessTestCase):
def setUp(self) -> None:
@ -746,7 +746,7 @@ class SymmMemNegativeTest(MultiProcessTestCase):
@instantiate_parametrized_tests
@requires_cuda_p2p_access()
class SymmMemCollectiveTest(MultiProcContinousTest):
class SymmMemCollectiveTest(MultiProcContinuousTest):
@property
def device(self) -> torch.device:
return torch.device(device_type, self.rank)
@ -993,7 +993,7 @@ class SymmMemCollectiveTest(MultiProcContinousTest):
@instantiate_parametrized_tests
@requires_cuda_p2p_access()
class LoweringTest(MultiProcContinousTest):
class LoweringTest(MultiProcContinuousTest):
def _init_process(self) -> None:
torch.cuda.set_device(self.device)
enable_symm_mem_for_group(dist.group.WORLD.group_name)

View File

@ -292,6 +292,56 @@ class AOTAutogradCacheTests(InductorTestCase):
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
@inductor_config.patch("fx_graph_remote_cache", False)
@inductor_config.patch("fx_graph_cache", True)
@functorch_config.patch({"enable_autograd_cache": True})
def test_vmap(self):
"""
make
"""
def fn(x, y):
f = lambda x, y: (x * y + 1).sum(dim=0) # noqa: E731
vmapped = torch.vmap(f)(x, y)
return vmapped.sum(dim=0)
x = torch.randn(25, requires_grad=True)
y = torch.randn(25, requires_grad=True)
x2 = x.detach().clone().requires_grad_(True)
y2 = y.detach().clone().requires_grad_(True)
compiled_fn = torch.compile(fn, backend="inductor")
# A first call should miss in the cache.
self.assertEqual(fn(x, y), compiled_fn(x2, y2))
fn(x, y).sum().backward()
compiled_fn(x2, y2).sum().backward()
self.assertEqual(x.grad, x2.grad)
self.assertEqual(y.grad, y2.grad)
self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 0)
self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
# Reset all tensors
x = torch.randn(25, requires_grad=True)
y = torch.randn(25, requires_grad=True)
x2 = x.detach().clone().requires_grad_(True)
y2 = y.detach().clone().requires_grad_(True)
# A second call should hit. (First reset so in-memory guards
# don't prevent compilation).
self._clear_dynamo_and_codecache()
self.assertEqual(fn(x, y), compiled_fn(x2, y2))
fn(x, y).sum().backward()
compiled_fn(x2, y2).sum().backward()
self.assertEqual(x.grad, x2.grad)
self.assertEqual(y.grad, y2.grad)
self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], 1)
self.assertEqual(counters["aot_autograd"]["autograd_cache_saved"], 1)
@inductor_config.patch("fx_graph_remote_cache", False)
@inductor_config.patch("fx_graph_cache", True)
@functorch_config.patch({"enable_autograd_cache": True})

View File

@ -284,7 +284,7 @@ def fn():
def nothing(*args):
pass
code = bytecode_transformation.transform_code_object(fn.__code__, nothing)
code, _ = bytecode_transformation.transform_code_object(fn.__code__, nothing)
self.assertEqual(code.co_exceptiontable, fn.__code__.co_exceptiontable)
@skipIfNotPy311
@ -300,7 +300,7 @@ def fn():
def nothing(*args):
pass
code = bytecode_transformation.transform_code_object(fn.__code__, nothing)
code, _ = bytecode_transformation.transform_code_object(fn.__code__, nothing)
self.assertEqual(code.co_exceptiontable, fn.__code__.co_exceptiontable)
@skipIfNotPy311

View File

@ -13,6 +13,7 @@ if dist.is_available():
all_to_all_single_autograd,
wait_tensor,
)
from torch.distributed.device_mesh import init_device_mesh
from torch.testing._internal.distributed.fake_pg import FakeStore
@ -26,6 +27,8 @@ class TestFakeDistributed(DynamoTestCase):
# Use FakeProcessGroup to run tests on a single process
self.store = FakeStore()
dist.init_process_group(backend="fake", rank=0, world_size=2, store=self.store)
self.local_rank = 0
self.world_size = 2
def tearDown(self):
dist.destroy_process_group()
@ -115,6 +118,23 @@ class GraphModule(torch.nn.Module):
""", # noqa: B950
)
def test_device_mesh_get_local_rank(self):
device_mesh = init_device_mesh(
device_type="cpu",
mesh_shape=(self.world_size,),
mesh_dim_names=("dp",), # data parallel dimension
)
@torch.compile(backend="eager", fullgraph=True)
def fn(x):
local_rank = device_mesh.get_local_rank()
global_rank = device_mesh.get_rank()
return x + local_rank + global_rank
x = torch.ones(10)
res = fn(x)
self.assertEqual(res, x)
instantiate_parametrized_tests(TestFakeDistributed)

View File

@ -3084,29 +3084,29 @@ def forward(self, L_a_ : torch.SymInt, L_b_ : torch.SymInt, L_c_ : torch.SymInt,
b = torch.arange(l_b_)
c = torch.arange(l_c_)
d = torch.arange(l_d_)
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(l_d_, 'error'); _vmap_increment_nesting = None
child = torch._C._functorch._add_batch_dim(d, 0, 1); d = None
lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_1 = None
_vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(l_c_, 'error'); _vmap_increment_nesting_1 = None
child_1 = torch._C._functorch._add_batch_dim(c, 0, 2); c = None
lazy_load_decompositions_2 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_2 = None
_vmap_increment_nesting_2 = torch._C._functorch._vmap_increment_nesting(l_b_, 'error'); _vmap_increment_nesting_2 = None
child_2 = torch._C._functorch._add_batch_dim(b, 0, 3); b = None
lazy_load_decompositions_3 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_3 = None
_vmap_increment_nesting_3 = torch._C._functorch._vmap_increment_nesting(l_a_, 'error'); _vmap_increment_nesting_3 = None
_add_batch_dim_3 = torch._C._functorch._add_batch_dim(a, 0, 4); a = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(l_d_, 'error'); _vmap_increment_nesting = None
child = torch._functorch.predispatch._add_batch_dim(d, 0, 1); d = None
lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_1 = None
_vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(l_c_, 'error'); _vmap_increment_nesting_1 = None
child_1 = torch._functorch.predispatch._add_batch_dim(c, 0, 2); c = None
lazy_load_decompositions_2 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_2 = None
_vmap_increment_nesting_2 = torch._functorch.predispatch._vmap_increment_nesting(l_b_, 'error'); _vmap_increment_nesting_2 = None
child_2 = torch._functorch.predispatch._add_batch_dim(b, 0, 3); b = None
lazy_load_decompositions_3 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_3 = None
_vmap_increment_nesting_3 = torch._functorch.predispatch._vmap_increment_nesting(l_a_, 'error'); _vmap_increment_nesting_3 = None
_add_batch_dim_3 = torch._functorch.predispatch._add_batch_dim(a, 0, 4); a = None
add = _add_batch_dim_3 + child_2; _add_batch_dim_3 = child_2 = None
add_1 = add + child_1; add = child_1 = None
batched_outputs = add_1 + child; add_1 = child = None
batched_outputs_1 = torch._C._functorch._remove_batch_dim(batched_outputs, 4, l_a_, 0); batched_outputs = l_a_ = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
batched_outputs_2 = torch._C._functorch._remove_batch_dim(batched_outputs_1, 3, l_b_, 0); batched_outputs_1 = l_b_ = None
_vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
batched_outputs_3 = torch._C._functorch._remove_batch_dim(batched_outputs_2, 2, l_c_, 0); batched_outputs_2 = l_c_ = None
_vmap_decrement_nesting_2 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_2 = None
_remove_batch_dim_3 = torch._C._functorch._remove_batch_dim(batched_outputs_3, 1, l_d_, 0); batched_outputs_3 = l_d_ = None
_vmap_decrement_nesting_3 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_3 = None
batched_outputs_1 = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 4, l_a_, 0); batched_outputs = l_a_ = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
batched_outputs_2 = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 3, l_b_, 0); batched_outputs_1 = l_b_ = None
_vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
batched_outputs_3 = torch._functorch.predispatch._remove_batch_dim(batched_outputs_2, 2, l_c_, 0); batched_outputs_2 = l_c_ = None
_vmap_decrement_nesting_2 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_2 = None
_remove_batch_dim_3 = torch._functorch.predispatch._remove_batch_dim(batched_outputs_3, 1, l_d_, 0); batched_outputs_3 = l_d_ = None
_vmap_decrement_nesting_3 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_3 = None
return (_remove_batch_dim_3,)""", # noqa: B950
)
@ -3739,11 +3739,11 @@ class GraphModule(torch.nn.Module):
child: "f32[12, 4, 3]" = chunk.view(12, 4, 3); chunk = None
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
@ -3786,18 +3786,18 @@ class GraphModule(torch.nn.Module):
basis: "f32[12, 4, 3]" = chunk_1.view(12, 4, 3); chunk_1 = None
lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_1 = None
lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_1 = None
_vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting_1 = None
_vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting_1 = None
_add_batch_dim_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 3); basis = None
_add_batch_dim_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 3); basis = None
_autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim_1], retain_graph = True, create_graph = True); primals_out = diff_primals = _add_batch_dim_1 = None
batched_outputs: "f32[4, 3]" = _autograd_grad[0]; _autograd_grad = None
chunked_result: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 3, 12, 0); batched_outputs = None
chunked_result: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 3, 12, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
split = chunked_result.split((12,), dim = 0); chunked_result = None
split_1: "f32[12, 4, 3]" = split[0]; split = None
@ -3816,9 +3816,9 @@ class GraphModule(torch.nn.Module):
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
results_1: "f32[12, 4, 3, 4, 3]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
results_1: "f32[12, 4, 3, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
_vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
_vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
movedim: "f32[4, 3, 4, 3, 12]" = results_1.movedim(0, -1); results_1 = None
split_2 = movedim.split((12,), dim = -1); movedim = None
@ -3867,11 +3867,11 @@ class GraphModule(torch.nn.Module):
child: "f32[12, 3, 4]" = chunk.view(12, 3, 4); chunk = None
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
@ -3916,18 +3916,18 @@ class GraphModule(torch.nn.Module):
basis: "f32[12, 4, 3]" = chunk_1.view(12, 4, 3); chunk_1 = None
lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_1 = None
lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_1 = None
_vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting_1 = None
_vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting_1 = None
_add_batch_dim_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 3); basis = None
_add_batch_dim_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 3); basis = None
_autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [child_4], [_add_batch_dim_1], retain_graph = True, create_graph = True); primals_out = child_4 = _add_batch_dim_1 = None
child_5: "f32[3, 4]" = _autograd_grad[0]; _autograd_grad = None
child_6: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(child_5, 3, 12, 0); child_5 = None
child_6: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_5, 3, 12, 0); child_5 = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
split = child_6.split((12,), dim = 0); child_6 = None
split_1: "f32[12, 3, 4]" = split[0]; split = None
@ -3947,9 +3947,9 @@ class GraphModule(torch.nn.Module):
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
child_10: "f32[12, 4, 3, 3, 4]" = torch._C._functorch._remove_batch_dim(child_9, 1, 12, 0); child_9 = None
child_10: "f32[12, 4, 3, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_9, 1, 12, 0); child_9 = None
_vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
_vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
movedim: "f32[4, 3, 3, 4, 12]" = child_10.movedim(0, -1); child_10 = None
split_2 = movedim.split((12,), dim = -1); movedim = None
@ -4014,18 +4014,18 @@ class GraphModule(torch.nn.Module):
basis: "f32[12, 4, 3]" = chunk.view(12, 4, 3); chunk = None
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(basis, 0, 1); basis = None
_add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1); basis = None
_autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True); primals_out = diff_primals = _add_batch_dim = None
batched_outputs: "f32[4, 3]" = _autograd_grad[0]; _autograd_grad = None
chunked_result: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
chunked_result: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
split = chunked_result.split((12,), dim = 0); chunked_result = None
split_1: "f32[12, 4, 3]" = split[0]; split = None
@ -4092,18 +4092,18 @@ class GraphModule(torch.nn.Module):
basis: "f32[12, 3, 4]" = chunk.view(12, 3, 4); chunk = None
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[3, 4]" = torch._C._functorch._add_batch_dim(basis, 0, 1); basis = None
_add_batch_dim: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1); basis = None
_autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True); primals_out = diff_primals = _add_batch_dim = None
batched_outputs: "f32[3, 4]" = _autograd_grad[0]; _autograd_grad = None
chunked_result: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
chunked_result: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
split = chunked_result.split((12,), dim = 0); chunked_result = None
split_1: "f32[12, 3, 4]" = split[0]; split = None
@ -4172,18 +4172,18 @@ class GraphModule(torch.nn.Module):
basis: "f32[12, 3, 4]" = chunk.view(12, 3, 4); chunk = None
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[3, 4]" = torch._C._functorch._add_batch_dim(basis, 0, 1); basis = None
_add_batch_dim: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(basis, 0, 1); basis = None
_autograd_grad = torch._functorch.eager_transforms._autograd_grad([primals_out], [diff_primals], [_add_batch_dim], retain_graph = True, create_graph = True); primals_out = diff_primals = _add_batch_dim = None
batched_outputs: "f32[3, 4]" = _autograd_grad[0]; _autograd_grad = None
chunked_result: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
chunked_result: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 12, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
split = chunked_result.split((12,), dim = 0); chunked_result = None
split_1: "f32[12, 3, 4]" = split[0]; split = None
@ -5229,11 +5229,11 @@ class GraphModule(torch.nn.Module):
child: "f32[12, 4, 3]" = chunk.view(12, 4, 3); chunk = None
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
@ -5259,9 +5259,9 @@ class GraphModule(torch.nn.Module):
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
results: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
results: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
movedim: "f32[4, 3, 12]" = results.movedim(0, -1); results = None
split = movedim.split((12,), dim = -1); movedim = None
@ -5310,11 +5310,11 @@ class GraphModule(torch.nn.Module):
child: "f32[12, 3, 4]" = chunk.view(12, 3, 4); chunk = None
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
@ -5341,9 +5341,9 @@ class GraphModule(torch.nn.Module):
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
results: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
results: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
movedim: "f32[3, 4, 12]" = results.movedim(0, -1); results = None
split = movedim.split((12,), dim = -1); movedim = None
@ -5392,11 +5392,11 @@ class GraphModule(torch.nn.Module):
child: "f32[12, 3, 4]" = chunk.view(12, 3, 4); chunk = None
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'error'); _vmap_increment_nesting = None
child_1: "f32[3, 4]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
child_1: "f32[3, 4]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
@ -5425,10 +5425,10 @@ class GraphModule(torch.nn.Module):
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
results: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
aux_2: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(aux_1, 1, 12, 0); aux_1 = None
results: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(tangents_out_unflatten, 1, 12, 0); tangents_out_unflatten = None
aux_2: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(aux_1, 1, 12, 0); aux_1 = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
aux_3: "f32[4, 3]" = aux_2[0]; aux_2 = None
@ -5479,11 +5479,11 @@ class GraphModule(torch.nn.Module):
child: "f32[12, 4, 3]" = chunk.view(12, 4, 3); chunk = None
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(12, 'same'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(12, 'same'); _vmap_increment_nesting = None
child_1: "f32[4, 3]" = torch._C._functorch._add_batch_dim(child, 0, 1); child = None
child_1: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(child, 0, 1); child = None
_jvp_increment_nesting = torch._C._functorch._jvp_increment_nesting(); _jvp_increment_nesting = None
_set_fwd_grad_enabled = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled = None
@ -5517,10 +5517,10 @@ class GraphModule(torch.nn.Module):
_set_fwd_grad_enabled_1 = torch._C._set_fwd_grad_enabled(True); _set_fwd_grad_enabled_1 = None
_jvp_decrement_nesting = torch._C._functorch._jvp_decrement_nesting(); _jvp_decrement_nesting = None
child_8: "f32[12, 3, 4]" = torch._C._functorch._remove_batch_dim(child_6, 1, 12, 0); child_6 = None
child_9: "f32[12, 4, 3]" = torch._C._functorch._remove_batch_dim(child_7, 1, 12, 0); child_7 = None
child_8: "f32[12, 3, 4]" = torch._functorch.predispatch._remove_batch_dim(child_6, 1, 12, 0); child_6 = None
child_9: "f32[12, 4, 3]" = torch._functorch.predispatch._remove_batch_dim(child_7, 1, 12, 0); child_7 = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
movedim: "f32[3, 4, 12]" = child_8.movedim(0, -1); child_8 = None
split = movedim.split((12,), dim = -1); movedim = None
@ -6260,19 +6260,19 @@ class GraphModule(torch.nn.Module):
def forward(self, L_x_: "f32[3, 3, 3]"):
l_x_ = L_x_
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
_add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
sum_1: "f32[3]" = _add_batch_dim.sum(0)
sum_2: "f32[3]" = _add_batch_dim.sum(1); _add_batch_dim = None
batched_outputs: "f32[3]" = sum_1 + sum_2; sum_1 = sum_2 = None
_remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
_remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
return (_remove_batch_dim,)
""",
)
@ -6298,20 +6298,20 @@ class GraphModule(torch.nn.Module):
def forward(self, L_x_: "f32[3, 3, 3]"):
l_x_ = L_x_
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
_add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
sum_1: "f32[3]" = _add_batch_dim.sum(0)
sum_2: "f32[3]" = _add_batch_dim.sum(1); _add_batch_dim = None
add: "f32[3]" = sum_1 + sum_2; sum_1 = sum_2 = None
batched_outputs: "f32[3]" = add + 3; add = None
_remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
_remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
return (_remove_batch_dim,)
""",
)
@ -6338,20 +6338,20 @@ class GraphModule(torch.nn.Module):
l_x_ = L_x_
l_y_ = L_y_
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
_add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
sum_1: "f32[3]" = _add_batch_dim.sum(0)
sum_2: "f32[3]" = _add_batch_dim.sum(1); _add_batch_dim = None
add: "f32[3]" = sum_1 + sum_2; sum_1 = sum_2 = None
batched_outputs: "f32[3, 3]" = add + l_y_; add = l_y_ = None
_remove_batch_dim: "f32[3, 3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
_remove_batch_dim: "f32[3, 3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
return (_remove_batch_dim,)
""",
)
@ -6379,21 +6379,21 @@ class GraphModule(torch.nn.Module):
l_x_ = L_x_
l_y_ = L_y_
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
_add_batch_dim_1: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 1, 1); l_y_ = None
_add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
_add_batch_dim_1: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 1, 1); l_y_ = None
sum_1: "f32[3]" = _add_batch_dim.sum(0)
sum_2: "f32[3]" = _add_batch_dim.sum(1); _add_batch_dim = None
add: "f32[3]" = sum_1 + sum_2; sum_1 = sum_2 = None
batched_outputs: "f32[3]" = add + _add_batch_dim_1; add = _add_batch_dim_1 = None
_remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
_remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
return (_remove_batch_dim,)
""",
)
@ -6423,21 +6423,21 @@ class GraphModule(torch.nn.Module):
l_x_ = L_x_
l_y_ = L_y_
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
_add_batch_dim_1: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 1, 1); l_y_ = None
_add_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
_add_batch_dim_1: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 1, 1); l_y_ = None
sum_1: "f32[3]" = _add_batch_dim.sum(0)
sum_2: "f32[3]" = _add_batch_dim.sum(1); _add_batch_dim = None
add: "f32[3]" = sum_1 + sum_2; sum_1 = sum_2 = None
batched_outputs: "f32[3]" = add + _add_batch_dim_1; add = _add_batch_dim_1 = None
_remove_batch_dim: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
_remove_batch_dim: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 1, 3, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
return (_remove_batch_dim,)
""",
)
@ -6463,29 +6463,29 @@ class GraphModule(torch.nn.Module):
l_x_ = L_x_
l_y_ = L_y_
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting = None
child: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
child_1: "f32[3, 3]" = torch._C._functorch._add_batch_dim(l_y_, 0, 1); l_y_ = None
child: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
child_1: "f32[3, 3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 0, 1); l_y_ = None
lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_1 = None
lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_1 = None
_vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting_1 = None
_vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting_1 = None
_add_batch_dim_2: "f32[3]" = torch._C._functorch._add_batch_dim(child, 1, 2); child = None
_add_batch_dim_3: "f32[3]" = torch._C._functorch._add_batch_dim(child_1, 1, 2); child_1 = None
_add_batch_dim_2: "f32[3]" = torch._functorch.predispatch._add_batch_dim(child, 1, 2); child = None
_add_batch_dim_3: "f32[3]" = torch._functorch.predispatch._add_batch_dim(child_1, 1, 2); child_1 = None
batched_outputs: "f32[3]" = _add_batch_dim_2 + _add_batch_dim_3; _add_batch_dim_2 = _add_batch_dim_3 = None
batched_outputs_1: "f32[3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 2, 3, 0); batched_outputs = None
batched_outputs_1: "f32[3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 2, 3, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_remove_batch_dim_1: "f32[3, 3, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs_1, 1, 3, 0); batched_outputs_1 = None
_remove_batch_dim_1: "f32[3, 3, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 1, 3, 0); batched_outputs_1 = None
_vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
_vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
return (_remove_batch_dim_1,)
""",
)
@ -6512,27 +6512,27 @@ class GraphModule(torch.nn.Module):
l_y_ = L_y_
l_x_ = L_x_
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(5, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(5, 'error'); _vmap_increment_nesting = None
child: "f32[3]" = torch._C._functorch._add_batch_dim(l_y_, 0, 1); l_y_ = None
child: "f32[3]" = torch._functorch.predispatch._add_batch_dim(l_y_, 0, 1); l_y_ = None
lazy_load_decompositions_1 = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions_1 = None
lazy_load_decompositions_1 = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions_1 = None
_vmap_increment_nesting_1 = torch._C._functorch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting_1 = None
_vmap_increment_nesting_1 = torch._functorch.predispatch._vmap_increment_nesting(3, 'error'); _vmap_increment_nesting_1 = None
_add_batch_dim_1: "f32[]" = torch._C._functorch._add_batch_dim(child, 0, 2); child = None
_add_batch_dim_1: "f32[]" = torch._functorch.predispatch._add_batch_dim(child, 0, 2); child = None
batched_outputs: "f32[2, 3]" = l_x_ * _add_batch_dim_1; l_x_ = _add_batch_dim_1 = None
batched_outputs_1: "f32[3, 2, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs, 2, 3, 0); batched_outputs = None
batched_outputs_1: "f32[3, 2, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs, 2, 3, 0); batched_outputs = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_remove_batch_dim_1: "f32[5, 3, 2, 3]" = torch._C._functorch._remove_batch_dim(batched_outputs_1, 1, 5, 0); batched_outputs_1 = None
_remove_batch_dim_1: "f32[5, 3, 2, 3]" = torch._functorch.predispatch._remove_batch_dim(batched_outputs_1, 1, 5, 0); batched_outputs_1 = None
_vmap_decrement_nesting_1 = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
_vmap_decrement_nesting_1 = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting_1 = None
return (_remove_batch_dim_1,)
""",
)
@ -6557,19 +6557,19 @@ class GraphModule(torch.nn.Module):
def forward(self, L_x_: "f32[2, 4, 3]"):
l_x_ = L_x_
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
_add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
child: "f32[3]" = _add_batch_dim.sum(0)
child_1: "f32[4]" = _add_batch_dim.sum(1); _add_batch_dim = None
_remove_batch_dim: "f32[2, 3]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 0); child = None
_remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
_remove_batch_dim: "f32[2, 3]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 0); child = None
_remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
return (_remove_batch_dim, _remove_batch_dim_1)
""",
)
@ -6594,19 +6594,19 @@ class GraphModule(torch.nn.Module):
def forward(self, L_x_: "f32[2, 4, 3]"):
l_x_ = L_x_
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
_add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
child: "f32[3]" = _add_batch_dim.sum(0)
child_1: "f32[4]" = _add_batch_dim.sum(1); _add_batch_dim = None
_remove_batch_dim: "f32[3, 2]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 1); child = None
_remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
_remove_batch_dim: "f32[3, 2]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 1); child = None
_remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
return (_remove_batch_dim, _remove_batch_dim_1)
""",
)
@ -6632,19 +6632,19 @@ class GraphModule(torch.nn.Module):
def forward(self, L_x_: "f32[2, 4, 3]"):
l_x_ = L_x_
lazy_load_decompositions = torch._functorch.vmap.lazy_load_decompositions(); lazy_load_decompositions = None
lazy_load_decompositions = torch._functorch.predispatch.lazy_load_decompositions(); lazy_load_decompositions = None
_vmap_increment_nesting = torch._C._functorch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
_vmap_increment_nesting = torch._functorch.predispatch._vmap_increment_nesting(2, 'error'); _vmap_increment_nesting = None
_add_batch_dim: "f32[4, 3]" = torch._C._functorch._add_batch_dim(l_x_, 0, 1); l_x_ = None
_add_batch_dim: "f32[4, 3]" = torch._functorch.predispatch._add_batch_dim(l_x_, 0, 1); l_x_ = None
child: "f32[3]" = _add_batch_dim.sum(0)
child_1: "f32[4]" = _add_batch_dim.sum(1); _add_batch_dim = None
_remove_batch_dim: "f32[3, 2]" = torch._C._functorch._remove_batch_dim(child, 1, 2, 1); child = None
_remove_batch_dim_1: "f32[2, 4]" = torch._C._functorch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
_remove_batch_dim: "f32[3, 2]" = torch._functorch.predispatch._remove_batch_dim(child, 1, 2, 1); child = None
_remove_batch_dim_1: "f32[2, 4]" = torch._functorch.predispatch._remove_batch_dim(child_1, 1, 2, 0); child_1 = None
_vmap_decrement_nesting = torch._C._functorch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
_vmap_decrement_nesting = torch._functorch.predispatch._vmap_decrement_nesting(); _vmap_decrement_nesting = None
return (_remove_batch_dim, _remove_batch_dim_1)
""",
)

View File

@ -942,6 +942,7 @@ exclusions = {
"aot_graphs",
"aot_graphs_effects",
"pre_grad_graphs",
"joint_graph_passes",
"post_grad_graphs",
"inductor_metrics",
"ir_pre_fusion",

View File

@ -12848,6 +12848,36 @@ fn
res = opt_f(x)
self.assertEqual(ref, res)
def test_builtin_complex(self):
def f(x):
c = (
complex(),
complex(1),
complex(2, 3),
complex(imag=2),
complex(real=1),
complex(imag=1, real=2),
complex("1+2j"),
)
return [x + z for z in c]
x = torch.randn(1)
opt_f = torch.compile(f, backend="eager", fullgraph=True)
res = opt_f(x)
ref = f(x)
self.assertEqual(res, ref)
def test_builtin_complex_args(self):
@torch.compile(backend="eager", fullgraph=True)
def f(*args, **kwargs):
return torch.tensor(complex(*args, **kwargs))
self.assertRaises(Unsupported, f, 1, 1, 1)
self.assertRaises(Unsupported, f, 1, 1, fake_arg=1)
self.assertRaises(Unsupported, f, fake_arg=1)
self.assertRaises(Unsupported, f, [])
self.assertRaises(Unsupported, f, "1 + j")
class TestTracer(JitTestCase):
def test_jit_save(self):

View File

@ -16,7 +16,7 @@ from torch._dynamo.package import CompilePackage, DiskDynamoStore, DynamoCache
from torch._dynamo.precompile_context import PrecompileContext
from torch._dynamo.testing import reduce_to_scalar_loss
from torch._functorch import config as functorch_config
from torch._inductor.mock_cache import global_stats, PatchCaches
from torch._inductor.mock_cache import global_stats, PatchCaches, Stats
from torch._inductor.runtime.runtime_utils import cache_dir
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
@ -452,33 +452,27 @@ def add(x, y):
def fn(x, y):
return x.sin() + y
arg1 = torch.randn(32, 32, device=device)
arg2 = torch.randn(32, 32, device=device)
arg1 = torch.randn(3, 3, device=device)
arg2 = torch.randn(3, 3, device=device)
expected = fn(arg1, arg2).clone()
with PatchCaches():
compiled_fn1 = torch.compile(fn, mode="max-autotune")
result = compiled_fn1(arg1, arg2).clone()
self.assertEqual(expected, result)
self.assertEqual(global_stats.autotune_local.num_get_miss, 1)
self.assertEqual(global_stats.autotune_local, Stats(1, 0, 1))
DynamoCache.clear()
total_frames = torch._dynamo.convert_frame.FRAME_COUNTER
self._save_and_reload(
expected_backends=1, expected_dynamo=1, expected_autotune=1
)
# During save, we check the autotune cache another time, and now it should hit
self.assertEqual(global_stats.autotune_local.num_get_hit, 1)
compiled_fn1 = torch.compile(fn, mode="max-autotune")
with torch.compiler.set_stance("fail_on_recompile"):
result1 = compiled_fn1(arg1, arg2).clone()
self.assertEqual(expected, result1)
self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
# No new hits or misses
# Unfortunately, we don't *actually* know how many puts there will be, because
# it's possible the best autotune config was found by coordesc.
self.assertEqual(global_stats.autotune_local.num_get_hit, 1)
self.assertEqual(global_stats.autotune_local.num_get_miss, 1)
self.assertEqual(global_stats.autotune_local, Stats(2, 1, 1))
@parametrize("device", ("cpu", "cuda", "xpu"))
@torch._dynamo.config.patch(caching_precompile=True)

View File

@ -245,7 +245,7 @@ class StructuredTraceTest(TestCase):
"""\
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -254,6 +254,8 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -277,7 +279,7 @@ class StructuredTraceTest(TestCase):
"""\
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -286,6 +288,8 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -314,10 +318,10 @@ class StructuredTraceTest(TestCase):
"""\
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['y']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "l_y_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -326,6 +330,8 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -336,7 +342,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"create_symbol": {"symbol": "s48", "val": "1", "vr": "[-int_oo, int_oo]", "source": "L['y']", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_x_": [1000, 1000], "add": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
@ -346,6 +352,8 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
@ -367,7 +375,7 @@ class StructuredTraceTest(TestCase):
"""\
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "ones_1": [1000, 1000], "output_1": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -376,6 +384,8 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -401,28 +411,28 @@ class StructuredTraceTest(TestCase):
"""\
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
{"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack1']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
{"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 1}
{"dynamo_start": {"stack": "STACK"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0}
{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['___stack0']"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
{"dynamo_output_graph": {"sizes": {"l_stack0_": [1000, 1000], "ones": [1000, 1000], "output": [1000, 1000], "sum_1": []}}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
@ -450,7 +460,7 @@ class StructuredTraceTest(TestCase):
{"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
{"dynamo_start": {"stack": "STACK"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['output']"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"compilation_metrics": "METRICS", "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
""", # noqa: B950
@ -470,7 +480,7 @@ class StructuredTraceTest(TestCase):
"""\
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"artifact": {"name": "dynamo_error", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@ -504,7 +514,7 @@ class StructuredTraceTest(TestCase):
"""\
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4000000}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1000, 1000], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "stride": [1000, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_a_": [1000, 1000], "output": [1000, 1000]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -615,7 +625,7 @@ class StructuredTraceTest(TestCase):
{"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"artifact": {"name": "dynamo_graph_break_reason", "encoding": "string"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['args'][0]"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
@ -631,32 +641,32 @@ class StructuredTraceTest(TestCase):
{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0}
{"dynamo_start": {"stack": "STACK"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['self']._modules['layers']._modules['0']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['self']._modules['layers']._modules['0']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 2, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 2, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 2, "source": "L['x']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 3, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 3, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 8, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 3, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 8, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 4, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 9, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 4, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 9, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 4, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 9, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_self_modules_layers_modules_0_parameters_weight_": [1024, 1024], "l_self_modules_layers_modules_0_parameters_bias_": [1024], "l_x_": [1024, 1024], "l_self_modules_layers_modules_1_parameters_weight_": [1024, 1024], "l_self_modules_layers_modules_1_parameters_bias_": [1024], "input_1": [1024, 1024], "input_2": [1024, 1024]}}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"optimize_ddp_split_graph": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"optimize_ddp_split_child": {"name": "submod_0"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"optimize_ddp_split_child": {"name": "submod_1"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1024, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['self']._modules['layers']._modules['0']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 2, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 2, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 2, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 2, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 2, "source": "L['self']._modules['layers']._modules['0']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -672,10 +682,10 @@ class StructuredTraceTest(TestCase):
{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"describe_storage": {"id": 16, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 29, "source": "L['self']._modules['layers']._modules['1']._parameters['weight']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 17, "describer_id": "ID", "size": 4096}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 30, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 17, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 30, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1], "storage": 17, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 30, "source": "L['self']._modules['layers']._modules['1']._parameters['bias']"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -715,7 +725,7 @@ class StructuredTraceTest(TestCase):
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 1}
{"dynamo_start": {"stack": "STACK"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_x_": [1], "add": [1]}}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -724,6 +734,8 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"aot_inference_graph": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -754,10 +766,10 @@ class StructuredTraceTest(TestCase):
"""\
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 800}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 20], "is_leaf": true, "stride": [20, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 20], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [20, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 2400}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [20, 30], "is_leaf": true, "stride": [30, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [20, 30], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [30, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['b']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_a_": [10, 20], "l_b_": [20, 30], "matmul": [10, 30]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -765,12 +777,12 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 200}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [5, 10], "is_leaf": true, "stride": [10, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [5, 10], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [10, 1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"create_symbol": {"symbol": "s97", "val": "5", "vr": "[2, int_oo]", "source": "L['a'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"create_symbol": {"symbol": "s98", "val": "10", "vr": "[2, int_oo]", "source": "L['a'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 600}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 15], "is_leaf": true, "stride": [15, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [10, 15], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [15, 1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['b']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"create_symbol": {"symbol": "s52", "val": "10", "vr": "[2, int_oo]", "source": "L['b'].size()[0]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"create_symbol": {"symbol": "s20", "val": "15", "vr": "[2, int_oo]", "source": "L['b'].size()[1]", "user_stack": "STACK", "stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
@ -806,7 +818,7 @@ class StructuredTraceTest(TestCase):
"""\
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -814,7 +826,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "recompile_reasons", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_x_": [1], "x": [1]}}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
@ -844,10 +856,10 @@ def forward(self, x, y):
return add
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 12}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['x']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 1, "describer_id": "ID", "size": 12}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "is_leaf": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 1, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [3], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 1, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 1, "source": "L['y']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_x_": [3], "l_y_": [3], "add": [3]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -874,7 +886,7 @@ def forward(self, x, y):
"""\
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_a_": [1], "sin": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -883,6 +895,8 @@ def forward(self, x, y):
{"artifact": {"name": "aot_forward_graph_fw_metadata", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "torch._functorch.config", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_joint_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -892,7 +906,7 @@ def forward(self, x, y):
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_start": {"stack": "STACK"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_storage": {"id": 0, "describer_id": "ID", "size": 4}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 0, "ndim": 1, "dtype": "torch.float32", "device": "device(type='cpu')", "size": [1], "dynamo_hint_overrides": {}, "is_leaf": true, "stride": [1], "storage": 0, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"describe_source": {"describer_id": "ID", "id": 0, "source": "L['a']"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"dynamo_output_graph": {"sizes": {"l_a_": [1], "sin": [1]}}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}

View File

@ -289,6 +289,39 @@ class TestDynamoTimed(TestCase):
"'l_x_': [3], 'linear': [1]}",
)
@dynamo_config.patch({"log_compilation_metrics": True})
@inductor_config.patch({"force_disable_caches": True})
def test_log_dynamo_start(self):
import torch._dynamo.convert_frame as convert_frame
self.warmup()
self.run_forward_backward()
# Dummy code object
def sample_func():
pass
code = sample_func.__code__
stack_strings = convert_frame.log_dynamo_start(code)
last_entry = stack_strings[-1]
# Check if the last entry is a valid stack trace i.e for the sample_func
self.assertIn(
f"Line: {code.co_firstlineno}",
last_entry,
"Log does not contain a Line no.",
)
self.assertIn(
f"Name: {code.co_name}", last_entry, "Log does not contain a Name"
)
self.assertIn(
"test_utils.py",
last_entry,
"Log file does not contain the expected Filename: 'test_utils.py'",
)
# Since the remaining logs are env specific, we just check if they are present instead of checking the exact string
self.assertGreater(len(stack_strings), 1)
@dynamo_config.patch(
{
"log_compilation_metrics": True,

Some files were not shown because too many files have changed in this diff Show More