Compare commits

..

6 Commits

Author SHA1 Message Date
a8c367127b Revert all changes to torch/cuda/_utils.py 2025-09-17 13:16:34 -07:00
fa839e440c testy test 2025-09-17 13:15:06 -07:00
4ae58a3dd4 simplify nvrtc discovery logic 2025-09-17 13:15:06 -07:00
8e8ec24374 Update _utils.py 2025-09-17 13:15:05 -07:00
4b74106204 lint 2025-09-17 13:15:05 -07:00
693880081c cub and compile_kernel 2025-09-17 13:15:04 -07:00
219 changed files with 2671 additions and 4791 deletions

View File

@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt
pip install auditwheel==6.2.0 wheel
if [ "$DESIRED_CUDA" = "cpu" ]; then
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
else
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
export USE_SYSTEM_NCCL=1
@ -45,5 +46,6 @@ else
export USE_NVIDIA_PYPI_LIBS=1
fi
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
fi

View File

@ -317,7 +317,7 @@ if __name__ == "__main__":
).decode()
print("Building PyTorch wheel")
build_vars = ""
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
if enable_cuda:
build_vars += "MAX_JOBS=5 "

View File

@ -1 +1 @@
e0dda9059d082537cee36be6c5e4fe3b18c880c0
56392aa978594cc155fa8af48cd949f5b5f1823a

View File

@ -1,2 +1,2 @@
transformers==4.56.0
transformers==4.54.0
soxr==0.5.0

View File

@ -42,27 +42,22 @@ install_pip_dependencies() {
# A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
# numba and scipy version used in PyTorch CI
conda_run pip uninstall -y numba scipy
# Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
pip_install yaspin==3.1.0
popd
}
setup_executorch() {
pushd executorch
export PYTHON_EXECUTABLE=python
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
popd
}
if [ $# -eq 0 ]; then
clone_executorch
install_buck2
install_conda_dependencies
install_pip_dependencies
pushd executorch
setup_executorch
popd
else
"$@"
fi
clone_executorch
install_buck2
install_conda_dependencies
install_pip_dependencies
setup_executorch

View File

@ -1,24 +1,23 @@
sphinx==6.2.1
sphinx==5.3.0
#Description: This is used to generate PyTorch docs
#Pinned versions: 7.2.6
#Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
sphinx-remove-toctrees==1.0.0.post1
#Description: This is used to generate PyTorch docs
#Pinned versions: 1.0.0.post1
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
# something related to Docker setup. We can investigate this later.
sphinxcontrib.katex==0.9.10
sphinxcontrib.katex==0.8.6
#Description: This is used to generate PyTorch docs
#Pinned versions: 0.9.10
#Pinned versions: 0.8.6
sphinx_sitemap==2.7.1
sphinxext-opengraph==0.9.1
#Description: This is used to generate PyTorch docs
#Pinned versions: 0.9.1
sphinx_sitemap==2.6.0
#Description: This is used to generate sitemap for PyTorch docs
#Pinned versions: 2.7.1
#Pinned versions: 2.6.0
matplotlib==3.5.3 ; python_version < "3.13"
matplotlib==3.6.3 ; python_version >= "3.13"
@ -30,17 +29,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
#Description: This is used to generate PyTorch docs
#Pinned versions: 2.13.0
breathe==4.35.0
breathe==4.34.0
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 4.35.0
#Pinned versions: 4.34.0
exhale==0.3.7
exhale==0.2.3
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 0.3.7
#Pinned versions: 0.2.3
docutils==0.18.1
docutils==0.16
#Description: This is used to generate PyTorch C++ docs
#Pinned versions: 0.18.1
#Pinned versions: 0.16
bs4==0.0.1
#Description: This is used to generate PyTorch C++ docs
@ -50,24 +49,13 @@ IPython==8.12.0
#Description: This is used to generate PyTorch functorch docs
#Pinned versions: 8.12.0
<<<<<<< HEAD
myst-nb==0.17.2
#Description: This is used to generate PyTorch functorch and torch.compile docs.
#Pinned versions: 0.17.2
=======
myst-nb==1.2.0
#Description: This is used to generate PyTorch functorch docs
#Pinned versions: 1.2.0
>>>>>>> 195382ce28e (Update)
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
python-etcd==0.4.5
sphinx-copybutton==0.5.2
sphinx-design==0.6.1
sphinx-copybutton==0.5.0
sphinx-design==0.4.0
sphinxcontrib-mermaid==1.0.0
<<<<<<< HEAD
myst-parser==0.18.1
=======
myst-parser==3.0.1
myst-nb
>>>>>>> 195382ce28e (Update)

View File

@ -83,10 +83,6 @@ rm -rf pytorch || true
pushd "$pt_checkout"
pushd docs
# Profile the docs build to see what is taking the longest
python -m cProfile -o docs_build.prof -m sphinx.cmd.build -b html -d build/doctrees source build/html
python -c "import pstats; p = pstats.Stats('docs_build.prof'); p.sort_stats('cumtime').print_stats(50)"
# Build the docs
if [ "$is_main_doc" = true ]; then
build_docs html || exit $?

View File

@ -1550,10 +1550,14 @@ test_executorch() {
install_torchvision
install_torchaudio
INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
pushd /executorch
"${INSTALL_SCRIPT}" setup_executorch
export PYTHON_EXECUTABLE=python
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
# NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
# from the PR
bash .ci/scripts/setup-linux.sh --build-tool cmake
echo "Run ExecuTorch unit tests"
pytest -v -n auto
@ -1567,6 +1571,10 @@ test_executorch() {
popd
# Test torchgen generated code for Executorch.
echo "Testing ExecuTorch op registration"
"$BUILD_BIN_DIR"/test_edge_op_registration
assert_git_not_dirty
}

View File

@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
for file in Path(f"artifacts/dist/{old_stem}").glob(
"*.dist-info/*",
"*.dist-info/**",
):
change_content_to_new_version(file)

View File

@ -1 +1 @@
9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26
d119fc86140785e7efc8f125c17153544d1e0f20

3
.github/labeler.yml vendored
View File

@ -130,6 +130,3 @@
- torch/csrc/inductor/aoti_include/**
- torchgen/aoti/**
- torchgen/gen_aoti_c_shim.py
"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt

View File

@ -74,11 +74,7 @@ jobs:
- docs_type: python
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 30m to finish python docs unless there are issues
timeout-minutes: 60
- docs_type: functorch
runner: ${{ inputs.runner_prefix }}linux.2xlarge
# It takes less than 15m to finish functorch docs unless there are issues
timeout-minutes: 15
timeout-minutes: 30
# Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
# The current name requires updating the database last docs push query from test-infra every time the matrix is updated
name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}

View File

@ -71,7 +71,8 @@ jobs:
pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
pytorch-linux-jammy-py3-clang12-executorch,
# Executorch pin needs update
# pytorch-linux-jammy-py3-clang12-executorch,
pytorch-linux-jammy-py3.12-triton-cpu,
pytorch-linux-noble-riscv64-py3.12-gcc14
]

View File

@ -318,6 +318,32 @@ jobs:
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-build:
if: false # Docker build needs pin update
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3-clang12-executorch
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
test-matrix: |
{ include: [
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-test:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3-clang12-executorch-build
if: false # Has been broken for a while
with:
build-environment: linux-jammy-py3-clang12-executorch
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-build.yml

View File

@ -259,27 +259,3 @@ jobs:
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-py3-clang12-executorch-build:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py3-clang12-executorch
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
test-matrix: |
{ include: [
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
]}
secrets: inherit
linux-jammy-py3-clang12-executorch-test:
name: linux-jammy-py3-clang12-executorch
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-py3-clang12-executorch-build
with:
build-environment: linux-jammy-py3-clang12-executorch
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
secrets: inherit

View File

@ -36,8 +36,6 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
# When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
allow-reuse-old-whl: false
build-additional-packages: "vision audio"
build-external-packages: "vllm"
build-environment: linux-jammy-cuda12.8-py3.12-gcc11

3
.gitignore vendored
View File

@ -259,9 +259,6 @@ gen
.pytest_cache
aten/build/*
# Linker scripts for prioritized text optimization
cmake/linker_script.ld
# Bram
plsdontbreak

View File

@ -964,6 +964,7 @@ exclude_patterns = [
'test/jit/**', # should be run through test/test_jit.py
'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py
'test/fx/**', # should be run through test/test_fx.py
'test/bottleneck_test/**', # excluded by test/run_test.py
'test/package/**', # excluded by test/run_test.py
'test/distributed/argparse_util_test.py',
'test/distributed/bin/test_script.py',
@ -1409,6 +1410,8 @@ exclude_patterns = [
'torch/utils/benchmark/utils/timer.py',
'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
'torch/utils/bottleneck/__init__.py',
'torch/utils/bottleneck/__main__.py',
'torch/utils/bundled_inputs.py',
'torch/utils/checkpoint.py',
'torch/utils/collect_env.py',

View File

@ -380,13 +380,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
OFF "USE_CUDA" OFF)
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
"CPU_AARCH64" OFF)
# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
if(LINUX AND CPU_AARCH64)
set(USE_PRIORITIZED_TEXT_DEFAULT ON)
endif()
cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
"${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
option(USE_MIMALLOC "Use mimalloc" OFF)
# Enable third party mimalloc library to improve memory allocation performance
@ -664,11 +657,6 @@ endif(MSVC)
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
# Set linker max-page-size to 64KiB on AArch64 Linux
if(LINUX AND CPU_AARCH64)
add_link_options_if_supported("-z,max-page-size=0x10000")
endif()
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
# applicable to mobile are disabled by this variable. Setting
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -1433,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
DESTINATION "${CMAKE_INSTALL_BINDIR}")
endif()
if(USE_PRIORITIZED_TEXT_FOR_LD)
add_compile_options(
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
)
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
add_custom_command(
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
COMMENT "Generating prioritized text linker files"
VERBATIM
)
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
if(BUILD_PYTHON)
set(LINKER_OPT_TARGETS torch_python)
endif()
if(NOT BUILD_LIBTORCHLESS)
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
if(USE_CUDA)
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
endif()
if(USE_XPU)
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
endif()
if(USE_ROCM)
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
endif()
endif()
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
if(TARGET ${tgt})
add_dependencies("${tgt}" generate_linker_script)
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
else()
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
endif()
endforeach()
else()
if(LINUX AND CPU_AARCH64)
message(WARNING [[
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
]])
endif()
endif()

View File

@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
}
bool Context::allowTF32CuDNN(const std::string& op) const {
if (op.empty()){
if (op.size() == 0){
bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
TORCH_CHECK(
@ -281,6 +281,9 @@ bool Context::userEnabledOverrideableSDP() const {
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
#ifdef USE_ROCM
static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
#endif
bool Context::checkCuBLASConfigDeterministic() {
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
@ -340,6 +343,12 @@ void Context::setImmediateMiopen(bool b) {
}
bool Context::allowTF32CuBLAS() const {
#ifdef USE_ROCM
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
if (allow_tf32 != true) {
return false;
}
#endif
bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
TORCH_CHECK(
@ -353,6 +362,14 @@ bool Context::allowTF32CuBLAS() const {
}
void Context::setAllowTF32CuBLAS(bool b) {
#ifdef USE_ROCM
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
if (allow_tf32 != true) {
C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
<< "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
return;
}
#endif
float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
}
@ -426,7 +443,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
std::string msg;
auto iterp = _fp32_precisions.find(backend);
TORCH_CHECK(iterp != _fp32_precisions.end());
for (const auto& p : iterp->second) {
for (auto p : iterp->second) {
msg += p;
msg += " ";
}

View File

@ -133,7 +133,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
: c10::TensorImpl(
c10::DispatchKeySet(DispatchKey::Functionalize),
view_value.dtype(),
base->storage().data_ptr().device()
view_value.device()
),
value_(view_value),
is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
@ -485,10 +485,7 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI
c10::Device FunctionalTensorWrapper::device_custom() const {
// The storage pointer already uses the underlying tensor custom device (if
// applicable) to extract the device. So, we dont have to recurse again by
// doing value_.unsafeGetTensorImpl()->device().
return storage().data_ptr().device();
return value_.unsafeGetTensorImpl()->device();
}
at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
return value_.unsafeGetTensorImpl()->sizes();

View File

@ -1954,8 +1954,8 @@ void scaled_gemm(
#if ROCM_VERSION >= 70000
if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
// TODO: add constraints based on hipblaslt internals
TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
"M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
"Matrix dimensions must be multiples of 32 for MX format. "
"Got m=", m, ", n=", n, ", k=", k);
}
#endif

View File

@ -1138,14 +1138,9 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
// TODO: We might want to enforce some structure on the shapes of the scale
// tensors
bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
bool is_packed_fp4_path = false;
#ifdef USE_ROCM
is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
#endif
return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
&& scale.is_contiguous());
}
bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
@ -1386,15 +1381,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
int packed_factor = 1;
if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
// For float4 data type, each byte stores two 4-bit floating-point values,
// effectively packing two elements into one byte.
packed_factor = 2;
}
TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
mat2.size(1) % 16 == 0,
"M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");
TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
"Matrix dimensions must be multiples of 32 for block-wise scaling");
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
out.scalar_type() == ScalarType::Half,

View File

@ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig(
}
}
template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
__global__ void CatArrayBatchedCopy_vectorized(
char* output,
CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
const int concatDim,
IndexType trailingSize) {
IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
if(tid >= nElements) return;
const char * data = (char*)inputs.input[blockIdx.y];
IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
int64_t dataOffset = (int64_t)offset * alignment; // in bytes
IndexType stride = gridDim.x * blockDim.x;
while( tid < nElements){
int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
tid += stride;
}
}
/*
Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
to improve memory bandwidth throughput.
@ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
// If all batches are contiguous we can call a specialized implementation
// which requires the input tensor addresses to be aligned to a
// 16 Byte boundary.
constexpr bool isContig = stride_size == 1;
bool isAligned = true;
constexpr int alignment = 16;
// Next, let's initialize the size, stride arrays for the output Tensor.
// for contig case, we'll canonicalize output strides, so that
// we don't have arbitrary strides for dims of size 0
size_t stride0 = 1;
if (memory_format == c10::MemoryFormat::Contiguous) {
for (int i = nDims - 1; i >= 0; --i) {
for (int i = 0; i < nDims; ++i) {
outputParam.tensorSize[i] = out.size(i);
if (isContig) {
outputParam.tensorStride[i] = stride0;
stride0 *= out.size(i);
} else {
outputParam.tensorStride[i] = out.stride(i);
}
outputParam.tensorStride[i] = out.stride(i);
}
} else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
// permute the semantics of dims from NCHW to NHWC so that the input
@ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
// If all batches are contiguous we can call a specialized implementation
// which requires the input tensor addresses to be aligned to a
// 16 Byte boundary.
// for channels last computing slice size correctly is much more involved, so we never send it
// on the fully vectorized path
// we need output stride in cat dimension to be multiple of alignment,
// if we ever use it to compute offsets
// for catting in 0th dimension it doesn't matter
bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
bool isContig = true;
bool isAligned = true;
unsigned int max_elements_per_tensor = 0;
// Now we loop
@ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
// high-dimensional tensor
if (inputs[i+batchCounter].get().numel() > 0) {
dimSize = inputs[i+batchCounter].get().size(dimension);
if (isInOutAligned) {
auto t = inputs[i+batchCounter].get();
// similarly to output stride, we cannot trust stride value to
// determine slice size if the corresponding dimension is 1
// we have to multiply all the subsequent sizes
int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
slice_size *= sizeof(scalar_t);
isInOutAligned &= (slice_size % alignment == 0);
}
}
catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
#ifdef USE_ROCM
// On ROCm, CatArrayBatchedCopy_contig is faster
isAligned = false;
isInOutAligned = false;
#else
// If at least one of the inputs is not aligned, we can't call the
// CatArrayBatchedCopy_alignedK_contig
isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
#endif
if (stride_size > 1) {
@ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
}
catMetaData.isContiguous[batchCounter] = false;
isContig = false;
} else {
catMetaData.isContiguous[batchCounter] = true;
}
@ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
max_elements_per_tensor, batchCounter);
#else
dim3 applyBlock, catGrid;
if (isInOutAligned) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
max_elements_per_tensor, batchCounter);
} else if (isContig && isAligned && sizeof(scalar_t) > 2) {
if (isContig && sizeof(scalar_t) > 2) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
max_elements_per_tensor, batchCounter);
} else if (isContig && isAligned && sizeof(scalar_t) == 2) {
} else if (isContig && sizeof(scalar_t) == 2) {
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
max_elements_per_tensor, batchCounter);
} else {
@ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
getCatGrid(batchCounter, catGrid);
}
#endif
int32_t trailingSize;
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
if (isInOutAligned) {
// in this case we can and should flatten the tensors after the cat dim
// we want to view the tensors as if consisting of `alignment`-sized elements
// however, we might not be able to cleanly divide just the last dim -
// it might not be the multiple of alignment.
// however, we know that the full concatted slice is multiple of alignment,
// so if we flatten all the dims after and including concat dim,
// it will be divisible by alignment
// then we need to divide last out size by elems_per_vec,
// and divide all strides except last by elems_per_vec (last stride is 1 always)
// for input, we will fix up the sizes and strides in the kernel directly
kernelOutputParam = outputParam;
nDims = dimension + 1;
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
trailingSize = outputParam.tensorStride[dimension];
kernelOutputParam.tensorStride[dimension] = 1;
for (int i = 0; i < dimension; ++i) {
kernelOutputParam.tensorStride[i] /= elems_per_vec;
}
}
if (memory_format != c10::MemoryFormat::Contiguous) {
switch (dimension) {
@ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
}
// Template Declarations for dim = 1, 2, 3, 4
#define HANDLE_CASE(DIMS) \
if (isInOutAligned) {\
constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\
(char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
} else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\
data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\

View File

@ -559,60 +559,4 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
return _int_mm_out_xpu(self, mat2, result);
}
Tensor _weight_int8pack_mm_xpu(
const Tensor& A,
const Tensor& B,
const Tensor& scales) {
auto M = A.size(0);
auto N = B.size(0);
auto K = A.size(1);
TORCH_CHECK(
A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
" : expect A to be either 32-bit or 16-bit float tensor.");
TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
TORCH_CHECK(
A.stride(1) == 1, " : A must be contiguous on the last dimension.");
TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
TORCH_CHECK(
scales.dim() == 1 && scales.size(0) == N,
" : expect scales to be 1d tensor with size ",
N);
auto C = at::empty({M, N}, A.options());
// --- Launch kernel ---
Tensor bias = at::Tensor();
Tensor mat2_zero_points = at::Tensor();
Tensor non_const_scales = scales;
auto post_op_args = torch::List<std::optional<at::Scalar>>();
at::native::onednn::quantized_matmul(
A.contiguous(),
1.0,
0,
B,
non_const_scales,
mat2_zero_points,
bias,
C,
1.0,
0,
C.scalar_type(),
/*other*/ std::nullopt,
/*other scale*/ 1.0,
/*other zp*/ 0,
/*binary post op*/ "none",
/*binary alpha*/ 1.0,
/*post_op_name*/ "none",
post_op_args,
/*post_op_algorithm*/ "none",
/*m2_trans*/ false);
return C;
}
} // namespace at::native

View File

@ -110,9 +110,8 @@ void quantized_matmul(
// [Note] Quantized Matrix Multiplication at XPU
// The following code integrates oneDNN quantized gemm. The quantization
// config we support:
// activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
// symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
// symmetric
// activation: s8&u8; per tensor calibrated; symmetric&asymmetric
// weight: s8; per_tensor/per_channel calibrated; symmetric
auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
construct_attr_by_post_op(
binary_post_op,

View File

@ -534,18 +534,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
output.resize_(output_size, memory_format);
output.fill_(0);
if (indices.defined() && indices.numel() > 0) {
auto output_image_size = c10::multiply_integers(output_size_);
int64_t min_idx = indices.min().item<int64_t>();
int64_t max_idx = indices.max().item<int64_t>();
if (min_idx < 0 || max_idx >= output_image_size) {
int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
}
}
id<MTLDevice> device = MPSDevice::getInstance()->device();
MPSStream* mpsStream = getCurrentMPSStream();
const auto numThreads = input.numel();

View File

@ -4243,7 +4243,6 @@
CPU: _weight_int8pack_mm_cpu
CUDA: _weight_int8pack_mm_cuda
MPS: _weight_int8pack_mm_mps
XPU: _weight_int8pack_mm_xpu
- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
python_module: sparse
@ -10849,7 +10848,6 @@
dispatch:
CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
autogen: _foreach_maximum.Scalar_out
# foreach_minimum/maximum dispatches to clamp_max/min

View File

@ -174,11 +174,11 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,0
google/gemma-2-2b,pass,5
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,0
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189
190
191
192
193

View File

@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,5
hf_Reformer,pass,8
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11

1 name accuracy graph_breaks
162
163
164
165
166
167
168
178
179
180
181
182
183
184

View File

@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,20
hf_Reformer,pass,25

1 name accuracy graph_breaks
110
111
112
113
114
115
116

View File

@ -170,15 +170,15 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,fail_to_run,0
meta-llama/Llama-3.2-1B,fail_accuracy,0
google/gemma-2-2b,fail_to_run,0
google/gemma-2-2b,fail_accuracy,0
google/gemma-3-4b-it,fail_to_run,0
google/gemma-3-4b-it,fail_accuracy,0
@ -186,4 +186,4 @@ openai/whisper-tiny,fail_to_run,0
Qwen/Qwen3-0.6B,fail_to_run,0
Qwen/Qwen3-0.6B,fail_accuracy,0

1 name accuracy graph_breaks
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189

View File

@ -138,7 +138,7 @@ hf_Bert_large,pass,0
hf_BigBird,pass,27
hf_BigBird,pass,25
@ -158,7 +158,7 @@ hf_Longformer,pass,4
hf_Reformer,pass,5
hf_Reformer,pass,8

1 name accuracy graph_breaks
138
139
140
141
142
143
144
158
159
160
161
162
163
164

View File

@ -138,7 +138,7 @@ hf_Bert_large,pass,0
hf_BigBird,pass,27
hf_BigBird,pass,25
@ -158,7 +158,7 @@ hf_Longformer,pass,4
hf_Reformer,pass,5
hf_Reformer,pass,8

1 name accuracy graph_breaks
138
139
140
141
142
143
144
158
159
160
161
162
163
164

View File

@ -138,7 +138,7 @@ hf_Bert_large,pass,0
hf_BigBird,pass,27
hf_BigBird,pass,25
@ -158,7 +158,7 @@ hf_Longformer,pass,4
hf_Reformer,pass,5
hf_Reformer,pass,8

1 name accuracy graph_breaks
138
139
140
141
142
143
144
158
159
160
161
162
163
164

View File

@ -174,11 +174,11 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,0
google/gemma-2-2b,pass,5
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,0
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189
190
191
192
193

View File

@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,5
hf_Reformer,pass,8
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11

1 name accuracy graph_breaks
162
163
164
165
166
167
168
178
179
180
181
182
183
184

View File

@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,20
hf_Reformer,pass,25

1 name accuracy graph_breaks
110
111
112
113
114
115
116

View File

@ -122,7 +122,7 @@ hf_Bert_large,pass,0
hf_BigBird,pass,27
hf_BigBird,pass,25
@ -142,7 +142,7 @@ hf_Longformer,pass,4
hf_Reformer,pass,5
hf_Reformer,pass,8

1 name accuracy graph_breaks
122
123
124
125
126
127
128
142
143
144
145
146
147
148

View File

@ -174,11 +174,11 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,0
google/gemma-2-2b,pass,5
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass,0
openai/whisper-tiny,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,0
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189
190
191
192
193

View File

@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,5
hf_Reformer,pass,8
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11

1 name accuracy graph_breaks
162
163
164
165
166
167
168
178
179
180
181
182
183
184

View File

@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,20
hf_Reformer,pass,25

1 name accuracy graph_breaks
110
111
112
113
114
115
116

View File

@ -174,11 +174,11 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,0
google/gemma-2-2b,pass,5
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,0
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189
190
191
192
193

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
google/gemma-2-2b,eager_fail_to_run,0
google/gemma-3-4b-it,eager_fail_to_run,0
openai/whisper-tiny,eager_fail_to_run,0
Qwen/Qwen3-0.6B,eager_fail_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,5
hf_Reformer,pass,8
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11

1 name accuracy graph_breaks
162
163
164
165
166
167
168
178
179
180
181
182
183
184

View File

@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,20
hf_Reformer,pass,25

1 name accuracy graph_breaks
110
111
112
113
114
115
116

View File

@ -174,11 +174,11 @@ YituTechConvBert,pass,0
meta-llama/Llama-3.2-1B,pass,0
meta-llama/Llama-3.2-1B,pass,5
google/gemma-2-2b,pass,0
google/gemma-2-2b,pass,5
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
openai/whisper-tiny,pass,0
openai/whisper-tiny,pass,6
Qwen/Qwen3-0.6B,pass,0
Qwen/Qwen3-0.6B,pass,5

1 name accuracy graph_breaks
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189
190
191
192
193

View File

@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,5
hf_Reformer,pass,8
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11

1 name accuracy graph_breaks
162
163
164
165
166
167
168
178
179
180
181
182
183
184

View File

@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
hf_Reformer,pass,20
hf_Reformer,pass,25

1 name accuracy graph_breaks
110
111
112
113
114
115
116

View File

@ -181,7 +181,7 @@ hf_T5_base,pass,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11
@ -205,7 +205,7 @@ llama,pass,0
llama_v2_7b_16h,pass_due_to_skip,0
llama_v2_7b_16h,model_fail_to_load,0

1 name accuracy graph_breaks
181
182
183
184
185
186
187
205
206
207
208
209
210
211

View File

@ -178,7 +178,7 @@ llama,fail_to_run,0
llama_v2_7b_16h,pass_due_to_skip,0
llama_v2_7b_16h,model_fail_to_load,0

1 name accuracy graph_breaks
178
179
180
181
182
183
184

View File

@ -181,7 +181,7 @@ hf_T5_base,pass,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11

1 name accuracy graph_breaks
181
182
183
184
185
186
187

View File

@ -198,7 +198,7 @@ llama,pass,0
llama_v2_7b_16h,pass_due_to_skip,0
llama_v2_7b_16h,model_fail_to_load,0

1 name accuracy graph_breaks
198
199
200
201
202
203
204

View File

@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
YituTechConvBert,pass,5
meta-llama/Llama-3.2-1B,eager_failed_to_run,0
google/gemma-2-2b,eager_failed_to_run,0
google/gemma-3-4b-it,eager_failed_to_run,0
openai/whisper-tiny,eager_failed_to_run,0
Qwen/Qwen3-0.6B,eager_failed_to_run,0

1 name accuracy graph_breaks
171
172
173

View File

@ -181,7 +181,7 @@ hf_T5_base,pass,0
hf_T5_generate,pass,7
hf_T5_generate,pass,11

1 name accuracy graph_breaks
181
182
183
184
185
186
187

View File

@ -198,7 +198,7 @@ llama,pass,0
llama_v2_7b_16h,pass_due_to_skip,0
llama_v2_7b_16h,model_fail_to_load,0

1 name accuracy graph_breaks
198
199
200
201
202
203
204

View File

@ -6,7 +6,7 @@ add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1
add_loop_inductor,compile_time_instruction_count,29660000000,0.1
add_loop_inductor,compile_time_instruction_count,30280000000,0.1
@ -50,27 +50,27 @@ symint_sum_loop,compile_time_instruction_count,4299000000,0.1
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1869000000,0.1
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.1
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5281000000,0.1
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.1
aotdispatcher_partitioner_cpu,compile_time_instruction_count,8333000000,0.1
aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.1
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1909000000,0.1
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.1
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3442000000,0.1
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.1
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9239000000,0.1
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.1
@ -78,7 +78,7 @@ mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1
mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,9051000000,0.1
mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8802129167,0.1
@ -86,4 +86,4 @@ basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1
basic_InlineMod_eager,compile_time_instruction_count,7618000000,0.1
basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.1

1 add_loop_eager compile_time_instruction_count 3070000000 0.1
6 basic_modules_ListOfLinears_eager compile_time_instruction_count 1048000000 0.1
7 basic_modules_ListOfLinears_inductor compile_time_instruction_count 15240000000 0.1
8 basic_modules_ListOfLinears_inductor_gpu_force_shape_pad compile_time_instruction_count 17020000000 0.1
9 basic_modules_ListOfLinears_inductor_gpu compile_time_instruction_count 11090000000 0.2
10 update_hint_regression compile_time_instruction_count 1719000000 0.1
11 sum_floordiv_regression compile_time_instruction_count 966100000 0.1
12 symint_sum compile_time_instruction_count 3237000000 0.1
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
78
79
80
81
82
83
84
86
87
88
89

View File

@ -1998,21 +1998,7 @@ def define_buck_targets(
third_party("sleef_arm"),
],
}),
compiler_flags = get_aten_compiler_flags() + select({
"DEFAULT": [],
"ovr_config//os:android-arm32": [
"-mfpu=vfpv3-d16",
"-march=armv7-a",
"-mthumb",
"-mfpu=neon",
],
"ovr_config//os:android-x86_32": [
"-mssse3",
],
"ovr_config//os:android-x86_64": [
"-mssse3",
],
}),
compiler_flags = get_aten_compiler_flags(),
exported_preprocessor_flags = get_aten_preprocessor_flags(),
exported_deps = [
":aten_header",

View File

@ -1,5 +1,4 @@
#include <c10/core/impl/DeviceGuardImplInterface.h>
#include <c10/core/impl/FakeGuardImpl.h>
#include <array>
namespace c10::impl {
@ -15,26 +14,4 @@ DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
}
namespace {
thread_local std::unique_ptr<DeviceGuardImplInterface> tls_fake_device_guard =
nullptr;
}
void ensureCUDADeviceGuardSet() {
constexpr auto cuda_idx = static_cast<std::size_t>(DeviceType::CUDA);
const DeviceGuardImplInterface* p =
device_guard_impl_registry[cuda_idx].load();
// A non-null `ptr` indicates that the CUDA guard is already set up,
// implying this is using cuda build
if (p && p->deviceCount() == 0) {
// In following cases, we override CUDA guard interface with a no-op
// device guard. When p->deviceCount() == 0, cuda build is enabled, but no
// cuda devices available.
tls_fake_device_guard = std::make_unique<FakeGuardImpl<DeviceType::CUDA>>();
device_guard_impl_registry[cuda_idx].store(tls_fake_device_guard.get());
}
}
} // namespace c10::impl

View File

@ -6,7 +6,6 @@
#include <c10/util/Exception.h>
// Just for C10_ANONYMOUS_VARIABLE
#include <c10/core/impl/TorchDispatchModeTLS.h>
#include <c10/util/Registry.h>
#include <array>
@ -252,7 +251,7 @@ struct C10_API DeviceGuardImplInterface {
// for devices that don't actually have a concept of device index. Prominent
// examples are CPU and Meta.
template <DeviceType D>
struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface {
struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
NoOpDeviceGuardImpl() = default;
DeviceType type() const override {
return D;
@ -372,7 +371,5 @@ inline bool hasDeviceGuardImpl(DeviceType type) {
return device_guard_impl_registry[static_cast<size_t>(type)].load();
}
void C10_API ensureCUDADeviceGuardSet();
} // namespace impl
} // namespace c10

View File

@ -158,7 +158,6 @@ function(caffe2_print_configuration_summary)
if(${USE_KLEIDIAI})
message(STATUS " USE_KLEIDIAI : ${USE_KLEIDIAI}")
endif()
message(STATUS " USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}")
message(STATUS " USE_UCC : ${USE_UCC}")
if(${USE_UCC})
message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}")

View File

@ -482,7 +482,6 @@ function(torch_update_find_cuda_flags)
endfunction()
include(CheckCXXCompilerFlag)
include(CheckLinkerFlag)
##############################################################################
# CHeck if given flag is supported and append it to provided outputvar
@ -512,22 +511,3 @@ function(target_compile_options_if_supported target flag)
target_compile_options(${target} PRIVATE ${flag})
endif()
endfunction()
# Check if a global link option is supported
function(add_link_options_if_supported flag)
check_linker_flag(C "LINKER:${flag}" _supported)
if("${_supported}")
add_link_options("LINKER:${flag}")
else()
message(WARNING "Attempted to use unsupported link option : ${flag}.")
endif()
endfunction()
function(target_link_options_if_supported tgt flag)
check_linker_flag(C "LINKER:${flag}" _supported)
if("${_supported}")
target_link_options("${tgt}" PRIVATE "LINKER:${flag}")
else()
message(WARNING "Attempted to use unsupported link option : ${flag}.")
endif()
endfunction()

View File

@ -1,7 +0,0 @@
.. role:: hidden
:class: hidden-section
.. currentmodule:: {{ module }}
{{ name | underline }}
.. autofunction:: {{ fullname }}

View File

@ -1,7 +0,0 @@
.. role:: hidden
:class: hidden-section
.. currentmodule:: {{ module }}
{{ name | underline }}
.. automethod:: {{ fullname }}

View File

@ -0,0 +1,62 @@
torch.utils.bottleneck
======================
.. automodule:: torch.utils.bottleneck
.. currentmodule:: torch.utils.bottleneck
`torch.utils.bottleneck` is a tool that can be used as an initial step for
debugging bottlenecks in your program. It summarizes runs of your script with
the Python profiler and PyTorch's autograd profiler.
Run it on the command line with
::
python -m torch.utils.bottleneck /path/to/source/script.py [args]
where [args] are any number of arguments to `script.py`, or run
``python -m torch.utils.bottleneck -h`` for more usage instructions.
.. warning::
Because your script will be profiled, please ensure that it exits in a
finite amount of time.
.. warning::
Due to the asynchronous nature of CUDA kernels, when running against
CUDA code, the cProfile output and CPU-mode autograd profilers may
not show correct timings: the reported CPU time reports the amount of time
used to launch the kernels but does not include the time the kernel
spent executing on a GPU unless the operation does a synchronize.
Ops that do synchronize appear to be extremely expensive under regular
CPU-mode profilers.
In these case where timings are incorrect, the CUDA-mode autograd profiler
may be helpful.
.. note::
To decide which (CPU-only-mode or CUDA-mode) autograd profiler output to
look at, you should first check if your script is CPU-bound
("CPU total time is much greater than CUDA total time").
If it is CPU-bound, looking at the results of the CPU-mode autograd
profiler will help. If on the other hand your script spends most of its
time executing on the GPU, then it makes sense to start
looking for responsible CUDA operators in the output of the CUDA-mode
autograd profiler.
Of course the reality is much more complicated and your script might not be
in one of those two extremes depending on the part of the model you're
evaluating. If the profiler outputs don't help, you could try looking at
the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
However, please take into account that the NVTX overhead is very high and
often gives a heavily skewed timeline. Similarly, ``Intel® VTune™ Profiler``
helps to analyze performance on Intel platforms further with
:func:`torch.autograd.profiler.emit_itt()`.
.. warning::
If you are profiling CUDA code, the first profiler that ``bottleneck`` runs
(cProfile) will include the CUDA startup time (CUDA buffer allocation cost)
in its time reporting. This should not matter if your bottlenecks result
in code much slower than the CUDA startup time.
For more complicated uses of the profilers (like in a multi-GPU case),
please see https://docs.python.org/3/library/profile.html
or :func:`torch.autograd.profiler.profile()` for more information.

View File

@ -66,7 +66,6 @@ extensions = [
"sphinx.ext.linkcode",
"sphinxcontrib.mermaid",
"sphinx_sitemap",
"sphinx_remove_toctrees"
]
myst_enable_extensions = [
@ -75,9 +74,6 @@ myst_enable_extensions = [
"html_image",
]
# Remove the "generated" tag from the toctree to allow for faster builds
remove_from_toctrees = ["generated/*"]
html_baseurl = "https://docs.pytorch.org/docs/stable/" # needed for sphinx-sitemap
sitemap_locales = [None]
sitemap_excludes = [
@ -97,10 +93,8 @@ numpydoc_show_class_members = False
# autosectionlabel throws warnings if section names are duplicated.
# The following tells autosectionlabel to not throw a warning for
# duplicated section names that are in different documents.
autosectionlabel_prefix_document = True
# katex options
#
#
@ -213,41 +207,6 @@ templates_path = [
]
# TODO: document these and remove them from here.
autosummary_filename_map = {
'torch.nn.utils.prune.identity': 'torch.nn.utils.prune.identity_function',
'torch.nn.utils.prune.Identity': 'torch.nn.utils.prune.Identity_class',
'torch.optim.adamw.adamw': 'torch.optim.adamw.adamw_function',
'torch.optim.adamw.AdamW': 'torch.optim.adamw.AdamW_class',
'torch.optim.asgd.asgd': 'torch.optim.asgd.asgd_function',
'torch.optim.asgd.ASGD': 'torch.optim.asgd.ASGD_class',
'torch.optim.nadam.nadam': 'torch.optim.nadam.nadam_function',
'torch.optim.nadam.NAdam': 'torch.optim.nadam.NAdam_class',
'torch.optim.radam.radam': 'torch.optim.radam.radam_function',
'torch.optim.radam.RAdam': 'torch.optim.radam.RAdam_class',
'torch.optim.rmsprop.rmsprop': 'torch.optim.rmsprop.rmsprop_function',
'torch.optim.rmsprop.RMSprop': 'torch.optim.rmsprop.RMSprop_class',
'torch.optim.rprop.rprop': 'torch.optim.rprop.rprop_function',
'torch.optim.rprop.Rprop': 'torch.optim.rprop.Rprop_class',
'torch.optim.sgd.sgd': 'torch.optim.sgd.sgd_function',
'torch.optim.sgd.SGD': 'torch.optim.sgd.SGD_class',
'torch.optim.adadelta.adadelta': 'torch.optim.adadelta.adadelta_function',
'torch.optim.adadelta.Adadelta': 'torch.optim.adadelta.Adadelta_class',
'torch.optim.adagrad.adagrad': 'torch.optim.adagrad.adagrad_function',
'torch.optim.adagrad.Adagrad': 'torch.optim.adagrad.Adagrad_class',
'torch.optim.adam.adam': 'torch.optim.adam.adam_function',
'torch.optim.adam.Adam': 'torch.optim.adam.Adam_class',
'torch.optim.adamax.adamax': 'torch.optim.adamax.adamax_function',
'torch.optim.adamax.Adamax': 'torch.optim.adamax.Adamax_class',
'torch.mtia.stream': 'torch.mtia.stream_function',
'torch.mtia.Stream': 'torch.mtia.Stream_class',
'torch.cpu.stream': 'torch.cpu.stream_function',
'torch.cpu.Stream': 'torch.cpu.Stream_class',
'torch.cuda.stream': 'torch.cuda.stream_function',
'torch.cuda.Stream': 'torch.cuda.Stream_class',
'torch.xpu.stream': 'torch.xpu.stream_function',
'torch.xpu.Stream': 'torch.xpu.Stream_class',
}
coverage_ignore_functions = [
# torch
"typename",

View File

@ -21,10 +21,10 @@
The following operations will fill uninitialized memory when this setting is
turned on:
* :meth:`torch.Tensor.resize_` when called with a tensor that is not
* :func:`torch.Tensor.resize_` when called with a tensor that is not
quantized
* :func:`torch.empty`
* :func:`torch.empty_strided`
* :func:`torch.empty_permuted`
* :func:`torch.empty_like`
```
```

View File

@ -260,73 +260,3 @@ these features.
```{eval-rst}
.. py:module:: torch.distributed.tensor.device_mesh
```
## Mixed Tensor and DTensor operations
So you got the following error message.
```
got mixed torch.Tensor and DTensor, need to convert all
torch.Tensor to DTensor before calling distributed operators!
```
There are two cases.
### Case 1: this is user error
The most common way to run into this error is to create a regular Tensor
(using a factory function) and then perform a Tensor-DTensor operation,
like the following:
```
tensor = torch.arange(10)
return tensor + dtensor
```
We disallow mixed Tensor-DTensor operations: if the input to any operations
(e.g. torch.add) is a DTensor, then all Tensor inputs must be DTensors.
This is because the semantics are ambiguous. We don't know if `tensor` is
the same across ranks or if it is different so we ask that the user
figure out how to construct a DTensor with accurate placements from `tensor`.
If each rank does have the same `tensor`, then please construct a replicated
DTensor:
```
tensor = torch.arange(10)
tensor = DTensor.from_local(tensor, placements=(Replicate(),))
return tensor + dtensor
```
If you wanted to create a DTensor with shards, below is how to do it.
Semantically this means that your Tensor data is split between the shards
and that operations act on the "full stacked data".
```
tensor = torch.full([], RANK)
tensor = DTensor.from_local(tensor, placements=(Shard(0),))
return tensor + dtensor
```
There are other things you may wish to do with your tensor beyond
these situations (these are not the only two options!).
## Case 2: the error came from PyTorch framework code
Sometimes the problem is that PyTorch framework code attempts to perform mixed
Tensor-DTensor operations. These are bugs in PyTorch, please file an issue
so that we can fix them.
On the user side, the only thing you can do is to avoid using the operation
that caused the issue and file a bug report.
For PyTorch Developers: one approach of fixing this is to rewrite PyTorch
framework code to avoid mixed Tensor-DTensor code (like in the previous section).
For PyTorch Developers: the second approach is to turn on DTensor implicit
replication inside the right places in PyTorch framework code.
When on, any mixed Tensor-DTensor operations will assume that the
non-DTensors can be replicated. Please be careful when using this as it
can lead to silent incorrectness.
- [Turning on implicit replication in Python](https://github.com/pytorch/pytorch/blob/d8e6b2fddc54c748d976e8f0ebe4b63ebe36d85b/torch/distributed/tensor/experimental/__init__.py#L15)
- [Turning on implicit replication in C++](https://github.com/pytorch/pytorch/blob/7a0f93344e2c851b9bcf2b9c3225a323d48fde26/aten/src/ATen/DTensorState.h#L10)

View File

@ -8,10 +8,6 @@
These APIs are experimental and subject to change without notice.
:::
```{eval-rst}
.. autoclass:: torch.fx.experimental.sym_node.DynamicInt
```
## torch.fx.experimental.symbolic_shapes
```{eval-rst}

View File

@ -76,6 +76,7 @@ storage
torch.testing <testing>
torch.utils <utils>
torch.utils.benchmark <benchmark_utils>
torch.utils.bottleneck <bottleneck>
torch.utils.checkpoint <checkpoint>
torch.utils.cpp_extension <cpp_extension>
torch.utils.data <data>

View File

@ -227,6 +227,9 @@
# Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
# By default, It is only enabled on Windows.
#
# USE_PRIORITIZED_TEXT_FOR_LD
# Uses prioritized text form cmake/prioritized_text.txt for LD
#
# BUILD_LIBTORCH_WHL
# Builds libtorch.so and its dependencies as a wheel
#
@ -320,6 +323,7 @@ from tools.setup_helpers.env import (
IS_LINUX,
IS_WINDOWS,
)
from tools.setup_helpers.generate_linker_script import gen_linker_script
def str2bool(value: str | None) -> bool:
@ -1623,6 +1627,26 @@ def main() -> None:
if BUILD_PYTHON_ONLY:
install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"]
if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")):
gen_linker_script(
filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
)
linker_script_path = os.path.abspath("cmake/linker_script.ld")
os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}"
os.environ["CFLAGS"] = (
os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections"
)
os.environ["CXXFLAGS"] = (
os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections"
)
elif platform.system() == "Linux" and platform.processor() == "aarch64":
print_box(
"""
WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA.
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
"""
)
# Parse the command line and check the arguments before we proceed with
# building deps and setup. We need to set values so `--help` works.
dist = Distribution()

View File

@ -1,4 +1,4 @@
# Owner(s): ["module: sparse"]
# Owner(s): ["module: unknown"]
import copy

View File

@ -1,4 +1,4 @@
# Owner(s): ["module: sparse"]
# Owner(s): ["module: unknown"]
import torch

View File

@ -1,4 +1,4 @@
# Owner(s): ["module: sparse"]
# Owner(s): ["module: unknown"]
import copy
import warnings

View File

@ -1,4 +1,4 @@
# Owner(s): ["module: sparse"]
# Owner(s): ["module: unknown"]
import copy
import itertools

View File

@ -1,4 +1,4 @@
# Owner(s): ["module: sparse"]
# Owner(s): ["module: unknown"]
import copy
import io

View File

@ -1,4 +1,4 @@
# Owner(s): ["module: sparse"]
# Owner(s): ["module: unknown"]
import torch

View File

@ -1,4 +1,4 @@
# Owner(s): ["module: sparse"]
# Owner(s): ["module: unknown"]
import warnings

View File

@ -1,4 +1,4 @@
# Owner(s): ["module: sparse"]
# Owner(s): ["module: unknown"]
import itertools
import re

View File

@ -1,4 +1,4 @@
# Owner(s): ["module: sparse"]
# Owner(s): ["module: unknown"]
import logging

View File

@ -1,4 +1,4 @@
# Owner(s): ["module: sparse"]
# Owner(s): ["module: unknown"]
import copy
import random

View File

@ -0,0 +1,7 @@
# Owner(s): ["module: unknown"]
import torch
x = torch.ones((3, 3), requires_grad=True)
(3 * x).sum().backward()

View File

@ -0,0 +1,17 @@
# Owner(s): ["module: unknown"]
import argparse
import torch
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Required args. Raises error if they aren't passed.
parser.add_argument("--foo", help="foo", required=True)
parser.add_argument("--bar", help="bar", required=True)
_ = parser.parse_args()
x = torch.ones((3, 3), requires_grad=True)
(3 * x).sum().backward()

View File

@ -0,0 +1,29 @@
# Owner(s): ["module: unknown"]
import torch
import torch.nn as nn
class Model(nn.Module):
def __init__(self) -> None:
super().__init__()
self.linear = nn.Linear(20, 20)
def forward(self, input):
out = self.linear(input[:, 10:30])
return out.sum()
def main():
data = torch.randn(10, 50).cuda()
model = Model().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
for _ in range(10):
optimizer.zero_grad()
loss = model(data)
loss.backward()
optimizer.step()
if __name__ == "__main__":
main()

View File

@ -117,49 +117,6 @@ class TestFullyShardStateDictMultiProcess(FSDPTest):
for key, value in ref_sharded_sd.items():
self.assertEqual(value, sharded_sd[key])
@skip_if_lt_x_gpu(2)
def test_cached_state_dict(self):
self.run_subtests(
{"mlp_dim": [2, 3, 4, 5], "mutate_after_state_dict": [True, False]},
self._test_cached_state_dict,
)
def _test_cached_state_dict(self, mlp_dim: int, mutate_after_state_dict: bool):
torch.manual_seed(42)
model = nn.Linear(mlp_dim, mlp_dim, bias=False)
fully_shard(model, reshard_after_forward=True)
optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
# call .state_dict() once and use `sd` directly to reduce cpu overhead
sd = model.state_dict()
assert isinstance(model.weight, DTensor)
if not mutate_after_state_dict:
self.assertTrue(
sd["weight"]._local_tensor.untyped_storage().data_ptr()
== model.weight._local_tensor.untyped_storage().data_ptr()
)
else:
model = model.cpu()
model = model.cuda()
self.assertTrue(
sd["weight"]._local_tensor.untyped_storage().data_ptr()
!= model.weight._local_tensor.untyped_storage().data_ptr()
)
torch.manual_seed(42 + self.rank)
inp = torch.rand(mlp_dim, mlp_dim, device="cuda")
for _ in range(5):
optim.zero_grad()
loss = model(inp).sum()
loss.backward()
optim.step()
if not mutate_after_state_dict:
self.assertTrue(
sd["weight"]._local_tensor.untyped_storage().data_ptr()
== model.weight._local_tensor.untyped_storage().data_ptr()
)
@skip_if_lt_x_gpu(2)
def test_dp_state_dict_cpu_offload(self):
self.run_subtests(

View File

@ -1,47 +1,22 @@
# Owner(s): ["oncall: distributed"]
import contextlib
import copy
import functools
import itertools
import unittest
from collections.abc import Iterable
from typing import Union
import torch
import torch.distributed as dist
import torch.nn as nn
from torch.distributed._composable.replicate_with_fsdp import replicate
from torch.distributed.fsdp import CPUOffloadPolicy, FSDPModule, OffloadPolicy
from torch.distributed.tensor import DTensor, init_device_mesh
from torch.distributed.fsdp import FSDPModule
from torch.distributed.tensor import DTensor
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
from torch.testing._internal.common_fsdp import (
check_sharded_parity,
compiled_fsdp_test,
FSDPTest,
FSDPTestMultiThread,
MLP,
patch_all_gather,
patch_reduce_scatter,
)
from torch.testing._internal.common_utils import (
get_cycles_per_ms,
run_tests,
TEST_HPU,
wrapSwapTensorsTest,
)
from torch.testing._internal.distributed._tensor.common_dtensor import (
ModelArgs,
Transformer,
TransformerBlock,
)
from torch.testing._internal.common_fsdp import FSDPTestMultiThread, get_devtype, MLP
from torch.testing._internal.common_utils import run_tests
c10d_ops = torch.ops.c10d
funcol = torch.ops.c10d_functional
from torch.testing._internal.common_fsdp import get_devtype
device_type = torch.device(get_devtype())
@ -194,463 +169,5 @@ class TestReplicateRegisteredParams(FSDPTestMultiThread):
self.assertEqual(param, ref_param)
class TestReplicateCastAfterInit(FSDPTestMultiThread):
@property
def world_size(self) -> int:
return 2
@skip_if_lt_x_gpu(1)
@wrapSwapTensorsTest(True)
def test_to_float64_after_init(self):
"""Tests that the user can cast the module to float64 after init."""
# NOTE: Test fp64 instead of a lower precision dtype like bf16 for
# better numerics. The important part is changing the dtype.
torch.manual_seed(42)
mlp_dim, device, dtype = 4, device_type, torch.float64
model = MLP(mlp_dim, device=device)
for param in model.parameters():
dist.broadcast(param, src=0)
ref_model = copy.deepcopy(model).to(dtype)
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
for module in (model.in_proj, model.out_proj, model):
replicate(module)
model.to(dtype)
for param in model.parameters():
self.assertEqual(param.dtype, dtype)
self.assertEqual(param.to_local().dtype, dtype)
self.assertEqual(param._spec.tensor_meta.dtype, dtype)
optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
check_sharded_parity(self, ref_model, model)
torch.manual_seed(42 + self.rank + 1)
inp = torch.randn((2, mlp_dim), device=device_type.type, dtype=dtype)
for iter_idx in range(10):
losses: list[torch.Tensor] = []
for _model in (ref_model, model):
losses.append(_model(inp).sum())
losses[-1].backward()
for param in ref_model.parameters():
if param.grad is not None:
dist.all_reduce(param.grad)
param.grad.div_(self.world_size)
self.assertEqual(losses[0], losses[1])
check_sharded_parity(self, ref_model, model)
for param in model.parameters():
self.assertEqual(param.dtype, dtype)
self.assertEqual(param.to_local().dtype, dtype)
self.assertEqual(param._spec.tensor_meta.dtype, dtype)
self.assertEqual(param.grad.dtype, dtype)
self.assertEqual(param.grad.to_local().dtype, dtype)
self.assertEqual(param.grad._spec.tensor_meta.dtype, dtype)
for _optim in (ref_optim, optim):
_optim.step()
_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
class TestReplicate1DTrainingCore(FSDPTest):
@property
def world_size(self) -> int:
return min(8, torch.get_device_module(device_type).device_count())
@skip_if_lt_x_gpu(2)
def test_train_parity_single_group(self):
"""
Tests train parity with DDP for a single FSDP group when sharding
parameters on dim-0.
"""
self.run_subtests(
{
"lin_shapes": [
[(16, 15), (15, 8)],
[(7, 15), (15, 3)],
[(16, 17), (17, 8)],
],
"use_shard_placement_fn": [False],
},
self._test_train_parity_single_group,
)
def _test_train_parity_single_group(
self, lin_shapes: list[tuple[int, int]], use_shard_placement_fn: bool
):
torch.manual_seed(42)
model = nn.Sequential(
nn.Linear(*lin_shapes[0]), nn.ReLU(), nn.Linear(*lin_shapes[1])
)
ref_model = copy.deepcopy(model).to(device_type)
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
replicate(model)
optim = torch.optim.Adam(model.parameters(), lr=1e-2)
torch.manual_seed(42 + self.rank + 1)
inp = (torch.randn((4, lin_shapes[0][0]), device=device_type.type),)
for iter_idx in range(10):
losses: list[torch.Tensor] = []
for _model in (ref_model, model):
losses.append(_model(*inp).sum())
losses[-1].backward()
for param in ref_model.parameters():
if param.grad is not None:
dist.all_reduce(param.grad)
param.grad.div_(self.world_size)
for _optim in (ref_optim, optim):
_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
_optim.step()
self.assertEqual(losses[0], losses[1])
@skip_if_lt_x_gpu(2)
@unittest.skipIf(TEST_HPU, "Sleep kernel not supported for HPU")
@compiled_fsdp_test(compile_compute_on_module=Transformer)
def test_train_parity_multi_groups(self):
"""
Tests train parity against DDP when using multiple parameter groups for
communication (for communication and computation overlap plus memory
reduction).
"""
self.run_subtests(
{
"reshard_after_forward": [True, False],
"test_device_type": [device_type.type],
"offload_policy": [OffloadPolicy()],
"delay_after_forward": [False, True],
"delay_before_all_gather": [False, True],
"delay_before_reduce_scatter": [False, True],
"delay_before_optim": [False, True],
"unshard_async_op": [False],
},
self._test_train_parity_multi_group,
)
@skip_if_lt_x_gpu(2)
@unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
def test_train_parity_multi_group_cpu_offload_eager(self):
"""
Tests train parity when using multiple parameter groups for
communication and CPU offloading.
"""
self.run_subtests(
{
"reshard_after_forward": [True], # save CI time
"offload_policy": [
CPUOffloadPolicy(pin_memory=True),
CPUOffloadPolicy(pin_memory=False),
],
"test_device_type": [device_type.type],
"delay_after_forward": [False, True],
"delay_before_all_gather": [False, True],
"delay_before_reduce_scatter": [False, True],
"delay_before_optim": [False, True],
"unshard_async_op": [False],
},
self._test_train_parity_multi_group,
)
def _test_train_parity_multi_group(
self,
reshard_after_forward: Union[bool, int],
offload_policy: OffloadPolicy,
test_device_type: str,
delay_after_forward: bool,
delay_before_all_gather: bool,
delay_before_reduce_scatter: bool,
delay_before_optim: bool,
unshard_async_op: bool,
):
# Only test individual delays or all four delays to save test time
if (
delay_after_forward
+ delay_before_all_gather
+ delay_before_reduce_scatter
+ delay_before_optim
in (2, 3)
):
return
assert test_device_type in ("cuda", "hpu", "xpu", "cpu"), f"{test_device_type}"
torch.manual_seed(42)
vocab_size = 1024
model_args = ModelArgs(
n_layers=3,
n_heads=4,
vocab_size=vocab_size,
max_seq_len=64,
dropout_p=0,
)
model = Transformer(model_args)
ref_model = copy.deepcopy(model).to(device_type)
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
mesh = init_device_mesh(
test_device_type,
(self.world_size, 1),
mesh_dim_names=("replicate", "shard"),
)
fully_shard_fn = functools.partial(
replicate,
device_mesh=mesh,
reshard_after_forward=reshard_after_forward,
offload_policy=offload_policy,
)
for module in model.modules():
if isinstance(module, TransformerBlock):
fully_shard_fn(module)
fully_shard_fn(model)
if unshard_async_op:
model._set_unshard_async_op(unshard_async_op)
optim = torch.optim.Adam(model.parameters(), lr=1e-2)
delay_in_ms = 100
orig_all_gather = dist.all_gather_into_tensor
orig_reduce_scatter = dist.reduce_scatter_tensor
def delayed_all_gather(*args, **kwargs):
torch.get_device_module(device_type)._sleep(
int(delay_in_ms * get_cycles_per_ms())
)
return orig_all_gather(*args, **kwargs)
def delayed_reduce_scatter(*args, **kwargs):
torch.get_device_module(device_type)._sleep(
int(delay_in_ms * get_cycles_per_ms())
)
return orig_reduce_scatter(*args, **kwargs)
torch.manual_seed(42 + self.rank + 1)
patch_all_gather_ctx = (
patch_all_gather(delayed_all_gather)
if delay_before_all_gather
else contextlib.nullcontext()
)
patch_reduce_scatter_ctx = (
patch_reduce_scatter(delayed_reduce_scatter)
if delay_before_reduce_scatter
else contextlib.nullcontext()
)
with patch_all_gather_ctx, patch_reduce_scatter_ctx:
for iter_idx in range(10):
inp = torch.randint(0, vocab_size, (3, 64), device=device_type)
losses: list[torch.Tensor] = []
for _model, _optim in ((ref_model, ref_optim), (model, optim)):
losses.append(_model(inp).sum())
if _model is model and delay_after_forward:
torch.get_device_module(device_type)._sleep(
int(delay_in_ms * get_cycles_per_ms())
)
losses[-1].backward()
if _model is model and delay_before_optim:
torch.get_device_module(device_type)._sleep(
int(delay_in_ms * get_cycles_per_ms())
)
for param in ref_model.parameters():
if param.grad is not None:
dist.all_reduce(param.grad)
param.grad.div_(self.world_size)
for _optim in (ref_optim, optim):
_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
_optim.step()
self.assertEqual(losses[0], losses[1])
@skip_if_lt_x_gpu(2)
def test_non_root_forward_backward(self):
"""
Tests running forward/backward through the root and then through a
non-root. The non-root needs to synchronize streams/queue the callback.
"""
torch.manual_seed(42)
lin_dim = 32
model = nn.Sequential(*[MLP(lin_dim, torch.device("cpu")) for _ in range(3)])
ref_model = copy.deepcopy(model).to(device_type)
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
for mlp in model:
replicate(mlp)
replicate(model)
optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
torch.manual_seed(42 + self.rank)
inp = torch.randn((8, lin_dim), device=device_type)
ref_root_loss = ref_model(inp).sum()
ref_root_loss.backward()
for param in ref_model.parameters():
dist.all_reduce(param.grad)
param.grad.detach().div_(self.world_size)
ref_optim.step()
ref_optim.zero_grad()
ref_nonroot_loss = ref_model[0](inp).sum()
ref_nonroot_loss.backward()
for param in ref_model.parameters():
if param.grad is not None:
dist.all_reduce(param.grad)
param.grad.detach().div_(self.world_size)
ref_optim.step()
root_loss = model(inp).sum()
root_loss.backward()
torch.get_device_module(device_type)._sleep(int(100 * get_cycles_per_ms()))
optim.step()
optim.zero_grad()
nonroot_loss = model[0](inp).sum()
nonroot_loss.backward()
optim.step()
self.assertEqual(ref_root_loss, root_loss)
self.assertEqual(ref_nonroot_loss, nonroot_loss)
self.assertEqual(ref_model(inp).sum(), model(inp).sum())
@skip_if_lt_x_gpu(2)
def test_multi_forward_module(self):
"""
Tests parity when running a module that participates multiple
times in forward.
"""
self.run_subtests(
{"reshard_after_forward": [True, False]},
self._test_multi_forward_module,
)
def _test_multi_forward_module(self, reshard_after_forward: Union[bool, int]):
class MultiForwardModule(nn.Module):
def __init__(self, device: torch.device):
super().__init__()
self.inner = nn.Linear(4, 4, device=device)
self.outer = nn.Linear(4, 5, device=device)
def forward(self, x):
i = self.inner(x)
j = self.inner(x)
return self.outer(i + j)
torch.manual_seed(42)
model = MultiForwardModule(device=device_type.type)
ref_model = copy.deepcopy(model).to(device_type)
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
replicate(model.inner)
replicate(model)
optim = torch.optim.Adam(model.parameters(), lr=1e-2)
torch.manual_seed(42 + self.rank)
inp = torch.randn((32, 4), device=device_type.type)
for iter_idx in range(10):
losses: list[torch.Tensor] = []
for _model in (ref_model, model):
losses.append(_model(inp).sum())
losses[-1].backward()
for param in ref_model.parameters():
if param.grad is not None:
dist.all_reduce(param.grad)
param.grad.div_(self.world_size)
for _optim in (ref_optim, optim):
_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
_optim.step()
self.assertEqual(losses[0], losses[1])
@skip_if_lt_x_gpu(2)
def test_explicit_prefetching(self):
torch.manual_seed(42)
model_args = ModelArgs(n_layers=8, dropout_p=0.0)
model = Transformer(model_args)
ref_model = copy.deepcopy(model).to(device_type)
ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
for layer in itertools.chain(model.layers, [model]):
replicate(layer)
optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
num_to_forward_prefetch = num_to_backward_prefetch = 2
for i, layer in enumerate(model.layers):
if i >= len(model.layers) - num_to_forward_prefetch:
break
layers_to_prefetch = [
model.layers[i + j] for j in range(1, num_to_forward_prefetch + 1)
]
layer.set_modules_to_forward_prefetch(layers_to_prefetch)
for i, layer in enumerate(model.layers):
if i < num_to_backward_prefetch:
continue
layers_to_prefetch = [
model.layers[i - j] for j in range(1, num_to_backward_prefetch + 1)
]
layer.set_modules_to_backward_prefetch(layers_to_prefetch)
torch.manual_seed(42 + self.rank)
inp = torch.randint(0, model_args.vocab_size, (2, 8), device=device_type.type)
for _ in range(10):
losses: list[torch.Tensor] = []
for _model in (ref_model, model):
losses.append(_model(inp).sum())
losses[-1].backward()
for param in ref_model.parameters():
if param.grad is not None:
dist.all_reduce(param.grad)
param.grad.div_(self.world_size)
for _optim in (ref_optim, optim):
_optim.zero_grad()
_optim.step()
self.assertEqual(losses[0], losses[1])
@skip_if_lt_x_gpu(2)
@unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
def test_post_optim_event(self):
torch.manual_seed(42)
model_args = ModelArgs(dropout_p=0.0)
model = Transformer(model_args)
ref_model = copy.deepcopy(model).to(device_type.type)
ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
for layer in itertools.chain(model.layers, [model]):
replicate(layer)
optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
def step_post_hook(
fsdp_module: FSDPModule, opt: torch.optim.Optimizer, args, kwargs
) -> None:
post_optim_event = (
torch.get_device_module(device_type).current_stream().record_event()
)
fsdp_module.set_post_optim_event(post_optim_event)
optim.register_step_post_hook(functools.partial(step_post_hook, model))
torch.manual_seed(42 + self.rank)
inp = torch.randint(0, model_args.vocab_size, (2, 8), device=device_type.type)
# Track all losses and check for equality at the end to avoid a CPU
# sync point after each iteration
ref_losses: list[torch.Tensor] = []
losses: list[torch.Tensor] = []
for _ in range(10):
ref_optim.zero_grad()
ref_losses.append(ref_model(inp).sum())
ref_losses[-1].backward()
for param in ref_model.parameters():
if param.grad is not None:
dist.all_reduce(param.grad)
param.grad.div_(self.world_size)
ref_optim.step()
for _ in range(10):
optim.zero_grad()
losses.append(model(inp).sum())
losses[-1].backward()
optim.step()
# Sleep after the optimizer step to allow CPU to run ahead into the
# next iteration's forward, exercising the post-optim stream sync
torch.get_device_module(device_type)._sleep(int(25 * get_cycles_per_ms()))
for ref_loss, loss in zip(ref_losses, losses):
self.assertEqual(ref_loss, loss)
if __name__ == "__main__":
run_tests()

View File

@ -41,7 +41,7 @@ class TestDTensorDebugMode(TestCase):
x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
with DebugMode(record_torchfunction=True) as debug_mode:
with DebugMode() as debug_mode:
torch.mm(x_dtensor, y_dtensor).sum()
self.assertExpectedInline(
@ -80,7 +80,7 @@ class TestDTensorDebugMode(TestCase):
x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
y_dtensor = DTensor.from_local(y, mesh, [Shard(1)], run_check=False)
with DebugMode(record_torchfunction=True) as debug_mode:
with DebugMode() as debug_mode:
z = x_dtensor + y_dtensor
z.sum().backward()
@ -121,7 +121,7 @@ class TestDTensorDebugMode(TestCase):
b_dt = DTensor.from_local(b, mesh, [Replicate(), Partial()], run_check=False)
# Capture the operator decomposition
with DebugMode(record_torchfunction=True) as debug_mode:
with DebugMode() as debug_mode:
torch.einsum("bld,dnh->blnh", a_dt, b_dt)
self.assertExpectedInline(
@ -176,7 +176,7 @@ class TestDTensorDebugMode(TestCase):
x = torch.randn(8, 8, 8)
linear = torch.nn.Linear(8, 8)
with DebugMode(record_torchfunction=True) as debug_mode:
with DebugMode() as debug_mode:
linear(x).sum()
self.assertExpectedInline(
@ -196,7 +196,7 @@ class TestDTensorDebugMode(TestCase):
x = torch.randn(8, 8)
y = torch.randn(8, 8, 8)
with DebugMode(record_torchfunction=True, record_faketensor=True) as debug_mode:
with DebugMode(record_faketensor=True) as debug_mode:
torch.matmul(y, x)
self.assertExpectedInline(

View File

@ -9,11 +9,12 @@ from typing import Union
import torch
import torch.distributed as dist
import torch.nn.functional as F
from torch import Tensor
from torch import nn, Tensor
from torch.distributed.device_mesh import init_device_mesh
from torch.distributed.tensor import DeviceMesh
from torch.distributed.tensor.debug import CommDebugMode
from torch.distributed.tensor.experimental._attention import (
_AttentionContextParallel,
_CausalBehavior,
_cp_options,
_DispatchMode,
@ -23,6 +24,7 @@ from torch.distributed.tensor.experimental._attention import (
context_parallel_unshard,
set_rotate_method,
)
from torch.distributed.tensor.parallel import parallelize_module
from torch.nn.attention import sdpa_kernel, SDPBackend
from torch.nn.attention.flex_attention import (
_mask_mod_signature,
@ -40,6 +42,8 @@ from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
from torch.testing._internal.common_utils import run_tests, skipIfRocm
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
ModelArgs,
Transformer,
with_comms,
)
@ -269,6 +273,180 @@ class RingAttentionTest(DTensorTestBase):
behavior,
)
@skip_if_lt_x_gpu(2)
@unittest.skipIf(
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
)
@with_comms
def test_ring_attention_native_transformer(self) -> None:
self.run_subtests(
{
"is_causal": [True, False],
"rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL],
},
self._test_ring_attention_native_transformer,
)
@sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
def _test_ring_attention_native_transformer(
self, is_causal: bool, rotater: _RotateMethod
) -> None:
_cp_options.enable_load_balance = is_causal
set_rotate_method(rotater_enum_to_str[rotater])
self.assertEqual(_cp_options.rotate_method, rotater)
device_mesh = DeviceMesh(
self.device_type,
torch.arange(0, self.world_size),
)
dtype = torch.bfloat16
bs = 8
ntokens = 8
dim = 32
nheads = 8
num_layers = 2
encoder_layer = nn.TransformerEncoderLayer(
d_model=dim,
nhead=nheads,
dim_feedforward=dim,
batch_first=True,
).to(dtype)
encoder_layer = parallelize_module(
module=encoder_layer,
device_mesh=device_mesh,
parallelize_plan={
"self_attn": _AttentionContextParallel(),
},
)
model = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
model = model.to(self.device_type).to(dtype)
mask = (
nn.Transformer.generate_square_subsequent_mask(
ntokens, device=self.device_type, dtype=dtype
)
if is_causal
else None
)
seq = torch.rand((bs, ntokens, dim), device=self.device_type, dtype=dtype)
with CommDebugMode() as comm_mode:
out = model(seq, mask=mask, is_causal=is_causal)
if rotater == _RotateMethod.ALL_TO_ALL:
self.assertDictEqual(
comm_mode.get_comm_counts(),
{
c10d_functional.all_to_all_single: (self.world_size - 1)
* num_layers,
},
)
else:
self.assertDictEqual(
comm_mode.get_comm_counts(),
{
c10d_functional.all_gather_into_tensor: num_layers,
},
)
with CommDebugMode() as comm_mode:
out.sum().backward()
if rotater == _RotateMethod.ALL_TO_ALL:
self.assertDictEqual(
comm_mode.get_comm_counts(),
{
c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
* num_layers,
},
)
else:
self.assertDictEqual(
comm_mode.get_comm_counts(),
{
c10d_functional.all_gather_into_tensor: num_layers,
c10d_functional.all_to_all_single: self.world_size * num_layers,
},
)
@skip_if_lt_x_gpu(2)
@unittest.skipIf(
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
)
@with_comms
@sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
def test_ring_attention_custom_transformer(self) -> None:
self.run_subtests(
{"rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL]},
self._test_ring_attention_custom_transformer,
)
def _test_ring_attention_custom_transformer(self, rotater: _RotateMethod) -> None:
set_rotate_method(rotater_enum_to_str[rotater])
self.assertEqual(_cp_options.rotate_method, rotater)
device_mesh = DeviceMesh(
self.device_type,
torch.arange(0, self.world_size),
)
# early init DTensor RNG tracker to avoid broadcast be captuured in comm_mode
torch.distributed.tensor._random.manual_seed(10, device_mesh)
dtype = torch.bfloat16
bs = 2
args = ModelArgs()
model = Transformer(args).to(dtype).to(self.device_type)
model = parallelize_module(
module=model,
device_mesh=device_mesh,
parallelize_plan={
f"layers.{i}.attention": _AttentionContextParallel()
for i in range(args.n_layers)
},
)
seq = torch.randint(
args.vocab_size, (bs, args.max_seq_len), device=self.device_type
)
with CommDebugMode() as comm_mode:
out = model(seq)
if rotater == _RotateMethod.ALL_TO_ALL:
self.assertDictEqual(
comm_mode.get_comm_counts(),
{
c10d_functional.all_to_all_single: (self.world_size - 1)
* args.n_layers,
},
)
else:
self.assertDictEqual(
comm_mode.get_comm_counts(),
{c10d_functional.all_gather_into_tensor: args.n_layers},
)
with CommDebugMode() as comm_mode:
out.sum().backward()
if rotater == _RotateMethod.ALL_TO_ALL:
self.assertDictEqual(
comm_mode.get_comm_counts(),
{
c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
* args.n_layers,
},
)
else:
self.assertDictEqual(
comm_mode.get_comm_counts(),
{
c10d_functional.all_gather_into_tensor: args.n_layers,
c10d_functional.all_to_all_single: self.world_size * args.n_layers,
},
)
# Compile the flex_attention function
compiled_flex_attention = torch.compile(flex_attention, dynamic=False, fullgraph=True)
@ -354,12 +532,12 @@ def generate_doc_mask_mod(
return doc_mask_mod
class CPFlexAttentionTest(DTensorTestBase):
class RingFlexAttentionTest(DTensorTestBase):
@property
def world_size(self) -> int:
return 2
def _test_cp_flex_attention(
def _test_ring_flex_attention(
self, qkv_size, B=1, mask_func=causal_mask, atol=1e-6, rtol=1e-2
) -> None:
torch.cuda.manual_seed(10)
@ -408,6 +586,15 @@ class CPFlexAttentionTest(DTensorTestBase):
mesh_shape=(self.world_size,),
mesh_dim_names=("cp",),
)
# NOTE: cp needs to know the sharding dimension
# TODO: see if this can be moved to the cp context
from torch.distributed.tensor.experimental._attention import _set_cp_global_var
_set_cp_global_var("cp_shard_dim", 2)
self.assertEqual(
torch.distributed.tensor.experimental._attention._cp_global_vars.cp_shard_dim,
2,
)
# NOTE: we do not test load balance here
_cp_options.enable_load_balance = False
@ -497,17 +684,17 @@ class CPFlexAttentionTest(DTensorTestBase):
@unittest.skipIf(
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
)
def test_cp_flex_attention(self) -> None:
def test_ring_flex_attention(self) -> None:
self.run_subtests(
{"qkv_size": [128 * self.world_size, 2048]},
self._test_cp_flex_attention,
self._test_ring_flex_attention,
)
# NOTE: Context Parallel should not be used for small attentions (block_size < 128)
with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close"):
self.run_subtests(
{"qkv_size": [64 * self.world_size]},
self._test_cp_flex_attention,
self._test_ring_flex_attention,
)
# TODO: merge with the above test
@ -516,7 +703,7 @@ class CPFlexAttentionTest(DTensorTestBase):
@unittest.skipIf(
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
)
def test_cp_flex_attention_document_mask(self) -> None:
def test_ring_flex_attention_document_mask(self) -> None:
random.seed(10)
# NOTE: Each (batch_size, seq_len) tuple introduces 2 create_block_mask
@ -537,7 +724,7 @@ class CPFlexAttentionTest(DTensorTestBase):
# TODO: change this for-loop to run_subtests
# Use a for-loop instead of run_subtests because we need to intialize the mask
# for each subtest. This can be baked into self._test_cp_flex_attention as
# for each subtest. This can be baked into self._test_ring_flex_attention as
# a str argument denoting mask type.
for batch_size, max_seq_len in itertools.product(
batch_size_list, max_seq_len_list
@ -551,7 +738,7 @@ class CPFlexAttentionTest(DTensorTestBase):
# construct testing function
test_func = functools.partial(
self._test_cp_flex_attention,
self._test_ring_flex_attention,
qkv_size=max_seq_len,
B=batch_size,
mask_func=document_causal_mask,

View File

@ -7,7 +7,7 @@ import warnings
import torch
import torch.distributed as dist
import torch.testing._internal.common_methods_invocations as common_ops
from torch.distributed.tensor import distribute_tensor, DTensor, init_device_mesh, Shard
from torch.distributed.tensor import DTensor, init_device_mesh
from torch.overrides import resolve_name
from torch.testing._internal.common_device_type import (
instantiate_device_type_tests,
@ -21,7 +21,6 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
)
from torch.utils import _pytree as pytree
from torch.utils._pytree import tree_map
from torch.utils.debug_mode import DebugMode
# rewrite common size variables to sth can be sharded evenly
@ -118,6 +117,7 @@ dtensor_fails = {
xfail("cholesky"),
xfail("cholesky_inverse"),
xfail("cholesky_solve"),
xfail("chunk"),
xfail("combinations"),
xfail("complex"),
xfail("count_nonzero"),
@ -661,36 +661,6 @@ class TestDTensorOps(DTensorOpTestBase):
sample_inputs_filter=lambda s: s.kwargs["num_classes"] != -1,
)
def test_mean(self):
self.mesh = init_device_mesh(DEVICE_TYPE, (self.world_size,))
shape = [2 * self.world_size + 1, 2 * self.world_size]
tensor = (
torch.arange(shape[0] * shape[1], dtype=torch.float32)
.reshape(shape)
.to(DEVICE_TYPE)
)
for is_evenly_shardable in [True]:
if is_evenly_shardable:
placement = [Shard(1)]
reduce_dim = 1
else:
placement = [Shard(0)]
reduce_dim = 0
dtensor = distribute_tensor(tensor, self.mesh, placement)
with DebugMode(record_torchfunction=False) as debug_mode:
mean = dtensor.mean(dim=reduce_dim)
full_tensor = mean.full_tensor()
self.assertEqual(full_tensor, tensor.mean(dim=reduce_dim))
if is_evenly_shardable:
self.assertFalse("redistribute_input" in debug_mode.debug_string())
else:
self.assertTrue("redistribute_input" in debug_mode.debug_string())
# only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU)
instantiate_device_type_tests(TestDTensorOps, globals(), only_for=(DEVICE_TYPE,))

View File

@ -1,10 +1,8 @@
# Copyright (c) Meta Platforms, Inc. and affiliates
# Owner(s): ["oncall: distributed"]
import random
from torch.distributed.tensor._dtensor_spec import DTensorSpec
from torch.distributed.tensor._op_schema import OpSchema, RuntimeSchemaInfo
from torch.distributed.tensor._op_schema import OpSchema
from torch.testing._internal.common_utils import run_tests, TestCase
@ -12,108 +10,12 @@ class TestOpSchema(TestCase):
def test_equality_checks_lists_of_dtensor_spec(self):
"""If x == y, then we must have h(x) == h(y)."""
dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
schema1 = OpSchema(op=None, args_schema=(dts, [dts]), kwargs_schema={})
schema2 = OpSchema(op=None, args_schema=(dts, [dts, dts]), kwargs_schema={})
schema1 = OpSchema(op=None, args_schema=[dts, [dts]], kwargs_schema={})
schema2 = OpSchema(op=None, args_schema=[dts, [dts, dts]], kwargs_schema={})
# This is a regression test; these schemas used to compare equal.
self.assertNotEqual(schema1, schema2)
self.assertNotEqual(hash(schema1), hash(schema2))
def test_equality_respects_static_attributes(self):
def _get_sample_op_schemas(static_arg_val, static_kwarg_val):
dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
static_argnum = 2
static_kwargkey = ["statickwarg"]
annotated_schemas = [
(False, False, None),
(True, False, RuntimeSchemaInfo(static_argnum=static_argnum)),
(False, True, RuntimeSchemaInfo(static_kwargkey=static_kwargkey)),
(
True,
True,
RuntimeSchemaInfo(
static_argnum=static_argnum, static_kwargkey=static_kwargkey
),
),
]
# non-tensor args show up in hash iff the argnum is static/
# kwargs show up in hash iff their name is in static_kwargkey.
# random elements are random because they are not supposed to matter for
# equality at all.
args_schema = (dts, random.randint(1, 1000000), static_arg_val)
kwargs_schema = {
"ignoredkwarg": random.randint(1, 1000000),
"statickwarg": static_kwarg_val,
}
return [
(
has_static_arg,
has_static_kwarg,
OpSchema(
op=None,
args_schema=args_schema,
kwargs_schema=kwargs_schema,
schema_info=si,
),
)
for (has_static_arg, has_static_kwarg, si) in annotated_schemas
]
for lhs_has_static_arg, lhs_has_static_kwarg, lhs in _get_sample_op_schemas(
1, 2
):
# Static arg/kwarg both match
for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
1, 2
):
if (
lhs_has_static_arg == rhs_has_static_arg
and lhs_has_static_kwarg == rhs_has_static_kwarg
):
self.assertEqual(lhs, rhs)
else:
self.assertNotEqual(lhs, rhs)
# Static arg mismatch
for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
3, 2
):
if (
lhs_has_static_arg
or rhs_has_static_arg
or lhs_has_static_kwarg != rhs_has_static_kwarg
):
self.assertNotEqual(lhs, rhs)
else:
self.assertEqual(lhs, rhs)
# Static kwarg mismatch
for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
1, 3
):
if (
lhs_has_static_kwarg
or rhs_has_static_kwarg
or lhs_has_static_arg != rhs_has_static_arg
):
self.assertNotEqual(lhs, rhs)
else:
self.assertEqual(lhs, rhs)
# Static arg/kwarg both mismatch
for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
3, 4
):
if (
lhs_has_static_arg
or rhs_has_static_arg
or lhs_has_static_kwarg
or rhs_has_static_kwarg
):
self.assertNotEqual(lhs, rhs)
else:
self.assertEqual(lhs, rhs)
if __name__ == "__main__":
run_tests()

View File

@ -63,20 +63,22 @@ def nvshmem_get_kernel(
@triton.jit
def nvshmem_putmem_signal_block_kernel(
dst,
src,
dst_ptr,
src_ptr,
size_bytes,
signal,
sig_val,
sig_ptr,
signal_val,
sig_op,
peer,
):
nvshmem.putmem_signal_block(dst, src, size_bytes, signal, sig_val, sig_op, peer)
nvshmem.putmem_signal_block(
dst_ptr, src_ptr, size_bytes, sig_ptr, signal_val, sig_op, peer
)
@triton.jit
def nvshmem_signal_wait_until_kernel(signal, cmp_op, cmp_val):
nvshmem.signal_wait_until(signal, cmp_op, cmp_val)
def nvshmem_signal_wait_until_kernel(sig_ptr, cmp_op, cmp_val):
nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
@triton.jit
@ -417,7 +419,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
val = 11
inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
symm_mem.rendezvous(inp, group=group_name)
inp_hdl = symm_mem.rendezvous(inp, group=group_name)
out_hdl = symm_mem.rendezvous(out, group=group_name)
# Use the signal pad attached to the output symmetric memory handle
@ -431,12 +433,15 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
if rank == 0:
# Rank 0 puts into Rank 1
dst_ptr = out_hdl.buffer_ptrs[peer]
src_ptr = inp_hdl.buffer_ptrs[rank]
sig_ptr = out_hdl.signal_pad_ptrs[peer]
nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
out,
inp,
dst_ptr,
src_ptr,
size_bytes=msg_size_bytes,
signal=flag,
sig_val=SIGNAL_VAL,
sig_ptr=sig_ptr,
signal_val=SIGNAL_VAL,
sig_op=NVSHMEM_SIGNAL_SET,
peer=peer,
extern_libs=nvshmem_lib,
@ -444,8 +449,9 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
if rank == 1:
# Wait until signal flag is set by Rank 0
sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
nvshmem_signal_wait_until_kernel[(1,)](
flag,
sig_ptr_local,
cmp_op=NVSHMEM_CMP_EQ,
cmp_val=SIGNAL_VAL,
extern_libs=nvshmem_lib,
@ -479,7 +485,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
val = 11
inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
symm_mem.rendezvous(inp, group=group_name)
inp_hdl = symm_mem.rendezvous(inp, group=group_name)
out_hdl = symm_mem.rendezvous(out, group=group_name)
# Use the signal pad attached to the output symmetric memory handle
@ -493,20 +499,24 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
if rank == 0:
# Rank 0 puts into Rank 1
dst_ptr = out_hdl.buffer_ptrs[peer]
src_ptr = inp_hdl.buffer_ptrs[rank]
sig_ptr = out_hdl.signal_pad_ptrs[peer]
nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
out,
inp,
dst_ptr,
src_ptr,
size_bytes=msg_size_bytes,
signal=flag,
sig_val=SIGNAL_VAL,
sig_ptr=sig_ptr,
signal_val=SIGNAL_VAL,
sig_op=NVSHMEM_SIGNAL_ADD,
peer=peer,
extern_libs=nvshmem_lib,
)
if rank == 1:
sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
nvshmem_signal_wait_until_kernel[(1, 1, 1)](
flag,
sig_ptr_local,
cmp_op=NVSHMEM_CMP_EQ,
cmp_val=SIGNAL_VAL,
extern_libs=nvshmem_lib,
@ -536,13 +546,10 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
FLAG_FINAL_VALUE = 42
# Use a single int64 symmetric tensor as our synchronization flag.
flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(
flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(
FLAG_INITIAL_VALUE
)
symm_mem.rendezvous(flag, group=group_name)
expected_flag = torch.tensor(
[FLAG_FINAL_VALUE], dtype=torch.int32, device=self.device
)
nvshmem_barrier_all_kernel[(1,)](extern_libs=nvshmem_lib)
@ -558,15 +565,19 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
# Verification
torch.testing.assert_close(
flag,
expected_flag,
torch.tensor([FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device),
)
if rank == 1:
# Rank 1 (the signaler)
val_to_put = torch.tensor(
[FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device
)
# Launch a kernel to put the value to Rank 0's flag tensor.
nvshmem_put_kernel[(1,)](
flag, # Destination symmetric tensor on the remote PE
expected_flag, # Source data tensor (local)
val_to_put, # Source data tensor (local)
1, # Number of elements
peer, # The target PE (Rank 0)
extern_libs=nvshmem_lib,
@ -598,7 +609,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
# Producer (rank 0) prepares the data to send
inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val_to_put)
symm_mem.rendezvous(inp, group=group_name)
inp_hdl = symm_mem.rendezvous(inp, group=group_name)
# Consumer (rank 1) prepares the destination buffer
out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
out_hdl = symm_mem.rendezvous(out, group=group_name)
@ -608,20 +619,24 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
if rank == 0:
# Producer (rank 0): Puts data into rank 1's `out` buffer and then sets the flag
dst_ptr = out_hdl.buffer_ptrs[peer]
src_ptr = inp_hdl.buffer_ptrs[rank]
sig_ptr = out_hdl.signal_pad_ptrs[peer]
nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
out,
inp,
dst_ptr,
src_ptr,
size_bytes=msg_size_bytes,
signal=flag,
sig_val=COMPLETION_FLAG_VAL,
sig_ptr=sig_ptr,
signal_val=COMPLETION_FLAG_VAL,
sig_op=NVSHMEM_SIGNAL_SET,
peer=peer,
extern_libs=nvshmem_lib,
)
elif rank == 1:
# Consumer (rank 1): Waits on the signal variable using `signal_wait_until`.
sig_ptr = out_hdl.signal_pad_ptrs[rank]
nvshmem_signal_wait_until_kernel[(1, 1, 1)](
flag,
sig_ptr,
cmp_op=NVSHMEM_CMP_EQ,
cmp_val=COMPLETION_FLAG_VAL,
extern_libs=nvshmem_lib,
@ -674,10 +689,10 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
symm_mem.rendezvous(out2, group=group_name)
# Use regular symmetric memory tensor for flag
flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(0)
flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
symm_mem.rendezvous(flag, group=group_name)
flag_update_val = torch.tensor(
[flag_val], dtype=torch.int32, device=self.device
[flag_val], dtype=torch.int64, device=self.device
)
NVSHMEM_CMP_EQ = 0 # compare equal
@ -710,7 +725,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
out2, val2 * torch.ones(numel, dtype=dtype, device=self.device)
)
torch.testing.assert_close(
flag, torch.tensor([flag_val], dtype=torch.int32, device=self.device)
flag, torch.tensor([flag_val], dtype=torch.int64, device=self.device)
)
@skipIfRocm
@ -732,9 +747,9 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(0)
flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
flag_update_val = torch.tensor(
[flag_val], dtype=torch.int32, device=self.device
[flag_val], dtype=torch.int64, device=self.device
)
symm_mem.rendezvous(inp, group=group_name)
@ -967,7 +982,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
torch.uint8,
torch.float16,
torch.float32,
# torch.float64, # Tensor-likes are not close
torch.float64,
torch.bfloat16,
],
)
@ -1120,7 +1135,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
torch.int64,
torch.float16,
torch.float32,
# torch.float64, # Tensor-likes are not close
torch.float64,
torch.bfloat16,
],
)

View File

@ -386,7 +386,7 @@ class TestCustomBackendAPI(torch._dynamo.test_case.TestCase):
self.assertTrue(backend_run)
devices = ["cpu", "cuda", "hpu", "xpu"]
devices = ["cpu", "cuda", "hpu"]
instantiate_device_type_tests(TestOptimizations, globals(), only_for=devices)
if __name__ == "__main__":

View File

@ -8,12 +8,7 @@ from torch._dynamo.callback import callback_handler, CallbackArgs, CallbackTrigg
from torch._dynamo.test_case import run_tests, TestCase
from torch._guards import CompileId
from torch.testing._internal.common_utils import TEST_WITH_ROCM
from torch.testing._internal.triton_utils import HAS_CUDA_AND_TRITON, requires_gpu
device_type = (
acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
)
from torch.testing._internal.triton_utils import requires_cuda_and_triton
class CallbackTests(TestCase):
@ -66,7 +61,7 @@ class CallbackTests(TestCase):
@unittest.skipIf(
TEST_WITH_ROCM, "ROCm outputs a different number of autotuning logs"
)
@requires_gpu
@requires_cuda_and_triton
@torch._inductor.config.patch(force_disable_caches=True)
def test_triggers(self) -> None:
torch._dynamo.reset()
@ -96,9 +91,9 @@ class CallbackTests(TestCase):
torch._dynamo.graph_break()
return self.fc2(temp)
model = TinyModel().to(device_type)
model = TinyModel().to("cuda")
compiled_model = torch.compile(model, mode="max-autotune")
x = torch.randn(10, 10, device=device_type)
x = torch.randn(10, 10, device="cuda")
loss = compiled_model(x).sum()
loss.backward()
@ -116,13 +111,9 @@ end=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id
)
order.clear()
if not HAS_CUDA_AND_TRITON:
return
compiled_model.zero_grad()
loss = compiled_model(x).sum()
loss.backward()
self.assertExpectedInline(
"\n".join(order),
"""\

View File

@ -48,6 +48,27 @@ class GenericCtxMgr:
class ErrorMessagesTest(LoggingTestCase):
def test_dynamic_shape_operator(self):
def fn():
return torch.nonzero(torch.rand([10, 10]))
self.assertExpectedInlineMunged(
Unsupported,
lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
"""\
Dynamic shape operator
Explanation: Operator `aten.nonzero.default`'s output shape depends on input Tensor data.
Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`
Developer debug context: aten.nonzero.default
For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0036.html
from user code:
File "test_error_messages.py", line N, in fn
return torch.nonzero(torch.rand([10, 10]))""",
)
def test_dynamic_shape_operator_no_meta_kernel(self):
def fn():
return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))
@ -70,6 +91,29 @@ from user code:
return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))""",
)
def test_data_dependent_operator(self):
def fn(x):
return x.item()
self.assertExpectedInlineMunged(
Unsupported,
lambda: torch.compile(fn, backend="eager", fullgraph=True)(
torch.Tensor([1])
),
"""\
Unsupported Tensor.item() call with capture_scalar_outputs=False
Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.
Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
Developer debug context: call_method TensorVariable() item () {}
For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html
from user code:
File "test_error_messages.py", line N, in fn
return x.item()""",
)
def test_data_dependent_operator2(self):
def fn(x):
return torch.equal(x, x)

View File

@ -40,16 +40,11 @@ from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
parametrize,
)
from torch.testing._internal.inductor_utils import HAS_GPU
# Defines all the kernels for tests
from torch.testing._internal.triton_utils import * # noqa: F403
device_type = (
acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
)
T = TypeVar("T")
d = torch.ones(10, 10)
@ -1155,10 +1150,10 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
m = a.to(torch.float16)
return b.type(m.type())
@unittest.skipIf(not HAS_GPU, "requires gpu")
@unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
@make_test
def test_tensor_type2(a, b):
m = a.to(device_type)
m = a.to("cuda")
return m + b.type(m.type())
@make_test
@ -2087,12 +2082,6 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
mytuple = FunctionTests.MyNamedTuple(a, b)
return mytuple.add(), mytuple.static_method(), mytuple.class_method()
@make_test
def test_namedtuple_replace(a, b):
mytuple = FunctionTests.MyNamedTuple(a, b)
replaced = mytuple._replace(first=b)
return mytuple.first + mytuple.second + replaced.first + replaced.second
@make_test
def test_generic_namedtuple_user_methods(a, b):
mytuple = FunctionTests.MyGenericNamedTuple(a, b)
@ -4051,7 +4040,7 @@ class GraphModule(torch.nn.Module):
def f1():
mod1 = torch.get_device_module()
mod2 = torch.get_device_module("cpu")
mod3 = torch.get_device_module(torch.device(device_type))
mod3 = torch.get_device_module(torch.device("cuda"))
return mod1, mod2, mod3
self.assertEqual(f1(), torch.compile(f1, backend="eager", fullgraph=True)())
@ -4086,7 +4075,6 @@ class GraphModule(torch.nn.Module):
new_device = (
"cpu" if torch._C._get_accelerator() == torch.device("cuda") else "cuda"
)
old_get_device_module = torch.get_device_module
def new_get_device_module(device=None):
@ -4733,12 +4721,10 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
opt_fn(x, ys, zs[:1])
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_gpu_current_device(self):
def test_cuda_current_device(self):
def fn(x):
y = torch.empty(
(2, 3),
dtype=torch.float32,
device=torch.accelerator.current_device_index(),
(2, 3), dtype=torch.float32, device=torch.cuda.current_device()
)
y.copy_(x)
return torch.sin(y + y.device.index)
@ -4746,11 +4732,11 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
counter = torch._dynamo.testing.CompileCounter()
opt_fn = torch.compile(backend=counter, fullgraph=True)(fn)
with torch.accelerator.device_index(0):
with torch.cuda.device(0):
x = torch.randn(2, 3)
self.assertEqual(opt_fn(x), fn(x))
self.assertEqual(counter.frame_count, 1)
with torch.accelerator.device_index(1):
with torch.cuda.device(1):
self.assertEqual(opt_fn(x), fn(x))
self.assertEqual(counter.frame_count, 2)

View File

@ -1,5 +1,6 @@
# Owner(s): ["module: dynamo"]
import contextlib
import os
import torch
import torch.fx
@ -195,6 +196,21 @@ class GraphRegionTrackerTests(TestCase):
)
def test_mismatched_global_state(self):
@contextlib.contextmanager
def _hip_allow_tf32():
# for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
# and only for MI300+
hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
try:
yield
finally:
if hip_allow_tf32 is not None:
os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
else:
del os.environ["HIPBLASLT_ALLOW_TF32"]
def inner_fn(x, y):
x1 = x * 1
y1 = y + 1
@ -235,29 +251,31 @@ class GraphRegionTrackerTests(TestCase):
def reset_default_dtype():
torch.set_default_dtype(old_dtype)
for ctx in [
lambda: torch.set_grad_enabled(False),
torch.autograd.grad_mode.inference_mode,
lambda: torch.autograd.graph.disable_saved_tensors_hooks(
"This is not supported"
),
# lambda: torch.set_num_threads(2), : Unsupported
(set_default_dtype_bfloat16, reset_default_dtype),
(
lambda: torch.use_deterministic_algorithms(True),
lambda: torch.use_deterministic_algorithms(False),
),
# (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
# lambda: torch.use_deterministic_algorithms(False)), : Unsupported
create_toggle_fns("allow_bf16_reduced_precision_reduction"),
create_toggle_fns("allow_fp16_reduced_precision_reduction"),
create_toggle_fns("allow_tf32"),
]:
self.assertExpectedInline(
self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
"""[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
with tf32_ctx():
for ctx in [
lambda: torch.set_grad_enabled(False),
torch.autograd.grad_mode.inference_mode,
lambda: torch.autograd.graph.disable_saved_tensors_hooks(
"This is not supported"
),
# lambda: torch.set_num_threads(2), : Unsupported
(set_default_dtype_bfloat16, reset_default_dtype),
(
lambda: torch.use_deterministic_algorithms(True),
lambda: torch.use_deterministic_algorithms(False),
),
# (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
# lambda: torch.use_deterministic_algorithms(False)), : Unsupported
create_toggle_fns("allow_bf16_reduced_precision_reduction"),
create_toggle_fns("allow_fp16_reduced_precision_reduction"),
create_toggle_fns("allow_tf32"),
]:
self.assertExpectedInline(
self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
"""[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
[['x1', 'y1', 'sum_1', 'o4'], ['x1_1', 'y1_1', 'sum_2', 'o5']]]""",
)
)
def test_mutation_tracking_simple(self):
def fn(x, y, z):

View File

@ -116,6 +116,8 @@ num_guards_executed=0)
const_guard = guards.LAMBDA_GUARD(
root,
functools.partial(equals_match, expected=5),
{},
False,
equals_match_verbose_code_parts(5),
)
self.assertTrue(const_guard(5))
@ -405,10 +407,14 @@ num_guards_executed=0)
guard_manager.add_type_match_guard(id_type(5), ["type(x) == int"])
guard_manager.add_lambda_guard(
functools.partial(ge_match, expected=5),
{},
False,
ge_match_verbose_code_parts(expected=5),
)
guard_manager.add_lambda_guard(
functools.partial(less_match, expected=10),
{},
False,
less_match_verbose_code_parts(expected=10),
)
self.assertEqual(len(guard_manager.get_leaf_guards()), 3)
@ -428,10 +434,14 @@ num_guards_executed=0)
guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
guard_manager.getattr_manager("x", "x", 1, default_mgr_enum).add_lambda_guard(
functools.partial(equals_match, expected=foo.x),
{},
False,
equals_match_verbose_code_parts(foo.x),
)
guard_manager.getattr_manager("y", "y", 2, default_mgr_enum).add_lambda_guard(
functools.partial(equals_match, expected=foo.y),
{},
False,
equals_match_verbose_code_parts(foo.y),
)
self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
@ -474,10 +484,14 @@ num_guards_executed=0)
guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
guard_manager.getitem_manager(0, "", 1, default_mgr_enum).add_lambda_guard(
functools.partial(equals_match, expected=foo[0]),
{},
False,
equals_match_verbose_code_parts(foo[0]),
)
guard_manager.getitem_manager(1, "", 2, default_mgr_enum).add_lambda_guard(
functools.partial(equals_match, expected=foo[1]),
{},
False,
equals_match_verbose_code_parts(foo[1]),
)
self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
@ -585,6 +599,8 @@ num_guards_executed=0)
lambda x: isinstance(x, Pair)
and isinstance(x.x, torch.Tensor)
and isinstance(x.y, int),
{},
False,
"global guard fail",
)
@ -635,6 +651,8 @@ num_guards_executed=0)
)
attr_manager.add_lambda_guard(
lambda x: x == 4,
{},
False,
"Expected value 4",
)
@ -675,6 +693,8 @@ num_guards_executed=0)
weakref_manager.add_lambda_guard(
lambda x: isinstance(x, torch.Tensor),
{},
False,
"global weakref fail",
)
@ -694,6 +714,8 @@ num_guards_executed=0)
)
foo_mgr.add_lambda_guard(
lambda x: x == 3,
{},
False,
"Expected value 3",
)
self.assertTrue(guard_manager.check(a))
@ -779,7 +801,7 @@ num_guards_executed=0)
# Add key-value manager (nothing : {"z" : 3})
self.assertTrue(root.check(f_locals))
dict_mgr.get_key_manager(1, "", nothing, default_mgr_enum).add_lambda_guard(
lambda x: x is nothing, ["x is nothing"]
lambda x: x is nothing, {}, False, ["x is nothing"]
)
self.assertTrue(root.check(f_locals))
value_mgr = dict_mgr.get_value_manager(

View File

@ -7207,7 +7207,9 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
return x + 1
guard_manager = torch._dynamo.guards.RootGuardManager()
guard_manager.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
guard_manager.add_lambda_guard(
lambda L: isinstance(L["x"], int), {"x": 0}, True, []
)
def injected(x):
return x + 42
@ -7232,27 +7234,33 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
return x + 1
guard_manager_bool = torch._dynamo.guards.RootGuardManager()
guard_manager_bool.add_lambda_guard(lambda L: isinstance(L["x"], bool), [])
guard_manager_bool.add_lambda_guard(
lambda L: isinstance(L["x"], bool), {"x": 0}, True, []
)
def injected_bool(x: bool):
return x + 102
guard_manager_int = torch._dynamo.guards.RootGuardManager()
guard_manager_int.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
guard_manager_int.add_lambda_guard(
lambda L: isinstance(L["x"], int), {"x": 0}, True, []
)
def injected_int(x: int):
return x + 42
guard_manager_tensor = torch._dynamo.guards.RootGuardManager()
guard_manager_tensor.add_lambda_guard(
lambda L: isinstance(L["x"], torch.Tensor), []
lambda L: isinstance(L["x"], torch.Tensor), {"x": 0}, True, []
)
def injected_tensor(x: torch.Tensor):
return x + 100
guard_manager_str = torch._dynamo.guards.RootGuardManager()
guard_manager_str.add_lambda_guard(lambda L: isinstance(L["x"], str), [])
guard_manager_str.add_lambda_guard(
lambda L: isinstance(L["x"], str), {"x": 0}, True, []
)
def injected_str(x: str):
return x + "1"
@ -7329,7 +7337,10 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
guard_manager_bool = torch._dynamo.guards.RootGuardManager()
guard_manager_bool.add_lambda_guard(
lambda L: isinstance(L["x"], bool), ["isinstance(L['x'], bool)"]
lambda L: isinstance(L["x"], bool),
{"x": 0},
True,
["isinstance(L['x'], bool)"],
)
def injected_bool(x: bool):
@ -8467,24 +8478,43 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
def fn(x):
return x + 1
initial_state = read_state()
y = torch.randn(10)
try:
for round in range(3):
for i in range(len(initial_state)):
new_state = [False] * len(initial_state)
new_state[i] = True
write_state(new_state)
assert read_state() == new_state
last_state.clear()
fn(y)
assert last_state == new_state
if round == 0:
assert cnt == i + 1
else:
assert cnt == len(initial_state)
finally:
write_state(initial_state)
import contextlib
@contextlib.contextmanager
def _hip_allow_tf32():
# for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
# and only for MI300+
hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
try:
yield
finally:
if hip_allow_tf32 is not None:
os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
else:
del os.environ["HIPBLASLT_ALLOW_TF32"]
tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
with tf32_ctx():
initial_state = read_state()
y = torch.randn(10)
try:
for round in range(3):
for i in range(len(initial_state)):
new_state = [False] * len(initial_state)
new_state[i] = True
write_state(new_state)
assert read_state() == new_state
last_state.clear()
fn(y)
assert last_state == new_state
if round == 0:
assert cnt == i + 1
else:
assert cnt == len(initial_state)
finally:
write_state(initial_state)
def test_grad_state_mutated(self):
prior = torch.is_grad_enabled()
@ -13263,7 +13293,7 @@ class MiscTestsDevice(torch._inductor.test_case.TestCase):
self.assertEqual(out, opt_out)
@unittest.skipIf(not TEST_MULTIGPU, "need multiple GPU")
def test_gpu_set_device(self, device):
def test_cuda_set_device(self, device):
def fn():
a = torch.ones(2, device=device)
torch.get_device_module(device).set_device(1)
@ -13343,26 +13373,6 @@ class MiscTestsDevice(torch._inductor.test_case.TestCase):
y = torch.tensor(5)
f(x, y)
def test_full_graph_capture_scalar_outputs(self):
@torch.compile(fullgraph=True)
def foo(a):
return torch.randn(5) * a.item()
# We expect to no longer raise here
foo(torch.tensor(2.0))
def test_full_graph_capture_dynamic_output_shape_ops(self):
def fn(x):
nz = torch.nonzero(x)
squared = nz * nz
sliced = torch.ops.aten.slice.Tensor(squared, dim=1, start=-2, end=None)
view = sliced.unsqueeze(dim=0)
return view.squeeze(dim=0)
example_inputs = (torch.randn(1, 1, 1, 1),)
# we expect to no longer raise here
torch.compile(fn, fullgraph=True)(*example_inputs)
def test_dynamic_float_scalar_tensor_coersion(self):
# Minified version of https://github.com/pytorch/pytorch/issues/158376#issuecomment-3079591367
class Foo:

View File

@ -47,8 +47,7 @@ class PrecompileContextTests(InductorTestCase):
x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
result = compiled_fn(x)
result.sum().backward()
self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
result = PrecompileContext.serialize()
@ -83,9 +82,8 @@ class PrecompileContextTests(InductorTestCase):
x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
result = compiled_fn(x)
result.sum().backward()
self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
for key in PrecompileContext._backend_artifacts_by_key.keys():
self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
for key in PrecompileContext._new_cache_artifacts_by_key.keys():
result = PrecompileContext.serialize_artifact_by_key(key)
assert isinstance(result, PrecompileCacheArtifact)
self.assertEqual(result.key, key)
@ -111,12 +109,11 @@ class PrecompileContextTests(InductorTestCase):
x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
result = compiled_fn(x)
result.sum().backward()
self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
# Find the key for the artifact of type "precompile_aot_autograd"
key = next(
k
for k, v in PrecompileContext._backend_artifacts_by_key.items()
for k, v in PrecompileContext._new_cache_artifacts_by_key.items()
if isinstance(v, EditablePrecompileCacheArtifact)
)

View File

@ -109,8 +109,6 @@ class StructuredTraceTestingFormatter(logging.Formatter):
metadata["dynamo_start"]["stack"] = "STACK"
if "inductor_output_code" in metadata:
metadata["inductor_output_code"]["filename"] = "FILENAME"
if "file_path" in metadata["inductor_output_code"]:
metadata["inductor_output_code"]["file_path"] = "FILENAME"
if "stack" in metadata:
metadata["stack"] = "STACK"
if "compilation_metrics" in metadata:
@ -261,7 +259,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -295,7 +293,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -337,7 +335,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@ -359,7 +357,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
@ -391,7 +389,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@ -448,7 +446,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
@ -457,7 +455,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
{"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
{"dynamo_start": {"stack": "STACK"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@ -600,7 +598,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -613,7 +611,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -681,7 +679,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"describe_storage": {"id": 16, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
{"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@ -700,7 +698,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@ -741,7 +739,7 @@ class StructuredTraceTest(TestCase):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
@ -902,7 +900,7 @@ def forward(self, x, y):
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@ -917,7 +915,7 @@ def forward(self, x, y):
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
{"artifact": {"name": "inductor_provenance_tracking_node_mappings", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"artifact": {"name": "inductor_provenance_tracking_kernel_stack_traces", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
{"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}

Some files were not shown because too many files have changed in this diff Show More