mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-29 19:24:55 +08:00
Compare commits
6 Commits
udate-sphi
...
msaroufim/
| Author | SHA1 | Date | |
|---|---|---|---|
| a8c367127b | |||
| fa839e440c | |||
| 4ae58a3dd4 | |||
| 8e8ec24374 | |||
| 4b74106204 | |||
| 693880081c |
@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt
|
||||
pip install auditwheel==6.2.0 wheel
|
||||
if [ "$DESIRED_CUDA" = "cpu" ]; then
|
||||
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
|
||||
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
else
|
||||
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
|
||||
export USE_SYSTEM_NCCL=1
|
||||
@ -45,5 +46,6 @@ else
|
||||
export USE_NVIDIA_PYPI_LIBS=1
|
||||
fi
|
||||
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
|
||||
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
fi
|
||||
|
||||
@ -317,7 +317,7 @@ if __name__ == "__main__":
|
||||
).decode()
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_vars = ""
|
||||
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
|
||||
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
|
||||
if enable_cuda:
|
||||
build_vars += "MAX_JOBS=5 "
|
||||
|
||||
@ -1 +1 @@
|
||||
e0dda9059d082537cee36be6c5e4fe3b18c880c0
|
||||
56392aa978594cc155fa8af48cd949f5b5f1823a
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
transformers==4.56.0
|
||||
transformers==4.54.0
|
||||
soxr==0.5.0
|
||||
|
||||
@ -42,27 +42,22 @@ install_pip_dependencies() {
|
||||
# A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
|
||||
# numba and scipy version used in PyTorch CI
|
||||
conda_run pip uninstall -y numba scipy
|
||||
# Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
|
||||
pip_install yaspin==3.1.0
|
||||
|
||||
popd
|
||||
}
|
||||
|
||||
setup_executorch() {
|
||||
pushd executorch
|
||||
|
||||
export PYTHON_EXECUTABLE=python
|
||||
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
|
||||
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
|
||||
|
||||
as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
|
||||
popd
|
||||
}
|
||||
|
||||
if [ $# -eq 0 ]; then
|
||||
clone_executorch
|
||||
install_buck2
|
||||
install_conda_dependencies
|
||||
install_pip_dependencies
|
||||
pushd executorch
|
||||
setup_executorch
|
||||
popd
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
clone_executorch
|
||||
install_buck2
|
||||
install_conda_dependencies
|
||||
install_pip_dependencies
|
||||
setup_executorch
|
||||
|
||||
@ -1,24 +1,23 @@
|
||||
sphinx==6.2.1
|
||||
sphinx==5.3.0
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 7.2.6
|
||||
|
||||
#Pinned versions: 5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
|
||||
|
||||
sphinx-remove-toctrees==1.0.0.post1
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 1.0.0.post1
|
||||
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
# something related to Docker setup. We can investigate this later.
|
||||
|
||||
sphinxcontrib.katex==0.9.10
|
||||
sphinxcontrib.katex==0.8.6
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 0.9.10
|
||||
#Pinned versions: 0.8.6
|
||||
|
||||
sphinx_sitemap==2.7.1
|
||||
sphinxext-opengraph==0.9.1
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 0.9.1
|
||||
|
||||
sphinx_sitemap==2.6.0
|
||||
#Description: This is used to generate sitemap for PyTorch docs
|
||||
#Pinned versions: 2.7.1
|
||||
#Pinned versions: 2.6.0
|
||||
|
||||
matplotlib==3.5.3 ; python_version < "3.13"
|
||||
matplotlib==3.6.3 ; python_version >= "3.13"
|
||||
@ -30,17 +29,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 2.13.0
|
||||
|
||||
breathe==4.35.0
|
||||
breathe==4.34.0
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 4.35.0
|
||||
#Pinned versions: 4.34.0
|
||||
|
||||
exhale==0.3.7
|
||||
exhale==0.2.3
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 0.3.7
|
||||
#Pinned versions: 0.2.3
|
||||
|
||||
docutils==0.18.1
|
||||
docutils==0.16
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 0.18.1
|
||||
#Pinned versions: 0.16
|
||||
|
||||
bs4==0.0.1
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
@ -50,24 +49,13 @@ IPython==8.12.0
|
||||
#Description: This is used to generate PyTorch functorch docs
|
||||
#Pinned versions: 8.12.0
|
||||
|
||||
<<<<<<< HEAD
|
||||
myst-nb==0.17.2
|
||||
#Description: This is used to generate PyTorch functorch and torch.compile docs.
|
||||
#Pinned versions: 0.17.2
|
||||
=======
|
||||
myst-nb==1.2.0
|
||||
#Description: This is used to generate PyTorch functorch docs
|
||||
#Pinned versions: 1.2.0
|
||||
>>>>>>> 195382ce28e (Update)
|
||||
|
||||
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
|
||||
python-etcd==0.4.5
|
||||
sphinx-copybutton==0.5.2
|
||||
sphinx-design==0.6.1
|
||||
sphinx-copybutton==0.5.0
|
||||
sphinx-design==0.4.0
|
||||
sphinxcontrib-mermaid==1.0.0
|
||||
<<<<<<< HEAD
|
||||
myst-parser==0.18.1
|
||||
=======
|
||||
myst-parser==3.0.1
|
||||
myst-nb
|
||||
>>>>>>> 195382ce28e (Update)
|
||||
|
||||
@ -83,10 +83,6 @@ rm -rf pytorch || true
|
||||
pushd "$pt_checkout"
|
||||
pushd docs
|
||||
|
||||
# Profile the docs build to see what is taking the longest
|
||||
python -m cProfile -o docs_build.prof -m sphinx.cmd.build -b html -d build/doctrees source build/html
|
||||
python -c "import pstats; p = pstats.Stats('docs_build.prof'); p.sort_stats('cumtime').print_stats(50)"
|
||||
|
||||
# Build the docs
|
||||
if [ "$is_main_doc" = true ]; then
|
||||
build_docs html || exit $?
|
||||
|
||||
@ -1550,10 +1550,14 @@ test_executorch() {
|
||||
install_torchvision
|
||||
install_torchaudio
|
||||
|
||||
INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
|
||||
|
||||
pushd /executorch
|
||||
"${INSTALL_SCRIPT}" setup_executorch
|
||||
|
||||
export PYTHON_EXECUTABLE=python
|
||||
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
|
||||
|
||||
# NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
|
||||
# from the PR
|
||||
bash .ci/scripts/setup-linux.sh --build-tool cmake
|
||||
|
||||
echo "Run ExecuTorch unit tests"
|
||||
pytest -v -n auto
|
||||
@ -1567,6 +1571,10 @@ test_executorch() {
|
||||
|
||||
popd
|
||||
|
||||
# Test torchgen generated code for Executorch.
|
||||
echo "Testing ExecuTorch op registration"
|
||||
"$BUILD_BIN_DIR"/test_edge_op_registration
|
||||
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
|
||||
@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
|
||||
change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")
|
||||
|
||||
for file in Path(f"artifacts/dist/{old_stem}").glob(
|
||||
"*.dist-info/*",
|
||||
"*.dist-info/**",
|
||||
):
|
||||
change_content_to_new_version(file)
|
||||
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26
|
||||
d119fc86140785e7efc8f125c17153544d1e0f20
|
||||
|
||||
3
.github/labeler.yml
vendored
3
.github/labeler.yml
vendored
@ -130,6 +130,3 @@
|
||||
- torch/csrc/inductor/aoti_include/**
|
||||
- torchgen/aoti/**
|
||||
- torchgen/gen_aoti_c_shim.py
|
||||
|
||||
"ciflow/vllm":
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
|
||||
6
.github/workflows/_docs.yml
vendored
6
.github/workflows/_docs.yml
vendored
@ -74,11 +74,7 @@ jobs:
|
||||
- docs_type: python
|
||||
runner: ${{ inputs.runner_prefix }}linux.2xlarge
|
||||
# It takes less than 30m to finish python docs unless there are issues
|
||||
timeout-minutes: 60
|
||||
- docs_type: functorch
|
||||
runner: ${{ inputs.runner_prefix }}linux.2xlarge
|
||||
# It takes less than 15m to finish functorch docs unless there are issues
|
||||
timeout-minutes: 15
|
||||
timeout-minutes: 30
|
||||
# Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
|
||||
# The current name requires updating the database last docs push query from test-infra every time the matrix is updated
|
||||
name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
|
||||
|
||||
3
.github/workflows/docker-builds.yml
vendored
3
.github/workflows/docker-builds.yml
vendored
@ -71,7 +71,8 @@ jobs:
|
||||
pytorch-linux-jammy-py3-clang12-onnx,
|
||||
pytorch-linux-jammy-linter,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
|
||||
pytorch-linux-jammy-py3-clang12-executorch,
|
||||
# Executorch pin needs update
|
||||
# pytorch-linux-jammy-py3-clang12-executorch,
|
||||
pytorch-linux-jammy-py3.12-triton-cpu,
|
||||
pytorch-linux-noble-riscv64-py3.12-gcc14
|
||||
]
|
||||
|
||||
26
.github/workflows/pull.yml
vendored
26
.github/workflows/pull.yml
vendored
@ -318,6 +318,32 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-build:
|
||||
if: false # Docker build needs pin update
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-test:
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-py3-clang12-executorch-build
|
||||
if: false # Has been broken for a while
|
||||
with:
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
24
.github/workflows/trunk.yml
vendored
24
.github/workflows/trunk.yml
vendored
@ -259,27 +259,3 @@ jobs:
|
||||
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-build:
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-py3-clang12-executorch-test:
|
||||
name: linux-jammy-py3-clang12-executorch
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-py3-clang12-executorch-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3-clang12-executorch
|
||||
docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
2
.github/workflows/vllm.yml
vendored
2
.github/workflows/vllm.yml
vendored
@ -36,8 +36,6 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
|
||||
allow-reuse-old-whl: false
|
||||
build-additional-packages: "vision audio"
|
||||
build-external-packages: "vllm"
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc11
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@ -259,9 +259,6 @@ gen
|
||||
.pytest_cache
|
||||
aten/build/*
|
||||
|
||||
# Linker scripts for prioritized text optimization
|
||||
cmake/linker_script.ld
|
||||
|
||||
# Bram
|
||||
plsdontbreak
|
||||
|
||||
|
||||
@ -964,6 +964,7 @@ exclude_patterns = [
|
||||
'test/jit/**', # should be run through test/test_jit.py
|
||||
'test/ao/sparsity/**', # should be run through test/test_ao_sparsity.py
|
||||
'test/fx/**', # should be run through test/test_fx.py
|
||||
'test/bottleneck_test/**', # excluded by test/run_test.py
|
||||
'test/package/**', # excluded by test/run_test.py
|
||||
'test/distributed/argparse_util_test.py',
|
||||
'test/distributed/bin/test_script.py',
|
||||
@ -1409,6 +1410,8 @@ exclude_patterns = [
|
||||
'torch/utils/benchmark/utils/timer.py',
|
||||
'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
|
||||
'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
|
||||
'torch/utils/bottleneck/__init__.py',
|
||||
'torch/utils/bottleneck/__main__.py',
|
||||
'torch/utils/bundled_inputs.py',
|
||||
'torch/utils/checkpoint.py',
|
||||
'torch/utils/collect_env.py',
|
||||
|
||||
@ -380,13 +380,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
|
||||
OFF "USE_CUDA" OFF)
|
||||
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
|
||||
"CPU_AARCH64" OFF)
|
||||
# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
|
||||
set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
set(USE_PRIORITIZED_TEXT_DEFAULT ON)
|
||||
endif()
|
||||
cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
|
||||
"${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
|
||||
|
||||
option(USE_MIMALLOC "Use mimalloc" OFF)
|
||||
# Enable third party mimalloc library to improve memory allocation performance
|
||||
@ -664,11 +657,6 @@ endif(MSVC)
|
||||
|
||||
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
|
||||
|
||||
# Set linker max-page-size to 64KiB on AArch64 Linux
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
add_link_options_if_supported("-z,max-page-size=0x10000")
|
||||
endif()
|
||||
|
||||
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
|
||||
# applicable to mobile are disabled by this variable. Setting
|
||||
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
|
||||
@ -1433,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
|
||||
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
|
||||
DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
endif()
|
||||
|
||||
if(USE_PRIORITIZED_TEXT_FOR_LD)
|
||||
add_compile_options(
|
||||
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
|
||||
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
|
||||
)
|
||||
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
|
||||
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
|
||||
COMMENT "Generating prioritized text linker files"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||
|
||||
if(BUILD_PYTHON)
|
||||
set(LINKER_OPT_TARGETS torch_python)
|
||||
endif()
|
||||
|
||||
if(NOT BUILD_LIBTORCHLESS)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
|
||||
if(USE_CUDA)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
|
||||
endif()
|
||||
if(USE_XPU)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
|
||||
endif()
|
||||
if(USE_ROCM)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
|
||||
if(TARGET ${tgt})
|
||||
add_dependencies("${tgt}" generate_linker_script)
|
||||
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
|
||||
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||
else()
|
||||
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
else()
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
message(WARNING [[
|
||||
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
|
||||
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
|
||||
]])
|
||||
endif()
|
||||
endif()
|
||||
@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
|
||||
}
|
||||
|
||||
bool Context::allowTF32CuDNN(const std::string& op) const {
|
||||
if (op.empty()){
|
||||
if (op.size() == 0){
|
||||
bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
|
||||
bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
|
||||
TORCH_CHECK(
|
||||
@ -281,6 +281,9 @@ bool Context::userEnabledOverrideableSDP() const {
|
||||
|
||||
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
|
||||
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
|
||||
#ifdef USE_ROCM
|
||||
static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
|
||||
#endif
|
||||
|
||||
bool Context::checkCuBLASConfigDeterministic() {
|
||||
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
|
||||
@ -340,6 +343,12 @@ void Context::setImmediateMiopen(bool b) {
|
||||
}
|
||||
|
||||
bool Context::allowTF32CuBLAS() const {
|
||||
#ifdef USE_ROCM
|
||||
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
|
||||
if (allow_tf32 != true) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
|
||||
bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
|
||||
TORCH_CHECK(
|
||||
@ -353,6 +362,14 @@ bool Context::allowTF32CuBLAS() const {
|
||||
}
|
||||
|
||||
void Context::setAllowTF32CuBLAS(bool b) {
|
||||
#ifdef USE_ROCM
|
||||
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
|
||||
if (allow_tf32 != true) {
|
||||
C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
|
||||
<< "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
|
||||
setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
|
||||
}
|
||||
@ -426,7 +443,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
|
||||
std::string msg;
|
||||
auto iterp = _fp32_precisions.find(backend);
|
||||
TORCH_CHECK(iterp != _fp32_precisions.end());
|
||||
for (const auto& p : iterp->second) {
|
||||
for (auto p : iterp->second) {
|
||||
msg += p;
|
||||
msg += " ";
|
||||
}
|
||||
|
||||
@ -133,7 +133,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
|
||||
: c10::TensorImpl(
|
||||
c10::DispatchKeySet(DispatchKey::Functionalize),
|
||||
view_value.dtype(),
|
||||
base->storage().data_ptr().device()
|
||||
view_value.device()
|
||||
),
|
||||
value_(view_value),
|
||||
is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
|
||||
@ -485,10 +485,7 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI
|
||||
|
||||
|
||||
c10::Device FunctionalTensorWrapper::device_custom() const {
|
||||
// The storage pointer already uses the underlying tensor custom device (if
|
||||
// applicable) to extract the device. So, we dont have to recurse again by
|
||||
// doing value_.unsafeGetTensorImpl()->device().
|
||||
return storage().data_ptr().device();
|
||||
return value_.unsafeGetTensorImpl()->device();
|
||||
}
|
||||
at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
|
||||
return value_.unsafeGetTensorImpl()->sizes();
|
||||
|
||||
@ -1954,8 +1954,8 @@ void scaled_gemm(
|
||||
#if ROCM_VERSION >= 70000
|
||||
if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
|
||||
// TODO: add constraints based on hipblaslt internals
|
||||
TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
|
||||
"M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
|
||||
TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
|
||||
"Matrix dimensions must be multiples of 32 for MX format. "
|
||||
"Got m=", m, ", n=", n, ", k=", k);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1138,14 +1138,9 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
// TODO: We might want to enforce some structure on the shapes of the scale
|
||||
// tensors
|
||||
bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
|
||||
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
|
||||
bool is_packed_fp4_path = false;
|
||||
#ifdef USE_ROCM
|
||||
is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
|
||||
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
|
||||
#endif
|
||||
return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
|
||||
return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
|
||||
&& scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
|
||||
&& scale.is_contiguous());
|
||||
}
|
||||
|
||||
bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
@ -1386,15 +1381,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
|
||||
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
|
||||
|
||||
int packed_factor = 1;
|
||||
if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
|
||||
// For float4 data type, each byte stores two 4-bit floating-point values,
|
||||
// effectively packing two elements into one byte.
|
||||
packed_factor = 2;
|
||||
}
|
||||
TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
|
||||
mat2.size(1) % 16 == 0,
|
||||
"M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");
|
||||
TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
|
||||
mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
|
||||
"Matrix dimensions must be multiples of 32 for block-wise scaling");
|
||||
|
||||
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
|
||||
out.scalar_type() == ScalarType::Half,
|
||||
|
||||
@ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
|
||||
__global__ void CatArrayBatchedCopy_vectorized(
|
||||
char* output,
|
||||
CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
|
||||
TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
|
||||
const int concatDim,
|
||||
IndexType trailingSize) {
|
||||
|
||||
IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
|
||||
|
||||
if(tid >= nElements) return;
|
||||
|
||||
const char * data = (char*)inputs.input[blockIdx.y];
|
||||
IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
|
||||
IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
|
||||
int64_t dataOffset = (int64_t)offset * alignment; // in bytes
|
||||
|
||||
IndexType stride = gridDim.x * blockDim.x;
|
||||
|
||||
while( tid < nElements){
|
||||
int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
|
||||
os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
|
||||
auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
|
||||
at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
|
||||
tid += stride;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
|
||||
to improve memory bandwidth throughput.
|
||||
@ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
|
||||
CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
|
||||
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
|
||||
// If all batches are contiguous we can call a specialized implementation
|
||||
// which requires the input tensor addresses to be aligned to a
|
||||
// 16 Byte boundary.
|
||||
|
||||
constexpr bool isContig = stride_size == 1;
|
||||
bool isAligned = true;
|
||||
constexpr int alignment = 16;
|
||||
|
||||
// Next, let's initialize the size, stride arrays for the output Tensor.
|
||||
// for contig case, we'll canonicalize output strides, so that
|
||||
// we don't have arbitrary strides for dims of size 0
|
||||
size_t stride0 = 1;
|
||||
if (memory_format == c10::MemoryFormat::Contiguous) {
|
||||
for (int i = nDims - 1; i >= 0; --i) {
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
outputParam.tensorSize[i] = out.size(i);
|
||||
if (isContig) {
|
||||
outputParam.tensorStride[i] = stride0;
|
||||
stride0 *= out.size(i);
|
||||
} else {
|
||||
outputParam.tensorStride[i] = out.stride(i);
|
||||
}
|
||||
outputParam.tensorStride[i] = out.stride(i);
|
||||
}
|
||||
} else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
|
||||
// permute the semantics of dims from NCHW to NHWC so that the input
|
||||
@ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
|
||||
at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
// If all batches are contiguous we can call a specialized implementation
|
||||
// which requires the input tensor addresses to be aligned to a
|
||||
// 16 Byte boundary.
|
||||
|
||||
// for channels last computing slice size correctly is much more involved, so we never send it
|
||||
// on the fully vectorized path
|
||||
// we need output stride in cat dimension to be multiple of alignment,
|
||||
// if we ever use it to compute offsets
|
||||
// for catting in 0th dimension it doesn't matter
|
||||
bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
|
||||
memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
|
||||
outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
|
||||
bool isContig = true;
|
||||
bool isAligned = true;
|
||||
unsigned int max_elements_per_tensor = 0;
|
||||
|
||||
// Now we loop
|
||||
@ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
// high-dimensional tensor
|
||||
if (inputs[i+batchCounter].get().numel() > 0) {
|
||||
dimSize = inputs[i+batchCounter].get().size(dimension);
|
||||
if (isInOutAligned) {
|
||||
auto t = inputs[i+batchCounter].get();
|
||||
// similarly to output stride, we cannot trust stride value to
|
||||
// determine slice size if the corresponding dimension is 1
|
||||
// we have to multiply all the subsequent sizes
|
||||
int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
|
||||
t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
|
||||
slice_size *= sizeof(scalar_t);
|
||||
isInOutAligned &= (slice_size % alignment == 0);
|
||||
}
|
||||
}
|
||||
|
||||
catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
|
||||
@ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
#ifdef USE_ROCM
|
||||
// On ROCm, CatArrayBatchedCopy_contig is faster
|
||||
isAligned = false;
|
||||
isInOutAligned = false;
|
||||
#else
|
||||
// If at least one of the inputs is not aligned, we can't call the
|
||||
// CatArrayBatchedCopy_alignedK_contig
|
||||
isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
|
||||
isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
|
||||
#endif
|
||||
|
||||
if (stride_size > 1) {
|
||||
@ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
|
||||
}
|
||||
catMetaData.isContiguous[batchCounter] = false;
|
||||
isContig = false;
|
||||
} else {
|
||||
catMetaData.isContiguous[batchCounter] = true;
|
||||
}
|
||||
@ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
max_elements_per_tensor, batchCounter);
|
||||
#else
|
||||
dim3 applyBlock, catGrid;
|
||||
if (isInOutAligned) {
|
||||
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
|
||||
max_elements_per_tensor, batchCounter);
|
||||
} else if (isContig && isAligned && sizeof(scalar_t) > 2) {
|
||||
if (isContig && sizeof(scalar_t) > 2) {
|
||||
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
|
||||
max_elements_per_tensor, batchCounter);
|
||||
} else if (isContig && isAligned && sizeof(scalar_t) == 2) {
|
||||
} else if (isContig && sizeof(scalar_t) == 2) {
|
||||
std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
|
||||
max_elements_per_tensor, batchCounter);
|
||||
} else {
|
||||
@ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
getCatGrid(batchCounter, catGrid);
|
||||
}
|
||||
#endif
|
||||
int32_t trailingSize;
|
||||
TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
|
||||
if (isInOutAligned) {
|
||||
// in this case we can and should flatten the tensors after the cat dim
|
||||
// we want to view the tensors as if consisting of `alignment`-sized elements
|
||||
// however, we might not be able to cleanly divide just the last dim -
|
||||
// it might not be the multiple of alignment.
|
||||
// however, we know that the full concatted slice is multiple of alignment,
|
||||
// so if we flatten all the dims after and including concat dim,
|
||||
// it will be divisible by alignment
|
||||
// then we need to divide last out size by elems_per_vec,
|
||||
// and divide all strides except last by elems_per_vec (last stride is 1 always)
|
||||
// for input, we will fix up the sizes and strides in the kernel directly
|
||||
kernelOutputParam = outputParam;
|
||||
nDims = dimension + 1;
|
||||
constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
|
||||
auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
|
||||
kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
|
||||
trailingSize = outputParam.tensorStride[dimension];
|
||||
kernelOutputParam.tensorStride[dimension] = 1;
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
kernelOutputParam.tensorStride[i] /= elems_per_vec;
|
||||
}
|
||||
}
|
||||
|
||||
if (memory_format != c10::MemoryFormat::Contiguous) {
|
||||
switch (dimension) {
|
||||
@ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
|
||||
}
|
||||
// Template Declarations for dim = 1, 2, 3, 4
|
||||
#define HANDLE_CASE(DIMS) \
|
||||
if (isInOutAligned) {\
|
||||
constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
|
||||
CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
|
||||
catGrid, applyBlock, 0, stream.stream()>>>(\
|
||||
(char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
|
||||
} else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
|
||||
if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
|
||||
CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
|
||||
catGrid, applyBlock, 0, stream.stream()>>>(\
|
||||
data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
|
||||
|
||||
@ -559,60 +559,4 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
|
||||
at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
|
||||
return _int_mm_out_xpu(self, mat2, result);
|
||||
}
|
||||
|
||||
Tensor _weight_int8pack_mm_xpu(
|
||||
const Tensor& A,
|
||||
const Tensor& B,
|
||||
const Tensor& scales) {
|
||||
auto M = A.size(0);
|
||||
auto N = B.size(0);
|
||||
auto K = A.size(1);
|
||||
|
||||
TORCH_CHECK(
|
||||
A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
|
||||
" : expect A to be either 32-bit or 16-bit float tensor.");
|
||||
TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
|
||||
TORCH_CHECK(
|
||||
A.stride(1) == 1, " : A must be contiguous on the last dimension.");
|
||||
TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
|
||||
TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
|
||||
TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
|
||||
|
||||
TORCH_CHECK(
|
||||
scales.dim() == 1 && scales.size(0) == N,
|
||||
" : expect scales to be 1d tensor with size ",
|
||||
N);
|
||||
|
||||
auto C = at::empty({M, N}, A.options());
|
||||
|
||||
// --- Launch kernel ---
|
||||
Tensor bias = at::Tensor();
|
||||
Tensor mat2_zero_points = at::Tensor();
|
||||
Tensor non_const_scales = scales;
|
||||
auto post_op_args = torch::List<std::optional<at::Scalar>>();
|
||||
|
||||
at::native::onednn::quantized_matmul(
|
||||
A.contiguous(),
|
||||
1.0,
|
||||
0,
|
||||
B,
|
||||
non_const_scales,
|
||||
mat2_zero_points,
|
||||
bias,
|
||||
C,
|
||||
1.0,
|
||||
0,
|
||||
C.scalar_type(),
|
||||
/*other*/ std::nullopt,
|
||||
/*other scale*/ 1.0,
|
||||
/*other zp*/ 0,
|
||||
/*binary post op*/ "none",
|
||||
/*binary alpha*/ 1.0,
|
||||
/*post_op_name*/ "none",
|
||||
post_op_args,
|
||||
/*post_op_algorithm*/ "none",
|
||||
/*m2_trans*/ false);
|
||||
|
||||
return C;
|
||||
}
|
||||
} // namespace at::native
|
||||
|
||||
@ -110,9 +110,8 @@ void quantized_matmul(
|
||||
// [Note] Quantized Matrix Multiplication at XPU
|
||||
// The following code integrates oneDNN quantized gemm. The quantization
|
||||
// config we support:
|
||||
// activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
|
||||
// symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
|
||||
// symmetric
|
||||
// activation: s8&u8; per tensor calibrated; symmetric&asymmetric
|
||||
// weight: s8; per_tensor/per_channel calibrated; symmetric
|
||||
auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
|
||||
construct_attr_by_post_op(
|
||||
binary_post_op,
|
||||
|
||||
@ -534,18 +534,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
|
||||
output.resize_(output_size, memory_format);
|
||||
output.fill_(0);
|
||||
|
||||
if (indices.defined() && indices.numel() > 0) {
|
||||
auto output_image_size = c10::multiply_integers(output_size_);
|
||||
|
||||
int64_t min_idx = indices.min().item<int64_t>();
|
||||
int64_t max_idx = indices.max().item<int64_t>();
|
||||
|
||||
if (min_idx < 0 || max_idx >= output_image_size) {
|
||||
int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
|
||||
TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
|
||||
}
|
||||
}
|
||||
|
||||
id<MTLDevice> device = MPSDevice::getInstance()->device();
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
const auto numThreads = input.numel();
|
||||
|
||||
@ -4243,7 +4243,6 @@
|
||||
CPU: _weight_int8pack_mm_cpu
|
||||
CUDA: _weight_int8pack_mm_cuda
|
||||
MPS: _weight_int8pack_mm_mps
|
||||
XPU: _weight_int8pack_mm_xpu
|
||||
|
||||
- func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
|
||||
python_module: sparse
|
||||
@ -10849,7 +10848,6 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
|
||||
CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
|
||||
MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
|
||||
autogen: _foreach_maximum.Scalar_out
|
||||
|
||||
# foreach_minimum/maximum dispatches to clamp_max/min
|
||||
|
||||
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -170,15 +170,15 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,fail_to_run,0
|
||||
meta-llama/Llama-3.2-1B,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,fail_to_run,0
|
||||
google/gemma-2-2b,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,fail_to_run,0
|
||||
google/gemma-3-4b-it,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
@ -186,4 +186,4 @@ openai/whisper-tiny,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,fail_to_run,0
|
||||
Qwen/Qwen3-0.6B,fail_accuracy,0
|
||||
|
||||
|
@ -138,7 +138,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -158,7 +158,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -138,7 +138,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -158,7 +158,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -138,7 +138,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -158,7 +158,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -122,7 +122,7 @@ hf_Bert_large,pass,0
|
||||
|
||||
|
||||
|
||||
hf_BigBird,pass,27
|
||||
hf_BigBird,pass,25
|
||||
|
||||
|
||||
|
||||
@ -142,7 +142,7 @@ hf_Longformer,pass,4
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -174,11 +174,11 @@ YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,0
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,0
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,0
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,0
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
||||
|
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,5
|
||||
hf_Reformer,pass,8
|
||||
|
||||
|
||||
|
||||
@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
hf_Reformer,pass,20
|
||||
hf_Reformer,pass,25
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -181,7 +181,7 @@ hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
@ -205,7 +205,7 @@ llama,pass,0
|
||||
|
||||
|
||||
|
||||
llama_v2_7b_16h,pass_due_to_skip,0
|
||||
llama_v2_7b_16h,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -178,7 +178,7 @@ llama,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
llama_v2_7b_16h,pass_due_to_skip,0
|
||||
llama_v2_7b_16h,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -181,7 +181,7 @@ hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -198,7 +198,7 @@ llama,pass,0
|
||||
|
||||
|
||||
|
||||
llama_v2_7b_16h,pass_due_to_skip,0
|
||||
llama_v2_7b_16h,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_failed_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_failed_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_failed_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_failed_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_failed_to_run,0
|
||||
|
||||
|
@ -181,7 +181,7 @@ hf_T5_base,pass,0
|
||||
|
||||
|
||||
|
||||
hf_T5_generate,pass,7
|
||||
hf_T5_generate,pass,11
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -198,7 +198,7 @@ llama,pass,0
|
||||
|
||||
|
||||
|
||||
llama_v2_7b_16h,pass_due_to_skip,0
|
||||
llama_v2_7b_16h,model_fail_to_load,0
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -6,7 +6,7 @@ add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1
|
||||
|
||||
|
||||
|
||||
add_loop_inductor,compile_time_instruction_count,29660000000,0.1
|
||||
add_loop_inductor,compile_time_instruction_count,30280000000,0.1
|
||||
|
||||
|
||||
|
||||
@ -50,27 +50,27 @@ symint_sum_loop,compile_time_instruction_count,4299000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1869000000,0.1
|
||||
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5281000000,0.1
|
||||
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_partitioner_cpu,compile_time_instruction_count,8333000000,0.1
|
||||
aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1909000000,0.1
|
||||
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3442000000,0.1
|
||||
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9239000000,0.1
|
||||
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.1
|
||||
|
||||
|
||||
|
||||
@ -78,7 +78,7 @@ mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1
|
||||
|
||||
|
||||
|
||||
mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,9051000000,0.1
|
||||
mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8802129167,0.1
|
||||
|
||||
|
||||
|
||||
@ -86,4 +86,4 @@ basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1
|
||||
|
||||
|
||||
|
||||
basic_InlineMod_eager,compile_time_instruction_count,7618000000,0.1
|
||||
basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.1
|
||||
|
||||
|
@ -1998,21 +1998,7 @@ def define_buck_targets(
|
||||
third_party("sleef_arm"),
|
||||
],
|
||||
}),
|
||||
compiler_flags = get_aten_compiler_flags() + select({
|
||||
"DEFAULT": [],
|
||||
"ovr_config//os:android-arm32": [
|
||||
"-mfpu=vfpv3-d16",
|
||||
"-march=armv7-a",
|
||||
"-mthumb",
|
||||
"-mfpu=neon",
|
||||
],
|
||||
"ovr_config//os:android-x86_32": [
|
||||
"-mssse3",
|
||||
],
|
||||
"ovr_config//os:android-x86_64": [
|
||||
"-mssse3",
|
||||
],
|
||||
}),
|
||||
compiler_flags = get_aten_compiler_flags(),
|
||||
exported_preprocessor_flags = get_aten_preprocessor_flags(),
|
||||
exported_deps = [
|
||||
":aten_header",
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
#include <c10/core/impl/DeviceGuardImplInterface.h>
|
||||
#include <c10/core/impl/FakeGuardImpl.h>
|
||||
#include <array>
|
||||
|
||||
namespace c10::impl {
|
||||
@ -15,26 +14,4 @@ DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
|
||||
device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
|
||||
}
|
||||
|
||||
namespace {
|
||||
thread_local std::unique_ptr<DeviceGuardImplInterface> tls_fake_device_guard =
|
||||
nullptr;
|
||||
}
|
||||
|
||||
void ensureCUDADeviceGuardSet() {
|
||||
constexpr auto cuda_idx = static_cast<std::size_t>(DeviceType::CUDA);
|
||||
|
||||
const DeviceGuardImplInterface* p =
|
||||
device_guard_impl_registry[cuda_idx].load();
|
||||
|
||||
// A non-null `ptr` indicates that the CUDA guard is already set up,
|
||||
// implying this is using cuda build
|
||||
if (p && p->deviceCount() == 0) {
|
||||
// In following cases, we override CUDA guard interface with a no-op
|
||||
// device guard. When p->deviceCount() == 0, cuda build is enabled, but no
|
||||
// cuda devices available.
|
||||
tls_fake_device_guard = std::make_unique<FakeGuardImpl<DeviceType::CUDA>>();
|
||||
device_guard_impl_registry[cuda_idx].store(tls_fake_device_guard.get());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace c10::impl
|
||||
|
||||
@ -6,7 +6,6 @@
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
// Just for C10_ANONYMOUS_VARIABLE
|
||||
#include <c10/core/impl/TorchDispatchModeTLS.h>
|
||||
#include <c10/util/Registry.h>
|
||||
|
||||
#include <array>
|
||||
@ -252,7 +251,7 @@ struct C10_API DeviceGuardImplInterface {
|
||||
// for devices that don't actually have a concept of device index. Prominent
|
||||
// examples are CPU and Meta.
|
||||
template <DeviceType D>
|
||||
struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface {
|
||||
struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
|
||||
NoOpDeviceGuardImpl() = default;
|
||||
DeviceType type() const override {
|
||||
return D;
|
||||
@ -372,7 +371,5 @@ inline bool hasDeviceGuardImpl(DeviceType type) {
|
||||
return device_guard_impl_registry[static_cast<size_t>(type)].load();
|
||||
}
|
||||
|
||||
void C10_API ensureCUDADeviceGuardSet();
|
||||
|
||||
} // namespace impl
|
||||
} // namespace c10
|
||||
|
||||
@ -158,7 +158,6 @@ function(caffe2_print_configuration_summary)
|
||||
if(${USE_KLEIDIAI})
|
||||
message(STATUS " USE_KLEIDIAI : ${USE_KLEIDIAI}")
|
||||
endif()
|
||||
message(STATUS " USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}")
|
||||
message(STATUS " USE_UCC : ${USE_UCC}")
|
||||
if(${USE_UCC})
|
||||
message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}")
|
||||
|
||||
@ -482,7 +482,6 @@ function(torch_update_find_cuda_flags)
|
||||
endfunction()
|
||||
|
||||
include(CheckCXXCompilerFlag)
|
||||
include(CheckLinkerFlag)
|
||||
|
||||
##############################################################################
|
||||
# CHeck if given flag is supported and append it to provided outputvar
|
||||
@ -512,22 +511,3 @@ function(target_compile_options_if_supported target flag)
|
||||
target_compile_options(${target} PRIVATE ${flag})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# Check if a global link option is supported
|
||||
function(add_link_options_if_supported flag)
|
||||
check_linker_flag(C "LINKER:${flag}" _supported)
|
||||
if("${_supported}")
|
||||
add_link_options("LINKER:${flag}")
|
||||
else()
|
||||
message(WARNING "Attempted to use unsupported link option : ${flag}.")
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(target_link_options_if_supported tgt flag)
|
||||
check_linker_flag(C "LINKER:${flag}" _supported)
|
||||
if("${_supported}")
|
||||
target_link_options("${tgt}" PRIVATE "LINKER:${flag}")
|
||||
else()
|
||||
message(WARNING "Attempted to use unsupported link option : ${flag}.")
|
||||
endif()
|
||||
endfunction()
|
||||
@ -1,7 +0,0 @@
|
||||
.. role:: hidden
|
||||
:class: hidden-section
|
||||
.. currentmodule:: {{ module }}
|
||||
|
||||
{{ name | underline }}
|
||||
|
||||
.. autofunction:: {{ fullname }}
|
||||
@ -1,7 +0,0 @@
|
||||
.. role:: hidden
|
||||
:class: hidden-section
|
||||
.. currentmodule:: {{ module }}
|
||||
|
||||
{{ name | underline }}
|
||||
|
||||
.. automethod:: {{ fullname }}
|
||||
62
docs/source/bottleneck.rst
Normal file
62
docs/source/bottleneck.rst
Normal file
@ -0,0 +1,62 @@
|
||||
torch.utils.bottleneck
|
||||
======================
|
||||
|
||||
.. automodule:: torch.utils.bottleneck
|
||||
.. currentmodule:: torch.utils.bottleneck
|
||||
|
||||
`torch.utils.bottleneck` is a tool that can be used as an initial step for
|
||||
debugging bottlenecks in your program. It summarizes runs of your script with
|
||||
the Python profiler and PyTorch's autograd profiler.
|
||||
|
||||
Run it on the command line with
|
||||
|
||||
::
|
||||
|
||||
python -m torch.utils.bottleneck /path/to/source/script.py [args]
|
||||
|
||||
where [args] are any number of arguments to `script.py`, or run
|
||||
``python -m torch.utils.bottleneck -h`` for more usage instructions.
|
||||
|
||||
.. warning::
|
||||
Because your script will be profiled, please ensure that it exits in a
|
||||
finite amount of time.
|
||||
|
||||
.. warning::
|
||||
Due to the asynchronous nature of CUDA kernels, when running against
|
||||
CUDA code, the cProfile output and CPU-mode autograd profilers may
|
||||
not show correct timings: the reported CPU time reports the amount of time
|
||||
used to launch the kernels but does not include the time the kernel
|
||||
spent executing on a GPU unless the operation does a synchronize.
|
||||
Ops that do synchronize appear to be extremely expensive under regular
|
||||
CPU-mode profilers.
|
||||
In these case where timings are incorrect, the CUDA-mode autograd profiler
|
||||
may be helpful.
|
||||
|
||||
.. note::
|
||||
To decide which (CPU-only-mode or CUDA-mode) autograd profiler output to
|
||||
look at, you should first check if your script is CPU-bound
|
||||
("CPU total time is much greater than CUDA total time").
|
||||
If it is CPU-bound, looking at the results of the CPU-mode autograd
|
||||
profiler will help. If on the other hand your script spends most of its
|
||||
time executing on the GPU, then it makes sense to start
|
||||
looking for responsible CUDA operators in the output of the CUDA-mode
|
||||
autograd profiler.
|
||||
|
||||
Of course the reality is much more complicated and your script might not be
|
||||
in one of those two extremes depending on the part of the model you're
|
||||
evaluating. If the profiler outputs don't help, you could try looking at
|
||||
the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
|
||||
However, please take into account that the NVTX overhead is very high and
|
||||
often gives a heavily skewed timeline. Similarly, ``Intel® VTune™ Profiler``
|
||||
helps to analyze performance on Intel platforms further with
|
||||
:func:`torch.autograd.profiler.emit_itt()`.
|
||||
|
||||
.. warning::
|
||||
If you are profiling CUDA code, the first profiler that ``bottleneck`` runs
|
||||
(cProfile) will include the CUDA startup time (CUDA buffer allocation cost)
|
||||
in its time reporting. This should not matter if your bottlenecks result
|
||||
in code much slower than the CUDA startup time.
|
||||
|
||||
For more complicated uses of the profilers (like in a multi-GPU case),
|
||||
please see https://docs.python.org/3/library/profile.html
|
||||
or :func:`torch.autograd.profiler.profile()` for more information.
|
||||
@ -66,7 +66,6 @@ extensions = [
|
||||
"sphinx.ext.linkcode",
|
||||
"sphinxcontrib.mermaid",
|
||||
"sphinx_sitemap",
|
||||
"sphinx_remove_toctrees"
|
||||
]
|
||||
|
||||
myst_enable_extensions = [
|
||||
@ -75,9 +74,6 @@ myst_enable_extensions = [
|
||||
"html_image",
|
||||
]
|
||||
|
||||
# Remove the "generated" tag from the toctree to allow for faster builds
|
||||
remove_from_toctrees = ["generated/*"]
|
||||
|
||||
html_baseurl = "https://docs.pytorch.org/docs/stable/" # needed for sphinx-sitemap
|
||||
sitemap_locales = [None]
|
||||
sitemap_excludes = [
|
||||
@ -97,10 +93,8 @@ numpydoc_show_class_members = False
|
||||
# autosectionlabel throws warnings if section names are duplicated.
|
||||
# The following tells autosectionlabel to not throw a warning for
|
||||
# duplicated section names that are in different documents.
|
||||
|
||||
autosectionlabel_prefix_document = True
|
||||
|
||||
|
||||
# katex options
|
||||
#
|
||||
#
|
||||
@ -213,41 +207,6 @@ templates_path = [
|
||||
]
|
||||
# TODO: document these and remove them from here.
|
||||
|
||||
autosummary_filename_map = {
|
||||
'torch.nn.utils.prune.identity': 'torch.nn.utils.prune.identity_function',
|
||||
'torch.nn.utils.prune.Identity': 'torch.nn.utils.prune.Identity_class',
|
||||
'torch.optim.adamw.adamw': 'torch.optim.adamw.adamw_function',
|
||||
'torch.optim.adamw.AdamW': 'torch.optim.adamw.AdamW_class',
|
||||
'torch.optim.asgd.asgd': 'torch.optim.asgd.asgd_function',
|
||||
'torch.optim.asgd.ASGD': 'torch.optim.asgd.ASGD_class',
|
||||
'torch.optim.nadam.nadam': 'torch.optim.nadam.nadam_function',
|
||||
'torch.optim.nadam.NAdam': 'torch.optim.nadam.NAdam_class',
|
||||
'torch.optim.radam.radam': 'torch.optim.radam.radam_function',
|
||||
'torch.optim.radam.RAdam': 'torch.optim.radam.RAdam_class',
|
||||
'torch.optim.rmsprop.rmsprop': 'torch.optim.rmsprop.rmsprop_function',
|
||||
'torch.optim.rmsprop.RMSprop': 'torch.optim.rmsprop.RMSprop_class',
|
||||
'torch.optim.rprop.rprop': 'torch.optim.rprop.rprop_function',
|
||||
'torch.optim.rprop.Rprop': 'torch.optim.rprop.Rprop_class',
|
||||
'torch.optim.sgd.sgd': 'torch.optim.sgd.sgd_function',
|
||||
'torch.optim.sgd.SGD': 'torch.optim.sgd.SGD_class',
|
||||
'torch.optim.adadelta.adadelta': 'torch.optim.adadelta.adadelta_function',
|
||||
'torch.optim.adadelta.Adadelta': 'torch.optim.adadelta.Adadelta_class',
|
||||
'torch.optim.adagrad.adagrad': 'torch.optim.adagrad.adagrad_function',
|
||||
'torch.optim.adagrad.Adagrad': 'torch.optim.adagrad.Adagrad_class',
|
||||
'torch.optim.adam.adam': 'torch.optim.adam.adam_function',
|
||||
'torch.optim.adam.Adam': 'torch.optim.adam.Adam_class',
|
||||
'torch.optim.adamax.adamax': 'torch.optim.adamax.adamax_function',
|
||||
'torch.optim.adamax.Adamax': 'torch.optim.adamax.Adamax_class',
|
||||
'torch.mtia.stream': 'torch.mtia.stream_function',
|
||||
'torch.mtia.Stream': 'torch.mtia.Stream_class',
|
||||
'torch.cpu.stream': 'torch.cpu.stream_function',
|
||||
'torch.cpu.Stream': 'torch.cpu.Stream_class',
|
||||
'torch.cuda.stream': 'torch.cuda.stream_function',
|
||||
'torch.cuda.Stream': 'torch.cuda.Stream_class',
|
||||
'torch.xpu.stream': 'torch.xpu.stream_function',
|
||||
'torch.xpu.Stream': 'torch.xpu.Stream_class',
|
||||
}
|
||||
|
||||
coverage_ignore_functions = [
|
||||
# torch
|
||||
"typename",
|
||||
|
||||
@ -21,10 +21,10 @@
|
||||
The following operations will fill uninitialized memory when this setting is
|
||||
turned on:
|
||||
|
||||
* :meth:`torch.Tensor.resize_` when called with a tensor that is not
|
||||
* :func:`torch.Tensor.resize_` when called with a tensor that is not
|
||||
quantized
|
||||
* :func:`torch.empty`
|
||||
* :func:`torch.empty_strided`
|
||||
* :func:`torch.empty_permuted`
|
||||
* :func:`torch.empty_like`
|
||||
```
|
||||
```
|
||||
@ -260,73 +260,3 @@ these features.
|
||||
```{eval-rst}
|
||||
.. py:module:: torch.distributed.tensor.device_mesh
|
||||
```
|
||||
|
||||
## Mixed Tensor and DTensor operations
|
||||
|
||||
So you got the following error message.
|
||||
```
|
||||
got mixed torch.Tensor and DTensor, need to convert all
|
||||
torch.Tensor to DTensor before calling distributed operators!
|
||||
```
|
||||
|
||||
There are two cases.
|
||||
|
||||
### Case 1: this is user error
|
||||
|
||||
The most common way to run into this error is to create a regular Tensor
|
||||
(using a factory function) and then perform a Tensor-DTensor operation,
|
||||
like the following:
|
||||
|
||||
```
|
||||
tensor = torch.arange(10)
|
||||
return tensor + dtensor
|
||||
```
|
||||
|
||||
We disallow mixed Tensor-DTensor operations: if the input to any operations
|
||||
(e.g. torch.add) is a DTensor, then all Tensor inputs must be DTensors.
|
||||
This is because the semantics are ambiguous. We don't know if `tensor` is
|
||||
the same across ranks or if it is different so we ask that the user
|
||||
figure out how to construct a DTensor with accurate placements from `tensor`.
|
||||
|
||||
If each rank does have the same `tensor`, then please construct a replicated
|
||||
DTensor:
|
||||
|
||||
```
|
||||
tensor = torch.arange(10)
|
||||
tensor = DTensor.from_local(tensor, placements=(Replicate(),))
|
||||
return tensor + dtensor
|
||||
```
|
||||
|
||||
If you wanted to create a DTensor with shards, below is how to do it.
|
||||
Semantically this means that your Tensor data is split between the shards
|
||||
and that operations act on the "full stacked data".
|
||||
|
||||
```
|
||||
tensor = torch.full([], RANK)
|
||||
tensor = DTensor.from_local(tensor, placements=(Shard(0),))
|
||||
return tensor + dtensor
|
||||
```
|
||||
|
||||
There are other things you may wish to do with your tensor beyond
|
||||
these situations (these are not the only two options!).
|
||||
|
||||
## Case 2: the error came from PyTorch framework code
|
||||
|
||||
Sometimes the problem is that PyTorch framework code attempts to perform mixed
|
||||
Tensor-DTensor operations. These are bugs in PyTorch, please file an issue
|
||||
so that we can fix them.
|
||||
|
||||
On the user side, the only thing you can do is to avoid using the operation
|
||||
that caused the issue and file a bug report.
|
||||
|
||||
For PyTorch Developers: one approach of fixing this is to rewrite PyTorch
|
||||
framework code to avoid mixed Tensor-DTensor code (like in the previous section).
|
||||
|
||||
For PyTorch Developers: the second approach is to turn on DTensor implicit
|
||||
replication inside the right places in PyTorch framework code.
|
||||
When on, any mixed Tensor-DTensor operations will assume that the
|
||||
non-DTensors can be replicated. Please be careful when using this as it
|
||||
can lead to silent incorrectness.
|
||||
|
||||
- [Turning on implicit replication in Python](https://github.com/pytorch/pytorch/blob/d8e6b2fddc54c748d976e8f0ebe4b63ebe36d85b/torch/distributed/tensor/experimental/__init__.py#L15)
|
||||
- [Turning on implicit replication in C++](https://github.com/pytorch/pytorch/blob/7a0f93344e2c851b9bcf2b9c3225a323d48fde26/aten/src/ATen/DTensorState.h#L10)
|
||||
|
||||
@ -8,10 +8,6 @@
|
||||
These APIs are experimental and subject to change without notice.
|
||||
:::
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: torch.fx.experimental.sym_node.DynamicInt
|
||||
```
|
||||
|
||||
## torch.fx.experimental.symbolic_shapes
|
||||
|
||||
```{eval-rst}
|
||||
|
||||
@ -76,6 +76,7 @@ storage
|
||||
torch.testing <testing>
|
||||
torch.utils <utils>
|
||||
torch.utils.benchmark <benchmark_utils>
|
||||
torch.utils.bottleneck <bottleneck>
|
||||
torch.utils.checkpoint <checkpoint>
|
||||
torch.utils.cpp_extension <cpp_extension>
|
||||
torch.utils.data <data>
|
||||
|
||||
24
setup.py
24
setup.py
@ -227,6 +227,9 @@
|
||||
# Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
|
||||
# By default, It is only enabled on Windows.
|
||||
#
|
||||
# USE_PRIORITIZED_TEXT_FOR_LD
|
||||
# Uses prioritized text form cmake/prioritized_text.txt for LD
|
||||
#
|
||||
# BUILD_LIBTORCH_WHL
|
||||
# Builds libtorch.so and its dependencies as a wheel
|
||||
#
|
||||
@ -320,6 +323,7 @@ from tools.setup_helpers.env import (
|
||||
IS_LINUX,
|
||||
IS_WINDOWS,
|
||||
)
|
||||
from tools.setup_helpers.generate_linker_script import gen_linker_script
|
||||
|
||||
|
||||
def str2bool(value: str | None) -> bool:
|
||||
@ -1623,6 +1627,26 @@ def main() -> None:
|
||||
if BUILD_PYTHON_ONLY:
|
||||
install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"]
|
||||
|
||||
if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")):
|
||||
gen_linker_script(
|
||||
filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
|
||||
)
|
||||
linker_script_path = os.path.abspath("cmake/linker_script.ld")
|
||||
os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}"
|
||||
os.environ["CFLAGS"] = (
|
||||
os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections"
|
||||
)
|
||||
os.environ["CXXFLAGS"] = (
|
||||
os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections"
|
||||
)
|
||||
elif platform.system() == "Linux" and platform.processor() == "aarch64":
|
||||
print_box(
|
||||
"""
|
||||
WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA.
|
||||
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
|
||||
"""
|
||||
)
|
||||
|
||||
# Parse the command line and check the arguments before we proceed with
|
||||
# building deps and setup. We need to set values so `--help` works.
|
||||
dist = Distribution()
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Owner(s): ["module: sparse"]
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import copy
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Owner(s): ["module: sparse"]
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Owner(s): ["module: sparse"]
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import copy
|
||||
import warnings
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Owner(s): ["module: sparse"]
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Owner(s): ["module: sparse"]
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import copy
|
||||
import io
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Owner(s): ["module: sparse"]
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
|
||||
import torch
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Owner(s): ["module: sparse"]
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import warnings
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Owner(s): ["module: sparse"]
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import itertools
|
||||
import re
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Owner(s): ["module: sparse"]
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
|
||||
import logging
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# Owner(s): ["module: sparse"]
|
||||
# Owner(s): ["module: unknown"]
|
||||
import copy
|
||||
import random
|
||||
|
||||
|
||||
7
test/bottleneck_test/test.py
Normal file
7
test/bottleneck_test/test.py
Normal file
@ -0,0 +1,7 @@
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
x = torch.ones((3, 3), requires_grad=True)
|
||||
(3 * x).sum().backward()
|
||||
17
test/bottleneck_test/test_args.py
Normal file
17
test/bottleneck_test/test_args.py
Normal file
@ -0,0 +1,17 @@
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
# Required args. Raises error if they aren't passed.
|
||||
parser.add_argument("--foo", help="foo", required=True)
|
||||
parser.add_argument("--bar", help="bar", required=True)
|
||||
_ = parser.parse_args()
|
||||
|
||||
x = torch.ones((3, 3), requires_grad=True)
|
||||
(3 * x).sum().backward()
|
||||
29
test/bottleneck_test/test_cuda.py
Normal file
29
test/bottleneck_test/test_cuda.py
Normal file
@ -0,0 +1,29 @@
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.linear = nn.Linear(20, 20)
|
||||
|
||||
def forward(self, input):
|
||||
out = self.linear(input[:, 10:30])
|
||||
return out.sum()
|
||||
|
||||
|
||||
def main():
|
||||
data = torch.randn(10, 50).cuda()
|
||||
model = Model().cuda()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
|
||||
for _ in range(10):
|
||||
optimizer.zero_grad()
|
||||
loss = model(data)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -117,49 +117,6 @@ class TestFullyShardStateDictMultiProcess(FSDPTest):
|
||||
for key, value in ref_sharded_sd.items():
|
||||
self.assertEqual(value, sharded_sd[key])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_cached_state_dict(self):
|
||||
self.run_subtests(
|
||||
{"mlp_dim": [2, 3, 4, 5], "mutate_after_state_dict": [True, False]},
|
||||
self._test_cached_state_dict,
|
||||
)
|
||||
|
||||
def _test_cached_state_dict(self, mlp_dim: int, mutate_after_state_dict: bool):
|
||||
torch.manual_seed(42)
|
||||
model = nn.Linear(mlp_dim, mlp_dim, bias=False)
|
||||
fully_shard(model, reshard_after_forward=True)
|
||||
optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
|
||||
|
||||
# call .state_dict() once and use `sd` directly to reduce cpu overhead
|
||||
sd = model.state_dict()
|
||||
assert isinstance(model.weight, DTensor)
|
||||
|
||||
if not mutate_after_state_dict:
|
||||
self.assertTrue(
|
||||
sd["weight"]._local_tensor.untyped_storage().data_ptr()
|
||||
== model.weight._local_tensor.untyped_storage().data_ptr()
|
||||
)
|
||||
else:
|
||||
model = model.cpu()
|
||||
model = model.cuda()
|
||||
self.assertTrue(
|
||||
sd["weight"]._local_tensor.untyped_storage().data_ptr()
|
||||
!= model.weight._local_tensor.untyped_storage().data_ptr()
|
||||
)
|
||||
|
||||
torch.manual_seed(42 + self.rank)
|
||||
inp = torch.rand(mlp_dim, mlp_dim, device="cuda")
|
||||
for _ in range(5):
|
||||
optim.zero_grad()
|
||||
loss = model(inp).sum()
|
||||
loss.backward()
|
||||
optim.step()
|
||||
if not mutate_after_state_dict:
|
||||
self.assertTrue(
|
||||
sd["weight"]._local_tensor.untyped_storage().data_ptr()
|
||||
== model.weight._local_tensor.untyped_storage().data_ptr()
|
||||
)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_dp_state_dict_cpu_offload(self):
|
||||
self.run_subtests(
|
||||
|
||||
@ -1,47 +1,22 @@
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
|
||||
import contextlib
|
||||
import copy
|
||||
import functools
|
||||
import itertools
|
||||
import unittest
|
||||
from collections.abc import Iterable
|
||||
from typing import Union
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
from torch.distributed._composable.replicate_with_fsdp import replicate
|
||||
from torch.distributed.fsdp import CPUOffloadPolicy, FSDPModule, OffloadPolicy
|
||||
from torch.distributed.tensor import DTensor, init_device_mesh
|
||||
from torch.distributed.fsdp import FSDPModule
|
||||
from torch.distributed.tensor import DTensor
|
||||
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
|
||||
from torch.testing._internal.common_fsdp import (
|
||||
check_sharded_parity,
|
||||
compiled_fsdp_test,
|
||||
FSDPTest,
|
||||
FSDPTestMultiThread,
|
||||
MLP,
|
||||
patch_all_gather,
|
||||
patch_reduce_scatter,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
get_cycles_per_ms,
|
||||
run_tests,
|
||||
TEST_HPU,
|
||||
wrapSwapTensorsTest,
|
||||
)
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
ModelArgs,
|
||||
Transformer,
|
||||
TransformerBlock,
|
||||
)
|
||||
from torch.testing._internal.common_fsdp import FSDPTestMultiThread, get_devtype, MLP
|
||||
from torch.testing._internal.common_utils import run_tests
|
||||
|
||||
|
||||
c10d_ops = torch.ops.c10d
|
||||
funcol = torch.ops.c10d_functional
|
||||
|
||||
from torch.testing._internal.common_fsdp import get_devtype
|
||||
|
||||
|
||||
device_type = torch.device(get_devtype())
|
||||
|
||||
@ -194,463 +169,5 @@ class TestReplicateRegisteredParams(FSDPTestMultiThread):
|
||||
self.assertEqual(param, ref_param)
|
||||
|
||||
|
||||
class TestReplicateCastAfterInit(FSDPTestMultiThread):
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
return 2
|
||||
|
||||
@skip_if_lt_x_gpu(1)
|
||||
@wrapSwapTensorsTest(True)
|
||||
def test_to_float64_after_init(self):
|
||||
"""Tests that the user can cast the module to float64 after init."""
|
||||
# NOTE: Test fp64 instead of a lower precision dtype like bf16 for
|
||||
# better numerics. The important part is changing the dtype.
|
||||
|
||||
torch.manual_seed(42)
|
||||
mlp_dim, device, dtype = 4, device_type, torch.float64
|
||||
model = MLP(mlp_dim, device=device)
|
||||
for param in model.parameters():
|
||||
dist.broadcast(param, src=0)
|
||||
ref_model = copy.deepcopy(model).to(dtype)
|
||||
|
||||
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
|
||||
for module in (model.in_proj, model.out_proj, model):
|
||||
replicate(module)
|
||||
model.to(dtype)
|
||||
for param in model.parameters():
|
||||
self.assertEqual(param.dtype, dtype)
|
||||
self.assertEqual(param.to_local().dtype, dtype)
|
||||
self.assertEqual(param._spec.tensor_meta.dtype, dtype)
|
||||
optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
|
||||
check_sharded_parity(self, ref_model, model)
|
||||
torch.manual_seed(42 + self.rank + 1)
|
||||
inp = torch.randn((2, mlp_dim), device=device_type.type, dtype=dtype)
|
||||
for iter_idx in range(10):
|
||||
losses: list[torch.Tensor] = []
|
||||
for _model in (ref_model, model):
|
||||
losses.append(_model(inp).sum())
|
||||
losses[-1].backward()
|
||||
|
||||
for param in ref_model.parameters():
|
||||
if param.grad is not None:
|
||||
dist.all_reduce(param.grad)
|
||||
param.grad.div_(self.world_size)
|
||||
|
||||
self.assertEqual(losses[0], losses[1])
|
||||
check_sharded_parity(self, ref_model, model)
|
||||
for param in model.parameters():
|
||||
self.assertEqual(param.dtype, dtype)
|
||||
self.assertEqual(param.to_local().dtype, dtype)
|
||||
self.assertEqual(param._spec.tensor_meta.dtype, dtype)
|
||||
self.assertEqual(param.grad.dtype, dtype)
|
||||
self.assertEqual(param.grad.to_local().dtype, dtype)
|
||||
self.assertEqual(param.grad._spec.tensor_meta.dtype, dtype)
|
||||
for _optim in (ref_optim, optim):
|
||||
_optim.step()
|
||||
_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
|
||||
|
||||
|
||||
class TestReplicate1DTrainingCore(FSDPTest):
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
return min(8, torch.get_device_module(device_type).device_count())
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_train_parity_single_group(self):
|
||||
"""
|
||||
Tests train parity with DDP for a single FSDP group when sharding
|
||||
parameters on dim-0.
|
||||
"""
|
||||
self.run_subtests(
|
||||
{
|
||||
"lin_shapes": [
|
||||
[(16, 15), (15, 8)],
|
||||
[(7, 15), (15, 3)],
|
||||
[(16, 17), (17, 8)],
|
||||
],
|
||||
"use_shard_placement_fn": [False],
|
||||
},
|
||||
self._test_train_parity_single_group,
|
||||
)
|
||||
|
||||
def _test_train_parity_single_group(
|
||||
self, lin_shapes: list[tuple[int, int]], use_shard_placement_fn: bool
|
||||
):
|
||||
torch.manual_seed(42)
|
||||
model = nn.Sequential(
|
||||
nn.Linear(*lin_shapes[0]), nn.ReLU(), nn.Linear(*lin_shapes[1])
|
||||
)
|
||||
ref_model = copy.deepcopy(model).to(device_type)
|
||||
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
|
||||
|
||||
replicate(model)
|
||||
optim = torch.optim.Adam(model.parameters(), lr=1e-2)
|
||||
torch.manual_seed(42 + self.rank + 1)
|
||||
inp = (torch.randn((4, lin_shapes[0][0]), device=device_type.type),)
|
||||
for iter_idx in range(10):
|
||||
losses: list[torch.Tensor] = []
|
||||
for _model in (ref_model, model):
|
||||
losses.append(_model(*inp).sum())
|
||||
losses[-1].backward()
|
||||
|
||||
for param in ref_model.parameters():
|
||||
if param.grad is not None:
|
||||
dist.all_reduce(param.grad)
|
||||
param.grad.div_(self.world_size)
|
||||
|
||||
for _optim in (ref_optim, optim):
|
||||
_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
|
||||
_optim.step()
|
||||
self.assertEqual(losses[0], losses[1])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_HPU, "Sleep kernel not supported for HPU")
|
||||
@compiled_fsdp_test(compile_compute_on_module=Transformer)
|
||||
def test_train_parity_multi_groups(self):
|
||||
"""
|
||||
Tests train parity against DDP when using multiple parameter groups for
|
||||
communication (for communication and computation overlap plus memory
|
||||
reduction).
|
||||
"""
|
||||
self.run_subtests(
|
||||
{
|
||||
"reshard_after_forward": [True, False],
|
||||
"test_device_type": [device_type.type],
|
||||
"offload_policy": [OffloadPolicy()],
|
||||
"delay_after_forward": [False, True],
|
||||
"delay_before_all_gather": [False, True],
|
||||
"delay_before_reduce_scatter": [False, True],
|
||||
"delay_before_optim": [False, True],
|
||||
"unshard_async_op": [False],
|
||||
},
|
||||
self._test_train_parity_multi_group,
|
||||
)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
|
||||
def test_train_parity_multi_group_cpu_offload_eager(self):
|
||||
"""
|
||||
Tests train parity when using multiple parameter groups for
|
||||
communication and CPU offloading.
|
||||
"""
|
||||
self.run_subtests(
|
||||
{
|
||||
"reshard_after_forward": [True], # save CI time
|
||||
"offload_policy": [
|
||||
CPUOffloadPolicy(pin_memory=True),
|
||||
CPUOffloadPolicy(pin_memory=False),
|
||||
],
|
||||
"test_device_type": [device_type.type],
|
||||
"delay_after_forward": [False, True],
|
||||
"delay_before_all_gather": [False, True],
|
||||
"delay_before_reduce_scatter": [False, True],
|
||||
"delay_before_optim": [False, True],
|
||||
"unshard_async_op": [False],
|
||||
},
|
||||
self._test_train_parity_multi_group,
|
||||
)
|
||||
|
||||
def _test_train_parity_multi_group(
|
||||
self,
|
||||
reshard_after_forward: Union[bool, int],
|
||||
offload_policy: OffloadPolicy,
|
||||
test_device_type: str,
|
||||
delay_after_forward: bool,
|
||||
delay_before_all_gather: bool,
|
||||
delay_before_reduce_scatter: bool,
|
||||
delay_before_optim: bool,
|
||||
unshard_async_op: bool,
|
||||
):
|
||||
# Only test individual delays or all four delays to save test time
|
||||
if (
|
||||
delay_after_forward
|
||||
+ delay_before_all_gather
|
||||
+ delay_before_reduce_scatter
|
||||
+ delay_before_optim
|
||||
in (2, 3)
|
||||
):
|
||||
return
|
||||
assert test_device_type in ("cuda", "hpu", "xpu", "cpu"), f"{test_device_type}"
|
||||
torch.manual_seed(42)
|
||||
vocab_size = 1024
|
||||
model_args = ModelArgs(
|
||||
n_layers=3,
|
||||
n_heads=4,
|
||||
vocab_size=vocab_size,
|
||||
max_seq_len=64,
|
||||
dropout_p=0,
|
||||
)
|
||||
model = Transformer(model_args)
|
||||
ref_model = copy.deepcopy(model).to(device_type)
|
||||
|
||||
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
|
||||
mesh = init_device_mesh(
|
||||
test_device_type,
|
||||
(self.world_size, 1),
|
||||
mesh_dim_names=("replicate", "shard"),
|
||||
)
|
||||
fully_shard_fn = functools.partial(
|
||||
replicate,
|
||||
device_mesh=mesh,
|
||||
reshard_after_forward=reshard_after_forward,
|
||||
offload_policy=offload_policy,
|
||||
)
|
||||
for module in model.modules():
|
||||
if isinstance(module, TransformerBlock):
|
||||
fully_shard_fn(module)
|
||||
fully_shard_fn(model)
|
||||
if unshard_async_op:
|
||||
model._set_unshard_async_op(unshard_async_op)
|
||||
optim = torch.optim.Adam(model.parameters(), lr=1e-2)
|
||||
|
||||
delay_in_ms = 100
|
||||
orig_all_gather = dist.all_gather_into_tensor
|
||||
orig_reduce_scatter = dist.reduce_scatter_tensor
|
||||
|
||||
def delayed_all_gather(*args, **kwargs):
|
||||
torch.get_device_module(device_type)._sleep(
|
||||
int(delay_in_ms * get_cycles_per_ms())
|
||||
)
|
||||
return orig_all_gather(*args, **kwargs)
|
||||
|
||||
def delayed_reduce_scatter(*args, **kwargs):
|
||||
torch.get_device_module(device_type)._sleep(
|
||||
int(delay_in_ms * get_cycles_per_ms())
|
||||
)
|
||||
return orig_reduce_scatter(*args, **kwargs)
|
||||
|
||||
torch.manual_seed(42 + self.rank + 1)
|
||||
patch_all_gather_ctx = (
|
||||
patch_all_gather(delayed_all_gather)
|
||||
if delay_before_all_gather
|
||||
else contextlib.nullcontext()
|
||||
)
|
||||
patch_reduce_scatter_ctx = (
|
||||
patch_reduce_scatter(delayed_reduce_scatter)
|
||||
if delay_before_reduce_scatter
|
||||
else contextlib.nullcontext()
|
||||
)
|
||||
with patch_all_gather_ctx, patch_reduce_scatter_ctx:
|
||||
for iter_idx in range(10):
|
||||
inp = torch.randint(0, vocab_size, (3, 64), device=device_type)
|
||||
losses: list[torch.Tensor] = []
|
||||
for _model, _optim in ((ref_model, ref_optim), (model, optim)):
|
||||
losses.append(_model(inp).sum())
|
||||
if _model is model and delay_after_forward:
|
||||
torch.get_device_module(device_type)._sleep(
|
||||
int(delay_in_ms * get_cycles_per_ms())
|
||||
)
|
||||
losses[-1].backward()
|
||||
if _model is model and delay_before_optim:
|
||||
torch.get_device_module(device_type)._sleep(
|
||||
int(delay_in_ms * get_cycles_per_ms())
|
||||
)
|
||||
|
||||
for param in ref_model.parameters():
|
||||
if param.grad is not None:
|
||||
dist.all_reduce(param.grad)
|
||||
param.grad.div_(self.world_size)
|
||||
|
||||
for _optim in (ref_optim, optim):
|
||||
_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
|
||||
_optim.step()
|
||||
self.assertEqual(losses[0], losses[1])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_non_root_forward_backward(self):
|
||||
"""
|
||||
Tests running forward/backward through the root and then through a
|
||||
non-root. The non-root needs to synchronize streams/queue the callback.
|
||||
"""
|
||||
torch.manual_seed(42)
|
||||
lin_dim = 32
|
||||
model = nn.Sequential(*[MLP(lin_dim, torch.device("cpu")) for _ in range(3)])
|
||||
ref_model = copy.deepcopy(model).to(device_type)
|
||||
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
|
||||
for mlp in model:
|
||||
replicate(mlp)
|
||||
replicate(model)
|
||||
optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
|
||||
torch.manual_seed(42 + self.rank)
|
||||
inp = torch.randn((8, lin_dim), device=device_type)
|
||||
|
||||
ref_root_loss = ref_model(inp).sum()
|
||||
ref_root_loss.backward()
|
||||
for param in ref_model.parameters():
|
||||
dist.all_reduce(param.grad)
|
||||
param.grad.detach().div_(self.world_size)
|
||||
ref_optim.step()
|
||||
ref_optim.zero_grad()
|
||||
ref_nonroot_loss = ref_model[0](inp).sum()
|
||||
ref_nonroot_loss.backward()
|
||||
for param in ref_model.parameters():
|
||||
if param.grad is not None:
|
||||
dist.all_reduce(param.grad)
|
||||
param.grad.detach().div_(self.world_size)
|
||||
ref_optim.step()
|
||||
|
||||
root_loss = model(inp).sum()
|
||||
root_loss.backward()
|
||||
torch.get_device_module(device_type)._sleep(int(100 * get_cycles_per_ms()))
|
||||
optim.step()
|
||||
optim.zero_grad()
|
||||
nonroot_loss = model[0](inp).sum()
|
||||
nonroot_loss.backward()
|
||||
optim.step()
|
||||
|
||||
self.assertEqual(ref_root_loss, root_loss)
|
||||
self.assertEqual(ref_nonroot_loss, nonroot_loss)
|
||||
self.assertEqual(ref_model(inp).sum(), model(inp).sum())
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_multi_forward_module(self):
|
||||
"""
|
||||
Tests parity when running a module that participates multiple
|
||||
times in forward.
|
||||
"""
|
||||
self.run_subtests(
|
||||
{"reshard_after_forward": [True, False]},
|
||||
self._test_multi_forward_module,
|
||||
)
|
||||
|
||||
def _test_multi_forward_module(self, reshard_after_forward: Union[bool, int]):
|
||||
class MultiForwardModule(nn.Module):
|
||||
def __init__(self, device: torch.device):
|
||||
super().__init__()
|
||||
self.inner = nn.Linear(4, 4, device=device)
|
||||
self.outer = nn.Linear(4, 5, device=device)
|
||||
|
||||
def forward(self, x):
|
||||
i = self.inner(x)
|
||||
j = self.inner(x)
|
||||
return self.outer(i + j)
|
||||
|
||||
torch.manual_seed(42)
|
||||
model = MultiForwardModule(device=device_type.type)
|
||||
ref_model = copy.deepcopy(model).to(device_type)
|
||||
|
||||
ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
|
||||
replicate(model.inner)
|
||||
replicate(model)
|
||||
optim = torch.optim.Adam(model.parameters(), lr=1e-2)
|
||||
|
||||
torch.manual_seed(42 + self.rank)
|
||||
inp = torch.randn((32, 4), device=device_type.type)
|
||||
for iter_idx in range(10):
|
||||
losses: list[torch.Tensor] = []
|
||||
for _model in (ref_model, model):
|
||||
losses.append(_model(inp).sum())
|
||||
losses[-1].backward()
|
||||
|
||||
for param in ref_model.parameters():
|
||||
if param.grad is not None:
|
||||
dist.all_reduce(param.grad)
|
||||
param.grad.div_(self.world_size)
|
||||
|
||||
for _optim in (ref_optim, optim):
|
||||
_optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
|
||||
_optim.step()
|
||||
|
||||
self.assertEqual(losses[0], losses[1])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_explicit_prefetching(self):
|
||||
torch.manual_seed(42)
|
||||
model_args = ModelArgs(n_layers=8, dropout_p=0.0)
|
||||
model = Transformer(model_args)
|
||||
ref_model = copy.deepcopy(model).to(device_type)
|
||||
ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
|
||||
|
||||
for layer in itertools.chain(model.layers, [model]):
|
||||
replicate(layer)
|
||||
optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
|
||||
|
||||
num_to_forward_prefetch = num_to_backward_prefetch = 2
|
||||
for i, layer in enumerate(model.layers):
|
||||
if i >= len(model.layers) - num_to_forward_prefetch:
|
||||
break
|
||||
layers_to_prefetch = [
|
||||
model.layers[i + j] for j in range(1, num_to_forward_prefetch + 1)
|
||||
]
|
||||
layer.set_modules_to_forward_prefetch(layers_to_prefetch)
|
||||
for i, layer in enumerate(model.layers):
|
||||
if i < num_to_backward_prefetch:
|
||||
continue
|
||||
layers_to_prefetch = [
|
||||
model.layers[i - j] for j in range(1, num_to_backward_prefetch + 1)
|
||||
]
|
||||
layer.set_modules_to_backward_prefetch(layers_to_prefetch)
|
||||
|
||||
torch.manual_seed(42 + self.rank)
|
||||
inp = torch.randint(0, model_args.vocab_size, (2, 8), device=device_type.type)
|
||||
for _ in range(10):
|
||||
losses: list[torch.Tensor] = []
|
||||
|
||||
for _model in (ref_model, model):
|
||||
losses.append(_model(inp).sum())
|
||||
losses[-1].backward()
|
||||
|
||||
for param in ref_model.parameters():
|
||||
if param.grad is not None:
|
||||
dist.all_reduce(param.grad)
|
||||
param.grad.div_(self.world_size)
|
||||
|
||||
for _optim in (ref_optim, optim):
|
||||
_optim.zero_grad()
|
||||
_optim.step()
|
||||
|
||||
self.assertEqual(losses[0], losses[1])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
|
||||
def test_post_optim_event(self):
|
||||
torch.manual_seed(42)
|
||||
model_args = ModelArgs(dropout_p=0.0)
|
||||
model = Transformer(model_args)
|
||||
ref_model = copy.deepcopy(model).to(device_type.type)
|
||||
ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
|
||||
for layer in itertools.chain(model.layers, [model]):
|
||||
replicate(layer)
|
||||
optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
|
||||
|
||||
def step_post_hook(
|
||||
fsdp_module: FSDPModule, opt: torch.optim.Optimizer, args, kwargs
|
||||
) -> None:
|
||||
post_optim_event = (
|
||||
torch.get_device_module(device_type).current_stream().record_event()
|
||||
)
|
||||
fsdp_module.set_post_optim_event(post_optim_event)
|
||||
|
||||
optim.register_step_post_hook(functools.partial(step_post_hook, model))
|
||||
|
||||
torch.manual_seed(42 + self.rank)
|
||||
inp = torch.randint(0, model_args.vocab_size, (2, 8), device=device_type.type)
|
||||
# Track all losses and check for equality at the end to avoid a CPU
|
||||
# sync point after each iteration
|
||||
ref_losses: list[torch.Tensor] = []
|
||||
losses: list[torch.Tensor] = []
|
||||
for _ in range(10):
|
||||
ref_optim.zero_grad()
|
||||
ref_losses.append(ref_model(inp).sum())
|
||||
ref_losses[-1].backward()
|
||||
|
||||
for param in ref_model.parameters():
|
||||
if param.grad is not None:
|
||||
dist.all_reduce(param.grad)
|
||||
param.grad.div_(self.world_size)
|
||||
|
||||
ref_optim.step()
|
||||
for _ in range(10):
|
||||
optim.zero_grad()
|
||||
losses.append(model(inp).sum())
|
||||
losses[-1].backward()
|
||||
optim.step()
|
||||
# Sleep after the optimizer step to allow CPU to run ahead into the
|
||||
# next iteration's forward, exercising the post-optim stream sync
|
||||
torch.get_device_module(device_type)._sleep(int(25 * get_cycles_per_ms()))
|
||||
for ref_loss, loss in zip(ref_losses, losses):
|
||||
self.assertEqual(ref_loss, loss)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -41,7 +41,7 @@ class TestDTensorDebugMode(TestCase):
|
||||
x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
|
||||
y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)
|
||||
|
||||
with DebugMode(record_torchfunction=True) as debug_mode:
|
||||
with DebugMode() as debug_mode:
|
||||
torch.mm(x_dtensor, y_dtensor).sum()
|
||||
|
||||
self.assertExpectedInline(
|
||||
@ -80,7 +80,7 @@ class TestDTensorDebugMode(TestCase):
|
||||
x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
|
||||
y_dtensor = DTensor.from_local(y, mesh, [Shard(1)], run_check=False)
|
||||
|
||||
with DebugMode(record_torchfunction=True) as debug_mode:
|
||||
with DebugMode() as debug_mode:
|
||||
z = x_dtensor + y_dtensor
|
||||
z.sum().backward()
|
||||
|
||||
@ -121,7 +121,7 @@ class TestDTensorDebugMode(TestCase):
|
||||
b_dt = DTensor.from_local(b, mesh, [Replicate(), Partial()], run_check=False)
|
||||
|
||||
# Capture the operator decomposition
|
||||
with DebugMode(record_torchfunction=True) as debug_mode:
|
||||
with DebugMode() as debug_mode:
|
||||
torch.einsum("bld,dnh->blnh", a_dt, b_dt)
|
||||
|
||||
self.assertExpectedInline(
|
||||
@ -176,7 +176,7 @@ class TestDTensorDebugMode(TestCase):
|
||||
x = torch.randn(8, 8, 8)
|
||||
linear = torch.nn.Linear(8, 8)
|
||||
|
||||
with DebugMode(record_torchfunction=True) as debug_mode:
|
||||
with DebugMode() as debug_mode:
|
||||
linear(x).sum()
|
||||
|
||||
self.assertExpectedInline(
|
||||
@ -196,7 +196,7 @@ class TestDTensorDebugMode(TestCase):
|
||||
x = torch.randn(8, 8)
|
||||
y = torch.randn(8, 8, 8)
|
||||
|
||||
with DebugMode(record_torchfunction=True, record_faketensor=True) as debug_mode:
|
||||
with DebugMode(record_faketensor=True) as debug_mode:
|
||||
torch.matmul(y, x)
|
||||
|
||||
self.assertExpectedInline(
|
||||
|
||||
@ -9,11 +9,12 @@ from typing import Union
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn.functional as F
|
||||
from torch import Tensor
|
||||
from torch import nn, Tensor
|
||||
from torch.distributed.device_mesh import init_device_mesh
|
||||
from torch.distributed.tensor import DeviceMesh
|
||||
from torch.distributed.tensor.debug import CommDebugMode
|
||||
from torch.distributed.tensor.experimental._attention import (
|
||||
_AttentionContextParallel,
|
||||
_CausalBehavior,
|
||||
_cp_options,
|
||||
_DispatchMode,
|
||||
@ -23,6 +24,7 @@ from torch.distributed.tensor.experimental._attention import (
|
||||
context_parallel_unshard,
|
||||
set_rotate_method,
|
||||
)
|
||||
from torch.distributed.tensor.parallel import parallelize_module
|
||||
from torch.nn.attention import sdpa_kernel, SDPBackend
|
||||
from torch.nn.attention.flex_attention import (
|
||||
_mask_mod_signature,
|
||||
@ -40,6 +42,8 @@ from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
|
||||
from torch.testing._internal.common_utils import run_tests, skipIfRocm
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
DTensorTestBase,
|
||||
ModelArgs,
|
||||
Transformer,
|
||||
with_comms,
|
||||
)
|
||||
|
||||
@ -269,6 +273,180 @@ class RingAttentionTest(DTensorTestBase):
|
||||
behavior,
|
||||
)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(
|
||||
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
|
||||
)
|
||||
@with_comms
|
||||
def test_ring_attention_native_transformer(self) -> None:
|
||||
self.run_subtests(
|
||||
{
|
||||
"is_causal": [True, False],
|
||||
"rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL],
|
||||
},
|
||||
self._test_ring_attention_native_transformer,
|
||||
)
|
||||
|
||||
@sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
|
||||
def _test_ring_attention_native_transformer(
|
||||
self, is_causal: bool, rotater: _RotateMethod
|
||||
) -> None:
|
||||
_cp_options.enable_load_balance = is_causal
|
||||
set_rotate_method(rotater_enum_to_str[rotater])
|
||||
self.assertEqual(_cp_options.rotate_method, rotater)
|
||||
device_mesh = DeviceMesh(
|
||||
self.device_type,
|
||||
torch.arange(0, self.world_size),
|
||||
)
|
||||
dtype = torch.bfloat16
|
||||
bs = 8
|
||||
ntokens = 8
|
||||
dim = 32
|
||||
nheads = 8
|
||||
num_layers = 2
|
||||
|
||||
encoder_layer = nn.TransformerEncoderLayer(
|
||||
d_model=dim,
|
||||
nhead=nheads,
|
||||
dim_feedforward=dim,
|
||||
batch_first=True,
|
||||
).to(dtype)
|
||||
encoder_layer = parallelize_module(
|
||||
module=encoder_layer,
|
||||
device_mesh=device_mesh,
|
||||
parallelize_plan={
|
||||
"self_attn": _AttentionContextParallel(),
|
||||
},
|
||||
)
|
||||
model = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
|
||||
model = model.to(self.device_type).to(dtype)
|
||||
|
||||
mask = (
|
||||
nn.Transformer.generate_square_subsequent_mask(
|
||||
ntokens, device=self.device_type, dtype=dtype
|
||||
)
|
||||
if is_causal
|
||||
else None
|
||||
)
|
||||
seq = torch.rand((bs, ntokens, dim), device=self.device_type, dtype=dtype)
|
||||
|
||||
with CommDebugMode() as comm_mode:
|
||||
out = model(seq, mask=mask, is_causal=is_causal)
|
||||
|
||||
if rotater == _RotateMethod.ALL_TO_ALL:
|
||||
self.assertDictEqual(
|
||||
comm_mode.get_comm_counts(),
|
||||
{
|
||||
c10d_functional.all_to_all_single: (self.world_size - 1)
|
||||
* num_layers,
|
||||
},
|
||||
)
|
||||
else:
|
||||
self.assertDictEqual(
|
||||
comm_mode.get_comm_counts(),
|
||||
{
|
||||
c10d_functional.all_gather_into_tensor: num_layers,
|
||||
},
|
||||
)
|
||||
|
||||
with CommDebugMode() as comm_mode:
|
||||
out.sum().backward()
|
||||
|
||||
if rotater == _RotateMethod.ALL_TO_ALL:
|
||||
self.assertDictEqual(
|
||||
comm_mode.get_comm_counts(),
|
||||
{
|
||||
c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
|
||||
* num_layers,
|
||||
},
|
||||
)
|
||||
else:
|
||||
self.assertDictEqual(
|
||||
comm_mode.get_comm_counts(),
|
||||
{
|
||||
c10d_functional.all_gather_into_tensor: num_layers,
|
||||
c10d_functional.all_to_all_single: self.world_size * num_layers,
|
||||
},
|
||||
)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(
|
||||
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
|
||||
)
|
||||
@with_comms
|
||||
@sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
|
||||
def test_ring_attention_custom_transformer(self) -> None:
|
||||
self.run_subtests(
|
||||
{"rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL]},
|
||||
self._test_ring_attention_custom_transformer,
|
||||
)
|
||||
|
||||
def _test_ring_attention_custom_transformer(self, rotater: _RotateMethod) -> None:
|
||||
set_rotate_method(rotater_enum_to_str[rotater])
|
||||
self.assertEqual(_cp_options.rotate_method, rotater)
|
||||
device_mesh = DeviceMesh(
|
||||
self.device_type,
|
||||
torch.arange(0, self.world_size),
|
||||
)
|
||||
# early init DTensor RNG tracker to avoid broadcast be captuured in comm_mode
|
||||
torch.distributed.tensor._random.manual_seed(10, device_mesh)
|
||||
|
||||
dtype = torch.bfloat16
|
||||
bs = 2
|
||||
args = ModelArgs()
|
||||
|
||||
model = Transformer(args).to(dtype).to(self.device_type)
|
||||
|
||||
model = parallelize_module(
|
||||
module=model,
|
||||
device_mesh=device_mesh,
|
||||
parallelize_plan={
|
||||
f"layers.{i}.attention": _AttentionContextParallel()
|
||||
for i in range(args.n_layers)
|
||||
},
|
||||
)
|
||||
|
||||
seq = torch.randint(
|
||||
args.vocab_size, (bs, args.max_seq_len), device=self.device_type
|
||||
)
|
||||
|
||||
with CommDebugMode() as comm_mode:
|
||||
out = model(seq)
|
||||
|
||||
if rotater == _RotateMethod.ALL_TO_ALL:
|
||||
self.assertDictEqual(
|
||||
comm_mode.get_comm_counts(),
|
||||
{
|
||||
c10d_functional.all_to_all_single: (self.world_size - 1)
|
||||
* args.n_layers,
|
||||
},
|
||||
)
|
||||
else:
|
||||
self.assertDictEqual(
|
||||
comm_mode.get_comm_counts(),
|
||||
{c10d_functional.all_gather_into_tensor: args.n_layers},
|
||||
)
|
||||
|
||||
with CommDebugMode() as comm_mode:
|
||||
out.sum().backward()
|
||||
|
||||
if rotater == _RotateMethod.ALL_TO_ALL:
|
||||
self.assertDictEqual(
|
||||
comm_mode.get_comm_counts(),
|
||||
{
|
||||
c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
|
||||
* args.n_layers,
|
||||
},
|
||||
)
|
||||
else:
|
||||
self.assertDictEqual(
|
||||
comm_mode.get_comm_counts(),
|
||||
{
|
||||
c10d_functional.all_gather_into_tensor: args.n_layers,
|
||||
c10d_functional.all_to_all_single: self.world_size * args.n_layers,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# Compile the flex_attention function
|
||||
compiled_flex_attention = torch.compile(flex_attention, dynamic=False, fullgraph=True)
|
||||
@ -354,12 +532,12 @@ def generate_doc_mask_mod(
|
||||
return doc_mask_mod
|
||||
|
||||
|
||||
class CPFlexAttentionTest(DTensorTestBase):
|
||||
class RingFlexAttentionTest(DTensorTestBase):
|
||||
@property
|
||||
def world_size(self) -> int:
|
||||
return 2
|
||||
|
||||
def _test_cp_flex_attention(
|
||||
def _test_ring_flex_attention(
|
||||
self, qkv_size, B=1, mask_func=causal_mask, atol=1e-6, rtol=1e-2
|
||||
) -> None:
|
||||
torch.cuda.manual_seed(10)
|
||||
@ -408,6 +586,15 @@ class CPFlexAttentionTest(DTensorTestBase):
|
||||
mesh_shape=(self.world_size,),
|
||||
mesh_dim_names=("cp",),
|
||||
)
|
||||
# NOTE: cp needs to know the sharding dimension
|
||||
# TODO: see if this can be moved to the cp context
|
||||
from torch.distributed.tensor.experimental._attention import _set_cp_global_var
|
||||
|
||||
_set_cp_global_var("cp_shard_dim", 2)
|
||||
self.assertEqual(
|
||||
torch.distributed.tensor.experimental._attention._cp_global_vars.cp_shard_dim,
|
||||
2,
|
||||
)
|
||||
|
||||
# NOTE: we do not test load balance here
|
||||
_cp_options.enable_load_balance = False
|
||||
@ -497,17 +684,17 @@ class CPFlexAttentionTest(DTensorTestBase):
|
||||
@unittest.skipIf(
|
||||
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
|
||||
)
|
||||
def test_cp_flex_attention(self) -> None:
|
||||
def test_ring_flex_attention(self) -> None:
|
||||
self.run_subtests(
|
||||
{"qkv_size": [128 * self.world_size, 2048]},
|
||||
self._test_cp_flex_attention,
|
||||
self._test_ring_flex_attention,
|
||||
)
|
||||
|
||||
# NOTE: Context Parallel should not be used for small attentions (block_size < 128)
|
||||
with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close"):
|
||||
self.run_subtests(
|
||||
{"qkv_size": [64 * self.world_size]},
|
||||
self._test_cp_flex_attention,
|
||||
self._test_ring_flex_attention,
|
||||
)
|
||||
|
||||
# TODO: merge with the above test
|
||||
@ -516,7 +703,7 @@ class CPFlexAttentionTest(DTensorTestBase):
|
||||
@unittest.skipIf(
|
||||
not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
|
||||
)
|
||||
def test_cp_flex_attention_document_mask(self) -> None:
|
||||
def test_ring_flex_attention_document_mask(self) -> None:
|
||||
random.seed(10)
|
||||
|
||||
# NOTE: Each (batch_size, seq_len) tuple introduces 2 create_block_mask
|
||||
@ -537,7 +724,7 @@ class CPFlexAttentionTest(DTensorTestBase):
|
||||
|
||||
# TODO: change this for-loop to run_subtests
|
||||
# Use a for-loop instead of run_subtests because we need to intialize the mask
|
||||
# for each subtest. This can be baked into self._test_cp_flex_attention as
|
||||
# for each subtest. This can be baked into self._test_ring_flex_attention as
|
||||
# a str argument denoting mask type.
|
||||
for batch_size, max_seq_len in itertools.product(
|
||||
batch_size_list, max_seq_len_list
|
||||
@ -551,7 +738,7 @@ class CPFlexAttentionTest(DTensorTestBase):
|
||||
|
||||
# construct testing function
|
||||
test_func = functools.partial(
|
||||
self._test_cp_flex_attention,
|
||||
self._test_ring_flex_attention,
|
||||
qkv_size=max_seq_len,
|
||||
B=batch_size,
|
||||
mask_func=document_causal_mask,
|
||||
|
||||
@ -7,7 +7,7 @@ import warnings
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.testing._internal.common_methods_invocations as common_ops
|
||||
from torch.distributed.tensor import distribute_tensor, DTensor, init_device_mesh, Shard
|
||||
from torch.distributed.tensor import DTensor, init_device_mesh
|
||||
from torch.overrides import resolve_name
|
||||
from torch.testing._internal.common_device_type import (
|
||||
instantiate_device_type_tests,
|
||||
@ -21,7 +21,6 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
)
|
||||
from torch.utils import _pytree as pytree
|
||||
from torch.utils._pytree import tree_map
|
||||
from torch.utils.debug_mode import DebugMode
|
||||
|
||||
|
||||
# rewrite common size variables to sth can be sharded evenly
|
||||
@ -118,6 +117,7 @@ dtensor_fails = {
|
||||
xfail("cholesky"),
|
||||
xfail("cholesky_inverse"),
|
||||
xfail("cholesky_solve"),
|
||||
xfail("chunk"),
|
||||
xfail("combinations"),
|
||||
xfail("complex"),
|
||||
xfail("count_nonzero"),
|
||||
@ -661,36 +661,6 @@ class TestDTensorOps(DTensorOpTestBase):
|
||||
sample_inputs_filter=lambda s: s.kwargs["num_classes"] != -1,
|
||||
)
|
||||
|
||||
def test_mean(self):
|
||||
self.mesh = init_device_mesh(DEVICE_TYPE, (self.world_size,))
|
||||
|
||||
shape = [2 * self.world_size + 1, 2 * self.world_size]
|
||||
tensor = (
|
||||
torch.arange(shape[0] * shape[1], dtype=torch.float32)
|
||||
.reshape(shape)
|
||||
.to(DEVICE_TYPE)
|
||||
)
|
||||
|
||||
for is_evenly_shardable in [True]:
|
||||
if is_evenly_shardable:
|
||||
placement = [Shard(1)]
|
||||
reduce_dim = 1
|
||||
else:
|
||||
placement = [Shard(0)]
|
||||
reduce_dim = 0
|
||||
dtensor = distribute_tensor(tensor, self.mesh, placement)
|
||||
|
||||
with DebugMode(record_torchfunction=False) as debug_mode:
|
||||
mean = dtensor.mean(dim=reduce_dim)
|
||||
full_tensor = mean.full_tensor()
|
||||
|
||||
self.assertEqual(full_tensor, tensor.mean(dim=reduce_dim))
|
||||
|
||||
if is_evenly_shardable:
|
||||
self.assertFalse("redistribute_input" in debug_mode.debug_string())
|
||||
else:
|
||||
self.assertTrue("redistribute_input" in debug_mode.debug_string())
|
||||
|
||||
|
||||
# only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU)
|
||||
instantiate_device_type_tests(TestDTensorOps, globals(), only_for=(DEVICE_TYPE,))
|
||||
|
||||
@ -1,10 +1,8 @@
|
||||
# Copyright (c) Meta Platforms, Inc. and affiliates
|
||||
# Owner(s): ["oncall: distributed"]
|
||||
|
||||
import random
|
||||
|
||||
from torch.distributed.tensor._dtensor_spec import DTensorSpec
|
||||
from torch.distributed.tensor._op_schema import OpSchema, RuntimeSchemaInfo
|
||||
from torch.distributed.tensor._op_schema import OpSchema
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
|
||||
@ -12,108 +10,12 @@ class TestOpSchema(TestCase):
|
||||
def test_equality_checks_lists_of_dtensor_spec(self):
|
||||
"""If x == y, then we must have h(x) == h(y)."""
|
||||
dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
|
||||
schema1 = OpSchema(op=None, args_schema=(dts, [dts]), kwargs_schema={})
|
||||
schema2 = OpSchema(op=None, args_schema=(dts, [dts, dts]), kwargs_schema={})
|
||||
schema1 = OpSchema(op=None, args_schema=[dts, [dts]], kwargs_schema={})
|
||||
schema2 = OpSchema(op=None, args_schema=[dts, [dts, dts]], kwargs_schema={})
|
||||
# This is a regression test; these schemas used to compare equal.
|
||||
self.assertNotEqual(schema1, schema2)
|
||||
self.assertNotEqual(hash(schema1), hash(schema2))
|
||||
|
||||
def test_equality_respects_static_attributes(self):
|
||||
def _get_sample_op_schemas(static_arg_val, static_kwarg_val):
|
||||
dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
|
||||
static_argnum = 2
|
||||
static_kwargkey = ["statickwarg"]
|
||||
annotated_schemas = [
|
||||
(False, False, None),
|
||||
(True, False, RuntimeSchemaInfo(static_argnum=static_argnum)),
|
||||
(False, True, RuntimeSchemaInfo(static_kwargkey=static_kwargkey)),
|
||||
(
|
||||
True,
|
||||
True,
|
||||
RuntimeSchemaInfo(
|
||||
static_argnum=static_argnum, static_kwargkey=static_kwargkey
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
# non-tensor args show up in hash iff the argnum is static/
|
||||
# kwargs show up in hash iff their name is in static_kwargkey.
|
||||
# random elements are random because they are not supposed to matter for
|
||||
# equality at all.
|
||||
args_schema = (dts, random.randint(1, 1000000), static_arg_val)
|
||||
kwargs_schema = {
|
||||
"ignoredkwarg": random.randint(1, 1000000),
|
||||
"statickwarg": static_kwarg_val,
|
||||
}
|
||||
return [
|
||||
(
|
||||
has_static_arg,
|
||||
has_static_kwarg,
|
||||
OpSchema(
|
||||
op=None,
|
||||
args_schema=args_schema,
|
||||
kwargs_schema=kwargs_schema,
|
||||
schema_info=si,
|
||||
),
|
||||
)
|
||||
for (has_static_arg, has_static_kwarg, si) in annotated_schemas
|
||||
]
|
||||
|
||||
for lhs_has_static_arg, lhs_has_static_kwarg, lhs in _get_sample_op_schemas(
|
||||
1, 2
|
||||
):
|
||||
# Static arg/kwarg both match
|
||||
for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
|
||||
1, 2
|
||||
):
|
||||
if (
|
||||
lhs_has_static_arg == rhs_has_static_arg
|
||||
and lhs_has_static_kwarg == rhs_has_static_kwarg
|
||||
):
|
||||
self.assertEqual(lhs, rhs)
|
||||
else:
|
||||
self.assertNotEqual(lhs, rhs)
|
||||
|
||||
# Static arg mismatch
|
||||
for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
|
||||
3, 2
|
||||
):
|
||||
if (
|
||||
lhs_has_static_arg
|
||||
or rhs_has_static_arg
|
||||
or lhs_has_static_kwarg != rhs_has_static_kwarg
|
||||
):
|
||||
self.assertNotEqual(lhs, rhs)
|
||||
else:
|
||||
self.assertEqual(lhs, rhs)
|
||||
|
||||
# Static kwarg mismatch
|
||||
for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
|
||||
1, 3
|
||||
):
|
||||
if (
|
||||
lhs_has_static_kwarg
|
||||
or rhs_has_static_kwarg
|
||||
or lhs_has_static_arg != rhs_has_static_arg
|
||||
):
|
||||
self.assertNotEqual(lhs, rhs)
|
||||
else:
|
||||
self.assertEqual(lhs, rhs)
|
||||
|
||||
# Static arg/kwarg both mismatch
|
||||
for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
|
||||
3, 4
|
||||
):
|
||||
if (
|
||||
lhs_has_static_arg
|
||||
or rhs_has_static_arg
|
||||
or lhs_has_static_kwarg
|
||||
or rhs_has_static_kwarg
|
||||
):
|
||||
self.assertNotEqual(lhs, rhs)
|
||||
else:
|
||||
self.assertEqual(lhs, rhs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -63,20 +63,22 @@ def nvshmem_get_kernel(
|
||||
|
||||
@triton.jit
|
||||
def nvshmem_putmem_signal_block_kernel(
|
||||
dst,
|
||||
src,
|
||||
dst_ptr,
|
||||
src_ptr,
|
||||
size_bytes,
|
||||
signal,
|
||||
sig_val,
|
||||
sig_ptr,
|
||||
signal_val,
|
||||
sig_op,
|
||||
peer,
|
||||
):
|
||||
nvshmem.putmem_signal_block(dst, src, size_bytes, signal, sig_val, sig_op, peer)
|
||||
nvshmem.putmem_signal_block(
|
||||
dst_ptr, src_ptr, size_bytes, sig_ptr, signal_val, sig_op, peer
|
||||
)
|
||||
|
||||
|
||||
@triton.jit
|
||||
def nvshmem_signal_wait_until_kernel(signal, cmp_op, cmp_val):
|
||||
nvshmem.signal_wait_until(signal, cmp_op, cmp_val)
|
||||
def nvshmem_signal_wait_until_kernel(sig_ptr, cmp_op, cmp_val):
|
||||
nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
|
||||
|
||||
|
||||
@triton.jit
|
||||
@ -417,7 +419,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
val = 11
|
||||
inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
|
||||
out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
|
||||
symm_mem.rendezvous(inp, group=group_name)
|
||||
inp_hdl = symm_mem.rendezvous(inp, group=group_name)
|
||||
out_hdl = symm_mem.rendezvous(out, group=group_name)
|
||||
|
||||
# Use the signal pad attached to the output symmetric memory handle
|
||||
@ -431,12 +433,15 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
|
||||
if rank == 0:
|
||||
# Rank 0 puts into Rank 1
|
||||
dst_ptr = out_hdl.buffer_ptrs[peer]
|
||||
src_ptr = inp_hdl.buffer_ptrs[rank]
|
||||
sig_ptr = out_hdl.signal_pad_ptrs[peer]
|
||||
nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
|
||||
out,
|
||||
inp,
|
||||
dst_ptr,
|
||||
src_ptr,
|
||||
size_bytes=msg_size_bytes,
|
||||
signal=flag,
|
||||
sig_val=SIGNAL_VAL,
|
||||
sig_ptr=sig_ptr,
|
||||
signal_val=SIGNAL_VAL,
|
||||
sig_op=NVSHMEM_SIGNAL_SET,
|
||||
peer=peer,
|
||||
extern_libs=nvshmem_lib,
|
||||
@ -444,8 +449,9 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
|
||||
if rank == 1:
|
||||
# Wait until signal flag is set by Rank 0
|
||||
sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
|
||||
nvshmem_signal_wait_until_kernel[(1,)](
|
||||
flag,
|
||||
sig_ptr_local,
|
||||
cmp_op=NVSHMEM_CMP_EQ,
|
||||
cmp_val=SIGNAL_VAL,
|
||||
extern_libs=nvshmem_lib,
|
||||
@ -479,7 +485,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
val = 11
|
||||
inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
|
||||
out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
|
||||
symm_mem.rendezvous(inp, group=group_name)
|
||||
inp_hdl = symm_mem.rendezvous(inp, group=group_name)
|
||||
out_hdl = symm_mem.rendezvous(out, group=group_name)
|
||||
|
||||
# Use the signal pad attached to the output symmetric memory handle
|
||||
@ -493,20 +499,24 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
|
||||
if rank == 0:
|
||||
# Rank 0 puts into Rank 1
|
||||
dst_ptr = out_hdl.buffer_ptrs[peer]
|
||||
src_ptr = inp_hdl.buffer_ptrs[rank]
|
||||
sig_ptr = out_hdl.signal_pad_ptrs[peer]
|
||||
nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
|
||||
out,
|
||||
inp,
|
||||
dst_ptr,
|
||||
src_ptr,
|
||||
size_bytes=msg_size_bytes,
|
||||
signal=flag,
|
||||
sig_val=SIGNAL_VAL,
|
||||
sig_ptr=sig_ptr,
|
||||
signal_val=SIGNAL_VAL,
|
||||
sig_op=NVSHMEM_SIGNAL_ADD,
|
||||
peer=peer,
|
||||
extern_libs=nvshmem_lib,
|
||||
)
|
||||
|
||||
if rank == 1:
|
||||
sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
|
||||
nvshmem_signal_wait_until_kernel[(1, 1, 1)](
|
||||
flag,
|
||||
sig_ptr_local,
|
||||
cmp_op=NVSHMEM_CMP_EQ,
|
||||
cmp_val=SIGNAL_VAL,
|
||||
extern_libs=nvshmem_lib,
|
||||
@ -536,13 +546,10 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
FLAG_FINAL_VALUE = 42
|
||||
|
||||
# Use a single int64 symmetric tensor as our synchronization flag.
|
||||
flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(
|
||||
flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(
|
||||
FLAG_INITIAL_VALUE
|
||||
)
|
||||
symm_mem.rendezvous(flag, group=group_name)
|
||||
expected_flag = torch.tensor(
|
||||
[FLAG_FINAL_VALUE], dtype=torch.int32, device=self.device
|
||||
)
|
||||
|
||||
nvshmem_barrier_all_kernel[(1,)](extern_libs=nvshmem_lib)
|
||||
|
||||
@ -558,15 +565,19 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
# Verification
|
||||
torch.testing.assert_close(
|
||||
flag,
|
||||
expected_flag,
|
||||
torch.tensor([FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device),
|
||||
)
|
||||
|
||||
if rank == 1:
|
||||
# Rank 1 (the signaler)
|
||||
val_to_put = torch.tensor(
|
||||
[FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device
|
||||
)
|
||||
|
||||
# Launch a kernel to put the value to Rank 0's flag tensor.
|
||||
nvshmem_put_kernel[(1,)](
|
||||
flag, # Destination symmetric tensor on the remote PE
|
||||
expected_flag, # Source data tensor (local)
|
||||
val_to_put, # Source data tensor (local)
|
||||
1, # Number of elements
|
||||
peer, # The target PE (Rank 0)
|
||||
extern_libs=nvshmem_lib,
|
||||
@ -598,7 +609,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
|
||||
# Producer (rank 0) prepares the data to send
|
||||
inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val_to_put)
|
||||
symm_mem.rendezvous(inp, group=group_name)
|
||||
inp_hdl = symm_mem.rendezvous(inp, group=group_name)
|
||||
# Consumer (rank 1) prepares the destination buffer
|
||||
out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
|
||||
out_hdl = symm_mem.rendezvous(out, group=group_name)
|
||||
@ -608,20 +619,24 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
|
||||
if rank == 0:
|
||||
# Producer (rank 0): Puts data into rank 1's `out` buffer and then sets the flag
|
||||
dst_ptr = out_hdl.buffer_ptrs[peer]
|
||||
src_ptr = inp_hdl.buffer_ptrs[rank]
|
||||
sig_ptr = out_hdl.signal_pad_ptrs[peer]
|
||||
nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
|
||||
out,
|
||||
inp,
|
||||
dst_ptr,
|
||||
src_ptr,
|
||||
size_bytes=msg_size_bytes,
|
||||
signal=flag,
|
||||
sig_val=COMPLETION_FLAG_VAL,
|
||||
sig_ptr=sig_ptr,
|
||||
signal_val=COMPLETION_FLAG_VAL,
|
||||
sig_op=NVSHMEM_SIGNAL_SET,
|
||||
peer=peer,
|
||||
extern_libs=nvshmem_lib,
|
||||
)
|
||||
elif rank == 1:
|
||||
# Consumer (rank 1): Waits on the signal variable using `signal_wait_until`.
|
||||
sig_ptr = out_hdl.signal_pad_ptrs[rank]
|
||||
nvshmem_signal_wait_until_kernel[(1, 1, 1)](
|
||||
flag,
|
||||
sig_ptr,
|
||||
cmp_op=NVSHMEM_CMP_EQ,
|
||||
cmp_val=COMPLETION_FLAG_VAL,
|
||||
extern_libs=nvshmem_lib,
|
||||
@ -674,10 +689,10 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
symm_mem.rendezvous(out2, group=group_name)
|
||||
|
||||
# Use regular symmetric memory tensor for flag
|
||||
flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(0)
|
||||
flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
|
||||
symm_mem.rendezvous(flag, group=group_name)
|
||||
flag_update_val = torch.tensor(
|
||||
[flag_val], dtype=torch.int32, device=self.device
|
||||
[flag_val], dtype=torch.int64, device=self.device
|
||||
)
|
||||
NVSHMEM_CMP_EQ = 0 # compare equal
|
||||
|
||||
@ -710,7 +725,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
out2, val2 * torch.ones(numel, dtype=dtype, device=self.device)
|
||||
)
|
||||
torch.testing.assert_close(
|
||||
flag, torch.tensor([flag_val], dtype=torch.int32, device=self.device)
|
||||
flag, torch.tensor([flag_val], dtype=torch.int64, device=self.device)
|
||||
)
|
||||
|
||||
@skipIfRocm
|
||||
@ -732,9 +747,9 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
|
||||
inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
|
||||
out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
|
||||
flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(0)
|
||||
flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
|
||||
flag_update_val = torch.tensor(
|
||||
[flag_val], dtype=torch.int32, device=self.device
|
||||
[flag_val], dtype=torch.int64, device=self.device
|
||||
)
|
||||
|
||||
symm_mem.rendezvous(inp, group=group_name)
|
||||
@ -967,7 +982,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
torch.uint8,
|
||||
torch.float16,
|
||||
torch.float32,
|
||||
# torch.float64, # Tensor-likes are not close
|
||||
torch.float64,
|
||||
torch.bfloat16,
|
||||
],
|
||||
)
|
||||
@ -1120,7 +1135,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||
torch.int64,
|
||||
torch.float16,
|
||||
torch.float32,
|
||||
# torch.float64, # Tensor-likes are not close
|
||||
torch.float64,
|
||||
torch.bfloat16,
|
||||
],
|
||||
)
|
||||
|
||||
@ -386,7 +386,7 @@ class TestCustomBackendAPI(torch._dynamo.test_case.TestCase):
|
||||
self.assertTrue(backend_run)
|
||||
|
||||
|
||||
devices = ["cpu", "cuda", "hpu", "xpu"]
|
||||
devices = ["cpu", "cuda", "hpu"]
|
||||
instantiate_device_type_tests(TestOptimizations, globals(), only_for=devices)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -8,12 +8,7 @@ from torch._dynamo.callback import callback_handler, CallbackArgs, CallbackTrigg
|
||||
from torch._dynamo.test_case import run_tests, TestCase
|
||||
from torch._guards import CompileId
|
||||
from torch.testing._internal.common_utils import TEST_WITH_ROCM
|
||||
from torch.testing._internal.triton_utils import HAS_CUDA_AND_TRITON, requires_gpu
|
||||
|
||||
|
||||
device_type = (
|
||||
acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
|
||||
)
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
|
||||
|
||||
class CallbackTests(TestCase):
|
||||
@ -66,7 +61,7 @@ class CallbackTests(TestCase):
|
||||
@unittest.skipIf(
|
||||
TEST_WITH_ROCM, "ROCm outputs a different number of autotuning logs"
|
||||
)
|
||||
@requires_gpu
|
||||
@requires_cuda_and_triton
|
||||
@torch._inductor.config.patch(force_disable_caches=True)
|
||||
def test_triggers(self) -> None:
|
||||
torch._dynamo.reset()
|
||||
@ -96,9 +91,9 @@ class CallbackTests(TestCase):
|
||||
torch._dynamo.graph_break()
|
||||
return self.fc2(temp)
|
||||
|
||||
model = TinyModel().to(device_type)
|
||||
model = TinyModel().to("cuda")
|
||||
compiled_model = torch.compile(model, mode="max-autotune")
|
||||
x = torch.randn(10, 10, device=device_type)
|
||||
x = torch.randn(10, 10, device="cuda")
|
||||
|
||||
loss = compiled_model(x).sum()
|
||||
loss.backward()
|
||||
@ -116,13 +111,9 @@ end=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id
|
||||
)
|
||||
order.clear()
|
||||
|
||||
if not HAS_CUDA_AND_TRITON:
|
||||
return
|
||||
|
||||
compiled_model.zero_grad()
|
||||
loss = compiled_model(x).sum()
|
||||
loss.backward()
|
||||
|
||||
self.assertExpectedInline(
|
||||
"\n".join(order),
|
||||
"""\
|
||||
|
||||
@ -48,6 +48,27 @@ class GenericCtxMgr:
|
||||
|
||||
|
||||
class ErrorMessagesTest(LoggingTestCase):
|
||||
def test_dynamic_shape_operator(self):
|
||||
def fn():
|
||||
return torch.nonzero(torch.rand([10, 10]))
|
||||
|
||||
self.assertExpectedInlineMunged(
|
||||
Unsupported,
|
||||
lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
|
||||
"""\
|
||||
Dynamic shape operator
|
||||
Explanation: Operator `aten.nonzero.default`'s output shape depends on input Tensor data.
|
||||
Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`
|
||||
|
||||
Developer debug context: aten.nonzero.default
|
||||
|
||||
For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0036.html
|
||||
|
||||
from user code:
|
||||
File "test_error_messages.py", line N, in fn
|
||||
return torch.nonzero(torch.rand([10, 10]))""",
|
||||
)
|
||||
|
||||
def test_dynamic_shape_operator_no_meta_kernel(self):
|
||||
def fn():
|
||||
return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))
|
||||
@ -70,6 +91,29 @@ from user code:
|
||||
return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))""",
|
||||
)
|
||||
|
||||
def test_data_dependent_operator(self):
|
||||
def fn(x):
|
||||
return x.item()
|
||||
|
||||
self.assertExpectedInlineMunged(
|
||||
Unsupported,
|
||||
lambda: torch.compile(fn, backend="eager", fullgraph=True)(
|
||||
torch.Tensor([1])
|
||||
),
|
||||
"""\
|
||||
Unsupported Tensor.item() call with capture_scalar_outputs=False
|
||||
Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.
|
||||
Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
|
||||
|
||||
Developer debug context: call_method TensorVariable() item () {}
|
||||
|
||||
For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html
|
||||
|
||||
from user code:
|
||||
File "test_error_messages.py", line N, in fn
|
||||
return x.item()""",
|
||||
)
|
||||
|
||||
def test_data_dependent_operator2(self):
|
||||
def fn(x):
|
||||
return torch.equal(x, x)
|
||||
|
||||
@ -40,16 +40,11 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import HAS_GPU
|
||||
|
||||
# Defines all the kernels for tests
|
||||
from torch.testing._internal.triton_utils import * # noqa: F403
|
||||
|
||||
|
||||
device_type = (
|
||||
acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
|
||||
)
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
d = torch.ones(10, 10)
|
||||
@ -1155,10 +1150,10 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
|
||||
m = a.to(torch.float16)
|
||||
return b.type(m.type())
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "requires gpu")
|
||||
@unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
|
||||
@make_test
|
||||
def test_tensor_type2(a, b):
|
||||
m = a.to(device_type)
|
||||
m = a.to("cuda")
|
||||
return m + b.type(m.type())
|
||||
|
||||
@make_test
|
||||
@ -2087,12 +2082,6 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
|
||||
mytuple = FunctionTests.MyNamedTuple(a, b)
|
||||
return mytuple.add(), mytuple.static_method(), mytuple.class_method()
|
||||
|
||||
@make_test
|
||||
def test_namedtuple_replace(a, b):
|
||||
mytuple = FunctionTests.MyNamedTuple(a, b)
|
||||
replaced = mytuple._replace(first=b)
|
||||
return mytuple.first + mytuple.second + replaced.first + replaced.second
|
||||
|
||||
@make_test
|
||||
def test_generic_namedtuple_user_methods(a, b):
|
||||
mytuple = FunctionTests.MyGenericNamedTuple(a, b)
|
||||
@ -4051,7 +4040,7 @@ class GraphModule(torch.nn.Module):
|
||||
def f1():
|
||||
mod1 = torch.get_device_module()
|
||||
mod2 = torch.get_device_module("cpu")
|
||||
mod3 = torch.get_device_module(torch.device(device_type))
|
||||
mod3 = torch.get_device_module(torch.device("cuda"))
|
||||
return mod1, mod2, mod3
|
||||
|
||||
self.assertEqual(f1(), torch.compile(f1, backend="eager", fullgraph=True)())
|
||||
@ -4086,7 +4075,6 @@ class GraphModule(torch.nn.Module):
|
||||
new_device = (
|
||||
"cpu" if torch._C._get_accelerator() == torch.device("cuda") else "cuda"
|
||||
)
|
||||
|
||||
old_get_device_module = torch.get_device_module
|
||||
|
||||
def new_get_device_module(device=None):
|
||||
@ -4733,12 +4721,10 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
|
||||
opt_fn(x, ys, zs[:1])
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_gpu_current_device(self):
|
||||
def test_cuda_current_device(self):
|
||||
def fn(x):
|
||||
y = torch.empty(
|
||||
(2, 3),
|
||||
dtype=torch.float32,
|
||||
device=torch.accelerator.current_device_index(),
|
||||
(2, 3), dtype=torch.float32, device=torch.cuda.current_device()
|
||||
)
|
||||
y.copy_(x)
|
||||
return torch.sin(y + y.device.index)
|
||||
@ -4746,11 +4732,11 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
|
||||
counter = torch._dynamo.testing.CompileCounter()
|
||||
opt_fn = torch.compile(backend=counter, fullgraph=True)(fn)
|
||||
|
||||
with torch.accelerator.device_index(0):
|
||||
with torch.cuda.device(0):
|
||||
x = torch.randn(2, 3)
|
||||
self.assertEqual(opt_fn(x), fn(x))
|
||||
self.assertEqual(counter.frame_count, 1)
|
||||
with torch.accelerator.device_index(1):
|
||||
with torch.cuda.device(1):
|
||||
self.assertEqual(opt_fn(x), fn(x))
|
||||
self.assertEqual(counter.frame_count, 2)
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# Owner(s): ["module: dynamo"]
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.fx
|
||||
@ -195,6 +196,21 @@ class GraphRegionTrackerTests(TestCase):
|
||||
)
|
||||
|
||||
def test_mismatched_global_state(self):
|
||||
@contextlib.contextmanager
|
||||
def _hip_allow_tf32():
|
||||
# for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
|
||||
# and only for MI300+
|
||||
hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
|
||||
os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if hip_allow_tf32 is not None:
|
||||
os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
|
||||
else:
|
||||
del os.environ["HIPBLASLT_ALLOW_TF32"]
|
||||
|
||||
def inner_fn(x, y):
|
||||
x1 = x * 1
|
||||
y1 = y + 1
|
||||
@ -235,29 +251,31 @@ class GraphRegionTrackerTests(TestCase):
|
||||
def reset_default_dtype():
|
||||
torch.set_default_dtype(old_dtype)
|
||||
|
||||
for ctx in [
|
||||
lambda: torch.set_grad_enabled(False),
|
||||
torch.autograd.grad_mode.inference_mode,
|
||||
lambda: torch.autograd.graph.disable_saved_tensors_hooks(
|
||||
"This is not supported"
|
||||
),
|
||||
# lambda: torch.set_num_threads(2), : Unsupported
|
||||
(set_default_dtype_bfloat16, reset_default_dtype),
|
||||
(
|
||||
lambda: torch.use_deterministic_algorithms(True),
|
||||
lambda: torch.use_deterministic_algorithms(False),
|
||||
),
|
||||
# (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
|
||||
# lambda: torch.use_deterministic_algorithms(False)), : Unsupported
|
||||
create_toggle_fns("allow_bf16_reduced_precision_reduction"),
|
||||
create_toggle_fns("allow_fp16_reduced_precision_reduction"),
|
||||
create_toggle_fns("allow_tf32"),
|
||||
]:
|
||||
self.assertExpectedInline(
|
||||
self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
|
||||
"""[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
|
||||
tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
|
||||
with tf32_ctx():
|
||||
for ctx in [
|
||||
lambda: torch.set_grad_enabled(False),
|
||||
torch.autograd.grad_mode.inference_mode,
|
||||
lambda: torch.autograd.graph.disable_saved_tensors_hooks(
|
||||
"This is not supported"
|
||||
),
|
||||
# lambda: torch.set_num_threads(2), : Unsupported
|
||||
(set_default_dtype_bfloat16, reset_default_dtype),
|
||||
(
|
||||
lambda: torch.use_deterministic_algorithms(True),
|
||||
lambda: torch.use_deterministic_algorithms(False),
|
||||
),
|
||||
# (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
|
||||
# lambda: torch.use_deterministic_algorithms(False)), : Unsupported
|
||||
create_toggle_fns("allow_bf16_reduced_precision_reduction"),
|
||||
create_toggle_fns("allow_fp16_reduced_precision_reduction"),
|
||||
create_toggle_fns("allow_tf32"),
|
||||
]:
|
||||
self.assertExpectedInline(
|
||||
self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
|
||||
"""[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
|
||||
[['x1', 'y1', 'sum_1', 'o4'], ['x1_1', 'y1_1', 'sum_2', 'o5']]]""",
|
||||
)
|
||||
)
|
||||
|
||||
def test_mutation_tracking_simple(self):
|
||||
def fn(x, y, z):
|
||||
|
||||
@ -116,6 +116,8 @@ num_guards_executed=0)
|
||||
const_guard = guards.LAMBDA_GUARD(
|
||||
root,
|
||||
functools.partial(equals_match, expected=5),
|
||||
{},
|
||||
False,
|
||||
equals_match_verbose_code_parts(5),
|
||||
)
|
||||
self.assertTrue(const_guard(5))
|
||||
@ -405,10 +407,14 @@ num_guards_executed=0)
|
||||
guard_manager.add_type_match_guard(id_type(5), ["type(x) == int"])
|
||||
guard_manager.add_lambda_guard(
|
||||
functools.partial(ge_match, expected=5),
|
||||
{},
|
||||
False,
|
||||
ge_match_verbose_code_parts(expected=5),
|
||||
)
|
||||
guard_manager.add_lambda_guard(
|
||||
functools.partial(less_match, expected=10),
|
||||
{},
|
||||
False,
|
||||
less_match_verbose_code_parts(expected=10),
|
||||
)
|
||||
self.assertEqual(len(guard_manager.get_leaf_guards()), 3)
|
||||
@ -428,10 +434,14 @@ num_guards_executed=0)
|
||||
guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
|
||||
guard_manager.getattr_manager("x", "x", 1, default_mgr_enum).add_lambda_guard(
|
||||
functools.partial(equals_match, expected=foo.x),
|
||||
{},
|
||||
False,
|
||||
equals_match_verbose_code_parts(foo.x),
|
||||
)
|
||||
guard_manager.getattr_manager("y", "y", 2, default_mgr_enum).add_lambda_guard(
|
||||
functools.partial(equals_match, expected=foo.y),
|
||||
{},
|
||||
False,
|
||||
equals_match_verbose_code_parts(foo.y),
|
||||
)
|
||||
self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
|
||||
@ -474,10 +484,14 @@ num_guards_executed=0)
|
||||
guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
|
||||
guard_manager.getitem_manager(0, "", 1, default_mgr_enum).add_lambda_guard(
|
||||
functools.partial(equals_match, expected=foo[0]),
|
||||
{},
|
||||
False,
|
||||
equals_match_verbose_code_parts(foo[0]),
|
||||
)
|
||||
guard_manager.getitem_manager(1, "", 2, default_mgr_enum).add_lambda_guard(
|
||||
functools.partial(equals_match, expected=foo[1]),
|
||||
{},
|
||||
False,
|
||||
equals_match_verbose_code_parts(foo[1]),
|
||||
)
|
||||
self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
|
||||
@ -585,6 +599,8 @@ num_guards_executed=0)
|
||||
lambda x: isinstance(x, Pair)
|
||||
and isinstance(x.x, torch.Tensor)
|
||||
and isinstance(x.y, int),
|
||||
{},
|
||||
False,
|
||||
"global guard fail",
|
||||
)
|
||||
|
||||
@ -635,6 +651,8 @@ num_guards_executed=0)
|
||||
)
|
||||
attr_manager.add_lambda_guard(
|
||||
lambda x: x == 4,
|
||||
{},
|
||||
False,
|
||||
"Expected value 4",
|
||||
)
|
||||
|
||||
@ -675,6 +693,8 @@ num_guards_executed=0)
|
||||
|
||||
weakref_manager.add_lambda_guard(
|
||||
lambda x: isinstance(x, torch.Tensor),
|
||||
{},
|
||||
False,
|
||||
"global weakref fail",
|
||||
)
|
||||
|
||||
@ -694,6 +714,8 @@ num_guards_executed=0)
|
||||
)
|
||||
foo_mgr.add_lambda_guard(
|
||||
lambda x: x == 3,
|
||||
{},
|
||||
False,
|
||||
"Expected value 3",
|
||||
)
|
||||
self.assertTrue(guard_manager.check(a))
|
||||
@ -779,7 +801,7 @@ num_guards_executed=0)
|
||||
# Add key-value manager (nothing : {"z" : 3})
|
||||
self.assertTrue(root.check(f_locals))
|
||||
dict_mgr.get_key_manager(1, "", nothing, default_mgr_enum).add_lambda_guard(
|
||||
lambda x: x is nothing, ["x is nothing"]
|
||||
lambda x: x is nothing, {}, False, ["x is nothing"]
|
||||
)
|
||||
self.assertTrue(root.check(f_locals))
|
||||
value_mgr = dict_mgr.get_value_manager(
|
||||
|
||||
@ -7207,7 +7207,9 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
|
||||
return x + 1
|
||||
|
||||
guard_manager = torch._dynamo.guards.RootGuardManager()
|
||||
guard_manager.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
|
||||
guard_manager.add_lambda_guard(
|
||||
lambda L: isinstance(L["x"], int), {"x": 0}, True, []
|
||||
)
|
||||
|
||||
def injected(x):
|
||||
return x + 42
|
||||
@ -7232,27 +7234,33 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
|
||||
return x + 1
|
||||
|
||||
guard_manager_bool = torch._dynamo.guards.RootGuardManager()
|
||||
guard_manager_bool.add_lambda_guard(lambda L: isinstance(L["x"], bool), [])
|
||||
guard_manager_bool.add_lambda_guard(
|
||||
lambda L: isinstance(L["x"], bool), {"x": 0}, True, []
|
||||
)
|
||||
|
||||
def injected_bool(x: bool):
|
||||
return x + 102
|
||||
|
||||
guard_manager_int = torch._dynamo.guards.RootGuardManager()
|
||||
guard_manager_int.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
|
||||
guard_manager_int.add_lambda_guard(
|
||||
lambda L: isinstance(L["x"], int), {"x": 0}, True, []
|
||||
)
|
||||
|
||||
def injected_int(x: int):
|
||||
return x + 42
|
||||
|
||||
guard_manager_tensor = torch._dynamo.guards.RootGuardManager()
|
||||
guard_manager_tensor.add_lambda_guard(
|
||||
lambda L: isinstance(L["x"], torch.Tensor), []
|
||||
lambda L: isinstance(L["x"], torch.Tensor), {"x": 0}, True, []
|
||||
)
|
||||
|
||||
def injected_tensor(x: torch.Tensor):
|
||||
return x + 100
|
||||
|
||||
guard_manager_str = torch._dynamo.guards.RootGuardManager()
|
||||
guard_manager_str.add_lambda_guard(lambda L: isinstance(L["x"], str), [])
|
||||
guard_manager_str.add_lambda_guard(
|
||||
lambda L: isinstance(L["x"], str), {"x": 0}, True, []
|
||||
)
|
||||
|
||||
def injected_str(x: str):
|
||||
return x + "1"
|
||||
@ -7329,7 +7337,10 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
|
||||
|
||||
guard_manager_bool = torch._dynamo.guards.RootGuardManager()
|
||||
guard_manager_bool.add_lambda_guard(
|
||||
lambda L: isinstance(L["x"], bool), ["isinstance(L['x'], bool)"]
|
||||
lambda L: isinstance(L["x"], bool),
|
||||
{"x": 0},
|
||||
True,
|
||||
["isinstance(L['x'], bool)"],
|
||||
)
|
||||
|
||||
def injected_bool(x: bool):
|
||||
@ -8467,24 +8478,43 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
|
||||
def fn(x):
|
||||
return x + 1
|
||||
|
||||
initial_state = read_state()
|
||||
y = torch.randn(10)
|
||||
try:
|
||||
for round in range(3):
|
||||
for i in range(len(initial_state)):
|
||||
new_state = [False] * len(initial_state)
|
||||
new_state[i] = True
|
||||
write_state(new_state)
|
||||
assert read_state() == new_state
|
||||
last_state.clear()
|
||||
fn(y)
|
||||
assert last_state == new_state
|
||||
if round == 0:
|
||||
assert cnt == i + 1
|
||||
else:
|
||||
assert cnt == len(initial_state)
|
||||
finally:
|
||||
write_state(initial_state)
|
||||
import contextlib
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _hip_allow_tf32():
|
||||
# for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
|
||||
# and only for MI300+
|
||||
hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
|
||||
os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if hip_allow_tf32 is not None:
|
||||
os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
|
||||
else:
|
||||
del os.environ["HIPBLASLT_ALLOW_TF32"]
|
||||
|
||||
tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
|
||||
with tf32_ctx():
|
||||
initial_state = read_state()
|
||||
y = torch.randn(10)
|
||||
try:
|
||||
for round in range(3):
|
||||
for i in range(len(initial_state)):
|
||||
new_state = [False] * len(initial_state)
|
||||
new_state[i] = True
|
||||
write_state(new_state)
|
||||
assert read_state() == new_state
|
||||
last_state.clear()
|
||||
fn(y)
|
||||
assert last_state == new_state
|
||||
if round == 0:
|
||||
assert cnt == i + 1
|
||||
else:
|
||||
assert cnt == len(initial_state)
|
||||
finally:
|
||||
write_state(initial_state)
|
||||
|
||||
def test_grad_state_mutated(self):
|
||||
prior = torch.is_grad_enabled()
|
||||
@ -13263,7 +13293,7 @@ class MiscTestsDevice(torch._inductor.test_case.TestCase):
|
||||
self.assertEqual(out, opt_out)
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "need multiple GPU")
|
||||
def test_gpu_set_device(self, device):
|
||||
def test_cuda_set_device(self, device):
|
||||
def fn():
|
||||
a = torch.ones(2, device=device)
|
||||
torch.get_device_module(device).set_device(1)
|
||||
@ -13343,26 +13373,6 @@ class MiscTestsDevice(torch._inductor.test_case.TestCase):
|
||||
y = torch.tensor(5)
|
||||
f(x, y)
|
||||
|
||||
def test_full_graph_capture_scalar_outputs(self):
|
||||
@torch.compile(fullgraph=True)
|
||||
def foo(a):
|
||||
return torch.randn(5) * a.item()
|
||||
|
||||
# We expect to no longer raise here
|
||||
foo(torch.tensor(2.0))
|
||||
|
||||
def test_full_graph_capture_dynamic_output_shape_ops(self):
|
||||
def fn(x):
|
||||
nz = torch.nonzero(x)
|
||||
squared = nz * nz
|
||||
sliced = torch.ops.aten.slice.Tensor(squared, dim=1, start=-2, end=None)
|
||||
view = sliced.unsqueeze(dim=0)
|
||||
return view.squeeze(dim=0)
|
||||
|
||||
example_inputs = (torch.randn(1, 1, 1, 1),)
|
||||
# we expect to no longer raise here
|
||||
torch.compile(fn, fullgraph=True)(*example_inputs)
|
||||
|
||||
def test_dynamic_float_scalar_tensor_coersion(self):
|
||||
# Minified version of https://github.com/pytorch/pytorch/issues/158376#issuecomment-3079591367
|
||||
class Foo:
|
||||
|
||||
@ -47,8 +47,7 @@ class PrecompileContextTests(InductorTestCase):
|
||||
x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
|
||||
result = compiled_fn(x)
|
||||
result.sum().backward()
|
||||
self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
|
||||
self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
|
||||
self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
|
||||
self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)
|
||||
|
||||
result = PrecompileContext.serialize()
|
||||
@ -83,9 +82,8 @@ class PrecompileContextTests(InductorTestCase):
|
||||
x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
|
||||
result = compiled_fn(x)
|
||||
result.sum().backward()
|
||||
self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
|
||||
self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
|
||||
for key in PrecompileContext._backend_artifacts_by_key.keys():
|
||||
self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
|
||||
for key in PrecompileContext._new_cache_artifacts_by_key.keys():
|
||||
result = PrecompileContext.serialize_artifact_by_key(key)
|
||||
assert isinstance(result, PrecompileCacheArtifact)
|
||||
self.assertEqual(result.key, key)
|
||||
@ -111,12 +109,11 @@ class PrecompileContextTests(InductorTestCase):
|
||||
x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
|
||||
result = compiled_fn(x)
|
||||
result.sum().backward()
|
||||
self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
|
||||
self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
|
||||
self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
|
||||
# Find the key for the artifact of type "precompile_aot_autograd"
|
||||
key = next(
|
||||
k
|
||||
for k, v in PrecompileContext._backend_artifacts_by_key.items()
|
||||
for k, v in PrecompileContext._new_cache_artifacts_by_key.items()
|
||||
if isinstance(v, EditablePrecompileCacheArtifact)
|
||||
)
|
||||
|
||||
|
||||
@ -109,8 +109,6 @@ class StructuredTraceTestingFormatter(logging.Formatter):
|
||||
metadata["dynamo_start"]["stack"] = "STACK"
|
||||
if "inductor_output_code" in metadata:
|
||||
metadata["inductor_output_code"]["filename"] = "FILENAME"
|
||||
if "file_path" in metadata["inductor_output_code"]:
|
||||
metadata["inductor_output_code"]["file_path"] = "FILENAME"
|
||||
if "stack" in metadata:
|
||||
metadata["stack"] = "STACK"
|
||||
if "compilation_metrics" in metadata:
|
||||
@ -261,7 +259,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -295,7 +293,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -337,7 +335,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
@ -359,7 +357,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
|
||||
@ -391,7 +389,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
@ -448,7 +446,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
|
||||
@ -457,7 +455,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
|
||||
{"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
|
||||
{"dynamo_start": {"stack": "STACK"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
@ -600,7 +598,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -613,7 +611,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
@ -681,7 +679,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"describe_storage": {"id": 16, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
{"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
@ -700,7 +698,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
|
||||
@ -741,7 +739,7 @@ class StructuredTraceTest(TestCase):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
|
||||
@ -902,7 +900,7 @@ def forward(self, x, y):
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
@ -917,7 +915,7 @@ def forward(self, x, y):
|
||||
{"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
{"artifact": {"name": "inductor_provenance_tracking_node_mappings", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"artifact": {"name": "inductor_provenance_tracking_kernel_stack_traces", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
|
||||
{"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user