mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-19 01:54:54 +08:00
Compare commits
120 Commits
ciflow/tru
...
dev/joona/
| Author | SHA1 | Date | |
|---|---|---|---|
| f203b98062 | |||
| b7208877c8 | |||
| f69815d77f | |||
| 1c04a43959 | |||
| 661fb53449 | |||
| 4e1b772103 | |||
| bdd3c3a29c | |||
| 9d8ceaa36f | |||
| 927899dc05 | |||
| a892f76d06 | |||
| 2ddcf53e1a | |||
| 689d731ece | |||
| b288d0020b | |||
| 4414e1bff0 | |||
| 694f9b943c | |||
| 01deee228a | |||
| 1233be0923 | |||
| 02b55c3f4a | |||
| ae3ce54f27 | |||
| 2f3bb7482c | |||
| 567dcdba75 | |||
| 77acc66df9 | |||
| 95d1df7d4e | |||
| 094e529c64 | |||
| a4c7bf7e8d | |||
| 22ccd44d73 | |||
| 39ebab1dd9 | |||
| 4c152a71ad | |||
| 1b43d6cd4e | |||
| 2b69673bbf | |||
| 2f74916e36 | |||
| 2b5eabc74b | |||
| 9ff95f6835 | |||
| 6fdb974f4a | |||
| 661d1653aa | |||
| 53809f9640 | |||
| 93ddd38ecd | |||
| 5804408f1b | |||
| 99117c1238 | |||
| b9bccec3bc | |||
| ca3aaef66e | |||
| f2e6f94081 | |||
| aa504d4d2a | |||
| d8ce6f8df9 | |||
| 4322354770 | |||
| 363385ad3e | |||
| e2e10753d7 | |||
| 5d99a795f5 | |||
| 2245d7d3b9 | |||
| 98b94b90dd | |||
| 5cdbda140c | |||
| 0ec53beaeb | |||
| 79fc0a9141 | |||
| d01a7b0241 | |||
| deabb3e36d | |||
| 79d2397b6b | |||
| 6ef3a62c36 | |||
| 530e782239 | |||
| c66a6c432e | |||
| 3d7a8b7e61 | |||
| de0d69b2c4 | |||
| bc60b86066 | |||
| d7782ddde7 | |||
| fb04e9ad03 | |||
| cfe799b4aa | |||
| b7f52773e6 | |||
| f6b54d8899 | |||
| da91bf5262 | |||
| 1c1638297e | |||
| ee0b5b4b1c | |||
| fcfb213c5a | |||
| 08042bbb9c | |||
| e20ca3bc2e | |||
| 4ed26f7382 | |||
| 4c79305b87 | |||
| f4b8c4f907 | |||
| d629b7a459 | |||
| 0922ba5f42 | |||
| c87295c044 | |||
| 7aa210d215 | |||
| 5a368b8010 | |||
| 602102be50 | |||
| 200156e385 | |||
| a2daf3fc86 | |||
| 52b45c16de | |||
| 2ef85bed5a | |||
| d99c6bcf69 | |||
| 8378abda84 | |||
| 5b42a5d9a6 | |||
| caca3f2eec | |||
| 9e2bf129e1 | |||
| c429b1fc5c | |||
| 1176b2b0b7 | |||
| dd37a1a434 | |||
| a74adcf80e | |||
| 5eac46a011 | |||
| e0fff31ae3 | |||
| 7ede33b8e3 | |||
| 065176cd97 | |||
| 02ee7dd7d3 | |||
| 99fdca8f4d | |||
| 9d1a74cb0c | |||
| 40e6f090d9 | |||
| bfddfde50c | |||
| b6570615f8 | |||
| 226850cc66 | |||
| f8a2ce3b9a | |||
| e2c6834584 | |||
| 0e7235ed73 | |||
| 3522e0ce74 | |||
| 50bf1f0b81 | |||
| c78e64622e | |||
| 5623628894 | |||
| 2aba180114 | |||
| 45b2c3d312 | |||
| 5b1e112cf9 | |||
| 5e6ac5c6e1 | |||
| 79317dc7a7 | |||
| 96a4c4b3d1 | |||
| 05bcfcc5d1 |
@ -75,9 +75,11 @@ if [[ "$ARCH" == "aarch64" ]]; then
|
||||
# ARM system libraries
|
||||
DEPS_LIST+=(
|
||||
"/usr/lib64/libgfortran.so.5"
|
||||
"/opt/OpenBLAS/lib/libopenblas.so.0"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libgfortran.so.5"
|
||||
"libopenblas.so.0"
|
||||
)
|
||||
fi
|
||||
|
||||
|
||||
@ -100,337 +100,6 @@ def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
|
||||
)
|
||||
|
||||
|
||||
def _compile_and_extract_symbols(
|
||||
cpp_content: str, compile_flags: list[str], exclude_list: list[str] | None = None
|
||||
) -> list[str]:
|
||||
"""
|
||||
Helper to compile a C++ file and extract all symbols.
|
||||
|
||||
Args:
|
||||
cpp_content: C++ source code to compile
|
||||
compile_flags: Compilation flags
|
||||
exclude_list: List of symbol names to exclude. Defaults to ["main"].
|
||||
|
||||
Returns:
|
||||
List of all symbols found in the object file (excluding those in exclude_list).
|
||||
"""
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
if exclude_list is None:
|
||||
exclude_list = ["main"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmppath = Path(tmpdir)
|
||||
cpp_file = tmppath / "test.cpp"
|
||||
obj_file = tmppath / "test.o"
|
||||
|
||||
cpp_file.write_text(cpp_content)
|
||||
|
||||
result = subprocess.run(
|
||||
compile_flags + [str(cpp_file), "-o", str(obj_file)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Compilation failed: {result.stderr}")
|
||||
|
||||
symbols = get_symbols(str(obj_file))
|
||||
|
||||
# Return all symbol names, excluding those in the exclude list
|
||||
return [name for _addr, _stype, name in symbols if name not in exclude_list]
|
||||
|
||||
|
||||
def check_stable_only_symbols(install_root: Path) -> None:
|
||||
"""
|
||||
Test TORCH_STABLE_ONLY and TORCH_TARGET_VERSION by compiling test code and comparing symbol counts.
|
||||
|
||||
This approach tests:
|
||||
1. WITHOUT macros -> many torch symbols exposed
|
||||
2. WITH TORCH_STABLE_ONLY -> zero torch symbols (all hidden)
|
||||
3. WITH TORCH_TARGET_VERSION -> zero torch symbols (all hidden)
|
||||
4. WITH both macros -> zero torch symbols (all hidden)
|
||||
"""
|
||||
include_dir = install_root / "include"
|
||||
assert include_dir.exists(), f"Expected {include_dir} to be present"
|
||||
|
||||
test_cpp_content = """
|
||||
// Main torch C++ API headers
|
||||
#include <torch/torch.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
// ATen tensor library
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
// Core c10 headers (commonly used)
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/core/TensorOptions.h>
|
||||
#include <c10/util/Optional.h>
|
||||
|
||||
int main() { return 0; }
|
||||
"""
|
||||
|
||||
base_compile_flags = [
|
||||
"g++",
|
||||
"-std=c++17",
|
||||
f"-I{include_dir}",
|
||||
f"-I{include_dir}/torch/csrc/api/include",
|
||||
"-c", # Compile only, don't link
|
||||
]
|
||||
|
||||
# Compile WITHOUT any macros
|
||||
symbols_without = _compile_and_extract_symbols(
|
||||
cpp_content=test_cpp_content,
|
||||
compile_flags=base_compile_flags,
|
||||
)
|
||||
|
||||
# We expect constexpr symbols, inline functions used by other headers etc.
|
||||
# to produce symbols
|
||||
num_symbols_without = len(symbols_without)
|
||||
print(f"Found {num_symbols_without} symbols without any macros defined")
|
||||
assert num_symbols_without != 0, (
|
||||
"Expected a non-zero number of symbols without any macros"
|
||||
)
|
||||
|
||||
# Compile WITH TORCH_STABLE_ONLY (expect 0 symbols)
|
||||
compile_flags_with_stable_only = base_compile_flags + ["-DTORCH_STABLE_ONLY"]
|
||||
|
||||
symbols_with_stable_only = _compile_and_extract_symbols(
|
||||
cpp_content=test_cpp_content,
|
||||
compile_flags=compile_flags_with_stable_only,
|
||||
)
|
||||
|
||||
num_symbols_with_stable_only = len(symbols_with_stable_only)
|
||||
assert num_symbols_with_stable_only == 0, (
|
||||
f"Expected no symbols with TORCH_STABLE_ONLY macro, but found {num_symbols_with_stable_only}"
|
||||
)
|
||||
|
||||
# Compile WITH TORCH_TARGET_VERSION (expect 0 symbols)
|
||||
compile_flags_with_target_version = base_compile_flags + [
|
||||
"-DTORCH_TARGET_VERSION=1"
|
||||
]
|
||||
|
||||
symbols_with_target_version = _compile_and_extract_symbols(
|
||||
cpp_content=test_cpp_content,
|
||||
compile_flags=compile_flags_with_target_version,
|
||||
)
|
||||
|
||||
num_symbols_with_target_version = len(symbols_with_target_version)
|
||||
assert num_symbols_with_target_version == 0, (
|
||||
f"Expected no symbols with TORCH_TARGET_VERSION macro, but found {num_symbols_with_target_version}"
|
||||
)
|
||||
|
||||
# Compile WITH both macros (expect 0 symbols)
|
||||
compile_flags_with_both = base_compile_flags + [
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
"-DTORCH_TARGET_VERSION=1",
|
||||
]
|
||||
|
||||
symbols_with_both = _compile_and_extract_symbols(
|
||||
cpp_content=test_cpp_content,
|
||||
compile_flags=compile_flags_with_both,
|
||||
)
|
||||
|
||||
num_symbols_with_both = len(symbols_with_both)
|
||||
assert num_symbols_with_both == 0, (
|
||||
f"Expected no symbols with both macros, but found {num_symbols_with_both}"
|
||||
)
|
||||
|
||||
|
||||
def check_stable_api_symbols(install_root: Path) -> None:
|
||||
"""
|
||||
Test that stable API headers still expose symbols with TORCH_STABLE_ONLY.
|
||||
The torch/csrc/stable/c/shim.h header is tested in check_stable_c_shim_symbols
|
||||
"""
|
||||
include_dir = install_root / "include"
|
||||
assert include_dir.exists(), f"Expected {include_dir} to be present"
|
||||
|
||||
stable_dir = include_dir / "torch" / "csrc" / "stable"
|
||||
assert stable_dir.exists(), f"Expected {stable_dir} to be present"
|
||||
|
||||
stable_headers = list(stable_dir.rglob("*.h"))
|
||||
if not stable_headers:
|
||||
raise RuntimeError("Could not find any stable headers")
|
||||
|
||||
includes = []
|
||||
for header in stable_headers:
|
||||
rel_path = header.relative_to(include_dir)
|
||||
includes.append(f"#include <{rel_path.as_posix()}>")
|
||||
|
||||
includes_str = "\n".join(includes)
|
||||
test_stable_content = f"""
|
||||
{includes_str}
|
||||
int main() {{ return 0; }}
|
||||
"""
|
||||
|
||||
compile_flags = [
|
||||
"g++",
|
||||
"-std=c++17",
|
||||
f"-I{include_dir}",
|
||||
f"-I{include_dir}/torch/csrc/api/include",
|
||||
"-c",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
]
|
||||
|
||||
symbols_stable = _compile_and_extract_symbols(
|
||||
cpp_content=test_stable_content,
|
||||
compile_flags=compile_flags,
|
||||
)
|
||||
num_symbols_stable = len(symbols_stable)
|
||||
print(f"Found {num_symbols_stable} symbols in torch/csrc/stable")
|
||||
assert num_symbols_stable > 0, (
|
||||
f"Expected stable headers to expose symbols with TORCH_STABLE_ONLY, "
|
||||
f"but found {num_symbols_stable} symbols"
|
||||
)
|
||||
|
||||
|
||||
def check_headeronly_symbols(install_root: Path) -> None:
|
||||
"""
|
||||
Test that header-only utility headers still expose symbols with TORCH_STABLE_ONLY.
|
||||
"""
|
||||
include_dir = install_root / "include"
|
||||
assert include_dir.exists(), f"Expected {include_dir} to be present"
|
||||
|
||||
# Find all headers in torch/headeronly
|
||||
headeronly_dir = include_dir / "torch" / "headeronly"
|
||||
assert headeronly_dir.exists(), f"Expected {headeronly_dir} to be present"
|
||||
headeronly_headers = list(headeronly_dir.rglob("*.h"))
|
||||
if not headeronly_headers:
|
||||
raise RuntimeError("Could not find any headeronly headers")
|
||||
|
||||
# Filter out platform-specific headers that may not compile everywhere
|
||||
platform_specific_keywords = [
|
||||
"cpu/vec",
|
||||
]
|
||||
|
||||
filtered_headers = []
|
||||
for header in headeronly_headers:
|
||||
rel_path = header.relative_to(include_dir).as_posix()
|
||||
if not any(
|
||||
keyword in rel_path.lower() for keyword in platform_specific_keywords
|
||||
):
|
||||
filtered_headers.append(header)
|
||||
|
||||
includes = []
|
||||
for header in filtered_headers:
|
||||
rel_path = header.relative_to(include_dir)
|
||||
includes.append(f"#include <{rel_path.as_posix()}>")
|
||||
|
||||
includes_str = "\n".join(includes)
|
||||
test_headeronly_content = f"""
|
||||
{includes_str}
|
||||
int main() {{ return 0; }}
|
||||
"""
|
||||
|
||||
compile_flags = [
|
||||
"g++",
|
||||
"-std=c++17",
|
||||
f"-I{include_dir}",
|
||||
f"-I{include_dir}/torch/csrc/api/include",
|
||||
"-c",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
]
|
||||
|
||||
symbols_headeronly = _compile_and_extract_symbols(
|
||||
cpp_content=test_headeronly_content,
|
||||
compile_flags=compile_flags,
|
||||
)
|
||||
num_symbols_headeronly = len(symbols_headeronly)
|
||||
print(f"Found {num_symbols_headeronly} symbols in torch/headeronly")
|
||||
assert num_symbols_headeronly > 0, (
|
||||
f"Expected headeronly headers to expose symbols with TORCH_STABLE_ONLY, "
|
||||
f"but found {num_symbols_headeronly} symbols"
|
||||
)
|
||||
|
||||
|
||||
def check_aoti_shim_symbols(install_root: Path) -> None:
|
||||
"""
|
||||
Test that AOTI shim headers still expose symbols with TORCH_STABLE_ONLY.
|
||||
"""
|
||||
include_dir = install_root / "include"
|
||||
assert include_dir.exists(), f"Expected {include_dir} to be present"
|
||||
|
||||
# There are no constexpr symbols etc., so we need to actually use functions
|
||||
# so that some symbols are found.
|
||||
test_shim_content = """
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
int main() {
|
||||
int32_t (*fp1)() = &aoti_torch_device_type_cpu;
|
||||
int32_t (*fp2)() = &aoti_torch_dtype_float32;
|
||||
(void)fp1; (void)fp2;
|
||||
return 0;
|
||||
}
|
||||
"""
|
||||
|
||||
compile_flags = [
|
||||
"g++",
|
||||
"-std=c++17",
|
||||
f"-I{include_dir}",
|
||||
f"-I{include_dir}/torch/csrc/api/include",
|
||||
"-c",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
]
|
||||
|
||||
symbols_shim = _compile_and_extract_symbols(
|
||||
cpp_content=test_shim_content,
|
||||
compile_flags=compile_flags,
|
||||
)
|
||||
num_symbols_shim = len(symbols_shim)
|
||||
assert num_symbols_shim > 0, (
|
||||
f"Expected shim headers to expose symbols with TORCH_STABLE_ONLY, "
|
||||
f"but found {num_symbols_shim} symbols"
|
||||
)
|
||||
|
||||
|
||||
def check_stable_c_shim_symbols(install_root: Path) -> None:
|
||||
"""
|
||||
Test that stable C shim headers still expose symbols with TORCH_STABLE_ONLY.
|
||||
"""
|
||||
include_dir = install_root / "include"
|
||||
assert include_dir.exists(), f"Expected {include_dir} to be present"
|
||||
|
||||
# Check if the stable C shim exists
|
||||
stable_shim = include_dir / "torch" / "csrc" / "stable" / "c" / "shim.h"
|
||||
if not stable_shim.exists():
|
||||
raise RuntimeError("Could not find stable c shim")
|
||||
|
||||
# There are no constexpr symbols etc., so we need to actually use functions
|
||||
# so that some symbols are found.
|
||||
test_stable_shim_content = """
|
||||
#include <torch/csrc/stable/c/shim.h>
|
||||
int main() {
|
||||
// Reference stable C API functions to create undefined symbols
|
||||
AOTITorchError (*fp1)(const char*, uint32_t*, int32_t*) = &torch_parse_device_string;
|
||||
AOTITorchError (*fp2)(uint32_t*) = &torch_get_num_threads;
|
||||
(void)fp1; (void)fp2;
|
||||
return 0;
|
||||
}
|
||||
"""
|
||||
|
||||
compile_flags = [
|
||||
"g++",
|
||||
"-std=c++17",
|
||||
f"-I{include_dir}",
|
||||
f"-I{include_dir}/torch/csrc/api/include",
|
||||
"-c",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
]
|
||||
|
||||
symbols_stable_shim = _compile_and_extract_symbols(
|
||||
cpp_content=test_stable_shim_content,
|
||||
compile_flags=compile_flags,
|
||||
)
|
||||
num_symbols_stable_shim = len(symbols_stable_shim)
|
||||
assert num_symbols_stable_shim > 0, (
|
||||
f"Expected stable C shim headers to expose symbols with TORCH_STABLE_ONLY, "
|
||||
f"but found {num_symbols_stable_shim} symbols"
|
||||
)
|
||||
|
||||
|
||||
def check_lib_symbols_for_abi_correctness(lib: str) -> None:
|
||||
print(f"lib: {lib}")
|
||||
cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
|
||||
@ -460,13 +129,6 @@ def main() -> None:
|
||||
check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
|
||||
check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)
|
||||
|
||||
# Check symbols when TORCH_STABLE_ONLY is defined
|
||||
check_stable_only_symbols(install_root)
|
||||
check_stable_api_symbols(install_root)
|
||||
check_headeronly_symbols(install_root)
|
||||
check_aoti_shim_symbols(install_root)
|
||||
check_stable_c_shim_symbols(install_root)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -389,6 +389,13 @@ test_lazy_tensor_meta_reference_disabled() {
|
||||
export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
|
||||
}
|
||||
|
||||
test_dynamo_core() {
|
||||
time python test/run_test.py \
|
||||
--include-dynamo-core-tests \
|
||||
--verbose \
|
||||
--upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_dynamo_wrapped_shard() {
|
||||
if [[ -z "$NUM_TEST_SHARDS" ]]; then
|
||||
@ -1814,6 +1821,8 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||
test_inductor_shard "${SHARD_NUMBER}"
|
||||
elif [[ "${TEST_CONFIG}" == *einops* ]]; then
|
||||
test_einops
|
||||
elif [[ "${TEST_CONFIG}" == *dynamo_core* ]]; then
|
||||
test_dynamo_core
|
||||
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
|
||||
install_torchvision
|
||||
test_dynamo_wrapped_shard "${SHARD_NUMBER}"
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
07b6cbde121417a70e4dc871adb6d27030e0ce3f
|
||||
ee1a1350eb37804b94334768f328144f058f14e9
|
||||
|
||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
||||
acccf86477759b2d3500f1ae1be065f7b1e409ec
|
||||
2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
|
||||
94631807d22c09723dd006f7be5beb649d5f88d0
|
||||
|
||||
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
@ -7,6 +7,7 @@ ciflow_push_tags:
|
||||
- ciflow/binaries
|
||||
- ciflow/binaries_libtorch
|
||||
- ciflow/binaries_wheel
|
||||
- ciflow/dynamo
|
||||
- ciflow/h100
|
||||
- ciflow/h100-cutlass-backend
|
||||
- ciflow/h100-distributed
|
||||
|
||||
2
.github/workflows/_linux-test.yml
vendored
2
.github/workflows/_linux-test.yml
vendored
@ -326,7 +326,7 @@ jobs:
|
||||
SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
|
||||
SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
|
||||
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
||||
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
||||
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
||||
|
||||
70
.github/workflows/dynamo-unittest.yml
vendored
Normal file
70
.github/workflows/dynamo-unittest.yml
vendored
Normal file
@ -0,0 +1,70 @@
|
||||
# Workflow: Dynamo Unit Test
|
||||
# runs unit tests for dynamo.
|
||||
name: dynamo-unittest
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/dynamo/*
|
||||
workflow_call:
|
||||
schedule:
|
||||
- cron: 29 8 * * * # about 1:29am PDT
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
dynamo-build:
|
||||
name: dynamo-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.11', '3.12']
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
dynamo-test:
|
||||
name: dynamo-test
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: [get-label-type, dynamo-build]
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.11', '3.12']
|
||||
with:
|
||||
build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
|
||||
docker-image: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
330
.spin/cmds.py
Normal file
330
.spin/cmds.py
Normal file
@ -0,0 +1,330 @@
|
||||
import hashlib
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
import spin
|
||||
|
||||
|
||||
def file_digest(file, algorithm: str):
|
||||
try:
|
||||
return hashlib.file_digest(file, algorithm)
|
||||
except AttributeError:
|
||||
pass # Fallback to manual implementation below
|
||||
hash = hashlib.new(algorithm)
|
||||
while chunk := file.read(8192):
|
||||
hash.update(chunk)
|
||||
return hash
|
||||
|
||||
|
||||
def _hash_file(file):
|
||||
with open(file, "rb") as f:
|
||||
hash = file_digest(f, "sha256")
|
||||
return hash.hexdigest()
|
||||
|
||||
|
||||
def _hash_files(files):
|
||||
hashes = {file: _hash_file(file) for file in files}
|
||||
return hashes
|
||||
|
||||
|
||||
def _read_hashes(hash_file: Path):
|
||||
if not hash_file.exists():
|
||||
return {}
|
||||
with hash_file.open("r") as f:
|
||||
lines = f.readlines()
|
||||
hashes = {}
|
||||
for line in lines:
|
||||
hash = line[:64]
|
||||
file = line[66:].strip()
|
||||
hashes[file] = hash
|
||||
return hashes
|
||||
|
||||
|
||||
def _updated_hashes(hash_file, files_to_hash):
|
||||
old_hashes = _read_hashes(hash_file)
|
||||
new_hashes = _hash_files(files_to_hash)
|
||||
if new_hashes != old_hashes:
|
||||
return new_hashes
|
||||
return None
|
||||
|
||||
|
||||
@click.command()
|
||||
def regenerate_version():
|
||||
"""Regenerate version.py."""
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"tools.generate_torch_version",
|
||||
"--is-debug=false",
|
||||
]
|
||||
spin.util.run(cmd)
|
||||
|
||||
|
||||
TYPE_STUBS = [
|
||||
(
|
||||
"Pytorch type stubs",
|
||||
Path(".lintbin/.pytorch-type-stubs.sha256"),
|
||||
[
|
||||
"aten/src/ATen/native/native_functions.yaml",
|
||||
"aten/src/ATen/native/tags.yaml",
|
||||
"tools/autograd/deprecated.yaml",
|
||||
],
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"tools.pyi.gen_pyi",
|
||||
"--native-functions-path",
|
||||
"aten/src/ATen/native/native_functions.yaml",
|
||||
"--tags-path",
|
||||
"aten/src/ATen/native/tags.yaml",
|
||||
"--deprecated-functions-path",
|
||||
"tools/autograd/deprecated.yaml",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Datapipes type stubs",
|
||||
None,
|
||||
[],
|
||||
[
|
||||
sys.executable,
|
||||
"torch/utils/data/datapipes/gen_pyi.py",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@click.command()
|
||||
def regenerate_type_stubs():
|
||||
"""Regenerate type stubs."""
|
||||
for name, hash_file, files_to_hash, cmd in TYPE_STUBS:
|
||||
if hash_file:
|
||||
if hashes := _updated_hashes(hash_file, files_to_hash):
|
||||
click.echo(
|
||||
f"Changes detected in type stub files for {name}. Regenerating..."
|
||||
)
|
||||
spin.util.run(cmd)
|
||||
hash_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with hash_file.open("w") as f:
|
||||
for file, hash in hashes.items():
|
||||
f.write(f"{hash} {file}\n")
|
||||
click.echo("Type stubs and hashes updated.")
|
||||
else:
|
||||
click.echo(f"No changes detected in type stub files for {name}.")
|
||||
else:
|
||||
click.echo(f"No hash file for {name}. Regenerating...")
|
||||
spin.util.run(cmd)
|
||||
click.echo("Type stubs regenerated.")
|
||||
|
||||
|
||||
@click.command()
|
||||
def regenerate_clangtidy_files():
|
||||
"""Regenerate clang-tidy files."""
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"tools.linter.clang_tidy.generate_build_files",
|
||||
]
|
||||
spin.util.run(cmd)
|
||||
|
||||
|
||||
#: These linters are expected to need less than 3s cpu time total
|
||||
VERY_FAST_LINTERS = {
|
||||
"ATEN_CPU_GPU_AGNOSTIC",
|
||||
"BAZEL_LINTER",
|
||||
"C10_NODISCARD",
|
||||
"C10_UNUSED",
|
||||
"CALL_ONCE",
|
||||
"CMAKE_MINIMUM_REQUIRED",
|
||||
"CONTEXT_DECORATOR",
|
||||
"COPYRIGHT",
|
||||
"CUBINCLUDE",
|
||||
"DEPLOY_DETECTION",
|
||||
"ERROR_PRONE_ISINSTANCE",
|
||||
"EXEC",
|
||||
"HEADER_ONLY_LINTER",
|
||||
"IMPORT_LINTER",
|
||||
"INCLUDE",
|
||||
"LINTRUNNER_VERSION",
|
||||
"MERGE_CONFLICTLESS_CSV",
|
||||
"META_NO_CREATE_UNBACKED",
|
||||
"NEWLINE",
|
||||
"NOQA",
|
||||
"NO_WORKFLOWS_ON_FORK",
|
||||
"ONCE_FLAG",
|
||||
"PYBIND11_INCLUDE",
|
||||
"PYBIND11_SPECIALIZATION",
|
||||
"PYPIDEP",
|
||||
"PYPROJECT",
|
||||
"RAWCUDA",
|
||||
"RAWCUDADEVICE",
|
||||
"ROOT_LOGGING",
|
||||
"TABS",
|
||||
"TESTOWNERS",
|
||||
"TYPEIGNORE",
|
||||
"TYPENOSKIP",
|
||||
"WORKFLOWSYNC",
|
||||
}
|
||||
|
||||
|
||||
#: These linters are expected to take a few seconds, but less than 10s cpu time total
|
||||
FAST_LINTERS = {
|
||||
"CMAKE",
|
||||
"DOCSTRING_LINTER",
|
||||
"GHA",
|
||||
"NATIVEFUNCTIONS",
|
||||
"RUFF",
|
||||
"SET_LINTER",
|
||||
"SHELLCHECK",
|
||||
"SPACES",
|
||||
}
|
||||
|
||||
|
||||
#: These linters are expected to take more than 10s cpu time total;
|
||||
#: some need more than 1 hour.
|
||||
SLOW_LINTERS = {
|
||||
"ACTIONLINT",
|
||||
"CLANGFORMAT",
|
||||
"CLANGTIDY",
|
||||
"CODESPELL",
|
||||
"FLAKE8",
|
||||
"GB_REGISTRY",
|
||||
"PYFMT",
|
||||
"PYREFLY",
|
||||
"TEST_DEVICE_BIAS",
|
||||
"TEST_HAS_MAIN",
|
||||
}
|
||||
|
||||
|
||||
ALL_LINTERS = VERY_FAST_LINTERS | FAST_LINTERS | SLOW_LINTERS
|
||||
|
||||
|
||||
LINTRUNNER_CACHE_INFO = (
|
||||
Path(".lintbin/.lintrunner.sha256"),
|
||||
[
|
||||
"requirements.txt",
|
||||
"pyproject.toml",
|
||||
".lintrunner.toml",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
LINTRUNNER_BASE_CMD = [
|
||||
"uvx",
|
||||
"--python",
|
||||
"3.10",
|
||||
"lintrunner@0.12.7",
|
||||
]
|
||||
|
||||
|
||||
@click.command()
|
||||
def setup_lint():
|
||||
"""Set up lintrunner with current CI version."""
|
||||
cmd = LINTRUNNER_BASE_CMD + ["init"]
|
||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
|
||||
|
||||
def _check_linters():
|
||||
cmd = LINTRUNNER_BASE_CMD + ["list"]
|
||||
ret = spin.util.run(cmd, output=False, stderr=subprocess.PIPE)
|
||||
linters = {l.strip() for l in ret.stdout.decode().strip().split("\n")[1:]}
|
||||
unknown_linters = linters - ALL_LINTERS
|
||||
missing_linters = ALL_LINTERS - linters
|
||||
if unknown_linters:
|
||||
click.secho(
|
||||
f"Unknown linters found; please add them to the correct category "
|
||||
f"in .spin/cmds.py: {', '.join(unknown_linters)}",
|
||||
fg="yellow",
|
||||
)
|
||||
if missing_linters:
|
||||
click.secho(
|
||||
f"Missing linters found; please update the corresponding category "
|
||||
f"in .spin/cmds.py: {', '.join(missing_linters)}",
|
||||
fg="yellow",
|
||||
)
|
||||
return unknown_linters, missing_linters
|
||||
|
||||
|
||||
@spin.util.extend_command(
|
||||
setup_lint,
|
||||
doc=f"""
|
||||
If configuration has changed, update lintrunner.
|
||||
|
||||
Compares the stored old hashes of configuration files with new ones and
|
||||
performs setup via setup-lint if the hashes have changed.
|
||||
Hashes are stored in {LINTRUNNER_CACHE_INFO[0]}; the following files are
|
||||
considered: {", ".join(LINTRUNNER_CACHE_INFO[1])}.
|
||||
""",
|
||||
)
|
||||
@click.pass_context
|
||||
def lazy_setup_lint(ctx, parent_callback, **kwargs):
|
||||
if hashes := _updated_hashes(*LINTRUNNER_CACHE_INFO):
|
||||
click.echo(
|
||||
"Changes detected in lint configuration files. Setting up linting tools..."
|
||||
)
|
||||
parent_callback(**kwargs)
|
||||
hash_file = LINTRUNNER_CACHE_INFO[0]
|
||||
hash_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
with hash_file.open("w") as f:
|
||||
for file, hash in hashes.items():
|
||||
f.write(f"{hash} {file}\n")
|
||||
click.echo("Linting tools set up and hashes updated.")
|
||||
else:
|
||||
click.echo("No changes detected in lint configuration files. Skipping setup.")
|
||||
click.echo("Regenerating version...")
|
||||
ctx.invoke(regenerate_version)
|
||||
click.echo("Regenerating type stubs...")
|
||||
ctx.invoke(regenerate_type_stubs)
|
||||
click.echo("Done.")
|
||||
_check_linters()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("-a", "--apply-patches", is_flag=True)
|
||||
@click.pass_context
|
||||
def lint(ctx, apply_patches, **kwargs):
|
||||
"""Lint all files."""
|
||||
ctx.invoke(lazy_setup_lint)
|
||||
all_files_linters = VERY_FAST_LINTERS | FAST_LINTERS
|
||||
changed_files_linters = SLOW_LINTERS
|
||||
cmd = LINTRUNNER_BASE_CMD
|
||||
if apply_patches:
|
||||
cmd += ["--apply-patches"]
|
||||
all_files_cmd = cmd + [
|
||||
"--take",
|
||||
",".join(all_files_linters),
|
||||
"--all-files",
|
||||
]
|
||||
spin.util.run(all_files_cmd)
|
||||
changed_files_cmd = cmd + [
|
||||
"--take",
|
||||
",".join(changed_files_linters),
|
||||
]
|
||||
spin.util.run(changed_files_cmd)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.pass_context
|
||||
def fixlint(ctx, **kwargs):
|
||||
"""Autofix all files."""
|
||||
ctx.invoke(lint, apply_patches=True)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("-a", "--apply-patches", is_flag=True)
|
||||
@click.pass_context
|
||||
def quicklint(ctx, apply_patches, **kwargs):
|
||||
"""Lint changed files."""
|
||||
ctx.invoke(lazy_setup_lint)
|
||||
cmd = LINTRUNNER_BASE_CMD
|
||||
if apply_patches:
|
||||
cmd += ["--apply-patches"]
|
||||
spin.util.run(cmd)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.pass_context
|
||||
def quickfix(ctx, **kwargs):
|
||||
"""Autofix changed files."""
|
||||
ctx.invoke(quicklint, apply_patches=True)
|
||||
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/headeronly/core/TensorAccessor.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/ArrayRef.h>
|
||||
#include <c10/util/Deprecated.h>
|
||||
@ -11,252 +12,37 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
|
||||
// is used to enable the __restrict__ keyword/modifier for the data
|
||||
// passed to cuda.
|
||||
template <typename T>
|
||||
struct DefaultPtrTraits {
|
||||
typedef T* PtrType;
|
||||
};
|
||||
|
||||
using torch::headeronly::DefaultPtrTraits;
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
template <typename T>
|
||||
struct RestrictPtrTraits {
|
||||
typedef T* __restrict__ PtrType;
|
||||
};
|
||||
using torch::headeronly::RestrictPtrTraits;
|
||||
#endif
|
||||
|
||||
// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
|
||||
// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
|
||||
// to functions and types available there (e.g. IntArrayRef isn't).
|
||||
|
||||
// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
|
||||
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
|
||||
class TensorAccessorBase {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
using TensorAccessorBase = torch::headeronly::detail::TensorAccessorBase<c10::IntArrayRef, T, N, PtrTraits, index_t>;
|
||||
|
||||
C10_HOST_DEVICE TensorAccessorBase(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: data_(data_), sizes_(sizes_), strides_(strides_) {}
|
||||
C10_HOST IntArrayRef sizes() const {
|
||||
return IntArrayRef(sizes_,N);
|
||||
}
|
||||
C10_HOST IntArrayRef strides() const {
|
||||
return IntArrayRef(strides_,N);
|
||||
}
|
||||
C10_HOST_DEVICE index_t stride(index_t i) const {
|
||||
return strides_[i];
|
||||
}
|
||||
C10_HOST_DEVICE index_t size(index_t i) const {
|
||||
return sizes_[i];
|
||||
}
|
||||
C10_HOST_DEVICE PtrType data() {
|
||||
return data_;
|
||||
}
|
||||
C10_HOST_DEVICE const PtrType data() const {
|
||||
return data_;
|
||||
}
|
||||
protected:
|
||||
PtrType data_;
|
||||
const index_t* sizes_;
|
||||
const index_t* strides_;
|
||||
};
|
||||
|
||||
// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
|
||||
// `Tensor.accessor<T, N>()`.
|
||||
// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only
|
||||
// indexing on the device uses `TensorAccessor`s.
|
||||
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
|
||||
class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
using TensorAccessor = torch::headeronly::detail::TensorAccessor<c10::IntArrayRef, T, N, PtrTraits, index_t>;
|
||||
|
||||
C10_HOST_DEVICE TensorAccessor(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {}
|
||||
namespace detail {
|
||||
|
||||
C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
|
||||
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
|
||||
}
|
||||
|
||||
C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const {
|
||||
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, template <typename U> class PtrTraits, typename index_t>
|
||||
class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
|
||||
C10_HOST_DEVICE TensorAccessor(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {}
|
||||
C10_HOST_DEVICE T & operator[](index_t i) {
|
||||
// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
|
||||
return this->data_[this->strides_[0]*i];
|
||||
}
|
||||
C10_HOST_DEVICE const T & operator[](index_t i) const {
|
||||
return this->data_[this->strides_[0]*i];
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host
|
||||
// and as
|
||||
// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
|
||||
// in order to transfer them on the device when calling kernels.
|
||||
// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
|
||||
// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
|
||||
// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
|
||||
// on the device, so those functions are host only.
|
||||
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
|
||||
class GenericPackedTensorAccessorBase {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
C10_HOST GenericPackedTensorAccessorBase(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: data_(data_) {
|
||||
std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
|
||||
std::copy(strides_, strides_ + N, std::begin(this->strides_));
|
||||
}
|
||||
|
||||
// if index_t is not int64_t, we want to have an int64_t constructor
|
||||
template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
|
||||
C10_HOST GenericPackedTensorAccessorBase(
|
||||
PtrType data_,
|
||||
const source_index_t* sizes_,
|
||||
const source_index_t* strides_)
|
||||
: data_(data_) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
this->sizes_[i] = sizes_[i];
|
||||
this->strides_[i] = strides_[i];
|
||||
}
|
||||
}
|
||||
|
||||
C10_HOST_DEVICE index_t stride(index_t i) const {
|
||||
return strides_[i];
|
||||
}
|
||||
C10_HOST_DEVICE index_t size(index_t i) const {
|
||||
return sizes_[i];
|
||||
}
|
||||
C10_HOST_DEVICE PtrType data() {
|
||||
return data_;
|
||||
}
|
||||
C10_HOST_DEVICE const PtrType data() const {
|
||||
return data_;
|
||||
}
|
||||
protected:
|
||||
PtrType data_;
|
||||
// NOLINTNEXTLINE(*c-arrays*)
|
||||
index_t sizes_[N];
|
||||
// NOLINTNEXTLINE(*c-arrays*)
|
||||
index_t strides_[N];
|
||||
C10_HOST void bounds_check_(index_t i) const {
|
||||
TORCH_CHECK_INDEX(
|
||||
template <size_t N, typename index_t>
|
||||
struct IndexBoundsCheck {
|
||||
IndexBoundsCheck(index_t i) {
|
||||
TORCH_CHECK_INDEX(
|
||||
0 <= i && i < index_t{N},
|
||||
"Index ",
|
||||
i,
|
||||
" is not within bounds of a tensor of dimension ",
|
||||
N);
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
|
||||
class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
|
||||
C10_HOST GenericPackedTensorAccessor(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
|
||||
|
||||
// if index_t is not int64_t, we want to have an int64_t constructor
|
||||
template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
|
||||
C10_HOST GenericPackedTensorAccessor(
|
||||
PtrType data_,
|
||||
const source_index_t* sizes_,
|
||||
const source_index_t* strides_)
|
||||
: GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
|
||||
|
||||
C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
|
||||
index_t* new_sizes = this->sizes_ + 1;
|
||||
index_t* new_strides = this->strides_ + 1;
|
||||
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
|
||||
}
|
||||
|
||||
C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const {
|
||||
const index_t* new_sizes = this->sizes_ + 1;
|
||||
const index_t* new_strides = this->strides_ + 1;
|
||||
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
|
||||
}
|
||||
|
||||
/// Returns a PackedTensorAccessor of the same dimension after transposing the
|
||||
/// two dimensions given. Does not actually move elements; transposition is
|
||||
/// made by permuting the size/stride arrays. If the dimensions are not valid,
|
||||
/// asserts.
|
||||
C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
|
||||
index_t dim1,
|
||||
index_t dim2) const {
|
||||
this->bounds_check_(dim1);
|
||||
this->bounds_check_(dim2);
|
||||
GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
|
||||
this->data_, this->sizes_, this->strides_);
|
||||
std::swap(result.strides_[dim1], result.strides_[dim2]);
|
||||
std::swap(result.sizes_[dim1], result.sizes_[dim2]);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, template <typename U> class PtrTraits, typename index_t>
|
||||
class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
C10_HOST GenericPackedTensorAccessor(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
|
||||
|
||||
// if index_t is not int64_t, we want to have an int64_t constructor
|
||||
template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
|
||||
C10_HOST GenericPackedTensorAccessor(
|
||||
PtrType data_,
|
||||
const source_index_t* sizes_,
|
||||
const source_index_t* strides_)
|
||||
: GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
|
||||
|
||||
C10_DEVICE T & operator[](index_t i) {
|
||||
return this->data_[this->strides_[0] * i];
|
||||
}
|
||||
C10_DEVICE const T& operator[](index_t i) const {
|
||||
return this->data_[this->strides_[0]*i];
|
||||
}
|
||||
|
||||
// Same as in the general N-dimensional case, but note that in the
|
||||
// 1-dimensional case the returned PackedTensorAccessor will always be an
|
||||
// identical copy of the original
|
||||
C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
|
||||
index_t dim1,
|
||||
index_t dim2) const {
|
||||
this->bounds_check_(dim1);
|
||||
this->bounds_check_(dim2);
|
||||
return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
|
||||
this->data_, this->sizes_, this->strides_);
|
||||
}
|
||||
};
|
||||
using GenericPackedTensorAccessorBase = torch::headeronly::detail::GenericPackedTensorAccessorBase<detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;
|
||||
|
||||
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
|
||||
using GenericPackedTensorAccessor = torch::headeronly::detail::GenericPackedTensorAccessor<TensorAccessor<T, N-1, PtrTraits, index_t>, detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;
|
||||
|
||||
// Can't put this directly into the macro function args because of commas
|
||||
#define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
|
||||
|
||||
@ -245,6 +245,9 @@ class TORCH_API TensorBase {
|
||||
size_t weak_use_count() const noexcept {
|
||||
return impl_.weak_use_count();
|
||||
}
|
||||
bool is_uniquely_owned() const noexcept {
|
||||
return impl_.is_uniquely_owned();
|
||||
}
|
||||
|
||||
std::string toString() const;
|
||||
|
||||
|
||||
@ -223,6 +223,62 @@ CONVERT_FROM_BF16_TEMPLATE(double)
|
||||
CONVERT_FROM_BF16_TEMPLATE(float16_t)
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
|
||||
// clang-[17, 20] crashes when autovectorizing static cast to bf16
|
||||
// Below is a workaround to have some vectorization
|
||||
// Works decently well for smaller int types
|
||||
template <typename from_type>
|
||||
inline void convertToBf16Impl(
|
||||
const from_type* __restrict src,
|
||||
c10::BFloat16* __restrict dst,
|
||||
uint64_t n) {
|
||||
bfloat16_t* dstPtr = reinterpret_cast<bfloat16_t*>(dst);
|
||||
uint64_t loopBound = n - (n % 16);
|
||||
uint64_t i = 0;
|
||||
for (; i < loopBound; i += 16) {
|
||||
float32x4_t a, b, c, d;
|
||||
a[0] = static_cast<float>(src[i]);
|
||||
a[1] = static_cast<float>(src[i + 1]);
|
||||
a[2] = static_cast<float>(src[i + 2]);
|
||||
a[3] = static_cast<float>(src[i + 3]);
|
||||
b[0] = static_cast<float>(src[i + 4]);
|
||||
b[1] = static_cast<float>(src[i + 5]);
|
||||
b[2] = static_cast<float>(src[i + 6]);
|
||||
b[3] = static_cast<float>(src[i + 7]);
|
||||
c[0] = static_cast<float>(src[i + 8]);
|
||||
c[1] = static_cast<float>(src[i + 9]);
|
||||
c[2] = static_cast<float>(src[i + 10]);
|
||||
c[3] = static_cast<float>(src[i + 11]);
|
||||
d[0] = static_cast<float>(src[i + 12]);
|
||||
d[1] = static_cast<float>(src[i + 13]);
|
||||
d[2] = static_cast<float>(src[i + 14]);
|
||||
d[3] = static_cast<float>(src[i + 15]);
|
||||
|
||||
vst1q_bf16(dstPtr + i, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(a), b));
|
||||
vst1q_bf16(dstPtr + i + 8, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(c), d));
|
||||
}
|
||||
|
||||
#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
|
||||
for (; i < n; i++) {
|
||||
float a = static_cast<float>(src[i]);
|
||||
dstPtr[i] = vcvth_bf16_f32(a);
|
||||
}
|
||||
}
|
||||
|
||||
#define CONVERT_TO_BF16_TEMPLATE(from_type) \
|
||||
template <> \
|
||||
inline void convert(const from_type* src, c10::BFloat16* dst, int64_t n) { \
|
||||
return convertToBf16Impl<from_type>(src, dst, n); \
|
||||
}
|
||||
|
||||
CONVERT_TO_BF16_TEMPLATE(uint8_t)
|
||||
CONVERT_TO_BF16_TEMPLATE(int8_t)
|
||||
CONVERT_TO_BF16_TEMPLATE(int16_t)
|
||||
CONVERT_TO_BF16_TEMPLATE(int32_t)
|
||||
|
||||
#endif
|
||||
|
||||
inline void convertBoolToBfloat16Impl(
|
||||
const bool* __restrict src,
|
||||
c10::BFloat16* __restrict dst,
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <shared_mutex>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cusparse.h>
|
||||
@ -88,8 +89,13 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
|
||||
TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
|
||||
|
||||
TORCH_CUDA_CPP_API void clearCublasWorkspaces();
|
||||
TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
|
||||
struct WorkspaceMapWithMutex {
|
||||
std::map<std::tuple<void*, void*>, at::DataPtr> map;
|
||||
std::shared_mutex mutex;
|
||||
};
|
||||
|
||||
TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
|
||||
TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
|
||||
TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
|
||||
|
||||
@ -175,17 +175,24 @@ void CUDAGraph::instantiate() {
|
||||
// Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
|
||||
// who prefer not to report error message through these arguments moving forward
|
||||
// (they prefer return value, or errors on api calls internal to the capture)
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000)
|
||||
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0));
|
||||
// ROCM appears to fail with HIP error: invalid argument
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && !defined(USE_ROCM)
|
||||
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, cudaGraphInstantiateFlagUseNodePriority));
|
||||
#else
|
||||
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
|
||||
#endif
|
||||
//Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
|
||||
//It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
|
||||
} else {
|
||||
#if !defined(USE_ROCM)
|
||||
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
|
||||
graph_,
|
||||
cudaGraphInstantiateFlagAutoFreeOnLaunch | cudaGraphInstantiateFlagUseNodePriority));
|
||||
#else
|
||||
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
|
||||
graph_,
|
||||
cudaGraphInstantiateFlagAutoFreeOnLaunch));
|
||||
#endif
|
||||
}
|
||||
has_graph_exec_ = true;
|
||||
}
|
||||
|
||||
@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
|
||||
// - Comments of @soumith copied from cuDNN handle pool implementation
|
||||
#ifdef NO_CUDNN_DESTROY_HANDLE
|
||||
#else
|
||||
cublasDestroy(handle);
|
||||
cublasDestroy(handle);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -107,19 +107,27 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle
|
||||
|
||||
} // namespace
|
||||
|
||||
std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
|
||||
static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
|
||||
WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
|
||||
static auto& instance = *new WorkspaceMapWithMutex;
|
||||
return instance;
|
||||
}
|
||||
|
||||
std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
|
||||
static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
|
||||
WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
|
||||
static auto& instance = *new WorkspaceMapWithMutex;
|
||||
return instance;
|
||||
}
|
||||
|
||||
void clearCublasWorkspaces() {
|
||||
cublas_handle_stream_to_workspace().clear();
|
||||
cublaslt_handle_stream_to_workspace().clear();
|
||||
{
|
||||
auto& workspace = cublas_handle_stream_to_workspace();
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
workspace.map.clear();
|
||||
}
|
||||
{
|
||||
auto& workspace = cublaslt_handle_stream_to_workspace();
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
workspace.map.clear();
|
||||
}
|
||||
}
|
||||
|
||||
size_t parseChosenWorkspaceSize() {
|
||||
@ -233,6 +241,38 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
|
||||
return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
|
||||
}
|
||||
|
||||
void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
|
||||
auto& workspace = cublas_handle_stream_to_workspace();
|
||||
|
||||
size_t workspace_size = getChosenWorkspaceSize();
|
||||
|
||||
// Fast path: check if workspace already exists
|
||||
{
|
||||
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.find(key);
|
||||
if (workspace_it != workspace.map.end()) {
|
||||
TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
|
||||
handle, workspace_it->second.get(), workspace_size));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Slow path: allocate workspace outside the lock
|
||||
auto new_workspace = getNewWorkspace();
|
||||
|
||||
// Insert with lock (double-check in case another thread inserted while we
|
||||
// were allocating)
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
|
||||
TORCH_CUDABLAS_CHECK(
|
||||
cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
|
||||
}
|
||||
}
|
||||
|
||||
void* getCUDABlasLtWorkspace() {
|
||||
#ifndef USE_ROCM
|
||||
static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
|
||||
@ -241,8 +281,10 @@ void* getCUDABlasLtWorkspace() {
|
||||
auto stream = c10::cuda::getCurrentCUDAStream();
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
|
||||
TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
|
||||
auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
|
||||
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.find(key);
|
||||
TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
|
||||
return workspace_it->second.mutable_get();
|
||||
}
|
||||
#endif
|
||||
@ -250,11 +292,29 @@ void* getCUDABlasLtWorkspace() {
|
||||
auto stream = c10::cuda::getCurrentCUDAStream();
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
|
||||
if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
|
||||
workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
|
||||
|
||||
auto& workspace = cublaslt_handle_stream_to_workspace();
|
||||
|
||||
// Fast path: check if workspace already exists
|
||||
{
|
||||
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.find(key);
|
||||
if (workspace_it != workspace.map.end()) {
|
||||
return workspace_it->second.mutable_get();
|
||||
}
|
||||
}
|
||||
|
||||
// Slow path: allocate workspace outside the lock
|
||||
auto new_workspace = getNewCUDABlasLtWorkspace();
|
||||
|
||||
// Insert with lock (double-check in case another thread inserted while we
|
||||
// were allocating)
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it =
|
||||
workspace.map.try_emplace(key, std::move(new_workspace)).first;
|
||||
return workspace_it->second.mutable_get();
|
||||
}
|
||||
return workspace_it->second.mutable_get();
|
||||
}
|
||||
|
||||
cublasHandle_t getCurrentCUDABlasHandle() {
|
||||
@ -298,13 +358,8 @@ cublasHandle_t getCurrentCUDABlasHandle() {
|
||||
// will allocate memory dynamically (even if they're cheap) outside
|
||||
// PyTorch's CUDA caching allocator. It's possible that CCA used up
|
||||
// all the memory and cublas's cudaMallocAsync will return OOM
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
auto workspace_it = cublas_handle_stream_to_workspace().find(key);
|
||||
if (workspace_it == cublas_handle_stream_to_workspace().end()) {
|
||||
workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
|
||||
}
|
||||
TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
|
||||
setWorkspaceForHandle(handle, stream);
|
||||
|
||||
#if !defined(USE_ROCM)
|
||||
// On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
|
||||
// FP32 data type calculations based on the value of the allow_tf32 flag.
|
||||
|
||||
@ -22,6 +22,7 @@ enum class MacOSVersion : uint32_t {
|
||||
MACOS_VER_15_0_PLUS,
|
||||
MACOS_VER_15_1_PLUS,
|
||||
MACOS_VER_15_2_PLUS,
|
||||
MACOS_VER_26_0_PLUS,
|
||||
};
|
||||
|
||||
//-----------------------------------------------------------------
|
||||
|
||||
@ -65,6 +65,7 @@ bool MPSDevice::isMacOS13Plus(MacOSVersion version) const {
|
||||
static bool _macos_15_0_plus = is_os_version_at_least(15, 0);
|
||||
static bool _macos_15_1_plus = is_os_version_at_least(15, 1);
|
||||
static bool _macos_15_2_plus = is_os_version_at_least(15, 2);
|
||||
static bool _macos_26_0_plus = is_os_version_at_least(26, 0);
|
||||
|
||||
switch (version) {
|
||||
case MacOSVersion::MACOS_VER_14_4_PLUS:
|
||||
@ -75,6 +76,8 @@ bool MPSDevice::isMacOS13Plus(MacOSVersion version) const {
|
||||
return _macos_15_1_plus;
|
||||
case MacOSVersion::MACOS_VER_15_2_PLUS:
|
||||
return _macos_15_2_plus;
|
||||
case MacOSVersion::MACOS_VER_26_0_PLUS:
|
||||
return _macos_26_0_plus;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1936,7 +1936,7 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o
|
||||
|
||||
// We order the tensors. t1 will be the larger tensor
|
||||
// We can always transpose tensor2 as the dimensions are always >= 1 (precondition from matmul)
|
||||
// and tensor1_larger iff tensor2.dim() > tensor1.dim()
|
||||
// and tensor1_larger iff tensor2.dim() > tensor1.dim(9
|
||||
const auto t1 = tensor1_larger ? MaybeOwned<Tensor>::borrowed(tensor1)
|
||||
: MaybeOwned<Tensor>::owned(tensor2.mT());
|
||||
const int64_t dim_t1 = t1->dim();
|
||||
@ -1948,11 +1948,20 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o
|
||||
return false;
|
||||
}
|
||||
|
||||
// If we require a gradient, we should fold to minimize backward memory usage - even if this
|
||||
// leads to a copy in forward because is needed in backward,
|
||||
// only time we avoid this strict pre-allocated memory usage (has_out = True)
|
||||
bool requires_grad = tensor1.requires_grad() || tensor2.requires_grad();
|
||||
if (requires_grad && !has_out) {
|
||||
// In this case we *do* incur in an extra copy to avoid creating an unnecessary large tensor in the backward
|
||||
// Suppose we don't fold here. Let t1.shape = [b, m, n] t2.shape = [n, k] like in a transformer
|
||||
// t2 will be expanded to a tensor of shape [b, n, k] and then we do t1.bmm(t2_expanded)
|
||||
// The issue appears in the backward.
|
||||
// The output gradient g of this operation would have shape [b, m, k]
|
||||
// The backward wrt. t2 of bmm would be given by t1.mH @ g, which has shape [b, n, k]
|
||||
// Then, the backward of expand is simply `sum(0)`. As such, we are instantiating a tensor
|
||||
// of shape [b, n, k] unnecessarily, which may cause a large memory footprint, and in the
|
||||
// worst case, an OOM
|
||||
bool t2_requires_grad = tensor1_larger ? tensor2.requires_grad() : tensor1.requires_grad();
|
||||
if (t2_requires_grad && !has_out) {
|
||||
// We should be checking !at::GradMode::is_enabled(), but apparently
|
||||
// this regresses performance in some cases:
|
||||
// https://github.com/pytorch/pytorch/issues/118548#issuecomment-1916022394
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -1087,7 +1087,8 @@ TORCH_IMPL_FUNC(index_copy_out)
|
||||
result.copy_(self);
|
||||
|
||||
// See Note [Enabling Deterministic Operations]
|
||||
if (result.is_cuda() && globalContext().deterministicAlgorithms()) {
|
||||
if ((result.is_cuda() || result.is_xpu()) &&
|
||||
globalContext().deterministicAlgorithms()) {
|
||||
torch::List<std::optional<Tensor>> indices;
|
||||
indices.resize(dim + 1);
|
||||
indices.set(dim, index);
|
||||
|
||||
@ -904,19 +904,11 @@ Tensor mvlgamma(const Tensor& self, int64_t p) {
|
||||
return args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi<double>) * QUARTER);
|
||||
}
|
||||
|
||||
// since mvlgamma_ has different signature from its
|
||||
// out and functional variant, we explicitly
|
||||
// define it (instead of using structured kernel).
|
||||
Tensor& mvlgamma_(Tensor& self, int64_t p) {
|
||||
mvlgamma_check(self, p);
|
||||
Tensor args = native::arange(
|
||||
-p *HALF + HALF,
|
||||
HALF,
|
||||
HALF,
|
||||
optTypeMetaToScalarType(self.options().dtype_opt()),
|
||||
self.options().layout_opt(),
|
||||
self.options().device_opt(),
|
||||
self.options().pinned_memory_opt());
|
||||
args = args.add(self.unsqueeze(-1));
|
||||
const auto p2_sub_p = static_cast<double>(p * (p - 1));
|
||||
return self.copy_(args.lgamma_().sum(-1).add_(p2_sub_p * std::log(c10::pi<double>) * QUARTER));
|
||||
return at::mvlgamma_out(self, self, p);
|
||||
}
|
||||
|
||||
Tensor& mvlgamma_out(const Tensor& self, int64_t p, Tensor& result) {
|
||||
|
||||
@ -296,7 +296,7 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
|
||||
bool launchGemmAndBiasCublasLt(
|
||||
// args contains result which is modified
|
||||
cublasCommonArgs& args,
|
||||
const Tensor& self,
|
||||
const std::optional<Tensor>& self,
|
||||
const Scalar& alpha,
|
||||
Activation activation = Activation::None
|
||||
) {
|
||||
@ -304,12 +304,8 @@ bool launchGemmAndBiasCublasLt(
|
||||
// or when it can be squeezed to 1D.
|
||||
// self_ptr == nullptr implies ignore bias epilogue
|
||||
// and use standard gemm-like API.
|
||||
const auto* self_ptr = [&]() -> auto {
|
||||
if (self.dim() == 1 || self.squeeze().dim() == 1) {
|
||||
return self.const_data_ptr<scalar_t>();
|
||||
}
|
||||
return static_cast<const scalar_t*>(nullptr);
|
||||
}();
|
||||
const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
|
||||
|
||||
|
||||
const auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
@ -392,35 +388,30 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
||||
#ifdef USE_ROCM
|
||||
// Conditioned on the device index, which is not persistent
|
||||
disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
|
||||
disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
|
||||
#endif
|
||||
// Condition on the input
|
||||
disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
|
||||
// }
|
||||
disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);
|
||||
|
||||
at::ScalarType scalar_type = mat1.scalar_type();
|
||||
bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
|
||||
|
||||
#ifdef USE_ROCM
|
||||
disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
|
||||
#endif
|
||||
|
||||
bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
|
||||
// for float output with half input cublasLT with bias produces wrong results
|
||||
use_bias_ptr_lt &= !is_float_output_with_half_input;
|
||||
|
||||
// Handle result/self shapes
|
||||
if (!result.is_same(self)) {
|
||||
at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});
|
||||
|
||||
// We use bias ptr in the Lt path only when bias is 1D
|
||||
const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
|
||||
const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
|
||||
if (!use_bias_ptr_lt) {
|
||||
// We do expand self even before
|
||||
// check for beta != 0.0 to make sure that
|
||||
// test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
|
||||
// runs green.
|
||||
return expand_size(self, result.sizes(), "addmm");
|
||||
}
|
||||
return c10::MaybeOwned<Tensor>::borrowed(self);
|
||||
}();
|
||||
// We do not copy bias only when we need the bias ptr
|
||||
// We do not copy bias only when we need the bias ptr
|
||||
if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
|
||||
// NOTE: self should broadcast over result
|
||||
at::native::copy_(result, *self_maybe_expanded);
|
||||
at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
|
||||
}
|
||||
}
|
||||
|
||||
@ -468,7 +459,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
scalar_type,
|
||||
"addmm_cuda_lt",
|
||||
[&] {
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
|
||||
}
|
||||
);
|
||||
#endif
|
||||
@ -480,7 +471,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
scalar_type,
|
||||
"addmm_cuda_lt",
|
||||
[&] {
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
|
||||
}
|
||||
);
|
||||
} // end is_float_output_with_half_input
|
||||
@ -936,7 +927,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
|
||||
return _int_mm_out_cuda(self, mat2, result);
|
||||
}
|
||||
|
||||
static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
|
||||
static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
|
||||
// ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
|
||||
TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
|
||||
TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
|
||||
@ -960,7 +951,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
|
||||
(out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
|
||||
if (!is_bmm && self_baddbmm.has_value()) {
|
||||
if (self_baddbmm.has_value()) {
|
||||
const auto& self = self_baddbmm.value();
|
||||
TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
|
||||
TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
|
||||
@ -968,15 +959,12 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
|
||||
}
|
||||
|
||||
Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
|
||||
IntArrayRef batch1_sizes = batch1.sizes();
|
||||
IntArrayRef batch2_sizes = batch2.sizes();
|
||||
|
||||
Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
|
||||
Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
|
||||
return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
|
||||
}
|
||||
|
||||
Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
|
||||
Scalar beta(0.0);
|
||||
Scalar alpha(1.0);
|
||||
{
|
||||
@ -988,14 +976,16 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
|
||||
}
|
||||
|
||||
Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
|
||||
// We need to copy the tensor
|
||||
Tensor out = self.clone().to(self.options().dtype(out_dtype));
|
||||
|
||||
return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
|
||||
TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
|
||||
"self dtype must match either out_dtype or batch1 dtype");
|
||||
Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
|
||||
return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
|
||||
}
|
||||
|
||||
Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
|
||||
// We need to copy the tensor
|
||||
out.copy_(self);
|
||||
{
|
||||
NoNamesGuard guard;
|
||||
baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
|
||||
@ -1030,24 +1020,27 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
|
||||
}
|
||||
|
||||
Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
|
||||
Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
|
||||
Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
|
||||
return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
|
||||
}
|
||||
|
||||
Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
|
||||
TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
|
||||
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
|
||||
// repeat dimensionality checks for direct calls to `out` overload
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
|
||||
TORCH_CHECK(
|
||||
mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
|
||||
mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
|
||||
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
|
||||
TORCH_CHECK(out_dtype == mat1.scalar_type() ||
|
||||
(out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
TORCH_CHECK(out_dtype == self.scalar_type() ||
|
||||
(out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
|
||||
"self dtype must match either out_dtype or mat1 dtype");
|
||||
|
||||
addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);
|
||||
|
||||
|
||||
@ -78,9 +78,18 @@ __global__ void EmbeddingBag_updateOutputKernel_max(
|
||||
scalar_t weightFeatMax = 0;
|
||||
int64_t bag_size_ = 0;
|
||||
int64_t maxWord = -1;
|
||||
|
||||
// Separate validation loop reduces register pressure in the main loop below.
|
||||
// No early exit (break) on invalid input as benchmarking shows it degrades performance.
|
||||
bool has_invalid_index = false;
|
||||
for (int64_t emb = begin; emb < end; emb++) {
|
||||
index_t input_idx = input[emb];
|
||||
has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
|
||||
}
|
||||
CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
|
||||
|
||||
for (int64_t emb = begin; emb < end; emb++) {
|
||||
bool pad = (input[emb] == padding_idx);
|
||||
CUDA_KERNEL_ASSERT(input[emb] < numRows);
|
||||
const int64_t weightRow = input[emb] * weight_stride0;
|
||||
scalar_t weightValue = weightFeat[weightRow];
|
||||
if (bag_size_ == 0 || weightValue > weightFeatMax) {
|
||||
@ -129,10 +138,19 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean(
|
||||
CUDA_KERNEL_ASSERT(end >= begin);
|
||||
accscalar_t weightFeatSum = 0;
|
||||
int64_t bag_size_ = 0;
|
||||
|
||||
// Separate validation loop reduces register pressure in the main loop below.
|
||||
// No early exit (break) on invalid input as benchmarking shows it degrades performance.
|
||||
bool has_invalid_index = false;
|
||||
for (int64_t emb = begin; emb < end; emb++) {
|
||||
index_t input_idx = input[emb];
|
||||
has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
|
||||
}
|
||||
CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
|
||||
|
||||
for (int64_t emb = begin; emb < end; emb++) {
|
||||
index_t input_idx = input[emb];
|
||||
bool pad = (input_idx == padding_idx);
|
||||
CUDA_KERNEL_ASSERT(0 <= input_idx && input_idx < numRows);
|
||||
const int64_t weightRow = input_idx * weight_stride0;
|
||||
scalar_t weightValue = weightFeat[weightRow];
|
||||
weightValue = pad ? static_cast<scalar_t>(0) : weightValue;
|
||||
|
||||
@ -78,9 +78,9 @@ _mx8_mx8_bf16_grouped_mm_fbgemm(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const SwizzleType& swizzle_a,
|
||||
const SwizzleType swizzle_a,
|
||||
const Tensor& scale_b,
|
||||
const SwizzleType& swizzle_b,
|
||||
const SwizzleType swizzle_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
Tensor& out) {
|
||||
const bool a_is_2d = mat_a.dim() == 2;
|
||||
|
||||
@ -5,69 +5,11 @@
|
||||
#include <cuda_bf16.h>
|
||||
#endif
|
||||
|
||||
// ROCm 6.3 is planned to have these functions, but until then here they are.
|
||||
#if defined(USE_ROCM)
|
||||
#include <device_functions.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <hip/hip_bf16.h>
|
||||
|
||||
__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
|
||||
#if (defined(__gfx942__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
|
||||
typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
|
||||
static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
|
||||
union {
|
||||
__hip_bfloat162_raw bf162_raw;
|
||||
vec_short2 vs2;
|
||||
} u{static_cast<__hip_bfloat162_raw>(value)};
|
||||
u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
|
||||
return static_cast<__hip_bfloat162>(u.bf162_raw);
|
||||
#else
|
||||
static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
|
||||
union u_hold {
|
||||
__hip_bfloat162_raw h2r;
|
||||
unsigned int u32;
|
||||
};
|
||||
u_hold old_val, new_val;
|
||||
old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
do {
|
||||
new_val.h2r = __hadd2(old_val.h2r, value);
|
||||
} while (!__hip_atomic_compare_exchange_strong(
|
||||
(unsigned int*)address, &old_val.u32, new_val.u32,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
|
||||
return old_val.h2r;
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
|
||||
#if (defined(__gfx942__)) && \
|
||||
__has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
|
||||
// The api expects an ext_vector_type of half
|
||||
typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
|
||||
static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
|
||||
union {
|
||||
__half2_raw h2r;
|
||||
vec_fp162 fp16;
|
||||
} u {static_cast<__half2_raw>(value)};
|
||||
u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
|
||||
return static_cast<__half2>(u.h2r);
|
||||
#else
|
||||
static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
|
||||
union u_hold {
|
||||
__half2_raw h2r;
|
||||
unsigned int u32;
|
||||
};
|
||||
u_hold old_val, new_val;
|
||||
old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
|
||||
do {
|
||||
new_val.h2r = __hadd2(old_val.h2r, value);
|
||||
} while (!__hip_atomic_compare_exchange_strong(
|
||||
(unsigned int*)address, &old_val.u32, new_val.u32,
|
||||
__ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
|
||||
return old_val.h2r;
|
||||
#endif
|
||||
}
|
||||
#define ATOMICADD preview_unsafeAtomicAdd
|
||||
#define ATOMICADD unsafeAtomicAdd
|
||||
#define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
|
||||
#else
|
||||
#define ATOMICADD atomicAdd
|
||||
|
||||
@ -2,18 +2,250 @@
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
#include <ATen/native/cuda/Loops.cuh>
|
||||
#include <ATen/native/cuda/JitLoops.cuh>
|
||||
#include <ATen/native/cuda/jit_utils.h>
|
||||
#include <ATen/native/cuda/ScanUtils.cuh>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/native/BinaryOps.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <c10/util/MathConstants.h>
|
||||
#include <c10/util/complex.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
|
||||
// NOTE: CUDA on Windows requires that the enclosing function
|
||||
// of a __device__ lambda not have internal linkage.
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// custom min and max to be used in logaddexp for complex arguments
|
||||
template <typename scalar_t, bool min>
|
||||
__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
|
||||
scalar_t xr = std::real(x);
|
||||
scalar_t yr = std::real(y);
|
||||
if (::isnan(yr) || (::isnan(std::imag(y)))) {
|
||||
return y;
|
||||
} else if (::isnan(xr) || (::isnan(std::imag(x)))) {
|
||||
return x;
|
||||
} else if (min) { // min
|
||||
return (xr < yr) ? x : y;
|
||||
} else { // max
|
||||
return (xr >= yr) ? x : y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
|
||||
// Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
|
||||
// Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
|
||||
const auto isnan_x = at::_isnan(x);
|
||||
const auto isnan_y = at::_isnan(y);
|
||||
scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
|
||||
scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
|
||||
if (min != max || ::isfinite(min)) {
|
||||
// nan will be propagated here
|
||||
return ::log1p(std::exp(min - max)) + max;
|
||||
} else {
|
||||
// special case to correctly handle infinite cases
|
||||
return x;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
|
||||
// complex exponential function, but implemented manually to get fast compilation time
|
||||
// this function only handles the case where the x is finite (not inf nor nan)
|
||||
const auto xreal = std::real(x);
|
||||
const auto ximag = std::imag(x);
|
||||
const auto exp_x_abs = std::exp(xreal);
|
||||
auto exp_x_real = exp_x_abs * std::cos(ximag);
|
||||
auto exp_x_imag = exp_x_abs * std::sin(ximag);
|
||||
return {exp_x_real, exp_x_imag};
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
|
||||
// complex exponential function, but implemented manually to get fast compilation time
|
||||
// this function only handles the case where the real part of x is infinite
|
||||
const auto ximag = std::imag(x);
|
||||
constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
|
||||
if (!::isfinite(ximag)) { // add this to make consitent with std::exp(x+yi)
|
||||
return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
|
||||
}
|
||||
const auto sin = std::sin(ximag);
|
||||
const auto cos = std::cos(ximag);
|
||||
// special case if the angle is exactly the multiple of pi/2
|
||||
auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
|
||||
auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
|
||||
return {exp_x_real, exp_x_imag};
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
|
||||
c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
|
||||
c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
|
||||
scalar_t min_real = std::real(min);
|
||||
scalar_t max_real = std::real(max);
|
||||
|
||||
if (::isnan(min_real) || ::isnan(std::imag(min))) {
|
||||
// handling the "infectious" NaNs
|
||||
return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
|
||||
}
|
||||
else if ((!::isfinite(min_real)) && (min_real == max_real)) {
|
||||
if (min_real < 0) {
|
||||
// handle the -inf case, the imaginary part here does not really matter as the exp(value)
|
||||
// will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
|
||||
// It does not matter if we're taking the exp of this value
|
||||
return min;
|
||||
} else {
|
||||
// handle the +inf case, we don't need the special precision for log1p for small values
|
||||
// and to avoid producing nan in case of real(max) == real(min) == +inf
|
||||
const auto exp_min = _fast_build_exp_inf(min);
|
||||
const auto exp_max = _fast_build_exp_inf(max);
|
||||
return ::log1p(exp_min + exp_max - 1); // log1p(x - 1) builds faster than log
|
||||
}
|
||||
} else {
|
||||
const auto minmax = min - max;
|
||||
c10::complex<scalar_t> exp_minmax;
|
||||
if (!::isfinite(minmax.real())) {
|
||||
exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
|
||||
} else {
|
||||
exp_minmax = _fast_build_exp(minmax);
|
||||
}
|
||||
return ::log1p(exp_minmax) + max;
|
||||
}
|
||||
}
|
||||
|
||||
// Complex logaddexp jiterator string
|
||||
const auto logaddexp_complex_string = jiterator_stringify(
|
||||
template<typename T>
|
||||
std::complex<T> log1p(const std::complex<T>& z)
|
||||
{
|
||||
using complex_t = std::complex<T>;
|
||||
T x = z.real();
|
||||
T y = z.imag();
|
||||
T zabs = abs(z);
|
||||
T theta = atan2(y, x + T(1));
|
||||
if (zabs < 0.5) {
|
||||
T r = x * (T(2) + x) + y * y;
|
||||
if (r == 0) { // handle underflow
|
||||
return complex_t(x, theta);
|
||||
}
|
||||
return complex_t(T(0.5) * std::log1p(r), theta);
|
||||
} else {
|
||||
T z0 = std::hypot(x + 1, y);
|
||||
return complex_t(log(z0), theta);
|
||||
}
|
||||
}
|
||||
|
||||
// separated _logaddexp_minmax into 2 different functions for jiterator_string
|
||||
template <typename T>
|
||||
std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
|
||||
T xr = x.real();
|
||||
T yr = y.real();
|
||||
if (isnan(yr) || isnan(y.imag())) {
|
||||
return y;
|
||||
} else if (isnan(xr) || isnan(x.imag())) {
|
||||
return x;
|
||||
} else {
|
||||
return (xr < yr) ? x : y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
|
||||
T xr = x.real();
|
||||
T yr = y.real();
|
||||
if (isnan(yr) || isnan(y.imag())) {
|
||||
return y;
|
||||
} else if (isnan(xr) || isnan(x.imag())) {
|
||||
return x;
|
||||
} else {
|
||||
return (xr >= yr) ? x : y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::complex<T> fast_build_exp(const std::complex<T>& x) {
|
||||
const auto xreal = x.real();
|
||||
const auto ximag = x.imag();
|
||||
const auto exp_x_abs = exp(xreal);
|
||||
auto exp_x_real = exp_x_abs * cos(ximag);
|
||||
auto exp_x_imag = exp_x_abs * sin(ximag);
|
||||
return std::complex<T>(exp_x_real, exp_x_imag);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
|
||||
using complex_t = std::complex<T>;
|
||||
const auto ximag = x.imag();
|
||||
const T exp_x_abs = INFINITY;
|
||||
if (!isfinite(ximag)) {
|
||||
return complex_t(exp_x_abs, NAN);
|
||||
}
|
||||
const auto sin_val = sin(ximag);
|
||||
const auto cos_val = cos(ximag);
|
||||
auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
|
||||
auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
|
||||
return complex_t(exp_x_real, exp_x_imag);
|
||||
}
|
||||
|
||||
template <typename complex_t>
|
||||
complex_t logaddexp_complex(complex_t x, complex_t y) {
|
||||
using T = typename complex_t::value_type;
|
||||
complex_t min_val = logaddexp_min(x, y);
|
||||
complex_t max_val = logaddexp_max(x, y);
|
||||
T min_real = min_val.real();
|
||||
T max_real = max_val.real();
|
||||
|
||||
if (isnan(min_real) || isnan(min_val.imag())) {
|
||||
return complex_t(NAN, NAN);
|
||||
}
|
||||
else if ((!isfinite(min_real)) && (min_real == max_real)) {
|
||||
if (min_real < T(0)) {
|
||||
return min_val;
|
||||
} else {
|
||||
const auto exp_min = fast_build_exp_inf<T>(min_val);
|
||||
const auto exp_max = fast_build_exp_inf<T>(max_val);
|
||||
return log1p(exp_min + exp_max - complex_t(1, 0));
|
||||
}
|
||||
} else {
|
||||
const auto minmax = min_val - max_val;
|
||||
complex_t exp_minmax;
|
||||
if (!isfinite(minmax.real())) {
|
||||
exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
|
||||
} else {
|
||||
exp_minmax = fast_build_exp<T>(minmax);
|
||||
}
|
||||
return log1p(exp_minmax) + max_val;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
constexpr char logaddexp_complex_name[] = "logaddexp_complex";
|
||||
void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
if (at::isComplexType(iter.dtype())) {
|
||||
#if AT_USE_JITERATOR()
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
|
||||
jitted_gpu_kernel<
|
||||
/*name=*/logaddexp_complex_name,
|
||||
/*return_dtype=*/scalar_t,
|
||||
/*common_dtype=*/scalar_t,
|
||||
/*arity=*/2>(iter, logaddexp_complex_string);
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
|
||||
const auto a = static_cast<opmath_t>(a_);
|
||||
const auto b = static_cast<opmath_t>(b_);
|
||||
return static_cast<scalar_t>(_log_add_exp_helper(a, b));
|
||||
});
|
||||
});
|
||||
#endif
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
ScalarType::BFloat16, ScalarType::Half,
|
||||
iter.dtype(), "logaddexp_cuda",
|
||||
[&]() {
|
||||
@ -29,6 +261,7 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {
|
||||
|
||||
@ -740,7 +740,12 @@ _scaled_rowwise_rowwise(
|
||||
TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
|
||||
TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())
|
||||
|
||||
TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
|
||||
// if we have a scale of shape [256, 1] (say), then stride can be [1, 0] - handle this case
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.stride(1) == 1 ||
|
||||
scale_a.size(1) == 1,
|
||||
"expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1)
|
||||
);
|
||||
TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
|
||||
|
||||
auto scaling_choice_a = ScalingType::RowWise;
|
||||
@ -1096,6 +1101,19 @@ _scaled_mxfp8_mxfp8(
|
||||
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
|
||||
}
|
||||
|
||||
void
|
||||
_check_mxfp4_support() {
|
||||
#ifndef USE_ROCM
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
// Only on B200 GPUs
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
// B200 = 10.0, B300 = 10.3
|
||||
dprops->major == 10,
|
||||
"MXFP4 scaling only supported in CUDA for B200/B300"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
Tensor&
|
||||
_scaled_mxfp4_mxfp4(
|
||||
@ -1108,6 +1126,7 @@ _scaled_mxfp4_mxfp4(
|
||||
#if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
|
||||
#else
|
||||
_check_mxfp4_support();
|
||||
// Restrictions:
|
||||
// A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
|
||||
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
|
||||
|
||||
342
aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
Normal file
342
aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
Normal file
@ -0,0 +1,342 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/BlasBackend.h>
|
||||
#include <ATen/WrapDimUtilsMulti.h>
|
||||
#include <ATen/ceil_div.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
|
||||
#include <ATen/native/xpu/Blas.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_addmm_activation_native.h>
|
||||
#include <ATen/ops/_efficientzerotensor.h>
|
||||
#include <ATen/ops/_scaled_mm_native.h>
|
||||
#include <ATen/ops/_unsafe_view_native.h>
|
||||
#include <ATen/ops/abs.h>
|
||||
#include <ATen/ops/addmm_native.h>
|
||||
#include <ATen/ops/addmv_native.h>
|
||||
#include <ATen/ops/baddbmm_native.h>
|
||||
#include <ATen/ops/bmm_native.h>
|
||||
#include <ATen/ops/copy_native.h>
|
||||
#include <ATen/ops/dot_native.h>
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_strided.h>
|
||||
#include <ATen/ops/gelu.h>
|
||||
#include <ATen/ops/max.h>
|
||||
#include <ATen/ops/mm_native.h>
|
||||
#include <ATen/ops/mul.h>
|
||||
#include <ATen/ops/ones.h>
|
||||
#include <ATen/ops/relu.h>
|
||||
#include <ATen/ops/scalar_tensor_native.h>
|
||||
#include <ATen/ops/vdot_native.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
using at::blas::ScalingType;
|
||||
using at::blas::SwizzleType;
|
||||
|
||||
namespace {
|
||||
/*
|
||||
* Scaling Type Determination:
|
||||
* ---------------------------
|
||||
* Conditions and corresponding Scaling Types:
|
||||
*
|
||||
* - If scale tensor is `Float8_e8m0fnu` or `Float8_e4m3fn`:
|
||||
* - Returns BlockWise (with additional size checks).
|
||||
*
|
||||
* - Else if scale.numel() == 1:
|
||||
* - Returns TensorWise.
|
||||
*
|
||||
* - Else if scale.dim() == 2 && scale.size(0) == outer_dim && scale.size(1) ==
|
||||
* 1:
|
||||
* - Returns RowWise.
|
||||
*
|
||||
* - Otherwise:
|
||||
* - Returns Error.
|
||||
*/
|
||||
|
||||
bool is_tensorwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
return at::isFloat8Type(t.scalar_type()) &&
|
||||
scale.scalar_type() == at::kFloat && scale.numel() == 1;
|
||||
}
|
||||
|
||||
bool is_rowwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
return (
|
||||
at::isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat &&
|
||||
scale.dim() == 2 && scale.size(0) == t.size(0) && scale.size(1) == 1 &&
|
||||
scale.is_contiguous());
|
||||
}
|
||||
|
||||
bool is_desired_scaling(
|
||||
const at::Tensor& t,
|
||||
const at::Tensor& scale,
|
||||
ScalingType desired_scaling) {
|
||||
auto result = desired_scaling == ScalingType::TensorWise
|
||||
? is_tensorwise_scaling(t, scale)
|
||||
: is_rowwise_scaling(t, scale);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::pair<ScalingType, ScalingType> get_joint_scaling(
|
||||
std::initializer_list<std::pair<ScalingType, ScalingType>> options,
|
||||
const at::Tensor& a,
|
||||
const at::Tensor& b,
|
||||
const at::Tensor& scale_a,
|
||||
const at::Tensor& scale_b) {
|
||||
for (auto [lhs, rhs] : options) {
|
||||
if (is_desired_scaling(a, scale_a, lhs) &&
|
||||
is_desired_scaling(b.t(), scale_b.t(), rhs)) {
|
||||
return {lhs, rhs};
|
||||
}
|
||||
}
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Invalid scaling configuration.\n"
|
||||
"- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
|
||||
"- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
|
||||
a.size(0),
|
||||
", 1) and scale_b should be (1, ",
|
||||
b.size(1),
|
||||
"), and both should be contiguous.\n"
|
||||
"Got a.dtype()=",
|
||||
a.scalar_type(),
|
||||
", scale_a.dtype()=",
|
||||
scale_a.scalar_type(),
|
||||
", scale_a.size()=",
|
||||
scale_a.sizes(),
|
||||
", scale_a.stride()=",
|
||||
scale_a.strides(),
|
||||
", ",
|
||||
"b.dtype()=",
|
||||
b.scalar_type(),
|
||||
", scale_b.dtype()=",
|
||||
scale_b.scalar_type(),
|
||||
", scale_b.size()=",
|
||||
scale_b.sizes(),
|
||||
" and scale_b.stride()=",
|
||||
scale_b.strides());
|
||||
}
|
||||
|
||||
Tensor& _scaled_gemm(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const ScalingType scaling_choice_a,
|
||||
const ScalingType scaling_choice_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const bool use_fast_accum,
|
||||
Tensor& out,
|
||||
const std::optional<Tensor>& alpha = std::nullopt) {
|
||||
// TODO: scale_result and alpha is not defined or used!
|
||||
std::optional<Tensor> scaled_result = std::nullopt;
|
||||
at::native::onednn::scaled_matmul(
|
||||
mat1,
|
||||
mat2,
|
||||
out,
|
||||
scale_a,
|
||||
scale_b,
|
||||
scaling_choice_a,
|
||||
scaling_choice_b,
|
||||
bias,
|
||||
scaled_result,
|
||||
use_fast_accum);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// Computes matrix multiply + bias while applying scaling to input and output
|
||||
// matrices Scales are only applicable when matrices are of Float8 type and
|
||||
// assumed to be equal to 1.0 by default. If output matrix type is 16 or 32-bit
|
||||
// type, scale_result is not applied. Known limitations:
|
||||
// - Only works if mat1 is row-major and mat2 is column-major
|
||||
// - Only works if matrices sizes are divisible by 32
|
||||
// - If 1-dimensional tensors are used then scale_a should be size =
|
||||
// mat1.size(0)
|
||||
// and scale_b should have size = to mat2.size(1)
|
||||
// Arguments:
|
||||
// - `mat1`: the first operand of the matrix multiply, can be type
|
||||
// `torch.float8_e4m3fn` or `torch.float8_e5m2`
|
||||
// - `mat2`: the second operand of the matrix multiply, can be type
|
||||
// `torch.float8_e4m3fn` or `torch.float8_e5m2`
|
||||
// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
|
||||
// - `out_dtype`: the output dtype, can either be a float8 or a higher
|
||||
// precision floating point type
|
||||
// - `scale_a`: a tensor with the inverse scale of `mat1`, whose
|
||||
// shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_b`: a tensor with the inverse scale of `mat2`, whose
|
||||
// shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_result`: a scalar tensor with the scale of the output, only
|
||||
// utilized if the output is a float8 type
|
||||
// - `use_fast_accum`: Not applicable for XPU. For now, it should always be
|
||||
// false.
|
||||
// - `out`: a reference to the output tensor
|
||||
|
||||
Tensor& _scaled_mm_out_xpu(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Note: fast_accum is not supported in XPU for now.
|
||||
TORCH_CHECK(!use_fast_accum, "fast_accum is not supported in XPU for now.");
|
||||
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
|
||||
|
||||
TORCH_CHECK(
|
||||
mat1.sizes()[1] == mat2.sizes()[0],
|
||||
"mat1 and mat2 shapes cannot be multiplied (",
|
||||
mat1.sizes()[0],
|
||||
"x",
|
||||
mat1.sizes()[1],
|
||||
" and ",
|
||||
mat2.sizes()[0],
|
||||
"x",
|
||||
mat2.sizes()[1],
|
||||
")");
|
||||
|
||||
// Check what type of scaling we are doing based on inputs. This list is
|
||||
// sorted by decreasing priority.
|
||||
|
||||
// List of supported datatypes for XPU with oneDNN:
|
||||
// https://uxlfoundation.github.io/oneDNN/dev_guide_matmul.html#data-types
|
||||
auto [scaling_choice_a, scaling_choice_b] = get_joint_scaling(
|
||||
{
|
||||
std::make_pair(ScalingType::TensorWise, ScalingType::TensorWise),
|
||||
std::make_pair(ScalingType::RowWise, ScalingType::RowWise),
|
||||
},
|
||||
mat1,
|
||||
mat2,
|
||||
scale_a,
|
||||
scale_b);
|
||||
TORCH_CHECK(
|
||||
!scale_result ||
|
||||
(scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
|
||||
"scale_result must be a float scalar");
|
||||
TORCH_CHECK(
|
||||
!bias || bias->numel() == mat2.sizes()[1],
|
||||
"Bias must be size ",
|
||||
mat2.sizes()[1],
|
||||
" but got ",
|
||||
bias->numel());
|
||||
TORCH_CHECK(
|
||||
mat1.sizes()[1] % 16 == 0,
|
||||
"Expected trailing dimension of mat1 to be divisible by 16 ",
|
||||
"but got mat1 shape: (",
|
||||
mat1.sizes()[0],
|
||||
"x",
|
||||
mat1.sizes()[1],
|
||||
").");
|
||||
TORCH_CHECK(
|
||||
mat2.sizes()[0] % 16 == 0 && mat2.sizes()[1] % 16 == 0,
|
||||
"mat2 shape (",
|
||||
mat2.sizes()[0],
|
||||
"x",
|
||||
mat2.sizes()[1],
|
||||
") must be divisible by 16");
|
||||
// Check types
|
||||
TORCH_CHECK(
|
||||
!out_dtype || *out_dtype == out.scalar_type(),
|
||||
"out_dtype must match output matrix type");
|
||||
TORCH_CHECK(
|
||||
at::isFloat8Type(mat1.scalar_type()),
|
||||
"Expected mat1 to be Float8 matrix got ",
|
||||
mat1.scalar_type());
|
||||
TORCH_CHECK(
|
||||
at::isFloat8Type(mat2.scalar_type()),
|
||||
"Expected mat2 to be Float8 matrix got ",
|
||||
mat2.scalar_type());
|
||||
// TODO: oneDNN Currently only supports e4m3 with group scales on BMG. Not
|
||||
// support 2D scales, only 1D. Needs to add more checks there.
|
||||
|
||||
if (bias) {
|
||||
TORCH_CHECK(
|
||||
bias->scalar_type() == kFloat ||
|
||||
bias->scalar_type() == c10::ScalarType::BFloat16 ||
|
||||
bias->scalar_type() == c10::ScalarType::Half,
|
||||
"Bias must be Float32 or BFloat16 or Half, but got ",
|
||||
bias->scalar_type());
|
||||
}
|
||||
|
||||
{
|
||||
auto bias_ = bias.value_or(Tensor());
|
||||
auto scale_result_ = scale_result.value_or(Tensor());
|
||||
|
||||
// NOLINTNEXTLINE(*c-array*)
|
||||
TensorArg targs[]{
|
||||
{out, "out", 0},
|
||||
{mat1, "mat1", 1},
|
||||
{mat2, "mat2", 2},
|
||||
{bias_, "bias", 3},
|
||||
{scale_a, "scale_a", 4},
|
||||
{scale_b, "scale_b", 5},
|
||||
{scale_result_, "scale_result", 6}};
|
||||
checkAllSameGPU(__func__, targs);
|
||||
}
|
||||
|
||||
// Validation checks have passed lets resize the output to actual size
|
||||
IntArrayRef mat1_sizes = mat1.sizes();
|
||||
IntArrayRef mat2_sizes = mat2.sizes();
|
||||
at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
|
||||
|
||||
// If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
|
||||
// kernels do not support this case).
|
||||
if (mat1_sizes[0] == 0 || mat1_sizes[1] == 0 || mat2_sizes[1] == 0) {
|
||||
// `out` was created with `at::empty`. In the case where we are multiplying
|
||||
// MxK by KxN and K is the zero dim, we need to initialize here to properly
|
||||
// return a tensor of zeros.
|
||||
if (mat1_sizes[1] == 0) {
|
||||
out.zero_();
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// TODO: Scale_result is not supported by now!!
|
||||
return _scaled_gemm(
|
||||
mat1,
|
||||
mat2,
|
||||
scale_a,
|
||||
scale_b,
|
||||
scaling_choice_a,
|
||||
scaling_choice_b,
|
||||
bias,
|
||||
use_fast_accum,
|
||||
out);
|
||||
}
|
||||
|
||||
Tensor _scaled_mm_xpu(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
std::optional<c10::ScalarType> out_dtype,
|
||||
bool use_fast_accum) {
|
||||
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
|
||||
Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
|
||||
return _scaled_mm_out_xpu(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
bias,
|
||||
scale_result,
|
||||
out_dtype,
|
||||
use_fast_accum,
|
||||
out);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
@ -1,3 +1,4 @@
|
||||
#include <ATen/BlasBackend.h>
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
@ -8,7 +9,6 @@
|
||||
#include <oneapi/dnnl/dnnl.hpp>
|
||||
|
||||
namespace at::native::onednn {
|
||||
|
||||
at::Tensor broadcast_bias2D(
|
||||
at::Tensor& dst,
|
||||
at::Tensor& bias,
|
||||
@ -328,4 +328,236 @@ void quantized_matmul(
|
||||
result.copy_(dst);
|
||||
}
|
||||
|
||||
// Describes how to configure oneDNN scales for a given role/ScalingType
|
||||
struct ScaleSpec {
|
||||
// specifies the way scale values will be applied to an ARG tensor.
|
||||
int mask;
|
||||
// specifies how scales are grouped along dimensions where
|
||||
// multiple scale factors are used.
|
||||
dnnl::memory::dims groups;
|
||||
// specifies data type for scale factors.
|
||||
dnnl::memory::data_type dtype;
|
||||
|
||||
// Helper to compute expected number of elements for scale tensors
|
||||
// arg_type: "src" for SRC (groups pattern {1, X}),
|
||||
// "wei" for WEIGHTS (groups pattern {X, 1})
|
||||
int64_t expected_numel(
|
||||
int64_t outer_dim,
|
||||
int64_t inner_dim,
|
||||
const std::string& arg_type) const {
|
||||
if (groups == dnnl::memory::dims{1, 1})
|
||||
return 1; // tensorwise scaling
|
||||
|
||||
TORCH_CHECK(
|
||||
arg_type == "src" || arg_type == "wei",
|
||||
"Expected arg_type to be 'src' or 'wei', but got '",
|
||||
arg_type,
|
||||
"'");
|
||||
|
||||
// For rowwise: SRC groups={1, K}, WEI groups={K, 1}
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
(groups == dnnl::memory::dims{1, inner_dim} ||
|
||||
groups == dnnl::memory::dims{inner_dim, 1}),
|
||||
"The groups must be either {1, inner_dim} or {inner_dim, 1}. But got ",
|
||||
groups,
|
||||
".");
|
||||
return outer_dim;
|
||||
}
|
||||
|
||||
// Normalize an incoming scale tensor to contiguous storage and appropriate
|
||||
// dtype/view
|
||||
at::Tensor normalize(const at::Tensor& scale) const {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
dtype == dnnl::memory::data_type::f32,
|
||||
"tensor scale currently must be f32, but got scale dtype: ",
|
||||
scale.scalar_type());
|
||||
return scale.to(at::kFloat).contiguous();
|
||||
}
|
||||
};
|
||||
|
||||
// This function defines how to set scales mask and groups according to:
|
||||
// https://github.com/uxlfoundation/oneDNN/blob/main/tests/benchdnn/doc/knobs_attr.md#--attr-scales
|
||||
// The returned value will be used in
|
||||
// `set_scales(arg, mask, groups, data_type)`.
|
||||
inline ScaleSpec make_scale_spec(
|
||||
at::blas::ScalingType scaling_type,
|
||||
int64_t M,
|
||||
int64_t K,
|
||||
int64_t N,
|
||||
const std::string& arg_type) {
|
||||
TORCH_CHECK(
|
||||
arg_type == "src" || arg_type == "wei",
|
||||
"Expected arg_type to be 'src' or 'wei', but got '",
|
||||
arg_type,
|
||||
"'");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
(scaling_type == at::blas::ScalingType::TensorWise ||
|
||||
scaling_type == at::blas::ScalingType::RowWise),
|
||||
"Currently only support scaling_type for TensorWise or RowWise");
|
||||
int64_t dim = K; // Currently only K is used for grouping
|
||||
bool is_src = (arg_type == "src");
|
||||
if (scaling_type == at::blas::ScalingType::TensorWise) {
|
||||
// Scale tensorwise. The same as `--attr-scales=common`.
|
||||
// mask=0 : scale whole tensor
|
||||
// groups={1, 1}: indicates that there is only one group for scaling
|
||||
return {0, {1, 1}, dnnl::memory::data_type::f32};
|
||||
} else {
|
||||
// (scaling_type == at::blas::ScalingType::RowWise)
|
||||
// Scale RowWise. The same as `--attr-scales=per_dim_01`.
|
||||
// mask={(1 << 0) | (1 << 1)}: Scale on both dim0 and dim1
|
||||
// SRC: groups={1, K}, WEIGHTS: groups={K, 1}
|
||||
return {
|
||||
(1 << 0) | (1 << 1),
|
||||
is_src ? dnnl::memory::dims{1, dim} : dnnl::memory::dims{dim, 1},
|
||||
dnnl::memory::data_type::f32};
|
||||
}
|
||||
}
|
||||
|
||||
sycl::event scaled_matmul(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
Tensor& result,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
at::blas::ScalingType scaling_choice_a,
|
||||
at::blas::ScalingType scaling_choice_b,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
bool use_fast_accum) {
|
||||
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||
auto& stream = GpuStreamManager::Instance().get_stream();
|
||||
|
||||
// This function will do steps with following steps
|
||||
// 1. create memory descriptor
|
||||
// 2. call write_to_dnnl_memory() to actually write memory
|
||||
// 3. execute
|
||||
|
||||
const int64_t M = mat1.size(0);
|
||||
const int64_t K = mat1.size(1);
|
||||
const int64_t N = mat2.size(1);
|
||||
|
||||
// 1.1 Create memory descriptor
|
||||
dnnl::memory::desc src_md = get_onednn_md(mat1);
|
||||
dnnl::memory::desc weights_md = get_onednn_md(mat2);
|
||||
dnnl::memory::desc dst_md = get_onednn_md(result);
|
||||
|
||||
// scale_a and scale_b has already be checked in `is_desired_scaling()` call.
|
||||
// So we could directly get their memory desc and set later.
|
||||
dnnl::memory::desc scale_a_md = get_onednn_md(scale_a);
|
||||
dnnl::memory::desc scale_b_md = get_onednn_md(scale_b);
|
||||
|
||||
dnnl::memory::desc bias_md;
|
||||
bool with_bias = bias.has_value();
|
||||
at::Tensor possible_reshaped_bias = bias.value_or(at::Tensor());
|
||||
if (with_bias) {
|
||||
if (possible_reshaped_bias.dim() == 1) {
|
||||
possible_reshaped_bias =
|
||||
possible_reshaped_bias.reshape({1, possible_reshaped_bias.size(0)});
|
||||
bias_md = get_onednn_md(possible_reshaped_bias);
|
||||
} else {
|
||||
bias_md = get_onednn_md(possible_reshaped_bias);
|
||||
}
|
||||
}
|
||||
|
||||
// 1.2 Create primitive descriptor and set scales mask
|
||||
const ScaleSpec src_spec = make_scale_spec(scaling_choice_a, M, K, N, "src");
|
||||
const ScaleSpec wei_spec = make_scale_spec(scaling_choice_b, M, K, N, "wei");
|
||||
|
||||
dnnl::primitive_attr op_attr = dnnl::primitive_attr();
|
||||
|
||||
#if ONEDNN_SUPPORT_DETERMINISTIC
|
||||
if (at::globalContext().deterministicAlgorithms() ||
|
||||
at::globalContext().deterministicMkldnn())
|
||||
op_attr.set_deterministic(true);
|
||||
#endif
|
||||
|
||||
std::vector<int64_t> default_groups;
|
||||
op_attr.set_scales(
|
||||
DNNL_ARG_SRC, src_spec.mask, src_spec.groups, src_spec.dtype);
|
||||
op_attr.set_scales(
|
||||
DNNL_ARG_WEIGHTS, wei_spec.mask, wei_spec.groups, wei_spec.dtype);
|
||||
// scale_result tensor currently only supports scalar(TensorWise Scaling).
|
||||
bool with_dst_scale = scale_result && scale_result->defined();
|
||||
if (with_dst_scale) {
|
||||
op_attr.set_scales(DNNL_ARG_DST, 0, {1}, dnnl::memory::data_type::f32);
|
||||
}
|
||||
|
||||
op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
||||
|
||||
// 1.3 Create the matmul primitive descriptor
|
||||
dnnl::matmul::primitive_desc matmul_pd = with_bias
|
||||
? dnnl::matmul::primitive_desc(
|
||||
engine, src_md, weights_md, bias_md, dst_md, op_attr)
|
||||
: dnnl::matmul::primitive_desc(
|
||||
engine, src_md, weights_md, dst_md, op_attr);
|
||||
|
||||
// 1.4 (Possible) Additional Checks
|
||||
// TODO: In case there are memory desc does not align with the actual tensor,
|
||||
// we might need to reorder weights similar to CPU's reorder_if_differ_in()
|
||||
// call. For example, weights not the same as matmul_pd.weights_desc(),
|
||||
|
||||
// 2. Prepare memory
|
||||
|
||||
// Create memory
|
||||
auto src_usr_m = make_onednn_memory(src_md, engine, mat1.data_ptr());
|
||||
auto weights_usr_m = make_onednn_memory(weights_md, engine, mat2.data_ptr());
|
||||
auto dst_usr_m = make_onednn_memory(dst_md, engine, result.data_ptr());
|
||||
dnnl::memory b_usr_m;
|
||||
if (with_bias) {
|
||||
b_usr_m =
|
||||
make_onednn_memory(bias_md, engine, possible_reshaped_bias.data_ptr());
|
||||
}
|
||||
|
||||
// Prepare runtime scale memories (flat 1-D views) using the specs
|
||||
auto make_scale_mem_from_spec = [&](const ScaleSpec& spec,
|
||||
int64_t expected_numel,
|
||||
const at::Tensor& scale_tensor) {
|
||||
at::Tensor prepared = spec.normalize(scale_tensor);
|
||||
TORCH_CHECK(
|
||||
prepared.numel() == expected_numel,
|
||||
"Scale buffer length mismatch. Expected ",
|
||||
expected_numel,
|
||||
", got ",
|
||||
prepared.numel());
|
||||
dnnl::memory::desc scale_md(
|
||||
{prepared.numel()}, spec.dtype, dnnl::memory::format_tag::x);
|
||||
return make_onednn_memory(scale_md, engine, prepared.data_ptr());
|
||||
};
|
||||
|
||||
auto scratchpad =
|
||||
make_onednn_memory(matmul_pd.scratchpad_desc(), engine, nullptr);
|
||||
|
||||
// 3. Setup Args for exec
|
||||
std::unordered_map<int, dnnl::memory> args;
|
||||
args.insert({DNNL_ARG_SRC, src_usr_m});
|
||||
args.insert({DNNL_ARG_WEIGHTS, weights_usr_m});
|
||||
args.insert({DNNL_ARG_DST, dst_usr_m});
|
||||
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
|
||||
if (with_bias) {
|
||||
args.insert({DNNL_ARG_BIAS, b_usr_m});
|
||||
}
|
||||
|
||||
// Attach runtime scales using specs
|
||||
auto src_sc_mem = make_scale_mem_from_spec(
|
||||
src_spec, src_spec.expected_numel(M, K, "src"), scale_a);
|
||||
auto wei_sc_mem = make_scale_mem_from_spec(
|
||||
wei_spec, wei_spec.expected_numel(N, K, "wei"), scale_b);
|
||||
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_sc_mem});
|
||||
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_sc_mem});
|
||||
if (with_dst_scale) {
|
||||
// Bind single f32 scalar as DST scale
|
||||
at::Tensor dst_scale_f32 = scale_result->to(at::kFloat).contiguous();
|
||||
dnnl::memory::desc dst_sc_md(
|
||||
{1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
|
||||
auto dst_sc_mem =
|
||||
make_onednn_memory(dst_sc_md, engine, dst_scale_f32.data_ptr());
|
||||
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_sc_mem});
|
||||
}
|
||||
|
||||
dnnl::matmul matmul_p = dnnl::matmul(matmul_pd);
|
||||
sycl::event matmul_fwd_event =
|
||||
dnnl::sycl_interop::execute(matmul_p, stream, args);
|
||||
return matmul_fwd_event;
|
||||
}
|
||||
|
||||
} // namespace at::native::onednn
|
||||
|
||||
@ -78,6 +78,10 @@ dnnl::memory::data_type get_onednn_dtype(
|
||||
return dnnl::memory::data_type::f32;
|
||||
case at::ScalarType::BFloat16:
|
||||
return dnnl::memory::data_type::bf16;
|
||||
case at::ScalarType::Float8_e4m3fn:
|
||||
return dnnl::memory::data_type::f8_e4m3;
|
||||
case at::ScalarType::Float8_e5m2:
|
||||
return dnnl::memory::data_type::f8_e5m2;
|
||||
default:
|
||||
if (!allow_undef) {
|
||||
TORCH_CHECK(
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/BlasBackend.h>
|
||||
#include <ATen/native/mkldnn/xpu/detail/Attr.h>
|
||||
#include <ATen/native/mkldnn/xpu/detail/Utils.h>
|
||||
#include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
|
||||
@ -202,4 +203,16 @@ void sdpa_backward(
|
||||
Tensor& grad_query,
|
||||
Tensor& grad_key,
|
||||
Tensor& grad_value);
|
||||
|
||||
sycl::event scaled_matmul(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
Tensor& result,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
at::blas::ScalingType scaling_choice_a,
|
||||
at::blas::ScalingType scaling_choice_b,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
const std::optional<at::Tensor>& scale_result,
|
||||
bool use_fast_accum);
|
||||
} // namespace at::native::onednn
|
||||
|
||||
@ -82,6 +82,7 @@ NSArray<NSNumber*>* getTensorAxes(const TensorBase& t);
|
||||
NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim);
|
||||
std::string getMPSShapeString(MPSShape* shape);
|
||||
std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false);
|
||||
std::string to_hex_key(float);
|
||||
std::string getArrayRefString(const IntArrayRef s);
|
||||
// use has_storage() on the returned tensor to determine if src actually is a view
|
||||
Tensor gatherViewTensor(const Tensor& src, Tensor& dst);
|
||||
|
||||
@ -301,6 +301,10 @@ std::string getArrayRefString(const IntArrayRef s) {
|
||||
return fmt::to_string(fmt::join(s, ","));
|
||||
}
|
||||
|
||||
std::string to_hex_key(float f) {
|
||||
return fmt::format("{:a}", f);
|
||||
}
|
||||
|
||||
std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) {
|
||||
fmt::basic_memory_buffer<char, 100> buffer;
|
||||
auto buf_iterator = std::back_inserter(buffer);
|
||||
|
||||
@ -96,7 +96,9 @@ kernel void addmm(
|
||||
auto bias =
|
||||
biasData[thread_id.y * strides[3].x + thread_id.x * strides[3].y];
|
||||
outputData[thread_id.y * strides[2].x + thread_id.x * strides[2].y] =
|
||||
static_cast<T>(alpha_beta[0] * sum + alpha_beta[1] * bias);
|
||||
static_cast<T>(
|
||||
c10::metal::mul(alpha_beta[0], sum) +
|
||||
c10::metal::mul(alpha_beta[1], bias));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -69,75 +69,139 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
|
||||
auto out = at::empty({batchSize, num_head, qSize, headSize}, query.options());
|
||||
auto attn = at::empty({batchSize, num_head, qSize, maxSeqLength}, query.options());
|
||||
auto scale_factor = sdp::calculate_scale(query, scale).expect_float();
|
||||
static const bool is_macOS_26_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_26_0_PLUS);
|
||||
@autoreleasepool {
|
||||
auto mkey = __func__ + getTensorsStringKey({query, key, value}) + ":" + std::to_string(is_causal) + ":" +
|
||||
std::to_string(attn_mask.has_value());
|
||||
auto cachedGraph =
|
||||
LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = query, k_ = key, v_ = value](auto mpsGraph, auto graph) {
|
||||
auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
|
||||
auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
|
||||
auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);
|
||||
auto kT = [mpsGraph transposeTensor:kTensor dimension:2 withDimension:3 name:nil];
|
||||
auto scaleTensor = [mpsGraph constantWithScalar:scale_factor
|
||||
shape:getMPSShape({1})
|
||||
dataType:MPSDataTypeFloat32];
|
||||
|
||||
auto maskedMM = [mpsGraph matrixMultiplicationWithPrimaryTensor:qTensor secondaryTensor:kT name:nil];
|
||||
CachedGraph* cachedGraph;
|
||||
//if(is_macOS_26_0_or_newer) {
|
||||
if(true) {
|
||||
cachedGraph =
|
||||
LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = query, k_ = key, v_ = value](auto mpsGraph, auto graph) {
|
||||
auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
|
||||
auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
|
||||
auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);
|
||||
|
||||
if (macOS15_0_plus && [maskedMM dataType] == MPSDataTypeFloat32) {
|
||||
// bug in MacOS15, without this trick SDPA leaks memory, adding 0.0f gets ignored(still takes SDPA sequence
|
||||
// path which leaks)
|
||||
auto oneTensor = [mpsGraph constantWithScalar:1e-20f shape:getMPSShape({1}) dataType:MPSDataTypeFloat32];
|
||||
maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:oneTensor name:nil];
|
||||
}
|
||||
if (is_causal) {
|
||||
MPSShape* maskShape = @[@(qSize), @(maxSeqLength)];
|
||||
auto x = [mpsGraph coordinateAlongAxis:-1 withShape:@[@(qSize), @1] name:nil];
|
||||
auto y = [mpsGraph coordinateAlongAxis:-2 withShape:@[@1, @(maxSeqLength)] name:nil];
|
||||
auto isLess = [mpsGraph lessThanOrEqualToWithPrimaryTensor:x secondaryTensor:y name:nil];
|
||||
auto causalMask = [mpsGraph selectWithPredicateTensor:isLess
|
||||
truePredicateTensor:[mpsGraph constantWithScalar:0 dataType:qTensor.dataType]
|
||||
falsePredicateTensor:[mpsGraph constantWithScalar:-INFINITY dataType:qTensor.dataType]
|
||||
name:nil];
|
||||
graph->maskTensor = causalMask;
|
||||
} else if (attn_mask) {
|
||||
graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
|
||||
}
|
||||
|
||||
// upcasting to float32 if needed to improve precision when multiplying by the scale factor
|
||||
maskedMM = castMPSTensor(mpsGraph, maskedMM, MPSDataTypeFloat32);
|
||||
maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
|
||||
// Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
|
||||
// Overwrites expected NANs in sm with zeros.
|
||||
// auto negInfTensor = [mpsGraph constantWithScalar:-INFINITY shape:maskedMM.shape dataType:maskedMM.dataType];
|
||||
// auto elem_neg_inf = [mpsGraph equalWithPrimaryTensor:maskedMM secondaryTensor:negInfTensor name:nil];
|
||||
// auto all_neg_infs_along_axis = [mpsGraph reductionAndWithTensor:elem_neg_inf axis:3 name:nil];
|
||||
// auto zero_mask = [mpsGraph broadcastTensor:all_neg_infs_along_axis toShape:maskedMM.shape name:nil];
|
||||
// auto zeroTensor = [mpsGraph constantWithScalar:0.0 shape:maskedMM.shape dataType:maskedMM.dataType];
|
||||
//
|
||||
// auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
|
||||
// MPSGraphTensor* correctedSM = [mpsGraph selectWithPredicateTensor:zero_mask
|
||||
// truePredicateTensor:zeroTensor
|
||||
// falsePredicateTensor:sm
|
||||
// name:nil];
|
||||
//
|
||||
// auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:correctedSM secondaryTensor:vTensor name:nil];
|
||||
|
||||
if (is_causal) {
|
||||
auto causalMask = [mpsGraph constantWithScalar:1.0f
|
||||
shape:getMPSShape({qSize, maxSeqLength})
|
||||
dataType:MPSDataTypeBool];
|
||||
causalMask = [mpsGraph bandPartWithTensor:causalMask numLower:-1 numUpper:0 name:nil];
|
||||
auto minusInf = [mpsGraph constantWithScalar:-1e20 shape:maskedMM.shape dataType:maskedMM.dataType];
|
||||
maskedMM = [mpsGraph selectWithPredicateTensor:causalMask
|
||||
truePredicateTensor:maskedMM
|
||||
falsePredicateTensor:minusInf
|
||||
name:nil];
|
||||
} else if (attn_mask) {
|
||||
graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
|
||||
maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM
|
||||
secondaryTensor:castMPSTensor(mpsGraph, graph->maskTensor, maskedMM.dataType)
|
||||
name:nil];
|
||||
}
|
||||
MPSGraphTensor* output;
|
||||
if(graph->maskTensor != nil) {
|
||||
output = [mpsGraph scaledDotProductAttentionWithQueryTensor:qTensor
|
||||
keyTensor:kTensor
|
||||
valueTensor:vTensor
|
||||
maskTensor:graph->maskTensor
|
||||
scale:scale_factor
|
||||
name:@"MPSGraph SDPA"];
|
||||
} else {
|
||||
output = [mpsGraph scaledDotProductAttentionWithQueryTensor:qTensor
|
||||
keyTensor:kTensor
|
||||
valueTensor:vTensor
|
||||
scale:scale_factor
|
||||
name:@"MPSGraph SDPA"];
|
||||
}
|
||||
graph->qTensor = qTensor;
|
||||
graph->kTensor = kTensor;
|
||||
graph->vTensor = vTensor;
|
||||
graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
|
||||
// graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
|
||||
});
|
||||
} else {
|
||||
cachedGraph =
|
||||
LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = query, k_ = key, v_ = value](auto mpsGraph, auto graph) {
|
||||
auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
|
||||
auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
|
||||
auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);
|
||||
auto kT = [mpsGraph transposeTensor:kTensor dimension:2 withDimension:3 name:nil];
|
||||
auto scaleTensor = [mpsGraph constantWithScalar:scale_factor
|
||||
shape:getMPSShape({1})
|
||||
dataType:MPSDataTypeFloat32];
|
||||
|
||||
// Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
|
||||
// Overwrites expected NANs in sm with zeros.
|
||||
auto negInfTensor = [mpsGraph constantWithScalar:-INFINITY shape:maskedMM.shape dataType:maskedMM.dataType];
|
||||
auto elem_neg_inf = [mpsGraph equalWithPrimaryTensor:maskedMM secondaryTensor:negInfTensor name:nil];
|
||||
auto all_neg_infs_along_axis = [mpsGraph reductionAndWithTensor:elem_neg_inf axis:3 name:nil];
|
||||
auto zero_mask = [mpsGraph broadcastTensor:all_neg_infs_along_axis toShape:maskedMM.shape name:nil];
|
||||
auto zeroTensor = [mpsGraph constantWithScalar:0.0 shape:maskedMM.shape dataType:maskedMM.dataType];
|
||||
auto maskedMM = [mpsGraph matrixMultiplicationWithPrimaryTensor:qTensor secondaryTensor:kT name:nil];
|
||||
|
||||
auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
|
||||
MPSGraphTensor* correctedSM = [mpsGraph selectWithPredicateTensor:zero_mask
|
||||
truePredicateTensor:zeroTensor
|
||||
falsePredicateTensor:sm
|
||||
name:nil];
|
||||
if (macOS15_0_plus && [maskedMM dataType] == MPSDataTypeFloat32) {
|
||||
// bug in MacOS15, without this trick SDPA leaks memory, adding 0.0f gets ignored(still takes SDPA sequence
|
||||
// path which leaks)
|
||||
auto oneTensor = [mpsGraph constantWithScalar:1e-20f shape:getMPSShape({1}) dataType:MPSDataTypeFloat32];
|
||||
maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:oneTensor name:nil];
|
||||
}
|
||||
|
||||
auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:correctedSM secondaryTensor:vTensor name:nil];
|
||||
graph->qTensor = qTensor;
|
||||
graph->kTensor = kTensor;
|
||||
graph->vTensor = vTensor;
|
||||
graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
|
||||
graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
|
||||
});
|
||||
// upcasting to float32 if needed to improve precision when multiplying by the scale factor
|
||||
maskedMM = castMPSTensor(mpsGraph, maskedMM, MPSDataTypeFloat32);
|
||||
maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
|
||||
|
||||
if (is_causal) {
|
||||
auto causalMask = [mpsGraph constantWithScalar:1.0f
|
||||
shape:getMPSShape({qSize, maxSeqLength})
|
||||
dataType:MPSDataTypeBool];
|
||||
causalMask = [mpsGraph bandPartWithTensor:causalMask numLower:-1 numUpper:0 name:nil];
|
||||
auto minusInf = [mpsGraph constantWithScalar:-1e20 shape:maskedMM.shape dataType:maskedMM.dataType];
|
||||
maskedMM = [mpsGraph selectWithPredicateTensor:causalMask
|
||||
truePredicateTensor:maskedMM
|
||||
falsePredicateTensor:minusInf
|
||||
name:nil];
|
||||
} else if (attn_mask) {
|
||||
graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
|
||||
maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM
|
||||
secondaryTensor:castMPSTensor(mpsGraph, graph->maskTensor, maskedMM.dataType)
|
||||
name:nil];
|
||||
}
|
||||
|
||||
// Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
|
||||
// Overwrites expected NANs in sm with zeros.
|
||||
auto negInfTensor = [mpsGraph constantWithScalar:-INFINITY shape:maskedMM.shape dataType:maskedMM.dataType];
|
||||
auto elem_neg_inf = [mpsGraph equalWithPrimaryTensor:maskedMM secondaryTensor:negInfTensor name:nil];
|
||||
auto all_neg_infs_along_axis = [mpsGraph reductionAndWithTensor:elem_neg_inf axis:3 name:nil];
|
||||
auto zero_mask = [mpsGraph broadcastTensor:all_neg_infs_along_axis toShape:maskedMM.shape name:nil];
|
||||
auto zeroTensor = [mpsGraph constantWithScalar:0.0 shape:maskedMM.shape dataType:maskedMM.dataType];
|
||||
|
||||
auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
|
||||
MPSGraphTensor* correctedSM = [mpsGraph selectWithPredicateTensor:zero_mask
|
||||
truePredicateTensor:zeroTensor
|
||||
falsePredicateTensor:sm
|
||||
name:nil];
|
||||
|
||||
auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:correctedSM secondaryTensor:vTensor name:nil];
|
||||
graph->qTensor = qTensor;
|
||||
graph->kTensor = kTensor;
|
||||
graph->vTensor = vTensor;
|
||||
graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
|
||||
graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
|
||||
});
|
||||
}
|
||||
auto qPlaceholder = Placeholder(cachedGraph->qTensor, query);
|
||||
auto kPlaceholder = Placeholder(cachedGraph->kTensor, key);
|
||||
auto vPlaceholder = Placeholder(cachedGraph->vTensor, value);
|
||||
auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, out);
|
||||
auto attnPlaceholder = Placeholder(cachedGraph->attnTensor, attn);
|
||||
// auto attnPlaceholder = Placeholder(cachedGraph->attnTensor, attn);
|
||||
NSDictionary* feeds = nil;
|
||||
if (!attn_mask) {
|
||||
feeds = dictionaryFromPlaceholders(qPlaceholder, kPlaceholder, vPlaceholder);
|
||||
@ -145,7 +209,8 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
|
||||
auto mPlaceholder = Placeholder(cachedGraph->maskTensor, *attn_mask);
|
||||
feeds = dictionaryFromPlaceholders(qPlaceholder, kPlaceholder, vPlaceholder, mPlaceholder);
|
||||
}
|
||||
NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder, attnPlaceholder);
|
||||
// NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder, attnPlaceholder);
|
||||
NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder);
|
||||
runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outs);
|
||||
}
|
||||
|
||||
|
||||
@ -121,7 +121,7 @@ Tensor& do_metal_addmm(const Tensor& self,
|
||||
const Scalar& alpha,
|
||||
const Scalar& beta,
|
||||
const Tensor& bias) {
|
||||
if (beta.toDouble() == 0 && alpha.toDouble() == 1) {
|
||||
if (beta.isFloatingPoint() && alpha.isFloatingPoint() && beta.toDouble() == 0 && alpha.toDouble() == 1) {
|
||||
return do_metal_mm(self, other, output);
|
||||
}
|
||||
auto stream = getCurrentMPSStream();
|
||||
@ -147,13 +147,15 @@ Tensor& do_metal_addmm(const Tensor& self,
|
||||
std::array<int64_t, 2> i64;
|
||||
std::array<int32_t, 2> i32;
|
||||
std::array<float, 2> f32;
|
||||
} alpha_beta;
|
||||
std::array<c10::complex<float>, 2> c64;
|
||||
} alpha_beta{};
|
||||
if (output.scalar_type() == kLong) {
|
||||
alpha_beta.i64 = {alpha.toLong(), beta.toLong()};
|
||||
} else if (c10::isIntegralType(output.scalar_type(), true)) {
|
||||
alpha_beta.i32 = {alpha.toInt(), beta.toInt()};
|
||||
} else if (c10::isComplexType(output.scalar_type())) {
|
||||
alpha_beta.c64 = {alpha.toComplexFloat(), beta.toComplexFloat()};
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT(c10::isFloatingType(output.scalar_type()));
|
||||
alpha_beta.f32 = {alpha.toFloat(), beta.toFloat()};
|
||||
}
|
||||
constexpr uint32_t TILE_DIM = 16; // fastest performance from tests on multiple macs
|
||||
|
||||
@ -91,25 +91,30 @@ static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
|
||||
#include <ATen/native/mps/Repeat_metallib.h>
|
||||
#endif
|
||||
|
||||
template <typename index_t>
|
||||
void computeRepeatIndices(const index_t* repeat_ptr,
|
||||
const int64_t* cumsum_ptr,
|
||||
index_t* result_ptr,
|
||||
int64_t size,
|
||||
int64_t result_size) {
|
||||
id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
|
||||
id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
|
||||
id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
|
||||
TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
|
||||
|
||||
Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
|
||||
TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
|
||||
std::string scalar_type;
|
||||
if constexpr (std::is_same_v<index_t, int32_t>) {
|
||||
if (repeat.scalar_type() == kInt) {
|
||||
scalar_type = "int32_t";
|
||||
} else if constexpr (std::is_same_v<index_t, int64_t>) {
|
||||
} else if (repeat.scalar_type() == kLong) {
|
||||
scalar_type = "int64_t";
|
||||
} else {
|
||||
TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
|
||||
TORCH_CHECK(false, "repeats has to be Long or Int tensor");
|
||||
}
|
||||
if (repeat.size(0) == 0) {
|
||||
return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
}
|
||||
Tensor repeat_ = repeat.contiguous();
|
||||
Tensor cumsum = repeat.cumsum(0);
|
||||
int64_t total = 0;
|
||||
if (output_size.has_value()) {
|
||||
total = output_size.value();
|
||||
} else {
|
||||
total = cumsum[-1].item<int64_t>();
|
||||
TORCH_CHECK((repeat >= 0).all().item<uint8_t>(), "repeats can not be negative");
|
||||
}
|
||||
|
||||
auto result = at::empty({total}, repeat.options());
|
||||
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
dispatch_sync(mpsStream->queue(), ^() {
|
||||
@ -121,20 +126,13 @@ void computeRepeatIndices(const index_t* repeat_ptr,
|
||||
getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);
|
||||
|
||||
[computeEncoder setComputePipelineState:pipelineState];
|
||||
mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size);
|
||||
mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size);
|
||||
mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0));
|
||||
mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0));
|
||||
|
||||
getMPSProfiler().endProfileKernel(pipelineState);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
|
||||
Tensor output;
|
||||
AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
|
||||
output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
|
||||
});
|
||||
return output;
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/TensorCompare.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <algorithm>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -89,13 +90,21 @@ static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor&
|
||||
auto clamp_shape = clamp_opt->sizes();
|
||||
auto input_shape = input_t.sizes();
|
||||
|
||||
TORCH_CHECK(num_clamp_dims <= num_input_dims,
|
||||
op_name + ": clamp tensor number of dims must not be greater than that of input tensor")
|
||||
if (num_clamp_dims > num_input_dims) {
|
||||
auto leading_dims = num_clamp_dims - num_input_dims;
|
||||
for (int64_t i = 0; i < leading_dims; ++i) {
|
||||
TORCH_CHECK(clamp_shape[i] == 1,
|
||||
op_name + ": clamp tensor leading shape must be 1 to broadcast with input tensor");
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_clamp_dims; i++)
|
||||
auto clamp_idx = num_clamp_dims - 1;
|
||||
auto input_idx = num_input_dims - 1;
|
||||
auto common_dims = std::min(num_clamp_dims, num_input_dims);
|
||||
for (int64_t i = 0; i < common_dims; ++i)
|
||||
// One of the indices is allowed to be 1; will be handled by broadcast
|
||||
TORCH_CHECK(clamp_shape[num_clamp_dims - 1 - i] == input_shape[num_input_dims - 1 - i] ||
|
||||
clamp_shape[num_clamp_dims - 1 - i] == 1 || input_shape[num_input_dims - 1 - i] == 1,
|
||||
TORCH_CHECK(clamp_shape[clamp_idx - i] == input_shape[input_idx - i] || clamp_shape[clamp_idx - i] == 1 ||
|
||||
input_shape[input_idx - i] == 1,
|
||||
op_name + ": clamp tensor trailing shape must match input tensor")
|
||||
}
|
||||
}
|
||||
@ -136,9 +145,6 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
|
||||
|
||||
auto result_type = output_t.scalar_type();
|
||||
|
||||
IntArrayRef new_min_shape;
|
||||
IntArrayRef new_max_shape;
|
||||
|
||||
auto num_min_dims = min_opt->dim();
|
||||
auto num_max_dims = max_opt->dim();
|
||||
auto num_input_dims = input_t.dim();
|
||||
@ -146,24 +152,32 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
|
||||
std::vector<int64_t> new_min_arr(num_input_dims);
|
||||
std::vector<int64_t> new_max_arr(num_input_dims);
|
||||
|
||||
if (has_min && num_min_dims < num_input_dims) {
|
||||
fill_new_shape(num_input_dims, num_min_dims, new_min_arr.data(), min_opt->sizes());
|
||||
new_min_shape = IntArrayRef(new_min_arr);
|
||||
}
|
||||
|
||||
if (has_max && num_max_dims < num_input_dims) {
|
||||
fill_new_shape(num_input_dims, num_max_dims, new_max_arr.data(), max_opt->sizes());
|
||||
new_max_shape = IntArrayRef(new_max_arr);
|
||||
}
|
||||
|
||||
Tensor min_opt_tensor;
|
||||
Tensor max_opt_tensor;
|
||||
|
||||
auto reshape_clamp_tensor = [&](const OptionalTensorRef clamp_tensor_ref,
|
||||
int64_t num_clamp_dims,
|
||||
std::vector<int64_t>& new_shape_storage) -> Tensor {
|
||||
IntArrayRef clamp_shape = clamp_tensor_ref->sizes();
|
||||
bool requires_view = false;
|
||||
|
||||
if (num_clamp_dims > num_input_dims) {
|
||||
clamp_shape = clamp_shape.slice(num_clamp_dims - num_input_dims);
|
||||
requires_view = true;
|
||||
} else if (num_clamp_dims < num_input_dims) {
|
||||
fill_new_shape(num_input_dims, num_clamp_dims, new_shape_storage.data(), clamp_shape);
|
||||
clamp_shape = IntArrayRef(new_shape_storage);
|
||||
requires_view = true;
|
||||
}
|
||||
|
||||
return requires_view ? (*clamp_tensor_ref).view(clamp_shape) : *clamp_tensor_ref;
|
||||
};
|
||||
|
||||
if (has_min) {
|
||||
min_opt_tensor = (num_min_dims < num_input_dims) ? (*min_opt).view(new_min_shape) : *min_opt;
|
||||
min_opt_tensor = reshape_clamp_tensor(min_opt, num_min_dims, new_min_arr);
|
||||
}
|
||||
if (has_max) {
|
||||
max_opt_tensor = (num_max_dims < num_input_dims) ? (*max_opt).view(new_max_shape) : *max_opt;
|
||||
max_opt_tensor = reshape_clamp_tensor(max_opt, num_max_dims, new_max_arr);
|
||||
}
|
||||
|
||||
@autoreleasepool {
|
||||
@ -244,8 +258,8 @@ static void clamp_scalar_out_mps(const Tensor& input_t,
|
||||
|
||||
@autoreleasepool {
|
||||
// the optional min/max refs could affect how we build the cached graph
|
||||
std::string key = op_name + (has_min ? ("_min:" + std::to_string(min_scalar)) : "") +
|
||||
(has_max ? ("_max:" + std::to_string(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
|
||||
std::string key = op_name + (has_min ? ("_min:" + to_hex_key(min_scalar)) : "") +
|
||||
(has_max ? ("_max:" + to_hex_key(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
if (has_min)
|
||||
newCachedGraph->minTensor = [mpsGraph constantWithScalar:min_scalar
|
||||
|
||||
@ -4225,7 +4225,7 @@
|
||||
MTIA: mm_out_mtia
|
||||
MPS: mm_out_mps
|
||||
XPU: mm_out_xpu
|
||||
SparseCPU, SparseCUDA: _sparse_mm_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out
|
||||
|
||||
- func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
|
||||
|
||||
@ -1426,9 +1426,6 @@ static at::Tensor _fp8_convolution_onednn_ref(
|
||||
w_scales_new_shape[0] = -1;
|
||||
auto dqw = weight.to(at::kFloat) * weight_scales.reshape(w_scales_new_shape);
|
||||
auto output_padding = std::vector<int64_t>(kSpatialDim, 0);
|
||||
if (bias.has_value()){
|
||||
bias = bias.value().to(at::kFloat);
|
||||
}
|
||||
auto y_f32 = at::convolution(
|
||||
dqx, dqw, bias, stride.vec(), padding.vec(), dilation.vec(), /* transposed */false, output_padding, groups
|
||||
);
|
||||
|
||||
@ -61,6 +61,7 @@ list(APPEND ATen_CUDA_TEST_SRCS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
|
||||
|
||||
77
aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
Normal file
77
aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
Normal file
@ -0,0 +1,77 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace
|
||||
// to verify that the data race fix is working correctly
|
||||
|
||||
TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) {
|
||||
if (!at::cuda::is_available()) {
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int num_accessor_threads = 15;
|
||||
constexpr int num_clear_threads = 5;
|
||||
constexpr int iterations_per_thread = 50;
|
||||
|
||||
std::atomic<bool> stop{false};
|
||||
std::atomic<int> error_count{0};
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(num_accessor_threads + num_clear_threads);
|
||||
|
||||
// Launch accessor threads
|
||||
for (int i = 0; i < num_accessor_threads; ++i) {
|
||||
threads.emplace_back([&stop, &error_count]() {
|
||||
try {
|
||||
at::cuda::CUDAGuard device_guard(0);
|
||||
|
||||
while (!stop.load(std::memory_order_relaxed)) {
|
||||
const auto handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
const auto workspace = at::cuda::getCUDABlasLtWorkspace();
|
||||
|
||||
if (handle == nullptr || workspace == nullptr) {
|
||||
error_count++;
|
||||
}
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
error_count++;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Launch threads that clear workspaces
|
||||
for (int i = 0; i < num_clear_threads; ++i) {
|
||||
threads.emplace_back([&error_count]() {
|
||||
try {
|
||||
for (int j = 0; j < iterations_per_thread; ++j) {
|
||||
at::cuda::clearCublasWorkspaces();
|
||||
std::this_thread::yield();
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
error_count++;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Let them run for a bit
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
stop.store(true, std::memory_order_relaxed);
|
||||
|
||||
for (auto& thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
|
||||
EXPECT_EQ(error_count.load(), 0);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
c10::cuda::CUDACachingAllocator::init(1);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
@ -10,6 +10,13 @@
|
||||
...
|
||||
}
|
||||
|
||||
{
|
||||
ignore_empty_generic_uninitialised_conditional_jump
|
||||
Memcheck:Cond
|
||||
fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
|
||||
...
|
||||
}
|
||||
|
||||
{
|
||||
Cond_cuda
|
||||
Memcheck:Cond
|
||||
|
||||
@ -9,28 +9,61 @@ def check_perf_csv(filename, threshold, threshold_scale):
|
||||
"""
|
||||
Basic performance checking.
|
||||
"""
|
||||
try:
|
||||
df = pd.read_csv(filename)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File {filename} not found")
|
||||
sys.exit(1)
|
||||
|
||||
df = pd.read_csv(filename)
|
||||
effective_threshold = threshold * threshold_scale
|
||||
print(f"Checking {filename} (speedup threshold >= {effective_threshold:.2f}x)\n")
|
||||
|
||||
failed = []
|
||||
for _, row in df.iterrows():
|
||||
model_name = row["name"]
|
||||
speedup = row["speedup"]
|
||||
if speedup < threshold * threshold_scale:
|
||||
failed.append(model_name)
|
||||
speedup = float(row["speedup"])
|
||||
abs_latency = float(row["abs_latency"])
|
||||
compilation_latency = float(row["compilation_latency"])
|
||||
compression_ratio = float(row["compression_ratio"])
|
||||
eager_peak_mem = float(row["eager_peak_mem"])
|
||||
dynamo_peak_mem = float(row["dynamo_peak_mem"])
|
||||
|
||||
print(f"{model_name:34} {speedup}")
|
||||
perf_summary = f"{model_name:34} speedup={speedup:.3f}x"
|
||||
if pd.notna(abs_latency):
|
||||
perf_summary += f", latency={abs_latency:.1f} ms/iter"
|
||||
if pd.notna(compilation_latency):
|
||||
perf_summary += f", compile={compilation_latency:.3f}s"
|
||||
if pd.notna(compression_ratio):
|
||||
perf_summary += f", mem_ratio={1 / compression_ratio:.2f}x"
|
||||
if pd.notna(eager_peak_mem) and pd.notna(dynamo_peak_mem):
|
||||
perf_summary += (
|
||||
f" (eager={eager_peak_mem:.1f} GB, dynamo={dynamo_peak_mem:.1f} GB)"
|
||||
)
|
||||
|
||||
if speedup < effective_threshold:
|
||||
failed.append((model_name, speedup))
|
||||
|
||||
print(perf_summary)
|
||||
|
||||
if failed:
|
||||
print(
|
||||
textwrap.dedent(
|
||||
f"""
|
||||
Error {len(failed)} models performance regressed
|
||||
{" ".join(failed)}
|
||||
Error {len(failed)} model(s) performance regressed
|
||||
{" ".join([name for name, _ in failed])}
|
||||
"""
|
||||
)
|
||||
)
|
||||
for name, sp in sorted(failed, key=lambda x: x[1]):
|
||||
pct_from_target = (sp / effective_threshold - 1.0) * 100.0
|
||||
print(
|
||||
f" - {name}: {sp:.3f}x (< {effective_threshold:.2f}x; {pct_from_target:.1f}% from target)"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(
|
||||
f"\nAll {len(df)} model(s) passed threshold check (>= {effective_threshold:.2f}x)"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -44,7 +77,7 @@ if __name__ == "__main__":
|
||||
"-s",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="multiple threshold by this value to relax the check",
|
||||
help="multiply threshold by this value to relax the check",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
check_perf_csv(args.file, args.threshold, args.threshold_scale)
|
||||
|
||||
@ -2379,7 +2379,9 @@ class BenchmarkRunner:
|
||||
print(
|
||||
f"Load model outputs from {self.args.compare_model_outputs_with} to compare"
|
||||
)
|
||||
saved_result = torch.load(self.args.compare_model_outputs_with)
|
||||
saved_result = torch.load(
|
||||
self.args.compare_model_outputs_with, weights_only=False
|
||||
)
|
||||
is_bitwise_same = bitwise_same(saved_result, new_result)
|
||||
if not is_bitwise_same:
|
||||
print(
|
||||
|
||||
@ -189,6 +189,10 @@ skip:
|
||||
- hf_Whisper
|
||||
- hf_distil_whisper
|
||||
- timm_vision_transformer_large
|
||||
# https://github.com/pytorch/pytorch/issues/167895
|
||||
- stable_diffusion
|
||||
- stable_diffusion_text_encoder
|
||||
- stable_diffusion_unet
|
||||
|
||||
device:
|
||||
cpu:
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
# These load paths point to different files in internal and OSS environment
|
||||
|
||||
load("@bazel_skylib//lib:paths.bzl", "paths")
|
||||
load("//tools/build_defs:cell_defs.bzl", "get_fbsource_cell")
|
||||
load("//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
|
||||
load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library")
|
||||
load("//tools/build_defs:fb_xplat_genrule.bzl", "fb_xplat_genrule")
|
||||
@ -590,6 +591,9 @@ def pt_operator_query_codegen(
|
||||
pt_allow_forced_schema_registration = True,
|
||||
compatible_with = [],
|
||||
apple_sdks = None):
|
||||
if get_fbsource_cell() == "fbcode":
|
||||
return
|
||||
|
||||
oplist_dir_name = name + "_pt_oplist"
|
||||
|
||||
# @lint-ignore BUCKLINT
|
||||
@ -865,6 +869,9 @@ def define_buck_targets(
|
||||
pt_xplat_cxx_library = fb_xplat_cxx_library,
|
||||
c2_fbandroid_xplat_compiler_flags = [],
|
||||
labels = []):
|
||||
if get_fbsource_cell() == "fbcode":
|
||||
return
|
||||
|
||||
# @lint-ignore BUCKLINT
|
||||
fb_native.filegroup(
|
||||
name = "metal_build_srcs",
|
||||
|
||||
@ -44,7 +44,7 @@ struct C10_API SafePyObject {
|
||||
(*other.pyinterpreter_)->incref(other.data_);
|
||||
}
|
||||
if (data_ != nullptr) {
|
||||
(*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
|
||||
(*pyinterpreter_)->decref(data_);
|
||||
}
|
||||
data_ = other.data_;
|
||||
pyinterpreter_ = other.pyinterpreter_;
|
||||
@ -53,7 +53,7 @@ struct C10_API SafePyObject {
|
||||
|
||||
~SafePyObject() {
|
||||
if (data_ != nullptr) {
|
||||
(*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
|
||||
(*pyinterpreter_)->decref(data_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -34,20 +34,6 @@ namespace c10 {
|
||||
// See [dtype Macros note] in torch/headeronly/core/ScalarType.h
|
||||
// regarding macros.
|
||||
|
||||
template <typename T>
|
||||
struct CppTypeToScalarType;
|
||||
|
||||
#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type) \
|
||||
template <> \
|
||||
struct CppTypeToScalarType<cpp_type> \
|
||||
: std:: \
|
||||
integral_constant<c10::ScalarType, c10::ScalarType::scalar_type> { \
|
||||
};
|
||||
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
|
||||
|
||||
#undef SPECIALIZE_CppTypeToScalarType
|
||||
|
||||
#define DEFINE_CONSTANT(_, name) \
|
||||
constexpr ScalarType k##name = ScalarType::name;
|
||||
|
||||
@ -106,13 +92,6 @@ inline bool isComplexType(ScalarType t) {
|
||||
t == ScalarType::ComplexDouble);
|
||||
}
|
||||
|
||||
inline bool isQIntType(ScalarType t) {
|
||||
// Don't forget to extend this when adding new QInt types
|
||||
return t == ScalarType::QInt8 || t == ScalarType::QUInt8 ||
|
||||
t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 ||
|
||||
t == ScalarType::QUInt2x4;
|
||||
}
|
||||
|
||||
inline bool isBitsType(ScalarType t) {
|
||||
return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
|
||||
t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
|
||||
|
||||
@ -48,6 +48,30 @@ void warnDeprecatedDataPtr() {
|
||||
TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
|
||||
}
|
||||
|
||||
void StorageImpl::incref_pyobject() const {
|
||||
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
||||
// do an acquire fence to ensure that the kHasPyObject bit was
|
||||
// observed before the load of the PyObject* below.
|
||||
// NB: This is a no-op on x86/x86-64
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
||||
}
|
||||
|
||||
void StorageImpl::decref_pyobject() const {
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
||||
}
|
||||
|
||||
bool StorageImpl::try_incref_pyobject() const {
|
||||
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
||||
if (C10_UNLIKELY(!interp)) {
|
||||
return false;
|
||||
}
|
||||
return (*interp)->try_incref(pyobj_slot_);
|
||||
}
|
||||
|
||||
void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
|
||||
// Allowlist verification.
|
||||
// Only if the devicetype is in the allowlist,
|
||||
|
||||
@ -105,6 +105,12 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
|
||||
data_ptr_.clear();
|
||||
}
|
||||
|
||||
void incref_pyobject() const override final;
|
||||
|
||||
void decref_pyobject() const override final;
|
||||
|
||||
bool try_incref_pyobject() const override final;
|
||||
|
||||
size_t nbytes() const {
|
||||
// OK to do this instead of maybe_as_int as nbytes is guaranteed positive
|
||||
TORCH_CHECK(!size_bytes_is_heap_allocated_);
|
||||
@ -370,4 +376,18 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
|
||||
bool resizable,
|
||||
std::optional<at::Device> device_opt);
|
||||
|
||||
namespace detail {
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
template <class T>
|
||||
struct TargetTraits<
|
||||
T,
|
||||
std::enable_if_t<
|
||||
std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
|
||||
static constexpr bool can_have_pyobject = true;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace detail
|
||||
|
||||
} // namespace c10
|
||||
|
||||
@ -277,7 +277,6 @@ void TensorImpl::release_resources() {
|
||||
if (storage_) {
|
||||
storage_ = {};
|
||||
}
|
||||
pyobj_slot_.maybe_destroy_pyobj();
|
||||
}
|
||||
|
||||
#ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
|
||||
@ -989,6 +988,30 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
|
||||
}
|
||||
}
|
||||
|
||||
void TensorImpl::incref_pyobject() const {
|
||||
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
||||
// do an acquire fence to ensure that the kHasPyObject bit was
|
||||
// observed before the load of the PyObject* below.
|
||||
// NB: This is a no-op on x86/x86-64
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
||||
}
|
||||
|
||||
void TensorImpl::decref_pyobject() const {
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
||||
}
|
||||
|
||||
bool TensorImpl::try_incref_pyobject() const {
|
||||
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
||||
if (C10_UNLIKELY(!interp)) {
|
||||
return false;
|
||||
}
|
||||
return (*interp)->try_incref(pyobj_slot_);
|
||||
}
|
||||
|
||||
namespace impl {
|
||||
|
||||
namespace {
|
||||
|
||||
@ -2178,6 +2178,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
|
||||
return &pyobj_slot_;
|
||||
}
|
||||
|
||||
void incref_pyobject() const override final;
|
||||
|
||||
void decref_pyobject() const override final;
|
||||
|
||||
bool try_incref_pyobject() const override final;
|
||||
|
||||
private:
|
||||
// See NOTE [std::optional operator usage in CUDA]
|
||||
// We probably don't want to expose this publicly until
|
||||
@ -3079,6 +3085,19 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
|
||||
friend class C10_TensorImpl_Size_Check_Dummy_Class;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
template <class T>
|
||||
struct TargetTraits<
|
||||
T,
|
||||
std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
|
||||
static constexpr bool can_have_pyobject = true;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Note [TensorImpl size constraints]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// Changed the size of TensorImpl? If the size went down, good for
|
||||
|
||||
@ -11,8 +11,11 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
|
||||
|
||||
void incref(PyObject* pyobj) const override {} // do nothing
|
||||
|
||||
void decref(PyObject* pyobj, bool has_pyobj_slot) const override {
|
||||
} // do nothing
|
||||
void decref(PyObject* pyobj) const override {} // do nothing
|
||||
|
||||
bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
#define PANIC(m) \
|
||||
TORCH_INTERNAL_ASSERT( \
|
||||
@ -20,6 +23,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
|
||||
"attempted to call " #m \
|
||||
" on a Tensor with nontrivial PyObject after corresponding interpreter died")
|
||||
|
||||
size_t refcnt(PyObject* pyobj) const override {
|
||||
PANIC(refcnt);
|
||||
}
|
||||
|
||||
c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override {
|
||||
PANIC(detach);
|
||||
}
|
||||
|
||||
@ -18,6 +18,9 @@ namespace c10 {
|
||||
struct IValue;
|
||||
class OperatorHandle;
|
||||
struct TensorImpl;
|
||||
namespace impl {
|
||||
struct PyObjectSlot;
|
||||
} // namespace impl
|
||||
} // namespace c10
|
||||
|
||||
namespace torch::jit {
|
||||
@ -126,9 +129,12 @@ struct C10_API PyInterpreterVTable {
|
||||
|
||||
// Run Py_INCREF on a PyObject.
|
||||
virtual void incref(PyObject* pyobj) const = 0;
|
||||
// Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call
|
||||
// See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
|
||||
virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0;
|
||||
// Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call.
|
||||
virtual void decref(PyObject* pyobj) const = 0;
|
||||
// Run PyUnstable_TryIncRef on a PyObject if it's not NULL.
|
||||
virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0;
|
||||
// Run Py_REFCNT on a PyObject.
|
||||
virtual size_t refcnt(PyObject* pyobj) const = 0;
|
||||
|
||||
// Perform a detach by deferring to the __torch_dispatch__ implementation of
|
||||
// detach, which will also arrange for the PyObject to get copied in this
|
||||
|
||||
@ -1,56 +0,0 @@
|
||||
#include <c10/core/impl/PyObjectSlot.h>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
|
||||
|
||||
PyObjectSlot::~PyObjectSlot() {
|
||||
maybe_destroy_pyobj();
|
||||
}
|
||||
|
||||
void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
if (owns_pyobj()) {
|
||||
TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
|
||||
(*pyobj_interpreter_.load(std::memory_order_acquire))
|
||||
->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
|
||||
// NB: this destructor can only be entered when there are no
|
||||
// references to this C++ object (obviously), NOR any references
|
||||
// to the PyObject (if there are references to the PyObject,
|
||||
// then the PyObject holds an owning reference to the tensor).
|
||||
// So it is OK to clear pyobj_ here as it is impossible for it to
|
||||
// be used again (modulo weak reference races)
|
||||
pyobj_ = nullptr; // for safety
|
||||
}
|
||||
}
|
||||
|
||||
PyInterpreter* PyObjectSlot::pyobj_interpreter() {
|
||||
return pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return reinterpret_cast<PyObject*>(
|
||||
reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
|
||||
}
|
||||
|
||||
PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter) {
|
||||
return *interpreter;
|
||||
}
|
||||
TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
|
||||
}
|
||||
|
||||
bool PyObjectSlot::owns_pyobj() {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return reinterpret_cast<uintptr_t>(pyobj_) & 1;
|
||||
}
|
||||
|
||||
void PyObjectSlot::set_owns_pyobj(bool b) {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
pyobj_ = reinterpret_cast<PyObject*>(
|
||||
reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
|
||||
}
|
||||
|
||||
} // namespace c10::impl
|
||||
@ -8,117 +8,58 @@
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace torch::utils {
|
||||
class PyObjectPreservation;
|
||||
}
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
struct C10_API PyObjectSlot {
|
||||
public:
|
||||
PyObjectSlot();
|
||||
|
||||
~PyObjectSlot();
|
||||
|
||||
void maybe_destroy_pyobj();
|
||||
|
||||
// Associate the TensorImpl with the specified PyObject, and, if necessary,
|
||||
// also tag the interpreter.
|
||||
//
|
||||
// NB: This lives in a header so that we can inline away the switch on status
|
||||
//
|
||||
// NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after
|
||||
// PyObject if necessary!
|
||||
void init_pyobj(PyObject* pyobj) {
|
||||
pyobj_interpreter_.store(
|
||||
getGlobalPyInterpreter(), std::memory_order_relaxed);
|
||||
pyobj_ = pyobj;
|
||||
}
|
||||
PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
|
||||
|
||||
// Query the PyObject interpreter. This may return null if there is no
|
||||
// interpreter. This is racy!
|
||||
PyInterpreter* pyobj_interpreter();
|
||||
|
||||
PyObject* _unchecked_untagged_pyobj() const;
|
||||
|
||||
// Test the interpreter tag. If tagged for the current interpreter, return
|
||||
// a non-nullopt (but possibly null) PyObject. If (possibly) untagged,
|
||||
// returns a nullopt. If it is definitely invalid, raises an error.
|
||||
//
|
||||
// If `ignore_hermetic_tls` is false and this function is called from a
|
||||
// hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
|
||||
// nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
|
||||
// context is ignored, allowing you to check the interpreter tag of a
|
||||
// nonhermetic PyObject from within a hermetic context. This is necessary
|
||||
// because there are some cases where the deallocator function of a
|
||||
// nonhermetic PyObject is called from within a hermetic context, so it must
|
||||
// be properly treated as a nonhermetic PyObject.
|
||||
//
|
||||
// NB: this lives in header so that we can avoid actually creating the
|
||||
// std::optional
|
||||
|
||||
// @todo alban: I'm not too sure what's going on here, we can probably delete
|
||||
// it but it's worthwhile making sure
|
||||
std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
|
||||
impl::PyInterpreter* interpreter =
|
||||
pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter == nullptr) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
// interpreter.
|
||||
PyInterpreter* pyobj_interpreter() const {
|
||||
return pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
PyInterpreter& load_pyobj_interpreter() const;
|
||||
PyInterpreter& load_pyobj_interpreter() const {
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
interpreter, "cannot access PyObject for Tensor - no interpreter set");
|
||||
return *interpreter;
|
||||
}
|
||||
|
||||
bool owns_pyobj();
|
||||
PyObject* load_pyobj() const {
|
||||
return pyobj_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
void set_owns_pyobj(bool b);
|
||||
void store_pyobj(PyObject* obj) {
|
||||
pyobj_.store(obj, std::memory_order_release);
|
||||
}
|
||||
|
||||
bool has_unique_reference() const {
|
||||
PyObject* pyobj = load_pyobj();
|
||||
return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1;
|
||||
}
|
||||
|
||||
void clear() {
|
||||
pyobj_.store(nullptr, std::memory_order_relaxed);
|
||||
pyobj_interpreter_.store(nullptr, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
private:
|
||||
// This field contains the interpreter tag for this object. See
|
||||
// Note [Python interpreter tag] for general context
|
||||
//
|
||||
// Note [Memory ordering on Python interpreter tag]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// What memory_order do we need when accessing this atomic? We don't
|
||||
// need a single total modification order (as provided by
|
||||
// memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
|
||||
// transition from -1 to some positive integer and never changes afterwards.
|
||||
// Because there is only one modification, it trivially already has a total
|
||||
// modification order (e.g., we don't need fences or locked instructions on
|
||||
// x86)
|
||||
//
|
||||
// In fact, one could make a reasonable argument that relaxed reads are OK,
|
||||
// due to the presence of external locking (GIL) to ensure that interactions
|
||||
// with other data structures are still correctly synchronized, so that
|
||||
// we fall in the "Single-Location Data Structures" case as described in
|
||||
// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
|
||||
// However, on x86, it doesn't matter if I use acquire or relaxed on the load
|
||||
// as I get the same assembly in both cases. So I just use the more
|
||||
// conservative acquire (which will impede compiler optimizations but I don't
|
||||
// care)
|
||||
// This is now always the global interpreter if the PyObject is set.
|
||||
// Maybe we can remove this field some day...
|
||||
std::atomic<PyInterpreter*> pyobj_interpreter_;
|
||||
|
||||
// This field contains a reference to a PyObject representing this Tensor.
|
||||
// If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
|
||||
// PyObject for it and set this field. This field does not have to be
|
||||
// protected by an atomic as it is only allowed to be accessed when you hold
|
||||
// the GIL, or during destruction of the tensor.
|
||||
//
|
||||
// When a PyObject dies, you are obligated to clear this field
|
||||
// (otherwise, you will try to use-after-free the pyobj); this currently
|
||||
// occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
|
||||
//
|
||||
// NB: Ordinarily, this should not be a strong reference, as if the
|
||||
// PyObject owns the Tensor, this would create a reference cycle.
|
||||
// However, sometimes this ownership flips. To track who owns
|
||||
// who, this has a single pointer tag indicating whether or not the
|
||||
// C++ object owns the PyObject (the common case, zero, means PyObject
|
||||
// owns the C++ object); see _unchecked_untagged_pyobj for raw access
|
||||
// or check_pyobj for checked access. See references to PyObject
|
||||
// resurrection in torch/csrc/autograd/python_variable.cpp
|
||||
PyObject* pyobj_;
|
||||
// The PyObject representing this Tensor or nullptr. Ownership is managed
|
||||
// by intrusive_ptr. By the time the PyObjectSlot is destroyed, this
|
||||
// reference is already dead.
|
||||
std::atomic<PyObject*> pyobj_;
|
||||
|
||||
friend class torch::utils::PyObjectPreservation;
|
||||
};
|
||||
|
||||
} // namespace c10::impl
|
||||
|
||||
@ -50,7 +50,13 @@ namespace c10 {
|
||||
/// However, you should prefer to use ArrayRef when possible, because its use
|
||||
/// of TORCH_CHECK will lead to better user-facing error messages.
|
||||
template <typename T>
|
||||
class ArrayRef final : public HeaderOnlyArrayRef<T> {
|
||||
// ArrayRef cannot be derived from. Normally, we would use `final`
|
||||
// specifier to force this constraint at compile time. However, Intel
|
||||
// compiler does not recognize ArrayRef as a class template (which is
|
||||
// required in the definition of at::TensorAccessor, for instance)
|
||||
// when `final` specifier is used. So, we cannot define ArrayRef as
|
||||
// final because of the Intel compiler issue.
|
||||
class ArrayRef : public HeaderOnlyArrayRef<T> {
|
||||
public:
|
||||
/// @name Constructors, all inherited from HeaderOnlyArrayRef except for
|
||||
/// SmallVector. As inherited constructors won't work with class template
|
||||
|
||||
@ -379,7 +379,11 @@ C10_API std::string GetExceptionString(const std::exception& e);
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
#ifdef STRIP_ERROR_MESSAGES
|
||||
#define TORCH_RETHROW(e, ...) throw
|
||||
#define TORCH_RETHROW(e, ...) \
|
||||
do { \
|
||||
(void)e; /* Suppress unused variable warning */ \
|
||||
throw; \
|
||||
} while (false)
|
||||
#else
|
||||
#define TORCH_RETHROW(e, ...) \
|
||||
do { \
|
||||
|
||||
@ -12,6 +12,10 @@ template <typename, typename...>
|
||||
class class_;
|
||||
}
|
||||
|
||||
namespace torch::utils {
|
||||
class PyObjectPreservation;
|
||||
}
|
||||
|
||||
namespace c10 {
|
||||
class intrusive_ptr_target;
|
||||
namespace raw {
|
||||
@ -33,6 +37,8 @@ constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
|
||||
constexpr uint64_t kReferenceCountOne = 1;
|
||||
constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
|
||||
constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
|
||||
// Indicates whether the object has a PyObject wrapper.
|
||||
constexpr uint64_t kHasPyObject = (uint64_t(1) << 63);
|
||||
|
||||
template <class TTarget>
|
||||
struct intrusive_target_default_null_type final {
|
||||
@ -55,7 +61,11 @@ inline uint32_t refcount(uint64_t combined_refcount) {
|
||||
}
|
||||
|
||||
inline uint32_t weakcount(uint64_t combined_refcount) {
|
||||
return static_cast<uint32_t>(combined_refcount >> 32);
|
||||
return static_cast<uint32_t>((combined_refcount & ~kHasPyObject) >> 32);
|
||||
}
|
||||
|
||||
inline bool has_pyobject(uint64_t combined_refcount) {
|
||||
return (combined_refcount & kHasPyObject) != 0;
|
||||
}
|
||||
|
||||
// The only requirement for refcount increment is that it happens-before
|
||||
@ -66,12 +76,6 @@ inline uint64_t atomic_combined_refcount_increment(
|
||||
return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
|
||||
}
|
||||
|
||||
inline uint32_t atomic_refcount_increment(
|
||||
std::atomic<uint64_t>& combined_refcount) {
|
||||
return detail::refcount(atomic_combined_refcount_increment(
|
||||
combined_refcount, kReferenceCountOne));
|
||||
}
|
||||
|
||||
inline uint32_t atomic_weakcount_increment(
|
||||
std::atomic<uint64_t>& combined_refcount) {
|
||||
return detail::weakcount(atomic_combined_refcount_increment(
|
||||
@ -99,6 +103,11 @@ inline uint32_t atomic_weakcount_decrement(
|
||||
combined_refcount, kWeakReferenceCountOne));
|
||||
}
|
||||
|
||||
template <class T, class = void>
|
||||
struct TargetTraits {
|
||||
static constexpr bool can_have_pyobject = false;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/**
|
||||
@ -155,6 +164,23 @@ class C10_API intrusive_ptr_target {
|
||||
// we can atomically operate on both at the same time for performance
|
||||
// and defined behaviors.
|
||||
//
|
||||
// Note [PyObject preservation for Tensor and Storages]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// intrusive_ptr has special support for preserving PyObject wrappers
|
||||
// for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of
|
||||
// the combined_refcount_ is used to indicate whether the object has a
|
||||
// PyObject wrapper.
|
||||
//
|
||||
// - The PyObject, if it exists, holds a strong reference to the
|
||||
// intrusive_ptr_target.
|
||||
//
|
||||
// - When the refcount goes from 1 to 2, we incref the PyObject.
|
||||
//
|
||||
// - When the refcount goes from 2 to 1, we decref the PyObject.
|
||||
//
|
||||
// In other words, the intrusive_ptr keeps the PyObject alive as long as there
|
||||
// are other C++ references to the intrusive_ptr_target.
|
||||
|
||||
mutable std::atomic<uint64_t> combined_refcount_;
|
||||
static_assert(sizeof(std::atomic<uint64_t>) == 8);
|
||||
static_assert(alignof(std::atomic<uint64_t>) == 8);
|
||||
@ -172,6 +198,8 @@ class C10_API intrusive_ptr_target {
|
||||
template <typename T>
|
||||
friend struct ExclusivelyOwnedTensorTraits;
|
||||
|
||||
friend class torch::utils::PyObjectPreservation;
|
||||
|
||||
protected:
|
||||
// protected destructor. We never want to destruct intrusive_ptr_target*
|
||||
// directly.
|
||||
@ -255,6 +283,16 @@ class C10_API intrusive_ptr_target {
|
||||
*/
|
||||
virtual void release_resources() {}
|
||||
|
||||
/**
|
||||
* These two methods are called when the refcount transitions between one
|
||||
* and two and the object has a PyObject wrapper.
|
||||
*/
|
||||
virtual void incref_pyobject() const {}
|
||||
virtual void decref_pyobject() const {}
|
||||
virtual bool try_incref_pyobject() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
|
||||
return detail::refcount(combined_refcount_.load(order));
|
||||
}
|
||||
@ -265,6 +303,19 @@ class C10_API intrusive_ptr_target {
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
template <>
|
||||
struct TargetTraits<c10::intrusive_ptr_target> {
|
||||
// A generic intrusive_ptr<intrusive_ptr_target> may actually be a TensorImpl
|
||||
// or StorageImpl, so we have to allow for PyObject support.
|
||||
static constexpr bool can_have_pyobject = true;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <class TTarget, class NullType>
|
||||
class weak_intrusive_ptr;
|
||||
|
||||
@ -314,18 +365,34 @@ class intrusive_ptr final {
|
||||
|
||||
void retain_() {
|
||||
if (target_ != NullType::singleton()) {
|
||||
uint32_t new_refcount =
|
||||
detail::atomic_refcount_increment(target_->combined_refcount_);
|
||||
uint64_t combined = detail::atomic_combined_refcount_increment(
|
||||
target_->combined_refcount_, detail::kReferenceCountOne);
|
||||
uint32_t new_refcount = detail::refcount(combined);
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
new_refcount != 1,
|
||||
"intrusive_ptr: Cannot increase refcount after it reached zero.");
|
||||
|
||||
if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
// If the refcount transitioned from 1 to 2, we need to incref the
|
||||
// PyObject. In other words, we need to ensure that the PyObject stays
|
||||
// alive now that we have a C++ reference to this object in addition to
|
||||
// the PyObject itself.
|
||||
if (C10_UNLIKELY(
|
||||
detail::has_pyobject(combined) &&
|
||||
detail::refcount(combined) == 2)) {
|
||||
target_->incref_pyobject();
|
||||
}
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
!detail::has_pyobject(combined),
|
||||
"TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reset_() noexcept {
|
||||
if (target_ != NullType::singleton()) {
|
||||
if (target_->combined_refcount_.load(std::memory_order_acquire) ==
|
||||
detail::kUniqueRef) {
|
||||
if (is_uniquely_owned()) {
|
||||
// Both counts are 1, so there are no weak references and
|
||||
// we are releasing the last strong reference. No other
|
||||
// threads can observe the effects of this target_ deletion
|
||||
@ -337,9 +404,10 @@ class intrusive_ptr final {
|
||||
|
||||
auto combined_refcount = detail::atomic_combined_refcount_decrement(
|
||||
target_->combined_refcount_, detail::kReferenceCountOne);
|
||||
if (detail::refcount(combined_refcount) == 0) {
|
||||
bool should_delete =
|
||||
(combined_refcount == detail::kWeakReferenceCountOne);
|
||||
uint32_t new_refcount = detail::refcount(combined_refcount);
|
||||
bool has_pyobject = detail::has_pyobject(combined_refcount);
|
||||
if (new_refcount == 0) {
|
||||
bool should_delete = detail::weakcount(combined_refcount) == 1;
|
||||
// See comment above about weakcount. As long as refcount>0,
|
||||
// weakcount is one larger than the actual number of weak references.
|
||||
// So we need to decrement it here.
|
||||
@ -356,6 +424,18 @@ class intrusive_ptr final {
|
||||
if (should_delete) {
|
||||
delete target_;
|
||||
}
|
||||
} else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
// If the refcount transitioned from 2 to 1, we need to decref the
|
||||
// PyObject. In other words, we don't want to keep the PyObject alive if
|
||||
// there are no C++ references to this object other than the PyObject
|
||||
// itself.
|
||||
if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
|
||||
target_->decref_pyobject();
|
||||
}
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
!has_pyobject,
|
||||
"TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -522,6 +602,16 @@ class intrusive_ptr final {
|
||||
return use_count() == 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stronger than unique() in that it must not have any weakrefs as well.
|
||||
*/
|
||||
bool is_uniquely_owned() const noexcept {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
|
||||
uint64_t combined =
|
||||
target_->combined_refcount_.load(std::memory_order_acquire);
|
||||
return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an owning (!) pointer to the underlying object and makes the
|
||||
* intrusive_ptr instance invalid. That means the refcount is not decreased.
|
||||
@ -932,6 +1022,7 @@ class weak_intrusive_ptr final {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
} else {
|
||||
bool increfed = false;
|
||||
auto combined_refcount =
|
||||
target_->combined_refcount_.load(std::memory_order_relaxed);
|
||||
do {
|
||||
@ -940,12 +1031,31 @@ class weak_intrusive_ptr final {
|
||||
// Return nullptr.
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
}
|
||||
if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
if (detail::has_pyobject(combined_refcount) &&
|
||||
detail::refcount(combined_refcount) == 1 && !increfed) {
|
||||
// Object has a python wrapper with no other C++ references.
|
||||
// We need to to incref the Python object before we acquire a
|
||||
// strong reference to the C++ object to avoid a situation
|
||||
// where the Python object is deallocated concurrently.
|
||||
if (!target_->try_incref_pyobject()) {
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
}
|
||||
increfed = true;
|
||||
}
|
||||
}
|
||||
} while (!target_->combined_refcount_.compare_exchange_weak(
|
||||
combined_refcount,
|
||||
combined_refcount + detail::kReferenceCountOne,
|
||||
std::memory_order_acquire,
|
||||
std::memory_order_relaxed));
|
||||
|
||||
if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
if (increfed && detail::refcount(combined_refcount) != 1) {
|
||||
target_->decref_pyobject();
|
||||
}
|
||||
}
|
||||
|
||||
return intrusive_ptr<TTarget, NullType>(
|
||||
target_, raw::DontIncreaseRefcount{});
|
||||
}
|
||||
@ -1060,7 +1170,18 @@ namespace intrusive_ptr {
|
||||
// NullType::singleton to this function
|
||||
inline void incref(intrusive_ptr_target* self) {
|
||||
if (self) {
|
||||
detail::atomic_refcount_increment(self->combined_refcount_);
|
||||
uint64_t combined = detail::atomic_combined_refcount_increment(
|
||||
self->combined_refcount_, detail::kReferenceCountOne);
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
if (C10_UNLIKELY(
|
||||
detail::has_pyobject(combined) &&
|
||||
detail::refcount(combined) == 2)) {
|
||||
self->incref_pyobject();
|
||||
}
|
||||
#else
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!detail::has_pyobject(combined));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -15,6 +15,8 @@ using namespace c10::CachingDeviceAllocator;
|
||||
// newly allocated memory with 512-byte alignment.
|
||||
constexpr size_t kDeviceAlignment = 512;
|
||||
|
||||
class XPUAllocator;
|
||||
|
||||
namespace {
|
||||
using stream_set = ska::flat_hash_set<xpu::XPUStream>;
|
||||
|
||||
@ -23,14 +25,19 @@ typedef bool (*Comparison)(const Block*, const Block*);
|
||||
bool BlockComparatorSize(const Block* a, const Block* b);
|
||||
bool BlockComparatorAddress(const Block* a, const Block* b);
|
||||
|
||||
struct PrivatePool;
|
||||
|
||||
struct BlockPool {
|
||||
BlockPool(bool small)
|
||||
BlockPool(bool small, PrivatePool* private_pool = nullptr)
|
||||
: blocks(BlockComparatorSize),
|
||||
unmapped(BlockComparatorAddress),
|
||||
is_small(small) {}
|
||||
is_small(small),
|
||||
owner_PrivatePool(private_pool) {}
|
||||
|
||||
std::set<Block*, Comparison> blocks;
|
||||
std::set<Block*, Comparison> unmapped;
|
||||
const bool is_small;
|
||||
PrivatePool* owner_PrivatePool;
|
||||
};
|
||||
|
||||
struct ExpandableSegment;
|
||||
@ -349,6 +356,43 @@ struct AllocParams {
|
||||
StatTypes stat_types = {};
|
||||
};
|
||||
|
||||
// Internal implementation that manages actual memory blocks.
|
||||
// high level MemPool interface wraps PrivatePool via MempoolId.
|
||||
struct PrivatePool {
|
||||
PrivatePool(MempoolId_t id, XPUAllocator* allocator = nullptr)
|
||||
: id(std::move(id)),
|
||||
allocator_(allocator),
|
||||
large_blocks(/*small=*/false, this),
|
||||
small_blocks(/*small=*/true, this) {}
|
||||
PrivatePool(const PrivatePool&) = delete;
|
||||
PrivatePool(PrivatePool&&) = delete;
|
||||
PrivatePool& operator=(const PrivatePool&) = delete;
|
||||
PrivatePool& operator=(PrivatePool&&) = delete;
|
||||
~PrivatePool() = default;
|
||||
|
||||
// default Mempool when no Mempool is specified
|
||||
MempoolId_t id{0, 0};
|
||||
// Number of live graphs using this pool
|
||||
int use_count{1};
|
||||
// Number of unfreed allocations made for this pool. When use_count and
|
||||
// allocation_count drop to zero, we can delete this PrivatePool from
|
||||
// graph_pools.
|
||||
int allocation_count{0};
|
||||
XPUAllocator* allocator_;
|
||||
BlockPool large_blocks;
|
||||
BlockPool small_blocks;
|
||||
|
||||
public:
|
||||
XPUAllocator* allocator() {
|
||||
return allocator_;
|
||||
}
|
||||
};
|
||||
struct MempoolIdHash {
|
||||
std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
|
||||
return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
|
||||
}
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
class DeviceCachingAllocator {
|
||||
@ -365,6 +409,13 @@ class DeviceCachingAllocator {
|
||||
bool set_fraction = false;
|
||||
std::vector<ExpandableSegment*> expandable_segments;
|
||||
std::vector<c10::DeviceIndex> devices_with_peer_access; // reserved
|
||||
std::vector<std::pair<MempoolId_t, std::function<bool(sycl::queue*)>>>
|
||||
captures_underway;
|
||||
ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
|
||||
graph_pools;
|
||||
// Pools no longer referenced by any graph.
|
||||
ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
|
||||
graph_pools_freeable;
|
||||
|
||||
size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
|
||||
if (!src || src->allocated || src->event_count > 0 ||
|
||||
@ -463,7 +514,22 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
BlockPool& get_pool(size_t size) {
|
||||
BlockPool& get_pool(size_t size, sycl::queue* queue) {
|
||||
if (C10_UNLIKELY(!captures_underway.empty())) {
|
||||
for (auto& entry : captures_underway) {
|
||||
// lookup for mempool id matching current capture graph
|
||||
if (entry.second(queue)) {
|
||||
auto it1 = graph_pools.find(entry.first);
|
||||
// lookup mempool
|
||||
TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
|
||||
if (size <= kSmallSize) {
|
||||
return it1->second->small_blocks;
|
||||
} else {
|
||||
return it1->second->large_blocks;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (size < kSmallSize) {
|
||||
return small_blocks;
|
||||
} else {
|
||||
@ -669,6 +735,10 @@ class DeviceCachingAllocator {
|
||||
if (!ptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (p.pool->owner_PrivatePool) {
|
||||
p.pool->owner_PrivatePool->allocation_count++;
|
||||
}
|
||||
p.block = new Block(device, p.queue(), size, p.pool, ptr);
|
||||
for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
|
||||
stats.reserved_bytes[stat_type].increase(size);
|
||||
@ -677,11 +747,14 @@ class DeviceCachingAllocator {
|
||||
return true;
|
||||
}
|
||||
|
||||
void synchronize_and_free_events() {
|
||||
void synchronize_and_free_events(PrivatePool* pool = nullptr) {
|
||||
for (auto& xe : xpu_events) {
|
||||
for (auto& e : xe.second) {
|
||||
auto event = e.first;
|
||||
auto* block = e.second;
|
||||
if (pool && block->pool->owner_PrivatePool != pool) {
|
||||
continue;
|
||||
}
|
||||
event.wait();
|
||||
block->event_count--;
|
||||
if (block->event_count == 0) {
|
||||
@ -785,6 +858,13 @@ class DeviceCachingAllocator {
|
||||
for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
|
||||
stats.reserved_bytes[stat_type].decrease(unmapped.size);
|
||||
});
|
||||
|
||||
if (block->pool->owner_PrivatePool) {
|
||||
// The Freed block belonged to a XPU graph's PrivatePool.
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
block->pool->owner_PrivatePool->allocation_count > 0);
|
||||
block->pool->owner_PrivatePool->allocation_count--;
|
||||
}
|
||||
}
|
||||
|
||||
void release_blocks(BlockPool& pool) {
|
||||
@ -812,13 +892,41 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
bool release_cached_blocks() {
|
||||
synchronize_and_free_events();
|
||||
// See Note [Safe to Free Blocks on BlockPool]
|
||||
c10::xpu::syncStreamsOnDevice(device_index);
|
||||
bool release_cached_blocks(MempoolId_t mempool_id) {
|
||||
if (mempool_id.first == 0 && mempool_id.second == 0 &&
|
||||
captures_underway.empty()) {
|
||||
synchronize_and_free_events();
|
||||
// See Note [Safe to Free Blocks on BlockPool]
|
||||
c10::xpu::syncStreamsOnDevice(device_index);
|
||||
|
||||
release_blocks(large_blocks);
|
||||
release_blocks(small_blocks);
|
||||
release_blocks(large_blocks);
|
||||
release_blocks(small_blocks);
|
||||
}
|
||||
|
||||
for (auto it = graph_pools_freeable.begin();
|
||||
it != graph_pools_freeable.end();) {
|
||||
if (mempool_id.first != 0 || mempool_id.second != 0) {
|
||||
if (it->first == mempool_id) {
|
||||
// If there is an active mempool, we sync only the events
|
||||
// associated with the pool
|
||||
synchronize_and_free_events(it->second);
|
||||
} else {
|
||||
// otherwise we move on
|
||||
++it;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
|
||||
release_blocks(it->second->small_blocks);
|
||||
release_blocks(it->second->large_blocks);
|
||||
if (it->second->allocation_count == 0) {
|
||||
auto erase_count = graph_pools.erase(it->first);
|
||||
TORCH_INTERNAL_ASSERT(erase_count == 1);
|
||||
it = graph_pools_freeable.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -903,6 +1011,30 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
void create_or_incref_pool(
|
||||
MempoolId_t mempool_id,
|
||||
XPUAllocator* allocator = nullptr) {
|
||||
auto it = graph_pools.find(mempool_id);
|
||||
if (it == graph_pools.end()) {
|
||||
// mempool_id does not reference an existing pool.
|
||||
// Make a new pool for XPU graph capture or memory pool usage.
|
||||
graph_pools.emplace(
|
||||
mempool_id, std::make_unique<PrivatePool>(mempool_id, allocator));
|
||||
} else {
|
||||
// mempool_id references an existing pool, which the current XPU graph
|
||||
// capture will share.
|
||||
TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
|
||||
TORCH_INTERNAL_ASSERT(allocator == nullptr);
|
||||
it->second->use_count++;
|
||||
}
|
||||
}
|
||||
|
||||
PrivatePool* get_private_pool(MempoolId_t mempool_id) {
|
||||
auto it = graph_pools.find(mempool_id);
|
||||
TORCH_INTERNAL_ASSERT(it != graph_pools.end());
|
||||
return it->second.get();
|
||||
}
|
||||
|
||||
public:
|
||||
DeviceCachingAllocator(DeviceIndex device_index)
|
||||
: large_blocks(/* small */ false),
|
||||
@ -911,9 +1043,11 @@ class DeviceCachingAllocator {
|
||||
|
||||
Block* malloc(DeviceIndex device, size_t orig_size, sycl::queue& queue) {
|
||||
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||
process_events();
|
||||
if (C10_LIKELY(captures_underway.empty())) {
|
||||
process_events();
|
||||
}
|
||||
size_t size = round_size(orig_size);
|
||||
auto& pool = get_pool(size);
|
||||
auto& pool = get_pool(size, &queue);
|
||||
const size_t alloc_size = get_allocation_size(size);
|
||||
AllocParams params(device, size, &queue, &pool, alloc_size);
|
||||
params.stat_types = get_stat_types_for_pool(pool);
|
||||
@ -923,7 +1057,7 @@ class DeviceCachingAllocator {
|
||||
// Can't reuse an existing block, try to get a new one.
|
||||
if (!block_found) {
|
||||
block_found = alloc_block(params, false) ||
|
||||
(release_cached_blocks() && alloc_block(params, true));
|
||||
(release_cached_blocks({0, 0}) && alloc_block(params, true));
|
||||
}
|
||||
if (!block_found) {
|
||||
const auto& raw_device = c10::xpu::get_raw_device(device);
|
||||
@ -1016,9 +1150,9 @@ class DeviceCachingAllocator {
|
||||
block->stream_uses.insert(stream);
|
||||
}
|
||||
|
||||
void emptyCache() {
|
||||
void emptyCache(MempoolId_t mempool_id) {
|
||||
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||
release_cached_blocks();
|
||||
release_cached_blocks(mempool_id);
|
||||
}
|
||||
|
||||
DeviceStats getStats() {
|
||||
@ -1172,9 +1306,9 @@ class XPUAllocator : public DeviceAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
|
||||
void emptyCache(MempoolId_t mempool_id) override {
|
||||
for (auto& da : device_allocators) {
|
||||
da->emptyCache();
|
||||
da->emptyCache(mempool_id);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1290,8 +1424,8 @@ void init(DeviceIndex device_count) {
|
||||
return allocator.init(device_count);
|
||||
}
|
||||
|
||||
void emptyCache() {
|
||||
return allocator.emptyCache();
|
||||
void emptyCache(MempoolId_t mempool_id) {
|
||||
return allocator.emptyCache(mempool_id);
|
||||
}
|
||||
|
||||
void resetPeakStats(DeviceIndex device) {
|
||||
|
||||
@ -10,7 +10,7 @@ C10_XPU_API Allocator* get();
|
||||
|
||||
C10_XPU_API void init(DeviceIndex device_count);
|
||||
|
||||
C10_XPU_API void emptyCache();
|
||||
C10_XPU_API void emptyCache(MempoolId_t mempool_id = {0, 0});
|
||||
|
||||
C10_XPU_API void resetPeakStats(DeviceIndex device);
|
||||
|
||||
|
||||
@ -734,7 +734,7 @@ void PyTorchStreamWriter::setup(const string& file_name) {
|
||||
file_name,
|
||||
std::ofstream::out | std::ofstream::trunc | std::ofstream::binary
|
||||
);
|
||||
} catch (const std::ios_base::failure& e) {
|
||||
} catch (const std::ios_base::failure&) {
|
||||
#ifdef _WIN32
|
||||
// Windows have verbose error code, we prefer to use it than std errno.
|
||||
uint32_t error_code = GetLastError();
|
||||
@ -773,8 +773,20 @@ void PyTorchStreamWriter::writeRecord(
|
||||
bool compress) {
|
||||
AT_ASSERT(!finalized_);
|
||||
AT_ASSERT(!archive_name_plus_slash_.empty());
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
files_written_.count(name) == 0, "Tried to serialize file twice: ", name);
|
||||
if (files_written_.count(name) > 0) {
|
||||
// Allow multiple writes for triton binaries
|
||||
bool is_triton_extension =
|
||||
c10::ends_with(name, ".so") ||
|
||||
c10::ends_with(name, ".cubin") ||
|
||||
c10::ends_with(name, ".hsaco");
|
||||
|
||||
if (is_triton_extension) {
|
||||
LOG(WARNING) << "File '" << name << "' is being serialized multiple times";
|
||||
return;
|
||||
}
|
||||
|
||||
TORCH_INTERNAL_ASSERT(false, "Tried to serialize file twice: ", name);
|
||||
}
|
||||
if (name == kSerializationIdRecordName && serialization_id_.empty()) {
|
||||
// In case of copying records from another file, skip writing a different
|
||||
// serialization_id than the one computed in this writer.
|
||||
|
||||
@ -10,7 +10,7 @@ API. This API can roughly be divided into five parts:
|
||||
- **TorchScript**: An interface to the TorchScript JIT compiler and interpreter.
|
||||
- **C++ Extensions**: A means of extending the Python API with custom C++ and CUDA routines.
|
||||
|
||||
Combining, these building blocks form a research and
|
||||
Combined, these building blocks form a research and
|
||||
production ready C++ library for tensor computation and dynamic neural
|
||||
networks with strong emphasis on GPU acceleration as well as fast CPU
|
||||
performance. It is currently in use at Facebook in research and
|
||||
@ -76,7 +76,7 @@ C++ Frontend
|
||||
------------
|
||||
|
||||
The PyTorch C++ frontend provides a high level, pure C++ modeling interface for
|
||||
neural network and general ML(Machine Learning) research and production use cases,
|
||||
neural networks and general ML (Machine Learning) research and production use cases,
|
||||
largely following the Python API in design and provided functionality. The C++
|
||||
frontend includes the following:
|
||||
|
||||
|
||||
113
docs/source/accelerator/device.md
Normal file
113
docs/source/accelerator/device.md
Normal file
@ -0,0 +1,113 @@
|
||||
# Device Management
|
||||
|
||||
## Background
|
||||
|
||||
Device management handles basic operations like querying how many devices are available and switching between them. Accelerator backends need to wrap their device runtime's APIs and expose them to PyTorch.
|
||||
|
||||
The OpenReg implementation ([`OpenRegFunctions.h/cpp`][OpenReg Device Management]) shows how to wrap a third-party runtime. These functions are used throughout the backend - by streams, events, generators, and Python bindings.
|
||||
|
||||
## Design
|
||||
|
||||
Accelerator vendors need to implement these core functions:
|
||||
|
||||
| Function Name | Description | Application Scenarios |
|
||||
| ------------------------- | ---------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- |
|
||||
| `device_count()` | Query the total number of available devices in the system | - Application initialization<br>- Multi-device workload distribution<br>- Validating device indices before use |
|
||||
| `current_device()` | Get the currently active device for the calling thread | - Debugging and logging<br>- Determining tensor placement<br>- Guard implementations |
|
||||
| `set_device()` | Change the active device for subsequent operations | - Switching context between devices<br>- Initializing specific device resources<br>- Multi-GPU training loops |
|
||||
| `exchange_device()` | Atomically swap device and return the previous device | - Implementing device guards<br>- Temporarily switching device context<br>- RAII-based device management |
|
||||
| `maybe_exchange_device()` | Conditionally exchange device only if the index is valid (-1 OK) | - Safe device switching with optional indices<br>- Guard implementations with nullable device values |
|
||||
|
||||
These functions are building blocks for more complex features like streams, events, and memory management. Make sure to validate inputs and handle errors properly.
|
||||
|
||||
## Implementation
|
||||
|
||||
This section shows how to implement device management using `set_device` as an example. The implementation requires:
|
||||
1. C++ wrappers around the device runtime
|
||||
2. Python bindings to expose the C++ functions
|
||||
3. User-friendly Python APIs
|
||||
|
||||
### C++ Side
|
||||
|
||||
Wrap the device runtime's API and add error handling. The `SetDevice` function shows this pattern:
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: OPENREG SetDevice FUNCTION
|
||||
:end-before: LITERALINCLUDE END: OPENREG SetDevice FUNCTION
|
||||
:linenos:
|
||||
```
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: OPENREG set_device FUNCTION
|
||||
:end-before: LITERALINCLUDE END: OPENREG set_device FUNCTION
|
||||
:linenos:
|
||||
```
|
||||
|
||||
### Binding
|
||||
|
||||
Expose the C++ functions to Python using pybind11:
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: MODULE SET DEVICE HELPER
|
||||
:end-before: LITERALINCLUDE END: MODULE SET DEVICE HELPER
|
||||
:linenos:
|
||||
```
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/Module.cpp
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: OPENREG MODULE METHODS
|
||||
:end-before: LITERALINCLUDE END: OPENREG MODULE METHODS
|
||||
:linenos:
|
||||
:emphasize-lines: 5
|
||||
```
|
||||
|
||||
### Python Side
|
||||
|
||||
Wrap the C++ bindings with user-friendly Python functions:
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/__init__.py
|
||||
:language: python
|
||||
:start-after: LITERALINCLUDE START: PYTHON SET DEVICE FUNCTION
|
||||
:end-before: LITERALINCLUDE END: PYTHON SET DEVICE FUNCTION
|
||||
:linenos:
|
||||
```
|
||||
|
||||
Here's the complete mapping from C++ to Python:
|
||||
|
||||
| C++ Binding Function | C++ Binding API (pybind11) | Python User API | Description |
|
||||
| -------------------- | ---------------------------------------- | -------------------------------- | -------------------------------------------- |
|
||||
| `_getDeviceCount` | `torch_openreg._C._get_device_count()` | `torch.openreg.device_count()` | Returns the total number of devices |
|
||||
| `_getDevice` | `torch_openreg._C._get_device()` | `torch.openreg.current_device()` | Returns the current active device index |
|
||||
| `_setDevice` | `torch_openreg._C._set_device(idx)` | `torch.openreg.set_device(idx)` | Sets the active device |
|
||||
| `_exchangeDevice` | `torch_openreg._C._exchange_device(idx)` | N/A (internal use only) | Atomically swaps device and returns previous |
|
||||
|
||||
## Guard
|
||||
|
||||
Device guards provide automatic device switching with exception safety. They're similar to lock guards in C++ - they switch device on construction and restore it on destruction.
|
||||
|
||||
Implement `DeviceGuardImplInterface` to integrate with PyTorch's guard system:
|
||||
|
||||
```{eval-rst}
|
||||
.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegGuard.h
|
||||
:language: c++
|
||||
:start-after: LITERALINCLUDE START: OPENREG DEVICE MGMT GUARD IMPL EXAMPLE
|
||||
:end-before: LITERALINCLUDE END: OPENREG DEVICE MGMT GUARD IMPL EXAMPLE
|
||||
:linenos:
|
||||
```
|
||||
|
||||
**What needs to be implemented:**
|
||||
|
||||
1. **exchangeDevice()**: Switch to a new device and return the old one (used by guard constructors)
|
||||
2. **getDevice()**: Get the current device
|
||||
3. **setDevice()**: Set the active device
|
||||
4. **Type checking**: Validate that device type matches the backend
|
||||
|
||||
This makes the guard available to PyTorch for the `PrivateUse1` device type. Users can then use standard PyTorch device guards with the custom backend.
|
||||
|
||||
[OpenReg Device Management]: https://github.com/pytorch/pytorch/blob/main/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp "OpenReg Device Management"
|
||||
@ -42,6 +42,7 @@ Next, we will delve into each chapter of this guide. Each chapter focuses on a k
|
||||
:glob:
|
||||
:maxdepth: 1
|
||||
|
||||
device
|
||||
hooks
|
||||
autoload
|
||||
operators
|
||||
|
||||
@ -254,7 +254,7 @@ To toggle the reduced precision reduction flags in C++, one can do
|
||||
|
||||
.. _fp16accumulation:
|
||||
|
||||
Full FP16 Accmumulation in FP16 GEMMs
|
||||
Full FP16 Accumulation in FP16 GEMMs
|
||||
-------------------------------------
|
||||
|
||||
Certain GPUs have increased performance when doing _all_ FP16 GEMM accumulation
|
||||
|
||||
@ -30,5 +30,6 @@ For a quick overview of `torch.compiler`, see {ref}`torch.compiler_overview`.
|
||||
skip_guard_on_all_nn_modules_unsafe
|
||||
keep_tensor_guards_unsafe
|
||||
skip_guard_on_globals_unsafe
|
||||
skip_all_guards_unsafe
|
||||
nested_compile_region
|
||||
```
|
||||
|
||||
@ -376,3 +376,19 @@ keep-runtime-typing = true
|
||||
|
||||
[tool.codespell]
|
||||
ignore-words = "tools/linter/dictionary.txt"
|
||||
|
||||
[tool.spin]
|
||||
package = 'torch'
|
||||
|
||||
[tool.spin.commands]
|
||||
"Build" = [
|
||||
".spin/cmds.py:lint",
|
||||
".spin/cmds.py:fixlint",
|
||||
".spin/cmds.py:quicklint",
|
||||
".spin/cmds.py:quickfix",
|
||||
]
|
||||
"Regenerate" = [
|
||||
".spin/cmds.py:regenerate_version",
|
||||
".spin/cmds.py:regenerate_type_stubs",
|
||||
".spin/cmds.py:regenerate_clangtidy_files",
|
||||
]
|
||||
|
||||
@ -32,7 +32,7 @@ project-excludes = [
|
||||
"torch/utils/tensorboard/summary.py",
|
||||
# formatting issues, will turn on after adjusting where suppressions can be
|
||||
# in import statements
|
||||
"tools/flight_recorder/components/types.py",
|
||||
"torch/distributed/flight_recorder/components/types.py",
|
||||
"torch/linalg/__init__.py",
|
||||
"torch/package/importer.py",
|
||||
"torch/package/_package_pickler.py",
|
||||
|
||||
@ -14,6 +14,7 @@ lintrunner ; platform_machine != "s390x" and platform_machine != "riscv64"
|
||||
networkx>=2.5.1
|
||||
optree>=0.13.0
|
||||
psutil
|
||||
spin
|
||||
sympy>=1.13.3
|
||||
typing-extensions>=4.13.2
|
||||
wheel
|
||||
|
||||
49
setup.py
49
setup.py
@ -1358,45 +1358,6 @@ class concat_license_files:
|
||||
|
||||
# Need to create the proper LICENSE.txt for the wheel
|
||||
class bdist_wheel(setuptools.command.bdist_wheel.bdist_wheel):
|
||||
def _wrap_headers_with_macro(self, bdist_dir: Path) -> None:
|
||||
"""Wrap all header files with #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION).
|
||||
|
||||
Excludes:
|
||||
- torch/include/torch/headeronly/*
|
||||
- torch/include/torch/csrc/stable/*
|
||||
- torch/include/torch/csrc/inductor/aoti_torch/c/ (only shim headers)
|
||||
- torch/include/torch/csrc/inductor/aoti_torch/generated/
|
||||
"""
|
||||
header_extensions = (".h", ".hpp", ".cuh")
|
||||
header_files = [
|
||||
f for ext in header_extensions for f in bdist_dir.rglob(f"*{ext}")
|
||||
]
|
||||
|
||||
# Paths to exclude from wrapping
|
||||
exclude_dir_patterns = [
|
||||
"torch/include/torch/headeronly/",
|
||||
"torch/include/torch/csrc/stable/",
|
||||
"torch/include/torch/csrc/inductor/aoti_torch/c/",
|
||||
"torch/include/torch/csrc/inductor/aoti_torch/generated/",
|
||||
]
|
||||
|
||||
for header_file in header_files:
|
||||
rel_path = header_file.relative_to(bdist_dir).as_posix()
|
||||
|
||||
if any(rel_path.startswith(pattern) for pattern in exclude_dir_patterns):
|
||||
report(f"Skipping header: {rel_path}")
|
||||
continue
|
||||
|
||||
original_content = header_file.read_text(encoding="utf-8")
|
||||
wrapped_content = (
|
||||
"#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)\n"
|
||||
f"{original_content}"
|
||||
"\n#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)\n"
|
||||
)
|
||||
|
||||
header_file.write_text(wrapped_content, encoding="utf-8")
|
||||
report(f"Wrapped header: {rel_path}")
|
||||
|
||||
def run(self) -> None:
|
||||
with concat_license_files(include_files=True):
|
||||
super().run()
|
||||
@ -1419,14 +1380,6 @@ class bdist_wheel(setuptools.command.bdist_wheel.bdist_wheel):
|
||||
# need an __init__.py file otherwise we wouldn't have a package
|
||||
(bdist_dir / "torch" / "__init__.py").touch()
|
||||
|
||||
# Wrap all header files with TORCH_STABLE_ONLY macro
|
||||
assert self.bdist_dir is not None, "bdist_dir should be set during wheel build"
|
||||
bdist_dir = Path(self.bdist_dir)
|
||||
report(
|
||||
"-- Wrapping header files with if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)"
|
||||
)
|
||||
self._wrap_headers_with_macro(bdist_dir)
|
||||
|
||||
|
||||
class clean(Command):
|
||||
user_options: ClassVar[list[tuple[str, str | None, str]]] = []
|
||||
@ -1632,7 +1585,7 @@ def configure_extension_build() -> tuple[
|
||||
if cmake_cache_vars["USE_DISTRIBUTED"]:
|
||||
# Only enable fr_trace command if distributed is enabled
|
||||
entry_points["console_scripts"].append(
|
||||
"torchfrtrace = tools.flight_recorder.fr_trace:main",
|
||||
"torchfrtrace = torch.distributed.flight_recorder.fr_trace:main",
|
||||
)
|
||||
return ext_modules, cmdclass, packages, entry_points, extra_install_requires
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@ set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)
|
||||
# Build the cpp gtest binary containing the cpp-only tests.
|
||||
set(AOTI_ABI_CHECK_TEST_SRCS
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/main.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_accessor.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch.cpp
|
||||
|
||||
50
test/cpp/aoti_abi_check/test_accessor.cpp
Normal file
50
test/cpp/aoti_abi_check/test_accessor.cpp
Normal file
@ -0,0 +1,50 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <torch/headeronly/core/TensorAccessor.h>
|
||||
#include <string>
|
||||
|
||||
TEST(TestAccessor, HeaderOnlyTensorAccessor) {
|
||||
std::vector<int32_t> v = {11, 12, 13, 21, 22, 23};
|
||||
std::vector<int64_t> sizes = {2, 3};
|
||||
std::vector<int64_t> strides = {3, 1};
|
||||
|
||||
auto acc = torch::headeronly::HeaderOnlyTensorAccessor<int32_t, 2>(
|
||||
v.data(), sizes.data(), strides.data());
|
||||
EXPECT_EQ(acc[0][0], 11);
|
||||
EXPECT_EQ(acc[0][1], 12);
|
||||
EXPECT_EQ(acc[0][2], 13);
|
||||
EXPECT_EQ(acc[1][0], 21);
|
||||
EXPECT_EQ(acc[1][1], 22);
|
||||
EXPECT_EQ(acc[1][2], 23);
|
||||
}
|
||||
|
||||
TEST(TestAccessor, HeaderOnlyGenericPackedTensorAccessor) {
|
||||
std::vector<int32_t> v = {11, 12, 13, 21, 22, 23};
|
||||
std::vector<int64_t> sizes = {2, 3};
|
||||
std::vector<int64_t> strides = {3, 1};
|
||||
|
||||
auto acc =
|
||||
torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<int32_t, 2>(
|
||||
v.data(), sizes.data(), strides.data());
|
||||
EXPECT_EQ(acc[0][0], 11);
|
||||
EXPECT_EQ(acc[0][1], 12);
|
||||
EXPECT_EQ(acc[0][2], 13);
|
||||
EXPECT_EQ(acc[1][0], 21);
|
||||
EXPECT_EQ(acc[1][1], 22);
|
||||
EXPECT_EQ(acc[1][2], 23);
|
||||
|
||||
auto tacc = acc.transpose(0, 1);
|
||||
EXPECT_EQ(tacc[0][0], 11);
|
||||
EXPECT_EQ(tacc[0][1], 21);
|
||||
EXPECT_EQ(tacc[1][0], 12);
|
||||
EXPECT_EQ(tacc[1][1], 22);
|
||||
EXPECT_EQ(tacc[2][0], 13);
|
||||
EXPECT_EQ(tacc[2][1], 23);
|
||||
|
||||
try {
|
||||
acc.transpose(0, 2);
|
||||
} catch (const std::exception& e) {
|
||||
EXPECT_TRUE(
|
||||
std::string(e.what()).find("HeaderOnlyIndexBoundsCheck") !=
|
||||
std::string::npos);
|
||||
}
|
||||
}
|
||||
@ -13,6 +13,17 @@ TEST(TestScalarType, ScalarTypeToCPPTypeT) {
|
||||
#undef DEFINE_CHECK
|
||||
}
|
||||
|
||||
TEST(TestScalarType, CppTypeToScalarType) {
|
||||
using torch::headeronly::CppTypeToScalarType;
|
||||
using torch::headeronly::ScalarType;
|
||||
|
||||
#define DEFINE_CHECK(TYPE, SCALARTYPE) \
|
||||
EXPECT_EQ(CppTypeToScalarType<TYPE>::value, ScalarType::SCALARTYPE);
|
||||
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
}
|
||||
|
||||
#define DEFINE_CHECK(TYPE, SCALARTYPE) \
|
||||
{ \
|
||||
EXPECT_EQ( \
|
||||
@ -90,3 +101,14 @@ TEST(TestScalarType, toUnderlying) {
|
||||
AT_FORALL_FLOAT8_TYPES(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
}
|
||||
|
||||
TEST(TestScalarType, isQIntType) {
|
||||
using torch::headeronly::isQIntType;
|
||||
using torch::headeronly::ScalarType;
|
||||
#define DEFINE_CHECK(_, name) EXPECT_TRUE(isQIntType(ScalarType::name));
|
||||
AT_FORALL_QINT_TYPES(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
#define DEFINE_CHECK(_, name) EXPECT_FALSE(isQIntType(ScalarType::name));
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
}
|
||||
|
||||
@ -15,7 +15,7 @@ namespace jit {
|
||||
TEST(CustomOperatorTest, InferredSchema) {
|
||||
torch::RegisterOperators reg(
|
||||
"foo::bar", [](double a, at::Tensor b) { return a + b; });
|
||||
auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
|
||||
ASSERT_EQ(ops.size(), 1);
|
||||
|
||||
auto& op = ops.front();
|
||||
@ -43,8 +43,7 @@ TEST(CustomOperatorTest, ExplicitSchema) {
|
||||
"foo::bar_with_schema(float a, Tensor b) -> Tensor",
|
||||
[](double a, at::Tensor b) { return a + b; });
|
||||
|
||||
auto& ops =
|
||||
getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
|
||||
ASSERT_EQ(ops.size(), 1);
|
||||
|
||||
auto& op = ops.front();
|
||||
@ -77,7 +76,7 @@ TEST(CustomOperatorTest, ListParameters) {
|
||||
torch::List<c10::complex<double>> complexdoubles,
|
||||
torch::List<at::Tensor> tensors) { return floats; });
|
||||
|
||||
auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
|
||||
ASSERT_EQ(ops.size(), 1);
|
||||
|
||||
auto& op = ops.front();
|
||||
@ -123,7 +122,7 @@ TEST(CustomOperatorTest, ListParameters2) {
|
||||
"foo::lists2(Tensor[] tensors) -> Tensor[]",
|
||||
[](torch::List<at::Tensor> tensors) { return tensors; });
|
||||
|
||||
auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
|
||||
ASSERT_EQ(ops.size(), 1);
|
||||
|
||||
auto& op = ops.front();
|
||||
@ -213,7 +212,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
|
||||
},
|
||||
aliasAnalysisFromSchema())});
|
||||
|
||||
auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
|
||||
ASSERT_EQ(ops.size(), 0);
|
||||
}
|
||||
|
||||
@ -232,7 +231,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
|
||||
},
|
||||
aliasAnalysisFromSchema())});
|
||||
|
||||
auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
|
||||
ASSERT_EQ(ops.size(), 1);
|
||||
|
||||
auto& op = ops.front();
|
||||
|
||||
@ -0,0 +1,30 @@
|
||||
#include "kernel.h"
|
||||
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
|
||||
STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
|
||||
STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
|
||||
Tensor res = new_empty(m, {m.size(0)});
|
||||
THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cuda",
|
||||
AT_WRAP(([&]() {
|
||||
auto resa = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
|
||||
auto ma = Accessor_cuda<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
|
||||
auto va = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
|
||||
mv_tensor_accessor_kernel<Accessor_cuda, scalar_t><<<1, 1, 0, 0>>>(resa, ma, va);
|
||||
})),
|
||||
AT_FLOATING_TYPES);
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
|
||||
m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
|
||||
}
|
||||
@ -1,3 +1,5 @@
|
||||
#include "kernel.h"
|
||||
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <torch/csrc/stable/accelerator.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
@ -308,7 +310,7 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my_amax(Tensor a) -> Tensor");
|
||||
m.def("my_amax_vec(Tensor a) -> Tensor");
|
||||
m.def("my_is_cpu(Tensor t) -> bool");
|
||||
m.def("test_default_constructor(bool undefined) -> bool");
|
||||
m.def("test_default_constructor(bool undefined) -> bool");
|
||||
}
|
||||
|
||||
bool test_default_constructor(bool defined) {
|
||||
@ -330,12 +332,47 @@ bool test_default_constructor(bool defined) {
|
||||
return out.defined();
|
||||
}
|
||||
|
||||
uint64_t get_any_data_ptr(Tensor t, bool mutable_) {
|
||||
if (mutable_) {
|
||||
return reinterpret_cast<uint64_t>(t.mutable_data_ptr());
|
||||
} else {
|
||||
return reinterpret_cast<uint64_t>(t.const_data_ptr());
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t get_template_any_data_ptr(Tensor t, c10::ScalarType dtype, bool mutable_) {
|
||||
#define DEFINE_CASE(T, name) \
|
||||
case torch::headeronly::ScalarType::name: { \
|
||||
if (mutable_) { \
|
||||
return reinterpret_cast<uint64_t>(t.mutable_data_ptr<T>()); \
|
||||
} else { \
|
||||
return reinterpret_cast<uint64_t>(t.const_data_ptr<T>()); \
|
||||
} \
|
||||
}
|
||||
switch (dtype) {
|
||||
// per aten/src/ATen/templates/TensorMethods.cpp:
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
|
||||
DEFINE_CASE(uint16_t, UInt16)
|
||||
DEFINE_CASE(uint32_t, UInt32)
|
||||
DEFINE_CASE(uint64_t, UInt64)
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
#undef DEFINE_CASE
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("get_any_data_ptr(Tensor t, bool mutable_) -> int");
|
||||
m.def("get_template_any_data_ptr(Tensor t, ScalarType dtype, bool mutable_) -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_zero_", TORCH_BOX(&my_zero_));
|
||||
m.impl("my_amax", TORCH_BOX(&my_amax));
|
||||
m.impl("my_amax_vec", TORCH_BOX(&my_amax_vec));
|
||||
m.impl("test_default_constructor", TORCH_BOX(&test_default_constructor));
|
||||
m.impl("get_any_data_ptr", TORCH_BOX(&get_any_data_ptr));
|
||||
m.impl("get_template_any_data_ptr", TORCH_BOX(&get_template_any_data_ptr));
|
||||
}
|
||||
|
||||
std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
@ -514,6 +551,32 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_is_cpu", &boxed_test_device_is_cpu);
|
||||
}
|
||||
|
||||
Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
|
||||
STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
|
||||
STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
|
||||
Tensor res = new_empty(m, {m.size(0)});
|
||||
THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cpu",
|
||||
AT_WRAP(([&]() {
|
||||
auto resa = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
|
||||
auto ma = Accessor_cpu<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
|
||||
auto va = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
|
||||
mv_tensor_accessor_kernel<Accessor_cpu, scalar_t>(resa, ma, va);
|
||||
})),
|
||||
AT_FLOATING_TYPES);
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("mv_tensor_accessor(Tensor m, Tensor v) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
|
||||
m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cpu));
|
||||
}
|
||||
|
||||
// Test functions for torch::stable::accelerator APIs
|
||||
|
||||
#ifdef LAE_USE_CUDA
|
||||
@ -634,3 +697,38 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_parallel_for", &boxed_test_parallel_for);
|
||||
m.impl("test_get_num_threads", &boxed_test_get_num_threads);
|
||||
}
|
||||
|
||||
Tensor my_empty(
|
||||
torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
|
||||
std::optional<torch::headeronly::ScalarType> dtype,
|
||||
std::optional<torch::stable::Device> device,
|
||||
std::optional<bool> pin_memory) {
|
||||
return empty(size, dtype, device, pin_memory);
|
||||
}
|
||||
|
||||
Tensor my_flatten(Tensor t, int64_t start_dim, int64_t end_dim) {
|
||||
return flatten(t, start_dim, end_dim);
|
||||
}
|
||||
|
||||
Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
|
||||
return reshape(t, shape);
|
||||
}
|
||||
|
||||
Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
|
||||
return view(t, size);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def(
|
||||
"my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
|
||||
m.def("my_flatten(Tensor t, int start_dim=0, int end_dim=-1) -> Tensor");
|
||||
m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
|
||||
m.def("my_view(Tensor t, int[] size) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_empty", TORCH_BOX(&my_empty));
|
||||
m.impl("my_flatten", TORCH_BOX(&my_flatten));
|
||||
m.impl("my_reshape", TORCH_BOX(&my_reshape));
|
||||
m.impl("my_view", TORCH_BOX(&my_view));
|
||||
}
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
#include <torch/headeronly/core/Dispatch_v2.h>
|
||||
#include <torch/headeronly/core/TensorAccessor.h>
|
||||
|
||||
template <typename T, size_t N>
|
||||
using Accessor_cpu = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
|
||||
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
#define MAYBE_GLOBAL __global__
|
||||
|
||||
template <typename T, size_t N>
|
||||
using Accessor_cuda = torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<T, N, torch::headeronly::RestrictPtrTraits>;
|
||||
|
||||
#else
|
||||
#define MAYBE_GLOBAL
|
||||
#endif
|
||||
|
||||
template <template <typename, size_t> class Accessor, typename scalar_t>
|
||||
MAYBE_GLOBAL void mv_tensor_accessor_kernel(Accessor<scalar_t, 1> resa, Accessor<scalar_t, 2> ma, Accessor<scalar_t, 1> va) {
|
||||
for (int64_t i = 0; i < resa.size(0); i++) {
|
||||
scalar_t val = 0;
|
||||
for (int64_t j = 0; j < ma.size(1); j++) {
|
||||
val += ma[i][j] * va[j];
|
||||
}
|
||||
resa[i] = val;
|
||||
}
|
||||
}
|
||||
@ -227,6 +227,37 @@ def test_tensor_device(t):
|
||||
return torch.ops.libtorch_agnostic.test_tensor_device.default(t)
|
||||
|
||||
|
||||
def get_any_data_ptr(t, mutable) -> int:
|
||||
"""
|
||||
Return data pointer value of the tensor.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
mutable: whether data pointer qualifier is mutable or const
|
||||
|
||||
Returns: int - pointer value
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.get_any_data_ptr.default(t, mutable)
|
||||
|
||||
|
||||
def get_template_any_data_ptr(t, dtype, mutable) -> int:
|
||||
"""
|
||||
Return data pointer value of the tensor iff it has dtype.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
dtype: Input dtype
|
||||
mutable: whether data pointer qualifier is mutable or const
|
||||
|
||||
Returns: int - pointer value
|
||||
|
||||
Raises RuntimeError when t.dtype() != dtype.
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.get_template_any_data_ptr.default(
|
||||
t, dtype, mutable
|
||||
)
|
||||
|
||||
|
||||
def my_pad(t) -> Tensor:
|
||||
"""
|
||||
Pads the input tensor with hardcoded padding parameters.
|
||||
@ -487,3 +518,72 @@ def test_get_num_threads() -> int:
|
||||
Returns: int - the number of threads for the parallel backend
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_get_num_threads.default()
|
||||
|
||||
|
||||
def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
|
||||
"""
|
||||
Creates an empty tensor with the specified size, dtype, device, and pin_memory.
|
||||
|
||||
Args:
|
||||
size: list[int] - size of the tensor to create
|
||||
dtype: ScalarType or None - data type of the tensor
|
||||
device: Device or None - device on which to create the tensor
|
||||
pin_memory: bool or None - whether to use pinned memory
|
||||
|
||||
Returns: Tensor - an uninitialized tensor with the specified properties
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_empty.default(size, dtype, device, pin_memory)
|
||||
|
||||
|
||||
def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
|
||||
"""
|
||||
Flattens the input tensor from start_dim to end_dim into a single dimension.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to flatten
|
||||
start_dim: int - first dimension to flatten (default: 0)
|
||||
end_dim: int - last dimension to flatten (default: -1)
|
||||
|
||||
Returns: Tensor - flattened tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_flatten.default(t, start_dim, end_dim)
|
||||
|
||||
|
||||
def my_reshape(t, shape) -> Tensor:
|
||||
"""
|
||||
Returns a tensor with the same data but different shape.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to reshape
|
||||
shape: list[int] - new shape for the tensor
|
||||
|
||||
Returns: Tensor - reshaped tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_reshape.default(t, shape)
|
||||
|
||||
|
||||
def my_view(t, size) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor with the same data as the input tensor but of a different shape.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to view
|
||||
size: list[int] - new size for the tensor
|
||||
|
||||
Returns: Tensor - tensor with new view
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_view.default(t, size)
|
||||
|
||||
|
||||
def mv_tensor_accessor(m, v) -> Tensor:
|
||||
"""
|
||||
Returns matrix-vector product.
|
||||
|
||||
Args:
|
||||
m: any 2-D Tensor with shape (N, M)
|
||||
v: any 1-D Tensor with shape (M,)
|
||||
|
||||
Returns:
|
||||
a 1-D Tensor with shape (N,)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.mv_tensor_accessor.default(m, v)
|
||||
|
||||
@ -33,16 +33,17 @@ class clean(distutils.command.clean.clean):
|
||||
|
||||
def get_extension():
|
||||
extra_compile_args = {
|
||||
"cxx": ["-fdiagnostics-color=always", "-DTORCH_STABLE_ONLY"],
|
||||
"cxx": ["-fdiagnostics-color=always"],
|
||||
}
|
||||
sources = list(CSRC_DIR.glob("**/*.cpp"))
|
||||
|
||||
extension = CppExtension
|
||||
# allow including <cuda_runtime.h>
|
||||
if torch.cuda.is_available():
|
||||
extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
|
||||
extra_compile_args["nvcc"] = ["-O2"]
|
||||
extension = CUDAExtension
|
||||
|
||||
sources = list(CSRC_DIR.glob("**/*.cpp"))
|
||||
sources.extend(CSRC_DIR.glob("**/*.cu"))
|
||||
|
||||
return [
|
||||
extension(
|
||||
|
||||
@ -14,11 +14,38 @@ from torch.testing._internal.common_utils import (
|
||||
install_cpp_extension,
|
||||
IS_WINDOWS,
|
||||
run_tests,
|
||||
skipIfTorchDynamo,
|
||||
TestCase,
|
||||
xfailIfTorchDynamo,
|
||||
)
|
||||
|
||||
|
||||
def get_supported_dtypes():
|
||||
"""Return a list of dtypes that are supported by torch stable ABI."""
|
||||
return [
|
||||
torch.int8,
|
||||
torch.int16,
|
||||
torch.int32,
|
||||
torch.int64,
|
||||
torch.uint8,
|
||||
torch.uint16,
|
||||
torch.uint32,
|
||||
torch.uint64,
|
||||
torch.bfloat16,
|
||||
torch.float16,
|
||||
torch.float32,
|
||||
torch.float64,
|
||||
torch.float8_e5m2,
|
||||
torch.float8_e4m3fn,
|
||||
torch.float8_e5m2fnuz,
|
||||
torch.float8_e4m3fnuz,
|
||||
torch.complex32,
|
||||
torch.complex64,
|
||||
torch.complex128,
|
||||
torch.bool,
|
||||
]
|
||||
|
||||
|
||||
# TODO: Fix this error in Windows:
|
||||
# LINK : error LNK2001: unresolved external symbol PyInit__C
|
||||
if not IS_WINDOWS:
|
||||
@ -274,6 +301,43 @@ if not IS_WINDOWS:
|
||||
expected0 = torch.narrow(t, dim0, start0, length0)
|
||||
self.assertEqual(out0, expected0)
|
||||
|
||||
@skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
|
||||
def test_get_any_data_ptr(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.empty(2, 5, device=device, dtype=torch.float32)
|
||||
expected_p = t.data_ptr()
|
||||
|
||||
for mutable in [True, False]:
|
||||
p = libtorch_agnostic.ops.get_any_data_ptr(t, mutable)
|
||||
self.assertEqual(p, expected_p)
|
||||
|
||||
@skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
|
||||
def test_get_template_any_data_ptr(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
supported_dtypes = get_supported_dtypes()
|
||||
|
||||
for dtype in supported_dtypes:
|
||||
t = torch.empty(2, 5, device=device, dtype=dtype)
|
||||
expected_p = t.data_ptr()
|
||||
|
||||
for rdtype in supported_dtypes:
|
||||
if dtype == rdtype:
|
||||
for mutable in [True, False]:
|
||||
p = libtorch_agnostic.ops.get_template_any_data_ptr(
|
||||
t, rdtype, mutable
|
||||
)
|
||||
self.assertEqual(p, expected_p)
|
||||
else:
|
||||
for mutable in [True, False]:
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, "expected scalar type.* but found"
|
||||
):
|
||||
libtorch_agnostic.ops.get_template_any_data_ptr(
|
||||
t, rdtype, mutable
|
||||
)
|
||||
|
||||
@onlyCUDA
|
||||
@deviceCountAtLeast(2)
|
||||
def test_device_guard(self, device):
|
||||
@ -525,6 +589,113 @@ if not IS_WINDOWS:
|
||||
expected_num_threads = torch.get_num_threads()
|
||||
self.assertEqual(num_threads, expected_num_threads)
|
||||
|
||||
def test_my_empty(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
deterministic = torch.are_deterministic_algorithms_enabled()
|
||||
try:
|
||||
# set use_deterministic_algorithms to fill uninitialized memory
|
||||
torch.use_deterministic_algorithms(True)
|
||||
|
||||
size = [2, 3]
|
||||
result = libtorch_agnostic.ops.my_empty(size, None, None, None)
|
||||
expected = torch.empty(size)
|
||||
self.assertEqual(result, expected, exact_device=True)
|
||||
|
||||
result_float = libtorch_agnostic.ops.my_empty(
|
||||
size, torch.float32, None, None
|
||||
)
|
||||
expected_float = torch.empty(size, dtype=torch.float32)
|
||||
self.assertEqual(result_float, expected_float, exact_device=True)
|
||||
|
||||
result_with_device = libtorch_agnostic.ops.my_empty(
|
||||
size, torch.float64, device, None
|
||||
)
|
||||
expected_with_device = torch.empty(
|
||||
size, dtype=torch.float64, device=device
|
||||
)
|
||||
self.assertEqual(
|
||||
result_with_device, expected_with_device, exact_device=True
|
||||
)
|
||||
|
||||
if device == "cuda":
|
||||
result_pinned = libtorch_agnostic.ops.my_empty(
|
||||
size, torch.float32, "cpu", True
|
||||
)
|
||||
expected_pinned = torch.empty(
|
||||
size, dtype=torch.float32, device="cpu", pin_memory=True
|
||||
)
|
||||
self.assertEqual(result_pinned, expected_pinned)
|
||||
self.assertTrue(result_pinned.is_pinned())
|
||||
finally:
|
||||
torch.use_deterministic_algorithms(deterministic)
|
||||
|
||||
def test_my_flatten(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3, 4, device=device)
|
||||
result = libtorch_agnostic.ops.my_flatten(t)
|
||||
expected = torch.flatten(t)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
result_start = libtorch_agnostic.ops.my_flatten(t, 1)
|
||||
expected_start = torch.flatten(t, 1)
|
||||
self.assertEqual(result_start, expected_start)
|
||||
|
||||
result_range = libtorch_agnostic.ops.my_flatten(t, 2, -1)
|
||||
expected_range = torch.flatten(t, 2, -1)
|
||||
self.assertEqual(result_range, expected_range)
|
||||
|
||||
def test_my_reshape(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3, 4, device=device)
|
||||
|
||||
result = libtorch_agnostic.ops.my_reshape(t, [6, 4])
|
||||
expected = torch.reshape(t, [6, 4])
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
result_infer = libtorch_agnostic.ops.my_reshape(t, [-1, 4])
|
||||
expected_infer = torch.reshape(t, [-1, 4])
|
||||
self.assertEqual(result_infer, expected_infer)
|
||||
|
||||
result_flat = libtorch_agnostic.ops.my_reshape(t, [-1])
|
||||
expected_flat = torch.reshape(t, [-1])
|
||||
self.assertEqual(result_flat, expected_flat)
|
||||
|
||||
def test_my_view(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3, 4, device=device)
|
||||
|
||||
result = libtorch_agnostic.ops.my_view(t, [6, 4])
|
||||
expected = t.view([6, 4])
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
result_infer = libtorch_agnostic.ops.my_view(t, [-1, 4])
|
||||
expected_infer = t.view([-1, 4])
|
||||
self.assertEqual(result_infer, expected_infer)
|
||||
|
||||
result_flat = libtorch_agnostic.ops.my_view(t, [-1])
|
||||
expected_flat = t.view([-1])
|
||||
self.assertEqual(result_flat, expected_flat)
|
||||
|
||||
def test_mv_tensor_accessor(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
m = torch.rand(3, 5, device=device)
|
||||
v = torch.rand(5, device=device)
|
||||
result = libtorch_agnostic.ops.mv_tensor_accessor(m, v)
|
||||
expected = torch.mv(m, v)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# non-contiguous inputs
|
||||
m = torch.rand(3 * 2, 5 * 3, device=device)[::2, ::3]
|
||||
v = torch.rand(5 * 4, device=device)[::4]
|
||||
result = libtorch_agnostic.ops.mv_tensor_accessor(m, v)
|
||||
expected = torch.mv(m, v)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -4,17 +4,12 @@
|
||||
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
void orCheckFail(
|
||||
const char* func,
|
||||
const char* file,
|
||||
uint32_t line,
|
||||
const char* msg = "");
|
||||
void orCheckFail(const char* func, const char* file, uint32_t line, const char* msg = "");
|
||||
|
||||
#define OPENREG_CHECK(EXPR, ...) \
|
||||
do { \
|
||||
const orError_t __err = EXPR; \
|
||||
if (__err != orSuccess) { \
|
||||
orCheckFail( \
|
||||
__func__, __FILE__, static_cast<uint32_t>(__LINE__), ##__VA_ARGS__); \
|
||||
} \
|
||||
#define OPENREG_CHECK(EXPR, ...) \
|
||||
do { \
|
||||
const orError_t __err = EXPR; \
|
||||
if (C10_UNLIKELY(__err != orSuccess)) { \
|
||||
orCheckFail(__func__, __FILE__, static_cast<uint32_t>(__LINE__), ##__VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
#include <c10/util/Exception.h>
|
||||
#include <include/openreg.h>
|
||||
|
||||
#include "OpenRegException.h"
|
||||
@ -9,21 +10,22 @@ orError_t GetDeviceCount(int* dev_count) {
|
||||
return orGetDeviceCount(dev_count);
|
||||
}
|
||||
|
||||
orError_t GetDevice(c10::DeviceIndex* device) {
|
||||
orError_t GetDevice(DeviceIndex* device) {
|
||||
int tmp_device = -1;
|
||||
auto err = orGetDevice(&tmp_device);
|
||||
*device = static_cast<c10::DeviceIndex>(tmp_device);
|
||||
*device = static_cast<DeviceIndex>(tmp_device);
|
||||
return err;
|
||||
}
|
||||
|
||||
orError_t SetDevice(c10::DeviceIndex device) {
|
||||
// LITERALINCLUDE START: OPENREG SetDevice FUNCTION
|
||||
orError_t SetDevice(DeviceIndex device) {
|
||||
int cur_device = -1;
|
||||
orGetDevice(&cur_device);
|
||||
OPENREG_CHECK(orGetDevice(&cur_device));
|
||||
if (device == cur_device) {
|
||||
return orSuccess;
|
||||
}
|
||||
return orSetDevice(device);
|
||||
}
|
||||
// LITERALINCLUDE END: OPENREG SetDevice FUNCTION
|
||||
|
||||
int device_count_impl() {
|
||||
int count = 0;
|
||||
@ -31,34 +33,37 @@ int device_count_impl() {
|
||||
return count;
|
||||
}
|
||||
|
||||
OPENREG_EXPORT c10::DeviceIndex device_count() noexcept {
|
||||
OPENREG_EXPORT DeviceIndex device_count() noexcept {
|
||||
// initialize number of devices only once
|
||||
static int count = []() {
|
||||
try {
|
||||
auto result = device_count_impl();
|
||||
TORCH_CHECK(
|
||||
result <= std::numeric_limits<c10::DeviceIndex>::max(),
|
||||
result <= std::numeric_limits<DeviceIndex>::max(),
|
||||
"Too many devices, DeviceIndex overflowed");
|
||||
return result;
|
||||
} catch (const c10::Error& ex) {
|
||||
} catch (const Error& ex) {
|
||||
// We don't want to fail, but still log the warning
|
||||
// msg() returns the message without the stack trace
|
||||
TORCH_WARN("Device initialization: ", ex.msg());
|
||||
return 0;
|
||||
}
|
||||
}();
|
||||
return static_cast<c10::DeviceIndex>(count);
|
||||
return static_cast<DeviceIndex>(count);
|
||||
}
|
||||
|
||||
OPENREG_EXPORT c10::DeviceIndex current_device() {
|
||||
c10::DeviceIndex cur_device = -1;
|
||||
GetDevice(&cur_device);
|
||||
OPENREG_EXPORT DeviceIndex current_device() {
|
||||
DeviceIndex cur_device = -1;
|
||||
OPENREG_CHECK(GetDevice(&cur_device));
|
||||
return cur_device;
|
||||
}
|
||||
|
||||
OPENREG_EXPORT void set_device(c10::DeviceIndex device) {
|
||||
SetDevice(device);
|
||||
// LITERALINCLUDE START: OPENREG set_device FUNCTION
|
||||
OPENREG_EXPORT void set_device(DeviceIndex device) {
|
||||
check_device_index(device);
|
||||
OPENREG_CHECK(SetDevice(device));
|
||||
}
|
||||
// LITERALINCLUDE END: OPENREG set_device FUNCTION
|
||||
|
||||
OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device) {
|
||||
int current_device = -1;
|
||||
@ -71,4 +76,8 @@ OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device) {
|
||||
return current_device;
|
||||
}
|
||||
|
||||
OPENREG_EXPORT DeviceIndex maybe_exchange_device(DeviceIndex to_device) {
|
||||
check_device_index(to_device);
|
||||
return ExchangeDevice(to_device);
|
||||
}
|
||||
} // namespace c10::openreg
|
||||
|
||||
@ -9,10 +9,20 @@
|
||||
|
||||
namespace c10::openreg {
|
||||
|
||||
OPENREG_EXPORT c10::DeviceIndex device_count() noexcept;
|
||||
OPENREG_EXPORT c10::DeviceIndex current_device();
|
||||
OPENREG_EXPORT void set_device(c10::DeviceIndex device);
|
||||
OPENREG_EXPORT DeviceIndex device_count() noexcept;
|
||||
OPENREG_EXPORT DeviceIndex current_device();
|
||||
OPENREG_EXPORT void set_device(DeviceIndex device);
|
||||
OPENREG_EXPORT DeviceIndex maybe_exchange_device(DeviceIndex to_device);
|
||||
|
||||
OPENREG_EXPORT DeviceIndex ExchangeDevice(DeviceIndex device);
|
||||
|
||||
static inline void check_device_index(int64_t device) {
|
||||
TORCH_CHECK(device >= 0 && device < c10::openreg::device_count(),
|
||||
"The device index is out of range. It must be in [0, ",
|
||||
static_cast<int>(c10::openreg::device_count()),
|
||||
"), but got ",
|
||||
static_cast<int>(device),
|
||||
".");
|
||||
}
|
||||
|
||||
} // namespace c10::openreg
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
|
||||
namespace c10::openreg {
|
||||
|
||||
// LITERALINCLUDE START: OPENREG GUARD REGISTRATION
|
||||
C10_REGISTER_GUARD_IMPL(PrivateUse1, OpenRegGuardImpl);
|
||||
// LITERALINCLUDE END: OPENREG GUARD REGISTRATION
|
||||
|
||||
} // namespace c10::openreg
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
|
||||
namespace c10::openreg {
|
||||
|
||||
// LITERALINCLUDE START: OPENREG DEVICE MGMT GUARD IMPL EXAMPLE
|
||||
struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
|
||||
static constexpr DeviceType static_type = c10::DeviceType::PrivateUse1;
|
||||
|
||||
@ -58,6 +59,7 @@ struct OpenRegGuardImpl final : public c10::impl::DeviceGuardImplInterface {
|
||||
|
||||
set_device(d.index());
|
||||
}
|
||||
// LITERALINCLUDE END: OPENREG DEVICE MGMT GUARD IMPL EXAMPLE
|
||||
|
||||
/**
|
||||
* Set the current device to c10::Device, without checking for errors
|
||||
|
||||
@ -27,6 +27,10 @@ class TestDevice(TestCase):
|
||||
self.assertEqual(torch.accelerator.current_device_index(), 1)
|
||||
self.assertEqual(torch.accelerator.current_device_index(), device)
|
||||
|
||||
def test_invalid_device_index(self):
|
||||
with self.assertRaisesRegex(RuntimeError, "The device index is out of range"):
|
||||
torch.accelerator.set_device_index(2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -34,18 +34,21 @@ static PyObject* _getDefaultGenerator(PyObject* self, PyObject* arg) {
|
||||
}
|
||||
// LITERALINCLUDE END: OPENREG GET DEFAULT GENERATOR
|
||||
|
||||
// LITERALINCLUDE START: MODULE SET DEVICE HELPER
|
||||
|
||||
PyObject* _setDevice(PyObject* self, PyObject* arg) {
|
||||
HANDLE_TH_ERRORS
|
||||
TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to setDevice");
|
||||
auto device = THPUtils_unpackLong(arg);
|
||||
|
||||
auto device = THPUtils_unpackDeviceIndex(arg);
|
||||
torch::utils::device_lazy_init(at::kPrivateUse1);
|
||||
c10::openreg::set_device(static_cast<c10::DeviceIndex>(device));
|
||||
c10::openreg::set_device(device);
|
||||
|
||||
Py_RETURN_NONE;
|
||||
END_HANDLE_TH_ERRORS
|
||||
}
|
||||
|
||||
// LITERALINCLUDE END: MODULE SET DEVICE HELPER
|
||||
|
||||
PyObject* _exchangeDevice(PyObject* self, PyObject* arg) {
|
||||
HANDLE_TH_ERRORS
|
||||
TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchangeDevice");
|
||||
|
||||
@ -41,8 +41,13 @@ def current_device():
|
||||
return torch_openreg._C._get_device()
|
||||
|
||||
|
||||
# LITERALINCLUDE START: PYTHON SET DEVICE FUNCTION
|
||||
def set_device(device) -> None:
|
||||
return torch_openreg._C._set_device(device)
|
||||
if device >= 0:
|
||||
torch_openreg._C._set_device(device)
|
||||
|
||||
|
||||
# LITERALINCLUDE END: PYTHON SET DEVICE FUNCTION
|
||||
|
||||
|
||||
def init():
|
||||
|
||||
67
test/cpp_extensions/torch_stable_test_extension/setup.py
Normal file
67
test/cpp_extensions/torch_stable_test_extension/setup.py
Normal file
@ -0,0 +1,67 @@
|
||||
import distutils.command.clean
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
from torch.utils.cpp_extension import BuildExtension, CppExtension
|
||||
|
||||
|
||||
ROOT_DIR = Path(__file__).parent
|
||||
CSRC_DIR = ROOT_DIR / "torch_stable_test" / "csrc"
|
||||
|
||||
|
||||
class clean(distutils.command.clean.clean):
|
||||
def run(self):
|
||||
# Run default behavior first
|
||||
distutils.command.clean.clean.run(self)
|
||||
|
||||
# Remove extension
|
||||
for path in (ROOT_DIR / "torch_stable_test").glob("**/*.so"):
|
||||
path.unlink()
|
||||
# Remove build and dist and egg-info directories
|
||||
dirs = [
|
||||
ROOT_DIR / "build",
|
||||
ROOT_DIR / "dist",
|
||||
ROOT_DIR / "torch_stable_test.egg-info",
|
||||
]
|
||||
for path in dirs:
|
||||
if path.exists():
|
||||
shutil.rmtree(str(path), ignore_errors=True)
|
||||
|
||||
|
||||
def get_extension():
|
||||
extra_compile_args = {
|
||||
"cxx": ["-fdiagnostics-color=always", "-DTORCH_STABLE_ONLY"],
|
||||
}
|
||||
|
||||
sources = list(CSRC_DIR.glob("**/*.cpp"))
|
||||
|
||||
return [
|
||||
CppExtension(
|
||||
"torch_stable_test._C",
|
||||
sources=sorted(str(s) for s in sources),
|
||||
py_limited_api=True,
|
||||
extra_compile_args=extra_compile_args,
|
||||
extra_link_args=[],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
setup(
|
||||
name="torch_stable_test",
|
||||
version="0.0",
|
||||
author="PyTorch Core Team",
|
||||
description="Test extension to verify TORCH_STABLE_ONLY flag",
|
||||
packages=find_packages(exclude=("test",)),
|
||||
package_data={"torch_stable_test": ["*.dll", "*.dylib", "*.so"]},
|
||||
install_requires=[
|
||||
"torch",
|
||||
],
|
||||
ext_modules=get_extension(),
|
||||
cmdclass={
|
||||
"build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
|
||||
"clean": clean,
|
||||
},
|
||||
options={"bdist_wheel": {"py_limited_api": "cp39"}},
|
||||
)
|
||||
@ -0,0 +1 @@
|
||||
#include <ATen/core/TensorBase.h> // This should trigger the TORCH_STABLE_ONLY error
|
||||
@ -0,0 +1,22 @@
|
||||
# Owner(s): ["module: cpp"]
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from torch.testing._internal.common_utils import (
|
||||
install_cpp_extension,
|
||||
IS_WINDOWS,
|
||||
run_tests,
|
||||
TestCase,
|
||||
)
|
||||
|
||||
|
||||
if not IS_WINDOWS:
|
||||
|
||||
class TestTorchStable(TestCase):
|
||||
def test_setup_fails(self):
|
||||
with self.assertRaisesRegex(RuntimeError, "build failed for cpp extension"):
|
||||
install_cpp_extension(extension_root=Path(__file__).parent.parent)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
@ -22,7 +22,7 @@ void check_all_parameters(
|
||||
|
||||
template<class Result, class... Args>
|
||||
Result get_operator_from_registry_and_execute(const char* op_name, Args&&... args) {
|
||||
auto& ops = torch::jit::getAllOperatorsFor(
|
||||
auto ops = torch::jit::getAllOperatorsFor(
|
||||
torch::jit::Symbol::fromQualString(op_name));
|
||||
TORCH_INTERNAL_ASSERT(ops.size() == 1);
|
||||
|
||||
|
||||
@ -2,23 +2,16 @@
|
||||
|
||||
import copy
|
||||
import math
|
||||
import pathlib
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
|
||||
REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
|
||||
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
from tools.flight_recorder.components.builder import build_db
|
||||
from tools.flight_recorder.components.config_manager import JobConfig
|
||||
from tools.flight_recorder.components.types import COLLECTIVES, MatchInfo, MatchState
|
||||
from tools.flight_recorder.components.utils import match_one_event
|
||||
|
||||
|
||||
# Make sure to remove REPO_ROOT after import is done
|
||||
sys.path.remove(str(REPO_ROOT))
|
||||
|
||||
from torch.distributed.flight_recorder.components.builder import build_db
|
||||
from torch.distributed.flight_recorder.components.config_manager import JobConfig
|
||||
from torch.distributed.flight_recorder.components.types import (
|
||||
COLLECTIVES,
|
||||
MatchInfo,
|
||||
MatchState,
|
||||
)
|
||||
from torch.distributed.flight_recorder.components.utils import match_one_event
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
|
||||
|
||||
|
||||
@ -331,6 +331,25 @@ class DistElementwiseOpsTest(DTensorOpTestBase):
|
||||
self.assertEqual(z.placements, (Replicate(),))
|
||||
self.assertEqual(z.to_local(), input)
|
||||
|
||||
def test_inplace_op_partial_to_replicate(self):
|
||||
# test that in-place operations that require redistribution raise an error
|
||||
# to preserve aliasing semantics (issue #163374)
|
||||
device_mesh = self.build_device_mesh()
|
||||
|
||||
input_tensor = torch.tensor(64.0, device=self.device_type)
|
||||
partial_dt = DTensor.from_local(
|
||||
input_tensor, device_mesh, placements=(Partial(),)
|
||||
)
|
||||
|
||||
self.assertTrue(partial_dt.placements[0].is_partial())
|
||||
|
||||
# Inplace ops that require placement changes (Partial -> Replicate) should error
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError,
|
||||
"in-place operations that require placement changes are not supported",
|
||||
):
|
||||
partial_dt.clamp_(max=10)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -40,6 +40,7 @@ from torch.testing._internal.common_distributed import (
|
||||
DynamoDistributedSingleProcTestCase,
|
||||
MultiProcessTestCase,
|
||||
requires_accelerator_dist_backend,
|
||||
requires_gloo,
|
||||
skip_if_lt_x_gpu,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
@ -1773,16 +1774,10 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
|
||||
inputs = [x, w, ar_0, ar_1]
|
||||
f(*inputs, **self.get_world_trs())
|
||||
|
||||
def _pass(g):
|
||||
from torch._inductor.fx_passes.bucketing import bucket_all_reduce
|
||||
|
||||
bucket_all_reduce(g.owning_module, lambda _: 2000)
|
||||
|
||||
torch._inductor.config.post_grad_custom_post_pass = _pass
|
||||
|
||||
with torch._inductor.config.patch(
|
||||
{
|
||||
"reorder_for_compute_comm_overlap": False,
|
||||
"bucket_all_reduces_fx": bucket_mode,
|
||||
}
|
||||
):
|
||||
compiled = torch.compile(f)
|
||||
@ -2234,6 +2229,50 @@ class TestSyncDecisionCrossRanks(MultiProcessTestCase):
|
||||
)
|
||||
assert est_ms_nccl > 0
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@requires_gloo()
|
||||
def test_regression_use_nccl_estimate_with_gloo(self):
|
||||
# Test checks that using nccl estimator option does not hard fail
|
||||
# with backends that does not support runtime estimations, e.g. gloo
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
c10d.init_process_group(
|
||||
backend="gloo", store=store, rank=self.rank, world_size=self.world_size
|
||||
)
|
||||
group = c10d.distributed_c10d._get_default_group()
|
||||
group_name = "default"
|
||||
torch._C._distributed_c10d._register_process_group(
|
||||
group_name, torch.distributed.group.WORLD
|
||||
)
|
||||
group_size = group.size()
|
||||
|
||||
def func(inp, group_size, group_name):
|
||||
ag_0_out = torch.ops._c10d_functional.all_gather_into_tensor(
|
||||
inp, group_size, group_name
|
||||
)
|
||||
ag_0_wait = torch.ops.c10d_functional.wait_tensor(ag_0_out)
|
||||
ag_1_out = torch.ops._c10d_functional.all_gather_into_tensor(
|
||||
ag_0_wait, group_size, group_name
|
||||
)
|
||||
ag_1_wait = torch.ops.c10d_functional.wait_tensor(ag_1_out)
|
||||
return ag_1_wait
|
||||
|
||||
gm = make_fx(func)(torch.ones(4, 4), group_size, group_name)
|
||||
g = gm.graph
|
||||
for n in g.nodes:
|
||||
if is_all_gather_into_tensor(n):
|
||||
from torch._inductor.comm_analysis import (
|
||||
estimate_nccl_collective_runtime_from_fx_node,
|
||||
)
|
||||
|
||||
est_ms = estimate_nccl_collective_runtime_from_fx_node(
|
||||
n, use_nccl_estimator=False
|
||||
)
|
||||
assert est_ms > 0
|
||||
est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
|
||||
n, use_nccl_estimator=True
|
||||
)
|
||||
assert est_ms_nccl > 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from torch._dynamo.test_case import run_tests
|
||||
|
||||
@ -208,6 +208,21 @@ class NVSHMEMSymmetricMemoryTest(MultiProcContinuousTest):
|
||||
)
|
||||
self.assertEqual(y, expected)
|
||||
|
||||
def test_get_remote_tensors(self) -> None:
|
||||
"""
|
||||
Get all remote tensors
|
||||
"""
|
||||
self._init_device()
|
||||
group_name = dist.group.WORLD.group_name
|
||||
symm_mem.enable_symm_mem_for_group(group_name)
|
||||
|
||||
my_tensor = symm_mem.empty(1, device=self.device).fill_(self.rank)
|
||||
remote_tensors = torch.ops.symm_mem.get_remote_tensors(my_tensor, group_name)
|
||||
dist.barrier()
|
||||
|
||||
for peer, tensor in enumerate(remote_tensors):
|
||||
self.assertEqual(tensor, peer)
|
||||
|
||||
@skipIfRocm
|
||||
def test_nvshmem_put(self) -> None:
|
||||
self._init_device()
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
# Owner(s): ["module: dynamo"]
|
||||
|
||||
import copy
|
||||
import functools
|
||||
import inspect
|
||||
import os
|
||||
import pickle
|
||||
import unittest
|
||||
from contextlib import contextmanager
|
||||
from unittest.mock import patch
|
||||
|
||||
@ -13,17 +15,22 @@ import torch._inductor.config
|
||||
import torch._inductor.test_case
|
||||
import torch.onnx.operators
|
||||
import torch.utils.cpp_extension
|
||||
from torch._dynamo.aot_compile import ModelInput, SerializableCallable
|
||||
from torch._dynamo.aot_compile import AOTCompiledModel, ModelInput, SerializableCallable
|
||||
from torch._dynamo.exc import PackageError, Unsupported
|
||||
from torch._dynamo.package import DynamoCache
|
||||
from torch._dynamo.precompile_context import PrecompileContext
|
||||
from torch._inductor.runtime.runtime_utils import cache_dir
|
||||
from torch.fx._graph_pickler import GraphPickler
|
||||
from torch.testing._internal.common_utils import instantiate_parametrized_tests
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
TEST_CUDA,
|
||||
)
|
||||
|
||||
|
||||
MY_LAMBDA = lambda x: x + 1 # noqa: E731
|
||||
|
||||
EPS = torch.tensor(1e-7)
|
||||
|
||||
|
||||
class CustomCompiledFunction(torch._dynamo.aot_compile.SerializableCallable):
|
||||
def __init__(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]):
|
||||
@ -582,6 +589,18 @@ from user code:
|
||||
actual = compiled_fn(fn, *inputs)
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
def test_aot_compile_with_global_tensor(self):
|
||||
def fn(x, y):
|
||||
return x + y + EPS
|
||||
|
||||
def make_inputs():
|
||||
return (torch.randn(3, 4), torch.randn(3, 4))
|
||||
|
||||
compiled_fn = torch.compile(fn, fullgraph=True).aot_compile((make_inputs(), {}))
|
||||
|
||||
test_inputs = make_inputs()
|
||||
self.assertEqual(compiled_fn(*test_inputs), fn(*test_inputs))
|
||||
|
||||
def test_aot_compile_with_default_args(self):
|
||||
def fn(x, y=1):
|
||||
return x + x
|
||||
@ -599,6 +618,92 @@ from user code:
|
||||
actual = compiled_fn(*inputs)
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
@unittest.skipIf(not TEST_CUDA, "requires cuda")
|
||||
def test_aot_compile_with_aoti(self):
|
||||
with torch.device("cuda"):
|
||||
from torch._dynamo.hooks import Hooks
|
||||
|
||||
def fn(x, y):
|
||||
return x + y
|
||||
|
||||
def make_inputs():
|
||||
return (torch.randn(3, 4), torch.randn(3, 4))
|
||||
|
||||
compiled_fn = torch._dynamo.aot_compile.aot_compile_fullgraph(
|
||||
fn,
|
||||
(make_inputs(), {}),
|
||||
Hooks(),
|
||||
torch._TorchCompileAOTInductorWrapper(None, None, None),
|
||||
)
|
||||
|
||||
test_inputs = make_inputs()
|
||||
expected = fn(*test_inputs)
|
||||
actual = compiled_fn(*test_inputs)
|
||||
self.assertEqual(expected, actual)
|
||||
compiled_fn.save_compiled_function(self.path())
|
||||
with open(self.path(), "rb") as f:
|
||||
compiled_fn = torch.compiler.load_compiled_function(f)
|
||||
actual = compiled_fn(*test_inputs)
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
@unittest.skipIf(not TEST_CUDA, "requires cuda")
|
||||
def test_aot_compile_with_aoti_module(self):
|
||||
with torch.device("cuda"):
|
||||
from torch._dynamo.hooks import Hooks
|
||||
|
||||
mod = SimpleLinearModule()
|
||||
|
||||
def make_inputs():
|
||||
return (torch.randn(4, 3),)
|
||||
|
||||
compiled_mod = torch._dynamo.aot_compile.aot_compile_module(
|
||||
mod,
|
||||
[ModelInput(make_inputs(), {}, [])],
|
||||
Hooks(),
|
||||
torch._TorchCompileAOTInductorWrapper(None, None, None),
|
||||
)
|
||||
|
||||
def get_grads(m: torch.nn.Module):
|
||||
return {name: p.grad for name, p in m.named_parameters()}
|
||||
|
||||
original_mod = copy.deepcopy(mod)
|
||||
test_inputs = make_inputs()
|
||||
expected = mod(*test_inputs)
|
||||
expected.sum().backward()
|
||||
expected_grads = get_grads(mod)
|
||||
|
||||
actual = compiled_mod(*test_inputs)
|
||||
self.assertEqual(expected, actual)
|
||||
serialized = compiled_mod.serialize()
|
||||
compiled_fn = AOTCompiledModel.deserialize(original_mod, serialized)
|
||||
actual = compiled_fn(*test_inputs)
|
||||
actual.sum().backward()
|
||||
self.assertEqual(get_grads(original_mod), expected_grads)
|
||||
|
||||
@unittest.skipIf(not TEST_CUDA, "requires cuda")
|
||||
def test_aot_compile_with_aoti_torch_compile(self):
|
||||
with torch.device("cuda"):
|
||||
|
||||
def fn(x, y):
|
||||
return x + y
|
||||
|
||||
def make_inputs():
|
||||
return (torch.randn(3, 4), torch.randn(3, 4))
|
||||
|
||||
compiled_fn = torch.compile(
|
||||
fn, fullgraph=True, options={"use_aoti": True}
|
||||
).aot_compile((make_inputs(), {}))
|
||||
test_inputs = make_inputs()
|
||||
expected = fn(*test_inputs)
|
||||
actual = compiled_fn(*test_inputs)
|
||||
self.assertEqual(expected, actual)
|
||||
compiled_fn.save_compiled_function(self.path())
|
||||
with open(self.path(), "rb") as f:
|
||||
compiled_fn = torch.compiler.load_compiled_function(f)
|
||||
actual = compiled_fn(*test_inputs)
|
||||
self.assertEqual(compiled_fn._artifacts.backend_name, "aotinductor")
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from torch._dynamo.test_case import run_tests
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user