mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-12 06:44:55 +08:00
Compare commits
1 Commits
ciflow/h10
...
docs
| Author | SHA1 | Date | |
|---|---|---|---|
| c93b821875 |
@ -260,8 +260,8 @@ case "$tag" in
|
||||
HALIDE=yes
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-py3.12-pallas)
|
||||
CUDA_VERSION=12.8.1
|
||||
pytorch-linux-jammy-cuda13.0-py3.12-pallas)
|
||||
CUDA_VERSION=13.0.0
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
PALLAS=yes
|
||||
|
||||
@ -8,11 +8,9 @@ from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
try:
|
||||
from collections.abc import Callable # Python 3.11+
|
||||
from typing import Any, Required, TypedDict
|
||||
from typing import Any, Callable, Required, TypedDict # Python 3.11+
|
||||
except ImportError:
|
||||
from collections.abc import Callable
|
||||
from typing import Any, TypedDict
|
||||
from typing import Any, Callable, TypedDict
|
||||
|
||||
from typing_extensions import Required # Fallback for Python <3.11
|
||||
|
||||
|
||||
@ -168,16 +168,14 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/intel/oneapi/compiler/latest/env/vars.sh
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/intel/oneapi/umf/latest/env/vars.sh
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/intel/oneapi/ccl/latest/env/vars.sh
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/intel/oneapi/mpi/latest/env/vars.sh
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/intel/oneapi/pti/latest/env/vars.sh
|
||||
# Enable XCCL build
|
||||
export USE_XCCL=1
|
||||
export USE_MPI=0
|
||||
# XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
|
||||
export USE_KINETO=0
|
||||
export TORCH_XPU_ARCH_LIST=pvc
|
||||
fi
|
||||
|
||||
|
||||
@ -208,8 +208,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
||||
source /opt/intel/oneapi/ccl/latest/env/vars.sh
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/intel/oneapi/mpi/latest/env/vars.sh
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/intel/oneapi/pti/latest/env/vars.sh
|
||||
# Check XPU status before testing
|
||||
timeout 30 xpu-smi discovery || true
|
||||
fi
|
||||
@ -339,7 +337,7 @@ test_python() {
|
||||
|
||||
test_python_smoke() {
|
||||
# Smoke tests for H100/B200
|
||||
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
|
||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
||||
ccb801b88af136454798b945175c4c87e636ac33
|
||||
ca2212438fdd8ce29b66999ed70ed54b0f9372d1
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
|
||||
c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
|
||||
|
||||
13
.github/labeler.yml
vendored
13
.github/labeler.yml
vendored
@ -165,16 +165,3 @@
|
||||
- torch/_inductor/kernel/mm.py
|
||||
- test/inductor/test_max_autotune.py
|
||||
- third_party/fbgemm
|
||||
|
||||
"ciflow/mps":
|
||||
- aten/src/ATen/mps/**
|
||||
- aten/src/ATen/native/mps/**
|
||||
- torch/_inductor/codegen/mps.py
|
||||
- test/test_mps.py
|
||||
- test/inductor/test_mps_basic.py
|
||||
|
||||
"ciflow/h100-symm-mem":
|
||||
- torch/csrc/distributed/c10d/symm_mem/**
|
||||
- torch/distributed/_symmetric_memory/**
|
||||
- test/distributed/**/*mem*
|
||||
- test/distributed/**/*mem*/**
|
||||
|
||||
3
.github/scripts/delete_old_branches.py
vendored
3
.github/scripts/delete_old_branches.py
vendored
@ -1,11 +1,10 @@
|
||||
# Delete old branches
|
||||
import os
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Any, Callable
|
||||
|
||||
from github_utils import gh_fetch_json_dict, gh_graphql
|
||||
from gitutils import GitRepo
|
||||
|
||||
3
.github/scripts/filter_test_configs.py
vendored
3
.github/scripts/filter_test_configs.py
vendored
@ -8,11 +8,10 @@ import re
|
||||
import subprocess
|
||||
import sys
|
||||
import warnings
|
||||
from collections.abc import Callable
|
||||
from enum import Enum
|
||||
from functools import cache
|
||||
from logging import info
|
||||
from typing import Any, Optional
|
||||
from typing import Any, Callable, Optional
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
import yaml
|
||||
|
||||
3
.github/scripts/get_workflow_job_id.py
vendored
3
.github/scripts/get_workflow_job_id.py
vendored
@ -11,8 +11,7 @@ import sys
|
||||
import time
|
||||
import urllib
|
||||
import urllib.parse
|
||||
from collections.abc import Callable
|
||||
from typing import Any, Optional
|
||||
from typing import Any, Callable, Optional
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
|
||||
|
||||
3
.github/scripts/github_utils.py
vendored
3
.github/scripts/github_utils.py
vendored
@ -3,9 +3,8 @@
|
||||
import json
|
||||
import os
|
||||
import warnings
|
||||
from collections.abc import Callable
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, cast, Optional, Union
|
||||
from typing import Any, Callable, cast, Optional, Union
|
||||
from urllib.error import HTTPError
|
||||
from urllib.parse import quote
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
4
.github/scripts/gitutils.py
vendored
4
.github/scripts/gitutils.py
vendored
@ -4,10 +4,10 @@ import os
|
||||
import re
|
||||
import tempfile
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Iterator
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
from typing import Any, cast, Optional, TypeVar, Union
|
||||
from typing import Any, Callable, cast, Optional, TypeVar, Union
|
||||
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
3
.github/scripts/lintrunner.sh
vendored
3
.github/scripts/lintrunner.sh
vendored
@ -34,9 +34,6 @@ python3 torch/utils/data/datapipes/gen_pyi.py
|
||||
# Also check generated pyi files
|
||||
find torch -name '*.pyi' -exec git add --force -- "{}" +
|
||||
|
||||
# Print current environment
|
||||
python3 -m pip freeze
|
||||
|
||||
RC=0
|
||||
# Run lintrunner on all files
|
||||
if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
|
||||
|
||||
4
.github/scripts/trymerge.py
vendored
4
.github/scripts/trymerge.py
vendored
@ -17,12 +17,12 @@ import re
|
||||
import time
|
||||
import urllib.parse
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable, Iterable
|
||||
from collections.abc import Iterable
|
||||
from dataclasses import dataclass
|
||||
from functools import cache
|
||||
from pathlib import Path
|
||||
from re import Pattern
|
||||
from typing import Any, cast, NamedTuple, Optional
|
||||
from typing import Any, Callable, cast, NamedTuple, Optional
|
||||
from warnings import warn
|
||||
|
||||
import yaml
|
||||
|
||||
1
.github/workflows/b200-distributed.yml
vendored
1
.github/workflows/b200-distributed.yml
vendored
@ -37,6 +37,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
|
||||
1
.github/workflows/b200-symm-mem.yml
vendored
1
.github/workflows/b200-symm-mem.yml
vendored
@ -37,6 +37,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
|
||||
2
.github/workflows/docker-builds.yml
vendored
2
.github/workflows/docker-builds.yml
vendored
@ -67,7 +67,7 @@ jobs:
|
||||
pytorch-linux-jammy-py3.10-gcc11,
|
||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
|
||||
pytorch-linux-jammy-py3.12-halide,
|
||||
pytorch-linux-jammy-cuda12.8-py3.12-pallas,
|
||||
pytorch-linux-jammy-cuda13.0-py3.12-pallas,
|
||||
pytorch-linux-jammy-xpu-n-1-py3,
|
||||
pytorch-linux-noble-xpu-n-py3,
|
||||
pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
|
||||
|
||||
1
.github/workflows/h100-distributed.yml
vendored
1
.github/workflows/h100-distributed.yml
vendored
@ -37,6 +37,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: "linux.c7i.12xlarge"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '9.0'
|
||||
|
||||
2
.github/workflows/inductor-rocm-mi200.yml
vendored
2
.github/workflows/inductor-rocm-mi200.yml
vendored
@ -1,4 +1,4 @@
|
||||
name: inductor-rocm-mi200
|
||||
name: inductor-rocm
|
||||
|
||||
on:
|
||||
schedule:
|
||||
|
||||
6
.github/workflows/inductor-unittest.yml
vendored
6
.github/workflows/inductor-unittest.yml
vendored
@ -86,14 +86,14 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas
|
||||
build-environment: linux-jammy-py3.12-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-py3.12-pallas
|
||||
cuda-arch-list: '8.9'
|
||||
runner: linux.8xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
|
||||
{ config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
8
.github/workflows/nightly.yml
vendored
8
.github/workflows/nightly.yml
vendored
@ -5,11 +5,9 @@ on:
|
||||
- cron: 0 0 * * *
|
||||
push:
|
||||
tags:
|
||||
# NOTE: Doc build pipelines should only get triggered on:
|
||||
# Major or minor release candidates builds
|
||||
- v[0-9]+.[0-9]+.0+-rc[0-9]+
|
||||
# Final RC for major, minor and patch releases
|
||||
- v[0-9]+.[0-9]+.[0-9]+
|
||||
# NOTE: Doc build pipelines should only get triggered on release candidate builds
|
||||
# Release candidate tags look like: v1.11.0-rc1
|
||||
- v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
|
||||
- ciflow/nightly/*
|
||||
workflow_dispatch:
|
||||
|
||||
|
||||
2
.github/workflows/rocm-mi200.yml
vendored
2
.github/workflows/rocm-mi200.yml
vendored
@ -1,4 +1,4 @@
|
||||
name: rocm-mi200
|
||||
name: rocm
|
||||
|
||||
on:
|
||||
push:
|
||||
|
||||
3
.github/workflows/test-b200.yml
vendored
3
.github/workflows/test-b200.yml
vendored
@ -52,6 +52,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
@ -72,4 +73,4 @@ jobs:
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
secrets: inherit
|
||||
secrets: inherit
|
||||
1
.github/workflows/test-h100.yml
vendored
1
.github/workflows/test-h100.yml
vendored
@ -41,6 +41,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '9.0'
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -127,7 +127,6 @@ torch/test/
|
||||
torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
|
||||
torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
|
||||
torch/version.py
|
||||
torch/_inductor/kernel/vendored_templates/*
|
||||
minifier_launcher.py
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
|
||||
aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
|
||||
|
||||
@ -186,8 +186,6 @@ include_patterns = [
|
||||
'aten/src/ATen/native/nested/cuda/*.h',
|
||||
'aten/src/ATen/native/nested/*.cpp',
|
||||
'aten/src/ATen/native/nested/*.h',
|
||||
'aten/src/ATen/xpu/**/*.h',
|
||||
'aten/src/ATen/xpu/**/*.cpp',
|
||||
'c10/**/*.cpp',
|
||||
'c10/**/*.h',
|
||||
'torch/*.h',
|
||||
|
||||
@ -736,44 +736,6 @@ if(NOT DEFINED USE_BLAS)
|
||||
set(USE_BLAS ON)
|
||||
endif()
|
||||
|
||||
# Prioritized Text Linker Optimization
|
||||
if(USE_PRIORITIZED_TEXT_FOR_LD)
|
||||
|
||||
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
|
||||
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
|
||||
|
||||
execute_process(
|
||||
COMMAND ${Python_EXECUTABLE}
|
||||
${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py
|
||||
--filein "${LINKER_SCRIPT_FILE_IN}"
|
||||
--fout "${LINKER_SCRIPT_FILE_OUT}"
|
||||
RESULT_VARIABLE _gen_result
|
||||
OUTPUT_VARIABLE _gen_output
|
||||
ERROR_VARIABLE _gen_error
|
||||
)
|
||||
|
||||
if(NOT _gen_result EQUAL 0)
|
||||
message(FATAL_ERROR
|
||||
"Failed to generate linker script:\n${_gen_output}\n${_gen_error}")
|
||||
endif()
|
||||
|
||||
append_cxx_flag_if_supported("-ffunction-sections" CMAKE_CXX_FLAGS)
|
||||
append_cxx_flag_if_supported("-fdata-sections" CMAKE_CXX_FLAGS)
|
||||
append_c_flag_if_supported("-ffunction-sections" CMAKE_C_FLAGS)
|
||||
append_c_flag_if_supported("-fdata-sections" CMAKE_C_FLAGS)
|
||||
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
|
||||
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
|
||||
|
||||
else()
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
message(WARNING [[
|
||||
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
|
||||
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
|
||||
]])
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Build libtorch mobile library, which contains ATen/TH ops and native support
|
||||
# for TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
|
||||
if(INTERN_BUILD_MOBILE)
|
||||
@ -1440,6 +1402,9 @@ if(BUILD_JNI)
|
||||
add_subdirectory(android/pytorch_android)
|
||||
endif()
|
||||
|
||||
include(cmake/Summary.cmake)
|
||||
caffe2_print_configuration_summary()
|
||||
|
||||
# Parse custom debug info
|
||||
if(DEFINED USE_CUSTOM_DEBINFO)
|
||||
string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
|
||||
@ -1479,5 +1444,56 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
|
||||
DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
endif()
|
||||
|
||||
include(cmake/Summary.cmake)
|
||||
caffe2_print_configuration_summary()
|
||||
if(USE_PRIORITIZED_TEXT_FOR_LD)
|
||||
add_compile_options(
|
||||
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
|
||||
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
|
||||
)
|
||||
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
|
||||
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
|
||||
COMMENT "Generating prioritized text linker files"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||
|
||||
if(BUILD_PYTHON)
|
||||
set(LINKER_OPT_TARGETS torch_python)
|
||||
endif()
|
||||
|
||||
if(NOT BUILD_LIBTORCHLESS)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
|
||||
if(USE_CUDA)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
|
||||
endif()
|
||||
if(USE_XPU)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
|
||||
endif()
|
||||
if(USE_ROCM)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
|
||||
if(TARGET ${tgt})
|
||||
add_dependencies("${tgt}" generate_linker_script)
|
||||
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
|
||||
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||
else()
|
||||
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
else()
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
message(WARNING [[
|
||||
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
|
||||
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
|
||||
]])
|
||||
endif()
|
||||
endif()
|
||||
|
||||
2
LICENSE
2
LICENSE
@ -37,7 +37,7 @@ Copyright (c) 2024 Tri Dao.
|
||||
All rights reserved.
|
||||
|
||||
All contributions by Arm:
|
||||
Copyright (c) 2021, 2023-2025 Arm Limited and/or its affiliates
|
||||
Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates
|
||||
|
||||
All contributions from Caffe:
|
||||
Copyright(c) 2013, 2014, 2015, the respective contributors
|
||||
|
||||
@ -94,11 +94,6 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
|
||||
at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
|
||||
}
|
||||
|
||||
TORCH_API inline std::pair<size_t, size_t> getMemoryInfo(
|
||||
c10::DeviceIndex device_index) {
|
||||
const auto device_type = getAccelerator(true).value();
|
||||
return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index);
|
||||
}
|
||||
} // namespace at::accelerator
|
||||
|
||||
namespace at {
|
||||
|
||||
@ -226,8 +226,8 @@ template <
|
||||
typename B = HostBlock<S>>
|
||||
struct CachingHostAllocatorImpl {
|
||||
virtual ~CachingHostAllocatorImpl() {
|
||||
if (active_) {
|
||||
active_ = false;
|
||||
active_ = false;
|
||||
if (pinned_use_background_threads()) {
|
||||
getBackgroundThreadPool()->waitWorkComplete();
|
||||
}
|
||||
}
|
||||
@ -260,7 +260,6 @@ struct CachingHostAllocatorImpl {
|
||||
if (pinned_use_background_threads()) {
|
||||
// Launch the background thread and process events in a loop.
|
||||
static bool background_thread_flag [[maybe_unused]] = [this] {
|
||||
active_ = true;
|
||||
getBackgroundThreadPool()->run([&]() {
|
||||
while (active_) {
|
||||
process_events();
|
||||
@ -684,9 +683,9 @@ struct CachingHostAllocatorImpl {
|
||||
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||
|
||||
// Indicates whether the event-processing thread pool is active.
|
||||
// Indicates whether the object is active.
|
||||
// Set to false in the destructor to signal background threads to stop.
|
||||
std::atomic<bool> active_{false};
|
||||
std::atomic<bool> active_{true};
|
||||
protected:
|
||||
alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
|
||||
};
|
||||
|
||||
@ -245,9 +245,6 @@ class TORCH_API TensorBase {
|
||||
size_t weak_use_count() const noexcept {
|
||||
return impl_.weak_use_count();
|
||||
}
|
||||
bool is_uniquely_owned() const noexcept {
|
||||
return impl_.is_uniquely_owned();
|
||||
}
|
||||
|
||||
std::string toString() const;
|
||||
|
||||
|
||||
@ -55,6 +55,14 @@ struct numeric_limits<int8_t> {
|
||||
static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<uint16_t> {
|
||||
static inline __host__ __device__ uint16_t lowest() { return 0; }
|
||||
static inline __host__ __device__ uint16_t max() { return UINT16_MAX; }
|
||||
static inline __host__ __device__ uint16_t lower_bound() { return 0; }
|
||||
static inline __host__ __device__ uint16_t upper_bound() { return UINT16_MAX; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<int16_t> {
|
||||
static inline __host__ __device__ int16_t lowest() { return INT16_MIN; }
|
||||
@ -63,6 +71,14 @@ struct numeric_limits<int16_t> {
|
||||
static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<uint32_t> {
|
||||
static inline __host__ __device__ uint32_t lowest() { return 0; }
|
||||
static inline __host__ __device__ uint32_t max() { return UINT32_MAX; }
|
||||
static inline __host__ __device__ uint32_t lower_bound() { return 0; }
|
||||
static inline __host__ __device__ uint32_t upper_bound() { return UINT32_MAX; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<int32_t> {
|
||||
static inline __host__ __device__ int32_t lowest() { return INT32_MIN; }
|
||||
@ -71,6 +87,21 @@ struct numeric_limits<int32_t> {
|
||||
static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<uint64_t> {
|
||||
#ifdef _MSC_VER
|
||||
static inline __host__ __device__ uint64_t lowest() { return 0; }
|
||||
static inline __host__ __device__ uint64_t max() { return _UI64_MAX; }
|
||||
static inline __host__ __device__ uint64_t lower_bound() { return 0; }
|
||||
static inline __host__ __device__ uint64_t upper_bound() { return _UI64_MAX; }
|
||||
#else
|
||||
static inline __host__ __device__ uint64_t lowest() { return 0; }
|
||||
static inline __host__ __device__ uint64_t max() { return UINT64_MAX; }
|
||||
static inline __host__ __device__ uint64_t lower_bound() { return 0; }
|
||||
static inline __host__ __device__ uint64_t upper_bound() { return UINT64_MAX; }
|
||||
#endif
|
||||
};
|
||||
|
||||
template <>
|
||||
struct numeric_limits<int64_t> {
|
||||
#ifdef _MSC_VER
|
||||
|
||||
@ -157,8 +157,6 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
|
||||
DispatchKey::Negative,
|
||||
DispatchKey::Conjugate,
|
||||
DispatchKey::XLA,
|
||||
DispatchKey::XPU,
|
||||
DispatchKey::HPU,
|
||||
DispatchKey::CUDA,
|
||||
DispatchKey::CPU,
|
||||
DispatchKey::PrivateUse1,
|
||||
|
||||
@ -440,7 +440,7 @@ bool MPSHeapAllocatorImpl::release_cached_buffers() {
|
||||
// we need to release the lock temporarily as synchronizing may cause deadlock with completion handlers.
|
||||
m_mutex.unlock();
|
||||
auto stream = getDefaultMPSStream();
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
dispatch_sync(stream->queue(), ^() {
|
||||
stream->synchronize(SyncType::COMMIT_AND_WAIT);
|
||||
});
|
||||
m_mutex.lock();
|
||||
|
||||
@ -110,9 +110,6 @@ class TORCH_API MPSStream {
|
||||
return _stream;
|
||||
}
|
||||
|
||||
MTLBuffer_t getErrorBuffer();
|
||||
void checkLastError();
|
||||
|
||||
private:
|
||||
Stream _stream;
|
||||
MTLCommandQueue_t _commandQueue = nil;
|
||||
@ -124,8 +121,6 @@ class TORCH_API MPSStream {
|
||||
dispatch_queue_t _serialQueue = nullptr;
|
||||
// CommitAndContinue is enabled by default
|
||||
bool _enableCommitAndContinue = true;
|
||||
// Buffer that contains last raised error
|
||||
MTLBuffer_t _errorBuffer = nil;
|
||||
|
||||
// use synchronize() to access any of these commit functions outside MPSStream
|
||||
void commit();
|
||||
@ -160,7 +155,4 @@ class TORCH_API MPSStreamImpl {
|
||||
MPSStreamImpl();
|
||||
};
|
||||
|
||||
#ifdef __OBJC__
|
||||
void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
|
||||
#endif
|
||||
} // namespace at::mps
|
||||
|
||||
@ -3,13 +3,13 @@
|
||||
#include <ATen/mps/MPSAllocatorInterface.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/mps/MPSStream.h>
|
||||
#include <c10/metal/error.h>
|
||||
|
||||
@interface MPSGraphExecutionDescriptor ()
|
||||
@property(readwrite, atomic) BOOL enableCommitAndContinue;
|
||||
@end
|
||||
|
||||
namespace at::mps {
|
||||
|
||||
//-----------------------------------------------------------------
|
||||
// MPSStream
|
||||
//-----------------------------------------------------------------
|
||||
@ -30,10 +30,6 @@ MPSStream::MPSStream(Stream stream) : _stream(stream) {
|
||||
// Choose level which optimizes for GPU
|
||||
_compilationDescriptor.optimizationLevel = MPSGraphOptimizationLevel0;
|
||||
_executionDescriptor.compilationDescriptor = _compilationDescriptor;
|
||||
|
||||
_errorBuffer = [MPSDevice::getInstance()->device() newBufferWithLength:sizeof(c10::metal::ErrorMessages)
|
||||
options:MTLResourceStorageModeShared];
|
||||
std::memset([_errorBuffer contents], 0, 1024);
|
||||
}
|
||||
|
||||
MPSStream::~MPSStream() {
|
||||
@ -42,8 +38,6 @@ MPSStream::~MPSStream() {
|
||||
[_executionDescriptor release];
|
||||
[_compilationDescriptor release];
|
||||
_executionDescriptor = nil;
|
||||
[_errorBuffer release];
|
||||
_errorBuffer = nil;
|
||||
_compilationDescriptor = nil;
|
||||
|
||||
assert(_commandBuffer == nil);
|
||||
@ -110,7 +104,6 @@ void MPSStream::commitAndWait() {
|
||||
[_prevCommandBuffer waitUntilCompleted];
|
||||
[_prevCommandBuffer release];
|
||||
_prevCommandBuffer = nil;
|
||||
checkLastError();
|
||||
}
|
||||
|
||||
if (_commandBuffer) {
|
||||
@ -118,7 +111,6 @@ void MPSStream::commitAndWait() {
|
||||
[_commandBuffer waitUntilCompleted];
|
||||
[_commandBuffer release];
|
||||
_commandBuffer = nil;
|
||||
checkLastError();
|
||||
}
|
||||
}
|
||||
|
||||
@ -161,7 +153,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
dispatch_sync_with_rethrow(_serialQueue, ^() {
|
||||
dispatch_sync(_serialQueue, ^() {
|
||||
@autoreleasepool {
|
||||
endKernelCoalescing();
|
||||
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
||||
@ -191,7 +183,7 @@ void MPSStream::copy(id<MTLBuffer> srcBuffer,
|
||||
size_t dstOffset,
|
||||
uint64_t profileId,
|
||||
SyncType syncType) {
|
||||
dispatch_sync_with_rethrow(_serialQueue, ^() {
|
||||
dispatch_sync(_serialQueue, ^() {
|
||||
@autoreleasepool {
|
||||
endKernelCoalescing();
|
||||
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
||||
@ -244,7 +236,7 @@ void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDicti
|
||||
auto& profiler = getMPSProfiler();
|
||||
const bool isGraphProfilingEnabled = profiler.isOperationProfilingEnabled();
|
||||
|
||||
dispatch_sync_with_rethrow(_serialQueue, ^() {
|
||||
dispatch_sync(_serialQueue, ^() {
|
||||
endKernelCoalescing();
|
||||
if (isGraphProfilingEnabled) {
|
||||
// this function call is only relevant for interval-based Signposts
|
||||
@ -274,24 +266,6 @@ void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDicti
|
||||
});
|
||||
}
|
||||
|
||||
id<MTLBuffer> MPSStream::getErrorBuffer() {
|
||||
return _errorBuffer;
|
||||
}
|
||||
|
||||
void MPSStream::checkLastError() {
|
||||
auto msgs = reinterpret_cast<c10::metal::ErrorMessages*>([_errorBuffer contents]);
|
||||
const auto& msg = msgs->msg[0];
|
||||
if (!msgs) {
|
||||
return;
|
||||
}
|
||||
unsigned int count = 0;
|
||||
std::swap(count, msgs->count);
|
||||
if (!count) {
|
||||
return;
|
||||
}
|
||||
throw c10::AcceleratorError({msg.func, msg.file, msg.line}, 1, msg.message);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------
|
||||
// MPSStreamImpl
|
||||
//-----------------------------------------------------------------
|
||||
@ -315,19 +289,4 @@ MPSStream* getDefaultMPSStream() {
|
||||
return MPSStreamImpl::getInstance();
|
||||
}
|
||||
|
||||
// Helper methods
|
||||
void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
|
||||
__block std::optional<std::exception_ptr> block_exception;
|
||||
dispatch_sync(queue, ^() {
|
||||
try {
|
||||
block();
|
||||
} catch (...) {
|
||||
block_exception = std::current_exception();
|
||||
}
|
||||
});
|
||||
if (block_exception) {
|
||||
std::rethrow_exception(*block_exception);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace at::mps
|
||||
|
||||
@ -23,7 +23,6 @@
|
||||
#include <ATen/ops/_aminmax_native.h>
|
||||
#include <ATen/ops/_assert_async_native.h>
|
||||
#include <ATen/ops/_assert_scalar_native.h>
|
||||
#include <ATen/ops/_async_error_native.h>
|
||||
#include <ATen/ops/_functional_assert_async_native.h>
|
||||
#include <ATen/ops/_functional_assert_scalar_native.h>
|
||||
#include <ATen/ops/_make_per_tensor_quantized_tensor.h>
|
||||
@ -480,14 +479,6 @@ Tensor isfinite(const Tensor& self) {
|
||||
});
|
||||
}
|
||||
|
||||
void _async_error(std::string_view msg) {
|
||||
TORCH_CHECK(0, msg);
|
||||
}
|
||||
|
||||
void _async_error_meta(std::string_view msg) {
|
||||
// Do NOT error, it's an async error!
|
||||
}
|
||||
|
||||
void _assert_async_cpu(const Tensor& self) {
|
||||
TORCH_CHECK(
|
||||
native::is_nonzero(self),
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
#include <ATen/native/ReduceOpsUtils.h>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
@ -78,12 +79,12 @@ void min_all_kernel_impl(Tensor& result, const Tensor& input) {
|
||||
reduce_all_impl<int64_t>(result, input, upper_bound<int64_t>(),
|
||||
[=](int64_t a, int64_t b) -> int64_t { return min_impl(a, b); });
|
||||
} else {
|
||||
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "min_all", [&] {
|
||||
AT_DISPATCH_V2(input.scalar_type(), "min_all", AT_WRAP([&] {
|
||||
using Vec = Vectorized<opmath_type<scalar_t>>;
|
||||
reduce_all_impl_vec<scalar_t>(result, input, upper_bound<scalar_t>(),
|
||||
[=] (scalar_t a , scalar_t b) -> scalar_t { return min_impl(a, b); },
|
||||
[=](Vec a, Vec b) -> Vec { return minimum(a, b); });
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
|
||||
}
|
||||
}
|
||||
|
||||
@ -103,12 +104,12 @@ void max_all_kernel_impl(Tensor& result, const Tensor& input) {
|
||||
reduce_all_impl<int64_t>(result, input, lower_bound<int64_t>(),
|
||||
[=](int64_t a, int64_t b) -> int64_t { return max_impl(a, b); });
|
||||
} else {
|
||||
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_all", [&] {
|
||||
AT_DISPATCH_V2(input.scalar_type(), "max_all", AT_WRAP([&] {
|
||||
using Vec = Vectorized<opmath_type<scalar_t>>;
|
||||
reduce_all_impl_vec<scalar_t>(result, input, lower_bound<scalar_t>(),
|
||||
[=] (scalar_t a , scalar_t b) -> scalar_t { return max_impl(a, b); },
|
||||
[=](Vec a, Vec b) -> Vec { return maximum(a, b); });
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
|
||||
}
|
||||
}
|
||||
|
||||
@ -199,7 +200,7 @@ void aminmax_allreduce_kernel(
|
||||
}
|
||||
);
|
||||
} else {
|
||||
AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "aminmax_cpu", [&] {
|
||||
AT_DISPATCH_V2(input.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
|
||||
using Vec = Vectorized<opmath_type<scalar_t>>;
|
||||
using scalar_t_pair = std::pair<scalar_t, scalar_t>;
|
||||
reduce_all_impl_vec_two_outputs<scalar_t>(
|
||||
@ -214,7 +215,7 @@ void aminmax_allreduce_kernel(
|
||||
[=](Vec a, Vec b) -> Vec { return minimum(a, b); },
|
||||
[=](Vec a, Vec b) -> Vec { return maximum(a, b); }
|
||||
);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
@ -347,34 +348,35 @@ struct MinValuesOps: public at::native::MinOps<scalar_t> {
|
||||
};
|
||||
|
||||
void min_values_kernel_impl(TensorIterator& iter) {
|
||||
if (iter.dtype() == kLong) {
|
||||
// This case is special because of Vectorized<int64_t> does not
|
||||
// handle upper_bound<int64_t>().
|
||||
// See: https://github.com/pytorch/pytorch/issues/43254
|
||||
using scalar_t = int64_t;
|
||||
binary_kernel_reduce(
|
||||
iter,
|
||||
MinValuesOps<scalar_t>{},
|
||||
std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
|
||||
// This case is special because of Vectorized<int64_t> does not
|
||||
// handle upper_bound<int64_t>().
|
||||
// See: https://github.com/pytorch/pytorch/issues/43254
|
||||
if (iter.dtype() == kLong || iter.dtype() == kUInt64) {
|
||||
AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
|
||||
binary_kernel_reduce(
|
||||
iter,
|
||||
MinValuesOps<scalar_t>{},
|
||||
std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
|
||||
}), kLong, kUInt64);
|
||||
return;
|
||||
}
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cpu", [&iter] {
|
||||
AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
|
||||
binary_kernel_reduce_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
|
||||
[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
|
||||
static_cast<double>(upper_bound<scalar_t>()));
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void max_values_kernel_impl(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] {
|
||||
AT_DISPATCH_V2(iter.dtype(), "max_values_cpu", AT_WRAP([&iter] {
|
||||
binary_kernel_reduce_vec(
|
||||
iter,
|
||||
[](scalar_t a, scalar_t b) -> scalar_t { return max_impl(a, b); },
|
||||
[](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return maximum(a, b); },
|
||||
lower_bound<scalar_t>());
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void argmax_kernel_impl(TensorIterator &iter) {
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <ATen/NumericUtils.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
@ -106,7 +107,7 @@ void min_kernel_impl(
|
||||
bool keepdim) {
|
||||
int64_t self_dim_size = ensure_nonempty_size(self, dim);
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "min_cpu", [&] {
|
||||
AT_DISPATCH_V2(self.scalar_type(), "min_cpu", AT_WRAP([&] {
|
||||
compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
|
||||
scalar_t* result_data, int64_t* indice_data,
|
||||
const scalar_t* self_data, auto self_dim_stride) {
|
||||
@ -128,7 +129,7 @@ void min_kernel_impl(
|
||||
*indice_data = index;
|
||||
}
|
||||
);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
|
||||
}
|
||||
|
||||
void max_kernel_impl(
|
||||
@ -139,7 +140,7 @@ void max_kernel_impl(
|
||||
bool keepdim) {
|
||||
int64_t self_dim_size = ensure_nonempty_size(self, dim);
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "max_cpu", [&] {
|
||||
AT_DISPATCH_V2(self.scalar_type(), "max_cpu", AT_WRAP([&] {
|
||||
compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
|
||||
scalar_t* result_data, int64_t* indice_data,
|
||||
const scalar_t* self_data, auto self_dim_stride) {
|
||||
@ -161,7 +162,7 @@ void max_kernel_impl(
|
||||
*indice_data = index;
|
||||
}
|
||||
);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
|
||||
}
|
||||
|
||||
void aminmax_kernel(
|
||||
@ -186,7 +187,7 @@ void aminmax_kernel(
|
||||
return;
|
||||
}
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, self.scalar_type(), "aminmax_cpu", [&] {
|
||||
AT_DISPATCH_V2(self.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
|
||||
compare_base_kernel<scalar_t, scalar_t>(min_result, max_result, self, wrap_dim, keepdim, [&] (
|
||||
scalar_t* min_result_data, scalar_t* max_result_data,
|
||||
const scalar_t* self_data, auto self_dim_stride) {
|
||||
@ -209,7 +210,7 @@ void aminmax_kernel(
|
||||
*max_result_data = max_number;
|
||||
}
|
||||
);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half);
|
||||
}
|
||||
|
||||
void where_kernel_impl(TensorIterator &iter) {
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#define TORCH_ASSERT_NO_OPERATORS
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/NumericUtils.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
#include <ATen/native/ReduceAllOps.h>
|
||||
@ -28,22 +29,22 @@ void _min_max_values_kernel_cuda_impl(TensorIterator& iter) {
|
||||
}
|
||||
|
||||
void aminmax_allreduce_launch_kernel(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(
|
||||
kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_all_cuda", [&] {
|
||||
AT_DISPATCH_V2(
|
||||
iter.input_dtype(), "aminmax_all_cuda", AT_WRAP([&] {
|
||||
_min_max_values_kernel_cuda_impl<scalar_t>(iter);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void aminmax_launch_kernel(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(
|
||||
kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_cuda", [&]() {
|
||||
AT_DISPATCH_V2(
|
||||
iter.input_dtype(), "aminmax_cuda", AT_WRAP([&]() {
|
||||
gpu_reduce_kernel<scalar_t, scalar_t>(
|
||||
iter,
|
||||
MinMaxOps<scalar_t, scalar_t, int32_t>{},
|
||||
thrust::pair<scalar_t, scalar_t>(
|
||||
at::numeric_limits<scalar_t>::upper_bound(),
|
||||
at::numeric_limits<scalar_t>::lower_bound()));
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#define TORCH_ASSERT_NO_OPERATORS
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/NumericUtils.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
#include <ATen/native/ReduceAllOps.h>
|
||||
@ -33,27 +34,27 @@ void max_values_kernel_cuda_impl(TensorIterator& iter) {
|
||||
}
|
||||
|
||||
void max_values_kernel_cuda(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(
|
||||
kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cuda", [&]() {
|
||||
AT_DISPATCH_V2(
|
||||
iter.dtype(), "max_values_cuda", AT_WRAP([&]() {
|
||||
max_values_kernel_cuda_impl<scalar_t>(iter);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void max_launch_kernel(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(
|
||||
kBFloat16, kHalf, kBool, iter.input_dtype(), "max_cuda", [&]() {
|
||||
AT_DISPATCH_V2(
|
||||
iter.input_dtype(), "max_cuda", AT_WRAP([&]() {
|
||||
gpu_reduce_kernel<scalar_t, scalar_t>(
|
||||
iter,
|
||||
MaxOps<scalar_t>{},
|
||||
thrust::pair<scalar_t, int64_t>(
|
||||
at::numeric_limits<scalar_t>::lower_bound(), 0));
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void max_all_launch_kernel(TensorIterator &iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "max_all_cuda", [&] {
|
||||
AT_DISPATCH_V2(iter.input_dtype(), "max_all_cuda", AT_WRAP([&] {
|
||||
max_values_kernel_cuda_impl<scalar_t>(iter);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
REGISTER_DISPATCH(max_values_stub, &max_values_kernel_cuda)
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
#include <ATen/NumericUtils.h>
|
||||
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/NumericUtils.h>
|
||||
#include <ATen/cuda/NumericLimits.cuh>
|
||||
|
||||
@ -33,24 +34,24 @@ void min_values_kernel_cuda_impl(TensorIterator& iter) {
|
||||
}
|
||||
|
||||
void min_values_kernel_cuda(TensorIterator& iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cuda", [&]() {
|
||||
AT_DISPATCH_V2(iter.dtype(), "min_values_cuda", AT_WRAP([&]() {
|
||||
min_values_kernel_cuda_impl<scalar_t>(iter);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void min_launch_kernel(TensorIterator &iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_cuda", [&]() {
|
||||
AT_DISPATCH_V2(iter.input_dtype(), "min_cuda", AT_WRAP([&]() {
|
||||
gpu_reduce_kernel<scalar_t, scalar_t>(
|
||||
iter,
|
||||
MinOps<scalar_t>{},
|
||||
thrust::pair<scalar_t, int64_t>(at::numeric_limits<scalar_t>::upper_bound(), 0));
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
void min_all_launch_kernel(TensorIterator &iter) {
|
||||
AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_all_cuda", [&] {
|
||||
AT_DISPATCH_V2(iter.input_dtype(), "min_all_cuda", AT_WRAP([&] {
|
||||
min_values_kernel_cuda_impl<scalar_t>(iter);
|
||||
});
|
||||
}), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
|
||||
}
|
||||
|
||||
REGISTER_DISPATCH(min_values_stub, &min_values_kernel_cuda)
|
||||
|
||||
@ -40,6 +40,8 @@ using namespace at::mps;
|
||||
|
||||
namespace at::native::mps {
|
||||
|
||||
void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)());
|
||||
|
||||
struct MPSScalar {
|
||||
id<MTLBuffer> getMTLBuffer() const {
|
||||
return __builtin_bit_cast(id<MTLBuffer>, buffer.get());
|
||||
|
||||
@ -53,6 +53,21 @@
|
||||
@end
|
||||
|
||||
namespace at::native::mps {
|
||||
|
||||
void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()) {
|
||||
__block std::optional<std::exception_ptr> block_exception;
|
||||
dispatch_sync(queue, ^() {
|
||||
try {
|
||||
block();
|
||||
} catch (...) {
|
||||
block_exception = std::current_exception();
|
||||
}
|
||||
});
|
||||
if (block_exception) {
|
||||
std::rethrow_exception(*block_exception);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes distance from lowest to highest element offset in given tensor.
|
||||
*/
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
#include <c10/metal/atomic.h>
|
||||
#include <c10/metal/error.h>
|
||||
#include <c10/metal/indexing.h>
|
||||
#include <metal_stdlib>
|
||||
|
||||
@ -32,24 +31,10 @@ OffsetT index_apply_indices(
|
||||
constant IndexAB* indices,
|
||||
constant int64_t* sizes,
|
||||
constant int64_t* strides,
|
||||
uint num_indices,
|
||||
thread bool& error,
|
||||
device ErrorMessages* error_buf) {
|
||||
uint num_indices) {
|
||||
OffsetT rc = offs.x;
|
||||
for (uint i = 0; i < num_indices; i++) {
|
||||
auto idx = indices[i].indexArray[offs.y];
|
||||
if (idx < -sizes[i] || idx >= sizes[i]) {
|
||||
TORCH_REPORT_ERROR(
|
||||
error_buf,
|
||||
"index ",
|
||||
idx,
|
||||
" is out of bounds for dimension ",
|
||||
i,
|
||||
" with size ",
|
||||
sizes[i]);
|
||||
error = true;
|
||||
break;
|
||||
}
|
||||
if (idx < 0) {
|
||||
idx += sizes[i];
|
||||
}
|
||||
@ -70,7 +55,6 @@ kernel void index_select(
|
||||
constant int64_t* index_sizes,
|
||||
constant int64_t* index_strides,
|
||||
constant uint4& ndim_nindices_numel,
|
||||
device ErrorMessages* error_buffer,
|
||||
uint thread_index [[thread_position_in_grid]]) {
|
||||
const auto ndim = ndim_nindices_numel.x;
|
||||
const auto num_indices = ndim_nindices_numel.y;
|
||||
@ -81,19 +65,8 @@ kernel void index_select(
|
||||
indices_strides,
|
||||
ndim,
|
||||
thread_index);
|
||||
bool error = false;
|
||||
auto input_offs = index_apply_indices<OffsetT>(
|
||||
offs.yz,
|
||||
indices,
|
||||
index_sizes,
|
||||
index_strides,
|
||||
num_indices,
|
||||
error,
|
||||
error_buffer);
|
||||
if (error) {
|
||||
output[offs.x / sizeof(T)] = 0;
|
||||
return;
|
||||
}
|
||||
offs.yz, indices, index_sizes, index_strides, num_indices);
|
||||
output[offs.x / sizeof(T)] = input[input_offs / sizeof(T)];
|
||||
}
|
||||
|
||||
@ -109,9 +82,7 @@ inline void index_put_impl(
|
||||
constant int64_t* index_sizes,
|
||||
constant int64_t* index_strides,
|
||||
constant uint4& ndim_nindices_numel,
|
||||
device ErrorMessages* error_buffer,
|
||||
uint thread_index) {
|
||||
bool error = false;
|
||||
const auto ndim = ndim_nindices_numel.x;
|
||||
const auto num_indices = ndim_nindices_numel.y;
|
||||
const auto offs = index_get_offsets(
|
||||
@ -122,16 +93,7 @@ inline void index_put_impl(
|
||||
ndim,
|
||||
thread_index);
|
||||
auto output_offs = index_apply_indices<OffsetT>(
|
||||
offs.xz,
|
||||
indices,
|
||||
index_sizes,
|
||||
index_strides,
|
||||
num_indices,
|
||||
error,
|
||||
error_buffer);
|
||||
if (error) {
|
||||
return;
|
||||
}
|
||||
offs.xz, indices, index_sizes, index_strides, num_indices);
|
||||
output[output_offs / sizeof(T)] = input[offs.y / sizeof(T)];
|
||||
}
|
||||
|
||||
@ -147,7 +109,6 @@ kernel void index_put(
|
||||
constant int64_t* index_sizes,
|
||||
constant int64_t* index_strides,
|
||||
constant uint4& ndim_nindices_numel,
|
||||
device ErrorMessages* error_buffer,
|
||||
uint thread_index [[thread_position_in_grid]]) {
|
||||
index_put_impl(
|
||||
output,
|
||||
@ -160,7 +121,6 @@ kernel void index_put(
|
||||
index_sizes,
|
||||
index_strides,
|
||||
ndim_nindices_numel,
|
||||
error_buffer,
|
||||
thread_index);
|
||||
}
|
||||
|
||||
@ -176,7 +136,6 @@ kernel void index_put_serial(
|
||||
constant int64_t* index_sizes,
|
||||
constant int64_t* index_strides,
|
||||
constant uint4& ndim_nindices_numel,
|
||||
device ErrorMessages* error_buffer,
|
||||
uint thread_index [[thread_position_in_grid]]) {
|
||||
(void)thread_index; // Suppress unused vairable varning
|
||||
for (uint idx = 0; idx < ndim_nindices_numel.z; ++idx) {
|
||||
@ -191,7 +150,6 @@ kernel void index_put_serial(
|
||||
index_sizes,
|
||||
index_strides,
|
||||
ndim_nindices_numel,
|
||||
error_buffer,
|
||||
idx);
|
||||
}
|
||||
}
|
||||
@ -208,7 +166,6 @@ kernel void index_put_accumulate(
|
||||
constant int64_t* index_sizes,
|
||||
constant int64_t* index_strides,
|
||||
constant uint4& ndim_nindices_numel,
|
||||
device ErrorMessages* error_buffer,
|
||||
uint thread_index [[thread_position_in_grid]]) {
|
||||
const auto ndim = ndim_nindices_numel.x;
|
||||
const auto num_indices = ndim_nindices_numel.y;
|
||||
@ -219,18 +176,8 @@ kernel void index_put_accumulate(
|
||||
indices_strides,
|
||||
ndim,
|
||||
thread_index);
|
||||
bool error = false;
|
||||
auto output_offs = index_apply_indices<OffsetT>(
|
||||
offs.xz,
|
||||
indices,
|
||||
index_sizes,
|
||||
index_strides,
|
||||
num_indices,
|
||||
error,
|
||||
error_buffer);
|
||||
if (error) {
|
||||
return;
|
||||
}
|
||||
offs.xz, indices, index_sizes, index_strides, num_indices);
|
||||
AtomicType<T>::atomic_add(
|
||||
reinterpret_cast<device AtomicType_t<T>*>(output),
|
||||
output_offs / sizeof(T),
|
||||
@ -250,7 +197,6 @@ kernel void index_put_accumulate(
|
||||
constant int64_t* index_sizes, \
|
||||
constant int64_t* index_strides, \
|
||||
constant uint4& ndim_nindices_numel, \
|
||||
device ErrorMessages* error_buffer, \
|
||||
uint thread_index [[thread_position_in_grid]])
|
||||
|
||||
#define REGISTER_INDEX_OP_ALL_DTYPES(OP_NAME) \
|
||||
|
||||
@ -220,7 +220,7 @@ Tensor _embedding_bag_dense_backward_mps(const Tensor& output_grad,
|
||||
auto num_threads = (params.mode == EmbeddingBagMode::MAX) ? output_grad.numel() : num_indices * params.feature_size;
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
|
||||
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_backward_{}_{}",
|
||||
@ -273,7 +273,7 @@ Tensor _embedding_bag_per_sample_weights_backward_mps(const Tensor& output_grad,
|
||||
auto num_threads = num_indices * feature_size;
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
id<MTLComputeCommandEncoder> computeEncoder = stream->commandEncoder();
|
||||
auto pipeline_state = lib.getPipelineStateForFunc(fmt::format("embedding_bag_per_sample_weights_backward_{}_{}",
|
||||
|
||||
@ -179,8 +179,7 @@ static void dispatch_index_kernel(TensorIteratorBase& iter,
|
||||
iter.strides(2),
|
||||
index_size,
|
||||
index_stride,
|
||||
ndim_nindiees,
|
||||
mpsStream->getErrorBuffer());
|
||||
ndim_nindiees);
|
||||
mtl_dispatch1DJob(computeEncoder, indexSelectPSO, serial ? 1 : iter.numel());
|
||||
});
|
||||
}
|
||||
@ -300,7 +299,7 @@ static Tensor& nonzero_out_native_mps(const Tensor& self, Tensor& out_) {
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
using CachedGraph = MPSUnaryCachedGraph;
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
dispatch_sync(stream->queue(), ^() {
|
||||
stream->synchronize(SyncType::COMMIT_AND_WAIT);
|
||||
});
|
||||
int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
|
||||
@ -385,7 +384,7 @@ Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_) {
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
using CachedGraph = MPSUnaryCachedGraph;
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
dispatch_sync(stream->queue(), ^() {
|
||||
stream->synchronize(SyncType::COMMIT_AND_WAIT);
|
||||
});
|
||||
int64_t total_nonzero = at::count_nonzero(self).item<int64_t>();
|
||||
|
||||
@ -923,7 +923,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
|
||||
@autoreleasepool {
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
// which kernel variant to use based on the normalized axis N size
|
||||
const int N_READS = 4;
|
||||
auto metalType = mps::scalarToMetalTypeString(input);
|
||||
|
||||
@ -192,11 +192,6 @@
|
||||
CompositeExplicitAutograd: _assert_tensor_metadata
|
||||
Meta: _assert_tensor_metadata_meta_symint
|
||||
|
||||
- func: _async_error(str msg) -> ()
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: _async_error
|
||||
Meta: _async_error_meta
|
||||
|
||||
- func: _print(str s) -> ()
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: _print
|
||||
@ -4297,7 +4292,6 @@
|
||||
dispatch:
|
||||
SparseCPU: sparse_sparse_matmul_cpu
|
||||
SparseCUDA: sparse_sparse_matmul_cuda
|
||||
SparseMPS: sparse_sparse_matmul_mps
|
||||
autogen: _sparse_sparse_matmul.out
|
||||
|
||||
- func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
|
||||
@ -4389,7 +4383,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: mv
|
||||
SparseCPU, SparseCUDA, SparseMPS: mv_sparse
|
||||
SparseCPU, SparseCUDA: mv_sparse
|
||||
|
||||
- func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
@ -9838,7 +9832,7 @@
|
||||
structured_delegate: erfinv.out
|
||||
variants: method, function
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse
|
||||
SparseCPU, SparseCUDA: erfinv_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr
|
||||
tags: pointwise
|
||||
|
||||
@ -9847,7 +9841,7 @@
|
||||
structured_delegate: erfinv.out
|
||||
variants: method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_
|
||||
SparseCPU, SparseCUDA: erfinv_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_
|
||||
tags: pointwise
|
||||
|
||||
@ -9857,7 +9851,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: erfinv_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out
|
||||
SparseCPU, SparseCUDA: erfinv_sparse_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
|
||||
tags: pointwise
|
||||
|
||||
|
||||
@ -10,10 +10,6 @@
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_coalesce_native.h>
|
||||
#include <ATen/ops/repeat_interleave_native.h>
|
||||
#include <ATen/ops/cumsum.h>
|
||||
#include <ATen/ops/_sparse_sparse_matmul_native.h>
|
||||
#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
|
||||
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
|
||||
#include <ATen/ops/cat.h>
|
||||
#include <ATen/ops/add_native.h>
|
||||
@ -892,114 +888,5 @@ static void sparse_mask_intersection_out_mps_kernel(
|
||||
/*coalesce_mask=*/false);
|
||||
}
|
||||
|
||||
Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) {
|
||||
TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(),
|
||||
"sparse_sparse_matmul_mps: both inputs must be sparse COO tensors");
|
||||
TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(),
|
||||
"sparse_sparse_matmul_mps: both inputs must be on MPS device");
|
||||
TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2,
|
||||
"sparse_sparse_matmul_mps: both inputs must be 2D matrices");
|
||||
TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0,
|
||||
"sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)");
|
||||
TORCH_CHECK(mat1_.size(1) == mat2_.size(0),
|
||||
"mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")");
|
||||
TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(),
|
||||
"sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(),
|
||||
" does not match mat2 dtype ", mat2_.scalar_type());
|
||||
|
||||
const auto device = mat1_.device();
|
||||
|
||||
auto A = mat1_.coalesce();
|
||||
auto B = mat2_.coalesce();
|
||||
|
||||
const auto I = A.size(0);
|
||||
const auto K = A.size(1);
|
||||
const auto N = B.size(1);
|
||||
|
||||
const auto nnzA = A._nnz();
|
||||
const auto nnzB = B._nnz();
|
||||
|
||||
// Early empty result, return an empty, coalesced tensor
|
||||
if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) {
|
||||
auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
|
||||
auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
|
||||
auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
|
||||
out._coalesced_(true);
|
||||
return out;
|
||||
}
|
||||
|
||||
const auto computeDtype = at::result_type(mat1_, mat2_);
|
||||
|
||||
auto A_idx = A._indices().contiguous();
|
||||
auto A_val = A._values().to(computeDtype).contiguous();
|
||||
auto A_i = A_idx.select(0, 0).contiguous();
|
||||
auto A_k = A_idx.select(0, 1).contiguous();
|
||||
|
||||
auto B_idx = B._indices().contiguous();
|
||||
auto B_val = B._values().to(computeDtype).contiguous();
|
||||
auto B_k = B_idx.select(0, 0).contiguous();
|
||||
auto B_j = B_idx.select(0, 1).contiguous();
|
||||
|
||||
// csr-style row pointers for B by k (the shared dimension)
|
||||
Tensor row_ptr_B;
|
||||
{
|
||||
auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong));
|
||||
row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong));
|
||||
build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B);
|
||||
}
|
||||
|
||||
auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K);
|
||||
auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K);
|
||||
auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo);
|
||||
|
||||
auto counts = deg_B.index_select(0, A_k);
|
||||
|
||||
const int64_t P = counts.sum().item<int64_t>();
|
||||
if (P == 0) {
|
||||
auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
|
||||
auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
|
||||
auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
|
||||
out._coalesced_(true);
|
||||
return out;
|
||||
}
|
||||
|
||||
auto group_ids = repeat_interleave_mps(counts);
|
||||
|
||||
// exclusive cumsum of counts
|
||||
auto offsets = cumsum(counts, /*dim=*/0).sub(counts);
|
||||
auto offsets_gather = offsets.index_select(0, group_ids);
|
||||
auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather);
|
||||
|
||||
// Map each output element to its source B row and position
|
||||
auto k_per_out = A_k.index_select(0, group_ids);
|
||||
auto start_in_B = row_ptr_B.index_select(0, k_per_out);
|
||||
auto seg_index = start_in_B.add(within);
|
||||
|
||||
// Assemble candidate coo pairs and values
|
||||
auto i_out = A_i.index_select(0, group_ids).contiguous();
|
||||
auto j_out = B_j.index_select(0, seg_index).contiguous();
|
||||
auto vA_out = A_val.index_select(0, group_ids).contiguous();
|
||||
auto vB_out = B_val.index_select(0, seg_index).contiguous();
|
||||
auto v_out = vA_out.mul(vB_out);
|
||||
|
||||
// build (2, P) indices
|
||||
auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous();
|
||||
out_indices.select(0, 0).copy_(i_out);
|
||||
out_indices.select(0, 1).copy_(j_out);
|
||||
|
||||
auto result = _sparse_coo_tensor_unsafe(
|
||||
out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype));
|
||||
|
||||
result = result.coalesce();
|
||||
|
||||
if (result.scalar_type() != mat1_.scalar_type()) {
|
||||
auto cast_vals = result._values().to(mat1_.scalar_type());
|
||||
auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options());
|
||||
out._coalesced_(true);
|
||||
return out;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
|
||||
} // namespace at::native
|
||||
@ -10,13 +10,6 @@
|
||||
...
|
||||
}
|
||||
|
||||
{
|
||||
ignore_empty_generic_uninitialised_conditional_jump
|
||||
Memcheck:Cond
|
||||
fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
|
||||
...
|
||||
}
|
||||
|
||||
{
|
||||
Cond_cuda
|
||||
Memcheck:Cond
|
||||
|
||||
@ -952,7 +952,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
|
||||
first_fields.append(kwargs["tag"])
|
||||
headers = first_headers + ["speedup", "abs_latency"]
|
||||
row = first_fields + [float(speedup), median[1] * 1000]
|
||||
msg = f"{median[0] * 1000} ms, {median[1] * 1000} ms, {speedup:.3f}x"
|
||||
msg = f"{speedup:.3f}x"
|
||||
if args.baseline:
|
||||
headers.extend(
|
||||
[
|
||||
@ -1010,7 +1010,7 @@ def latency_experiment_summary(suite_name, args, model, timings, **kwargs):
|
||||
# Hypothetically you can use this from other places, but it's currently
|
||||
# inaccessible, and when this assert fails you need to update the
|
||||
# event_name here to account for the other cases you are using this
|
||||
assert any([args.quantization, args.optimus])
|
||||
assert args.quantization is not None
|
||||
output_signpost(
|
||||
dict(zip(headers, row)),
|
||||
args,
|
||||
@ -2587,9 +2587,6 @@ class BenchmarkRunner:
|
||||
**experiment_kwargs,
|
||||
)
|
||||
|
||||
# reset dynamo
|
||||
torch._dynamo.reset()
|
||||
|
||||
if self.args.export_aot_inductor:
|
||||
optimized_model_iter_fn = optimize_ctx
|
||||
else:
|
||||
@ -2953,7 +2950,7 @@ class BenchmarkRunner:
|
||||
status = self.check_tolerance(name, model, example_inputs, optimize_ctx)
|
||||
print(status)
|
||||
elif self.args.performance:
|
||||
if self.args.backend in ["torchao", "optimus"]:
|
||||
if self.args.backend == "torchao":
|
||||
status = self.run_performance_test_non_alternate(
|
||||
name, model, example_inputs, optimize_ctx, experiment, tag
|
||||
)
|
||||
@ -3529,12 +3526,6 @@ def parse_args(args=None):
|
||||
action="store_true",
|
||||
help="Measure speedup with TorchInductor",
|
||||
)
|
||||
group.add_argument(
|
||||
"--optimus",
|
||||
choices=["vertical_opt", "horizontal_opt", "all"],
|
||||
default=None,
|
||||
help="Measure speedup of Optimus with TorchInductor baseline",
|
||||
)
|
||||
group.add_argument(
|
||||
"--quantization",
|
||||
choices=[
|
||||
@ -3792,9 +3783,6 @@ def run(runner, args, original_dir=None):
|
||||
if args.inductor:
|
||||
assert args.backend is None
|
||||
args.backend = "inductor"
|
||||
if args.optimus:
|
||||
assert args.backend is None
|
||||
args.backend = "optimus"
|
||||
if args.quantization:
|
||||
assert args.backend is None
|
||||
args.backend = "torchao"
|
||||
@ -4079,22 +4067,10 @@ def run(runner, args, original_dir=None):
|
||||
|
||||
runner.model_iter_fn = model_iter_fn_and_mark_step
|
||||
optimize_ctx = torchao_optimize_ctx(args.quantization)
|
||||
elif args.backend == "optimus":
|
||||
from .optimus import get_baseline_ctx, get_optimus_optimize_ctx
|
||||
|
||||
baseline_ctx = get_baseline_ctx(
|
||||
nopython=args.nopython, inductor_compile_mode=args.inductor_compile_mode
|
||||
)
|
||||
runner.model_iter_fn = baseline_ctx(runner.model_iter_fn)
|
||||
optimize_ctx = get_optimus_optimize_ctx(
|
||||
args.optimus, args.nopython, args.inductor_compile_mode
|
||||
)
|
||||
else:
|
||||
optimize_ctx = torch._dynamo.optimize(args.backend, nopython=args.nopython)
|
||||
experiment = (
|
||||
speedup_experiment
|
||||
if args.backend not in ["torchao", "optimus"]
|
||||
else latency_experiment
|
||||
speedup_experiment if args.backend != "torchao" else latency_experiment
|
||||
)
|
||||
if args.accuracy:
|
||||
output_filename = f"accuracy_{args.backend}.csv"
|
||||
@ -4115,12 +4091,7 @@ def run(runner, args, original_dir=None):
|
||||
if args.only in runner.disable_cudagraph_models:
|
||||
args.disable_cudagraphs = True
|
||||
|
||||
if (
|
||||
args.inductor
|
||||
or args.backend == "inductor"
|
||||
or args.export_aot_inductor
|
||||
or args.backend == "optimus"
|
||||
):
|
||||
if args.inductor or args.backend == "inductor" or args.export_aot_inductor:
|
||||
inductor_config.triton.cudagraphs = not args.disable_cudagraphs
|
||||
inductor_config.triton.persistent_reductions = (
|
||||
not args.disable_persistent_reductions
|
||||
|
||||
@ -1,62 +0,0 @@
|
||||
import functools
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def get_baseline_ctx(nopython, inductor_compile_mode):
|
||||
return functools.partial(
|
||||
torch.compile,
|
||||
backend="inductor",
|
||||
fullgraph=nopython,
|
||||
mode=inductor_compile_mode,
|
||||
)
|
||||
|
||||
|
||||
def get_optimus_optimize_ctx(config, nopython, inductor_compile_mode):
|
||||
if config == "vertical_opt":
|
||||
optimus_inductor_config = {
|
||||
"pre_grad_fusion_options": {
|
||||
"normalization_pass": {},
|
||||
"merge_splits_pass": {},
|
||||
"split_cat_pass": {},
|
||||
"unbind_stack_pass": {},
|
||||
"unbind_cat_to_view_pass": {},
|
||||
}
|
||||
}
|
||||
elif config == "horizontal_opt":
|
||||
optimus_inductor_config = {
|
||||
"pre_grad_fusion_options": {
|
||||
"normalization_pass": {},
|
||||
"batch_linear": {},
|
||||
"batch_layernorm": {},
|
||||
},
|
||||
}
|
||||
elif config == "all":
|
||||
optimus_inductor_config = {
|
||||
"pre_grad_fusion_options": {
|
||||
"normalization_pass": {},
|
||||
"batch_linear": {},
|
||||
"batch_layernorm": {},
|
||||
"merge_splits_pass": {},
|
||||
"split_cat_pass": {},
|
||||
"unbind_stack_pass": {},
|
||||
"unbind_cat_to_view_pass": {},
|
||||
},
|
||||
}
|
||||
else:
|
||||
raise RuntimeError(f"Unknown optimus config: {config}")
|
||||
|
||||
def _inner(fn):
|
||||
if "pre_grad_fusion_options" in optimus_inductor_config:
|
||||
torch._inductor.config.pre_grad_fusion_options = optimus_inductor_config[
|
||||
"pre_grad_fusion_options"
|
||||
]
|
||||
if "post_grad_fusion_options" in optimus_inductor_config:
|
||||
torch._inductor.config.post_grad_fusion_options = optimus_inductor_config[
|
||||
"post_grad_fusion_options"
|
||||
]
|
||||
return torch.compile(
|
||||
fn, backend="inductor", fullgraph=nopython, mode=inductor_compile_mode
|
||||
)
|
||||
|
||||
return _inner
|
||||
@ -2,7 +2,6 @@ import csv
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# This script takes the logs produced by the benchmark scripts (e.g.,
|
||||
@ -16,7 +15,8 @@ from pathlib import Path
|
||||
# This script is not very well written, feel free to rewrite it as necessary
|
||||
|
||||
assert len(sys.argv) == 2
|
||||
full_log = Path(sys.argv[1]).read_text()
|
||||
|
||||
full_log = open(sys.argv[1]).read()
|
||||
|
||||
# If the log contains a gist URL, extract it so we can include it in the CSV
|
||||
gist_url = ""
|
||||
|
||||
@ -484,106 +484,24 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,False,50.954394,0.000000
|
||||
PyTorch,sum,sum_R256_V512_dim0_contiguousFalse_cpu,short,False,57.957757,0.000000
|
||||
PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,False,53.592068,0.000000
|
||||
PyTorch,sum,sum_R256_V512_dim1_contiguousFalse_cpu,short,False,51.339726,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.927,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.261,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.351,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.177,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,6.333,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,6.588,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,8.117,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,9.358,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,7.844,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,8.097,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.159,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.926,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.192,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.276,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,6.461,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,6.524,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,8.136,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.854,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,6.446,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,6.829,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.088,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.059,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.922,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.263,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,6.330,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,6.688,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,8.176,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.959,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,6.430,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,6.818,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.350,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.193,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.922,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.263,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,6.525,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,7.960,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.801,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,6.594,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,7.089,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.498,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.358,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.390,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.415,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.925,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,6.657,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,7.954,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.930,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,6.737,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,6.948,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.757,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.402,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.550,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.518,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,6.766,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.929,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,8.557,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,9.045,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,7.672,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,7.276,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,6.414,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,7.736,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,7.889,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,8.170,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,7.783,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,7.743,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.927,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,7.018,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,8.428,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,6.767,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.479,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,7.827,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.450,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.320,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,6.385,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,8.119,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,8.063,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.925,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,8.629,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,6.638,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.425,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.803,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.502,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.429,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,6.549,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,7.749,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,7.301,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.682,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.930,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,6.738,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,6.798,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,6.506,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,6.494,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,6.668,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,6.696,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,7.115,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.910,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.410,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,6.868,0.000000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.924,0.000000
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,False,7.040985,0.000000
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,False,7.168604,0.000000
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,False,7.434442,0.000000
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,False,7.078318,0.000000
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,False,7.426670,0.000000
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,False,7.679027,0.000000
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,False,7.281365,0.000000
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,False,7.682783,0.000000
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,False,8.381938,0.000000
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,False,7.039854,0.000000
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,False,7.399855,0.000000
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,False,7.715193,0.000000
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,False,7.255140,0.000000
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,False,7.753522,0.000000
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,False,8.364281,0.000000
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,False,7.476377,0.000000
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,False,8.458564,0.000000
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,False,9.391939,0.000000
|
||||
PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.float32,short,False,4.461410,0.000000
|
||||
PyTorch,addcmul,addcmul_M1_N2_cpu_dtypetorch.bfloat16,short,False,4.560082,0.000000
|
||||
PyTorch,addcmul,addcmul_M32_N64_cpu_dtypetorch.float32,short,False,5.141248,0.000000
|
||||
|
||||
|
@ -4,84 +4,74 @@ import torch
|
||||
|
||||
|
||||
tensor_conversion_short_configs = op_bench.cross_product_configs(
|
||||
M=[32],
|
||||
N=[128],
|
||||
M=(
|
||||
8,
|
||||
16,
|
||||
32,
|
||||
),
|
||||
N=(
|
||||
16,
|
||||
64,
|
||||
128,
|
||||
),
|
||||
device=["cpu", "cuda"],
|
||||
dtype_one=[
|
||||
torch.bool,
|
||||
torch.uint8,
|
||||
torch.int8,
|
||||
torch.int16,
|
||||
torch.int32,
|
||||
torch.int64,
|
||||
torch.half,
|
||||
torch.bfloat16,
|
||||
torch.float,
|
||||
torch.double,
|
||||
],
|
||||
dtype_two=[
|
||||
torch.bool,
|
||||
torch.uint8,
|
||||
torch.int8,
|
||||
torch.int16,
|
||||
torch.int32,
|
||||
torch.int64,
|
||||
torch.half,
|
||||
torch.bfloat16,
|
||||
torch.float,
|
||||
torch.double,
|
||||
],
|
||||
tags=["short"],
|
||||
)
|
||||
|
||||
tensor_conversion_long_configs = op_bench.cross_product_configs(
|
||||
M=[1024],
|
||||
N=[1024],
|
||||
M=(
|
||||
64,
|
||||
128,
|
||||
256,
|
||||
512,
|
||||
),
|
||||
N=(
|
||||
256,
|
||||
512,
|
||||
1024,
|
||||
2048,
|
||||
),
|
||||
device=["cpu", "cuda"],
|
||||
dtype_one=[
|
||||
torch.bool,
|
||||
torch.uint8,
|
||||
torch.int8,
|
||||
torch.int16,
|
||||
torch.int32,
|
||||
torch.int64,
|
||||
torch.half,
|
||||
torch.bfloat16,
|
||||
torch.float,
|
||||
torch.double,
|
||||
],
|
||||
dtype_two=[
|
||||
torch.bool,
|
||||
torch.uint8,
|
||||
torch.int8,
|
||||
torch.int16,
|
||||
torch.int32,
|
||||
torch.int64,
|
||||
torch.half,
|
||||
torch.bfloat16,
|
||||
torch.float,
|
||||
torch.double,
|
||||
],
|
||||
tags=["long"],
|
||||
)
|
||||
|
||||
|
||||
class TensorConversionBenchmark(op_bench.TorchBenchmarkBase):
|
||||
def init(self, M, N, dtype_one, dtype_two, device):
|
||||
class FloatToHalfTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
|
||||
def init(self, M, N, device):
|
||||
self.inputs = {
|
||||
"input": torch.rand(
|
||||
M, N, device=device, requires_grad=False, dtype=torch.float
|
||||
).to(dtype=dtype_one)
|
||||
)
|
||||
}
|
||||
self.dtype_one = dtype_one
|
||||
self.dtype_two = dtype_two
|
||||
|
||||
def forward(self, input):
|
||||
return input.to(dtype=self.dtype_two)
|
||||
return input.to(torch.half)
|
||||
|
||||
|
||||
op_bench.generate_pt_test(tensor_conversion_short_configs, TensorConversionBenchmark)
|
||||
op_bench.generate_pt_test(tensor_conversion_long_configs, TensorConversionBenchmark)
|
||||
class HalfToFloatTensorConversionBenchmark(op_bench.TorchBenchmarkBase):
|
||||
def init(self, M, N, device):
|
||||
self.inputs = {
|
||||
"input": torch.rand(
|
||||
M, N, device=device, requires_grad=False, dtype=torch.half
|
||||
)
|
||||
}
|
||||
|
||||
def forward(self, input):
|
||||
return input.to(torch.float)
|
||||
|
||||
|
||||
op_bench.generate_pt_test(
|
||||
tensor_conversion_short_configs, FloatToHalfTensorConversionBenchmark
|
||||
)
|
||||
op_bench.generate_pt_test(
|
||||
tensor_conversion_long_configs, FloatToHalfTensorConversionBenchmark
|
||||
)
|
||||
op_bench.generate_pt_test(
|
||||
tensor_conversion_short_configs, HalfToFloatTensorConversionBenchmark
|
||||
)
|
||||
op_bench.generate_pt_test(
|
||||
tensor_conversion_long_configs, HalfToFloatTensorConversionBenchmark
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
op_bench.benchmark_runner.main()
|
||||
|
||||
@ -349,106 +349,24 @@ PyTorch,sum,sum_R256_V512_dim0_contiguousTrue_cpu,short,FALSE,12.5841
|
||||
PyTorch,sum,sum_R256_V512_dim0_contiguousFALSE_cpu,short,FALSE,20.8765
|
||||
PyTorch,sum,sum_R256_V512_dim1_contiguousTrue_cpu,short,FALSE,15.4414
|
||||
PyTorch,sum,sum_R256_V512_dim1_contiguousFALSE_cpu,short,FALSE,15.3287
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,0.797
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.uint8,short,False,6.071
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int8,short,False,6.031
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int16,short,False,6.243
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int32,short,False,7.231
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.int64,short,False,7.791
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float16,short,False,12.661
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.bfloat16,short,False,11.225
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float32,short,False,9.772
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bool_dtype_twotorch.float64,short,False,9.872
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bool,short,False,6.033
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,0.781
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int8,short,False,6.060
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int16,short,False,6.180
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.258
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.int64,short,False,7.758
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float16,short,False,10.504
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.bfloat16,short,False,6.749
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float32,short,False,7.679
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.uint8_dtype_twotorch.float64,short,False,7.797
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bool,short,False,6.019
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.uint8,short,False,6.079
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int8,short,False,0.785
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int16,short,False,6.188
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int32,short,False,7.288
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.int64,short,False,7.770
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float16,short,False,10.466
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.bfloat16,short,False,6.676
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float32,short,False,7.736
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int8_dtype_twotorch.float64,short,False,7.780
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bool,short,False,6.130
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.uint8,short,False,6.221
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int8,short,False,6.101
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int16,short,False,0.791
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int32,short,False,6.254
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.int64,short,False,7.733
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float16,short,False,10.562
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.bfloat16,short,False,6.704
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float32,short,False,7.819
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int16_dtype_twotorch.float64,short,False,8.276
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bool,short,False,6.361
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,6.364
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int8,short,False,6.309
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int16,short,False,6.362
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,0.791
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.int64,short,False,7.746
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float16,short,False,9.462
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.bfloat16,short,False,6.678
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float32,short,False,7.827
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int32_dtype_twotorch.float64,short,False,8.200
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bool,short,False,6.925
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.uint8,short,False,6.947
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int8,short,False,6.962
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int16,short,False,6.906
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int32,short,False,7.664
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.int64,short,False,0.782
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float16,short,False,10.528
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.bfloat16,short,False,10.123
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float32,short,False,9.234
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.int64_dtype_twotorch.float64,short,False,8.694
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bool,short,False,12.653
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.uint8,short,False,9.348
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int8,short,False,8.774
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int16,short,False,9.063
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int32,short,False,10.012
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.int64,short,False,13.641
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float16,short,False,0.788
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.bfloat16,short,False,13.757
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float32,short,False,7.170
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float16_dtype_twotorch.float64,short,False,12.511
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bool,short,False,6.516
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.uint8,short,False,8.539
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int8,short,False,6.483
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int16,short,False,6.468
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int32,short,False,7.752
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.int64,short,False,9.868
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float16,short,False,10.556
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.bfloat16,short,False,0.792
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float32,short,False,7.577
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.bfloat16_dtype_twotorch.float64,short,False,8.267
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bool,short,False,6.819
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.uint8,short,False,7.715
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int8,short,False,6.754
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int16,short,False,6.825
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int32,short,False,7.790
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.int64,short,False,9.219
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float16,short,False,5.977
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.bfloat16,short,False,7.069
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float32,short,False,0.794
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float32_dtype_twotorch.float64,short,False,8.301
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bool,short,False,7.401
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.uint8,short,False,7.843
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int8,short,False,7.117
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int16,short,False,7.170
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int32,short,False,8.000
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.int64,short,False,9.284
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float16,short,False,7.179
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.bfloat16,short,False,7.645
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float32,short,False,7.988
|
||||
PyTorch,TensorConversionBenchmark,TensorConversionBenchmark_M32_N128_cpu_dtype_onetorch.float64_dtype_twotorch.float64,short,False,0.792
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0499
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3229
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4418
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.0868
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4495
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5578
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.2631
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5646
|
||||
PyTorch,FloatToHalfTensorConversionBenchmark,FloatToHalfTensorConversionBenchmark_M32_N128_cpu,short,FALSE,5.7898
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N16_cpu,short,FALSE,5.0228
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N64_cpu,short,FALSE,5.3692
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M8_N128_cpu,short,FALSE,5.4006
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N16_cpu,short,FALSE,5.1107
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N64_cpu,short,FALSE,5.4119
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M16_N128_cpu,short,FALSE,5.5583
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N16_cpu,short,FALSE,5.3818
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N64_cpu,short,FALSE,5.5742
|
||||
PyTorch,HalfToFloatTensorConversionBenchmark,HalfToFloatTensorConversionBenchmark_M32_N128_cpu,short,FALSE,6.8414
|
||||
PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.quint8",short,FALSE,9.4657
|
||||
PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint8",short,FALSE,9.4625
|
||||
PyTorch,relu,"relu_dims(3,4,5)_contigFALSE_inplaceFALSE_dtypetorch.qint32",short,FALSE,9.4165
|
||||
|
||||
|
@ -52,18 +52,19 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
|
||||
start.record()
|
||||
coo.matmul(mat)
|
||||
stop.record()
|
||||
|
||||
times.append(start.elapsed_time(stop))
|
||||
|
||||
coo_mean_time = sum(times) / len(times)
|
||||
coo_mean_time = sum(times) / len(times)
|
||||
|
||||
times = []
|
||||
for _ in range(test_count):
|
||||
start.record()
|
||||
csr.matmul(mat)
|
||||
stop.record()
|
||||
times.append(start.elapsed_time(stop))
|
||||
times = []
|
||||
for _ in range(test_count):
|
||||
start.record()
|
||||
csr.matmul(mat)
|
||||
stop.record()
|
||||
times.append(start.elapsed_time(stop))
|
||||
|
||||
csr_mean_time = sum(times) / len(times)
|
||||
csr_mean_time = sum(times) / len(times)
|
||||
|
||||
return coo_mean_time, csr_mean_time
|
||||
|
||||
@ -83,13 +84,10 @@ if __name__ == "__main__":
|
||||
|
||||
if args.outfile == "stdout":
|
||||
outfile = sys.stdout
|
||||
need_close = False
|
||||
elif args.outfile == "stderr":
|
||||
outfile = sys.stderr
|
||||
need_close = False
|
||||
else:
|
||||
outfile = open(args.outfile, "a")
|
||||
need_close = True
|
||||
|
||||
test_count = args.test_count
|
||||
m = args.m
|
||||
@ -150,5 +148,3 @@ if __name__ == "__main__":
|
||||
time,
|
||||
file=outfile,
|
||||
)
|
||||
if need_close:
|
||||
outfile.close()
|
||||
|
||||
@ -82,13 +82,10 @@ if __name__ == "__main__":
|
||||
|
||||
if args.outfile == "stdout":
|
||||
outfile = sys.stdout
|
||||
need_close = False
|
||||
elif args.outfile == "stderr":
|
||||
outfile = sys.stderr
|
||||
need_close = False
|
||||
else:
|
||||
outfile = open(args.outfile, "a")
|
||||
need_close = True
|
||||
|
||||
test_count = args.test_count
|
||||
m = args.m
|
||||
@ -135,5 +132,3 @@ if __name__ == "__main__":
|
||||
time_csr,
|
||||
file=outfile,
|
||||
)
|
||||
if need_close:
|
||||
outfile.close()
|
||||
|
||||
@ -179,13 +179,10 @@ if __name__ == "__main__":
|
||||
|
||||
if args.outfile == "stdout":
|
||||
outfile = sys.stdout
|
||||
need_close = False
|
||||
elif args.outfile == "stderr":
|
||||
outfile = sys.stderr
|
||||
need_close = False
|
||||
else:
|
||||
outfile = open(args.outfile, "a")
|
||||
need_close = True
|
||||
|
||||
ops = args.ops.split(",")
|
||||
|
||||
@ -437,5 +434,3 @@ if __name__ == "__main__":
|
||||
if op not in {"bsr_scatter_mm6", "bsr_dense_mm_with_meta"}:
|
||||
# Break on operations that do not consume parameters
|
||||
break
|
||||
if need_close:
|
||||
outfile.close()
|
||||
|
||||
@ -96,10 +96,6 @@ struct C10_API DeviceAllocator : public c10::Allocator {
|
||||
|
||||
// Resets peak memory usage statistics for the specified device
|
||||
virtual void resetPeakStats(c10::DeviceIndex device) = 0;
|
||||
|
||||
// Return the free memory size and total memory size in bytes for the
|
||||
// specified device.
|
||||
virtual std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) = 0;
|
||||
};
|
||||
|
||||
// This function is used to get the DeviceAllocator for a specific device type
|
||||
|
||||
@ -44,7 +44,7 @@ struct C10_API SafePyObject {
|
||||
(*other.pyinterpreter_)->incref(other.data_);
|
||||
}
|
||||
if (data_ != nullptr) {
|
||||
(*pyinterpreter_)->decref(data_);
|
||||
(*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
|
||||
}
|
||||
data_ = other.data_;
|
||||
pyinterpreter_ = other.pyinterpreter_;
|
||||
@ -53,7 +53,7 @@ struct C10_API SafePyObject {
|
||||
|
||||
~SafePyObject() {
|
||||
if (data_ != nullptr) {
|
||||
(*pyinterpreter_)->decref(data_);
|
||||
(*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -48,30 +48,6 @@ void warnDeprecatedDataPtr() {
|
||||
TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
|
||||
}
|
||||
|
||||
void StorageImpl::incref_pyobject() const {
|
||||
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
||||
// do an acquire fence to ensure that the kHasPyObject bit was
|
||||
// observed before the load of the PyObject* below.
|
||||
// NB: This is a no-op on x86/x86-64
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
||||
}
|
||||
|
||||
void StorageImpl::decref_pyobject() const {
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
||||
}
|
||||
|
||||
bool StorageImpl::try_incref_pyobject() const {
|
||||
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
||||
if (C10_UNLIKELY(!interp)) {
|
||||
return false;
|
||||
}
|
||||
return (*interp)->try_incref(pyobj_slot_);
|
||||
}
|
||||
|
||||
void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
|
||||
// Allowlist verification.
|
||||
// Only if the devicetype is in the allowlist,
|
||||
|
||||
@ -105,12 +105,6 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
|
||||
data_ptr_.clear();
|
||||
}
|
||||
|
||||
void incref_pyobject() const override final;
|
||||
|
||||
void decref_pyobject() const override final;
|
||||
|
||||
bool try_incref_pyobject() const override final;
|
||||
|
||||
size_t nbytes() const {
|
||||
// OK to do this instead of maybe_as_int as nbytes is guaranteed positive
|
||||
TORCH_CHECK(!size_bytes_is_heap_allocated_);
|
||||
@ -376,14 +370,4 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
|
||||
bool resizable,
|
||||
std::optional<at::Device> device_opt);
|
||||
|
||||
namespace detail {
|
||||
template <class T>
|
||||
struct TargetTraits<
|
||||
T,
|
||||
std::enable_if_t<
|
||||
std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
|
||||
static constexpr bool can_have_pyobject = true;
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
} // namespace c10
|
||||
|
||||
@ -277,6 +277,7 @@ void TensorImpl::release_resources() {
|
||||
if (storage_) {
|
||||
storage_ = {};
|
||||
}
|
||||
pyobj_slot_.maybe_destroy_pyobj();
|
||||
}
|
||||
|
||||
#ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
|
||||
@ -988,30 +989,6 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
|
||||
}
|
||||
}
|
||||
|
||||
void TensorImpl::incref_pyobject() const {
|
||||
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
||||
// do an acquire fence to ensure that the kHasPyObject bit was
|
||||
// observed before the load of the PyObject* below.
|
||||
// NB: This is a no-op on x86/x86-64
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
||||
}
|
||||
|
||||
void TensorImpl::decref_pyobject() const {
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
||||
}
|
||||
|
||||
bool TensorImpl::try_incref_pyobject() const {
|
||||
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
||||
if (C10_UNLIKELY(!interp)) {
|
||||
return false;
|
||||
}
|
||||
return (*interp)->try_incref(pyobj_slot_);
|
||||
}
|
||||
|
||||
namespace impl {
|
||||
|
||||
namespace {
|
||||
|
||||
@ -2176,12 +2176,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
|
||||
return &pyobj_slot_;
|
||||
}
|
||||
|
||||
void incref_pyobject() const override final;
|
||||
|
||||
void decref_pyobject() const override final;
|
||||
|
||||
bool try_incref_pyobject() const override final;
|
||||
|
||||
private:
|
||||
// See NOTE [std::optional operator usage in CUDA]
|
||||
// We probably don't want to expose this publicly until
|
||||
@ -3083,17 +3077,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
|
||||
friend class C10_TensorImpl_Size_Check_Dummy_Class;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <class T>
|
||||
struct TargetTraits<
|
||||
T,
|
||||
std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
|
||||
static constexpr bool can_have_pyobject = true;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Note [TensorImpl size constraints]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// Changed the size of TensorImpl? If the size went down, good for
|
||||
|
||||
@ -11,11 +11,8 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
|
||||
|
||||
void incref(PyObject* pyobj) const override {} // do nothing
|
||||
|
||||
void decref(PyObject* pyobj) const override {} // do nothing
|
||||
|
||||
bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override {
|
||||
return false;
|
||||
}
|
||||
void decref(PyObject* pyobj, bool has_pyobj_slot) const override {
|
||||
} // do nothing
|
||||
|
||||
#define PANIC(m) \
|
||||
TORCH_INTERNAL_ASSERT( \
|
||||
@ -23,10 +20,6 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
|
||||
"attempted to call " #m \
|
||||
" on a Tensor with nontrivial PyObject after corresponding interpreter died")
|
||||
|
||||
size_t refcnt(PyObject* pyobj) const override {
|
||||
PANIC(refcnt);
|
||||
}
|
||||
|
||||
c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override {
|
||||
PANIC(detach);
|
||||
}
|
||||
|
||||
@ -18,9 +18,6 @@ namespace c10 {
|
||||
struct IValue;
|
||||
class OperatorHandle;
|
||||
struct TensorImpl;
|
||||
namespace impl {
|
||||
struct PyObjectSlot;
|
||||
} // namespace impl
|
||||
} // namespace c10
|
||||
|
||||
namespace torch::jit {
|
||||
@ -129,12 +126,9 @@ struct C10_API PyInterpreterVTable {
|
||||
|
||||
// Run Py_INCREF on a PyObject.
|
||||
virtual void incref(PyObject* pyobj) const = 0;
|
||||
// Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call.
|
||||
virtual void decref(PyObject* pyobj) const = 0;
|
||||
// Run PyUnstable_TryIncRef on a PyObject if it's not NULL.
|
||||
virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0;
|
||||
// Run Py_REFCNT on a PyObject.
|
||||
virtual size_t refcnt(PyObject* pyobj) const = 0;
|
||||
// Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call
|
||||
// See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
|
||||
virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0;
|
||||
|
||||
// Perform a detach by deferring to the __torch_dispatch__ implementation of
|
||||
// detach, which will also arrange for the PyObject to get copied in this
|
||||
|
||||
56
c10/core/impl/PyObjectSlot.cpp
Normal file
56
c10/core/impl/PyObjectSlot.cpp
Normal file
@ -0,0 +1,56 @@
|
||||
#include <c10/core/impl/PyObjectSlot.h>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
|
||||
|
||||
PyObjectSlot::~PyObjectSlot() {
|
||||
maybe_destroy_pyobj();
|
||||
}
|
||||
|
||||
void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
if (owns_pyobj()) {
|
||||
TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
|
||||
(*pyobj_interpreter_.load(std::memory_order_acquire))
|
||||
->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
|
||||
// NB: this destructor can only be entered when there are no
|
||||
// references to this C++ object (obviously), NOR any references
|
||||
// to the PyObject (if there are references to the PyObject,
|
||||
// then the PyObject holds an owning reference to the tensor).
|
||||
// So it is OK to clear pyobj_ here as it is impossible for it to
|
||||
// be used again (modulo weak reference races)
|
||||
pyobj_ = nullptr; // for safety
|
||||
}
|
||||
}
|
||||
|
||||
PyInterpreter* PyObjectSlot::pyobj_interpreter() {
|
||||
return pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return reinterpret_cast<PyObject*>(
|
||||
reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
|
||||
}
|
||||
|
||||
PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter) {
|
||||
return *interpreter;
|
||||
}
|
||||
TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
|
||||
}
|
||||
|
||||
bool PyObjectSlot::owns_pyobj() {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return reinterpret_cast<uintptr_t>(pyobj_) & 1;
|
||||
}
|
||||
|
||||
void PyObjectSlot::set_owns_pyobj(bool b) {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
pyobj_ = reinterpret_cast<PyObject*>(
|
||||
reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
|
||||
}
|
||||
|
||||
} // namespace c10::impl
|
||||
@ -8,70 +8,117 @@
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace torch::utils {
|
||||
class PyObjectPreservation;
|
||||
}
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
struct C10_API PyObjectSlot {
|
||||
public:
|
||||
PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
|
||||
PyObjectSlot();
|
||||
|
||||
~PyObjectSlot();
|
||||
|
||||
void maybe_destroy_pyobj();
|
||||
|
||||
// Associate the TensorImpl with the specified PyObject, and, if necessary,
|
||||
// also tag the interpreter.
|
||||
//
|
||||
// NB: This lives in a header so that we can inline away the switch on status
|
||||
//
|
||||
// NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after
|
||||
// PyObject if necessary!
|
||||
void init_pyobj(PyObject* pyobj) {
|
||||
pyobj_interpreter_.store(
|
||||
getGlobalPyInterpreter(), std::memory_order_relaxed);
|
||||
pyobj_ = pyobj;
|
||||
}
|
||||
|
||||
// Query the PyObject interpreter. This may return null if there is no
|
||||
// interpreter.
|
||||
PyInterpreter* pyobj_interpreter() const {
|
||||
return pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
// interpreter. This is racy!
|
||||
PyInterpreter* pyobj_interpreter();
|
||||
|
||||
PyObject* _unchecked_untagged_pyobj() const;
|
||||
|
||||
// Test the interpreter tag. If tagged for the current interpreter, return
|
||||
// a non-nullopt (but possibly null) PyObject. If (possibly) untagged,
|
||||
// returns a nullopt. If it is definitely invalid, raises an error.
|
||||
//
|
||||
// If `ignore_hermetic_tls` is false and this function is called from a
|
||||
// hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
|
||||
// nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
|
||||
// context is ignored, allowing you to check the interpreter tag of a
|
||||
// nonhermetic PyObject from within a hermetic context. This is necessary
|
||||
// because there are some cases where the deallocator function of a
|
||||
// nonhermetic PyObject is called from within a hermetic context, so it must
|
||||
// be properly treated as a nonhermetic PyObject.
|
||||
//
|
||||
// NB: this lives in header so that we can avoid actually creating the
|
||||
// std::optional
|
||||
|
||||
// @todo alban: I'm not too sure what's going on here, we can probably delete
|
||||
// it but it's worthwhile making sure
|
||||
std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
|
||||
impl::PyInterpreter* interpreter =
|
||||
pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter == nullptr) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
}
|
||||
|
||||
PyInterpreter& load_pyobj_interpreter() const {
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
interpreter, "cannot access PyObject for Tensor - no interpreter set");
|
||||
return *interpreter;
|
||||
}
|
||||
PyInterpreter& load_pyobj_interpreter() const;
|
||||
|
||||
PyObject* load_pyobj() const {
|
||||
return pyobj_.load(std::memory_order_acquire);
|
||||
}
|
||||
bool owns_pyobj();
|
||||
|
||||
bool has_unique_reference() const {
|
||||
PyObject* pyobj = load_pyobj();
|
||||
return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1;
|
||||
}
|
||||
|
||||
void clear() {
|
||||
pyobj_.store(nullptr, std::memory_order_relaxed);
|
||||
pyobj_interpreter_.store(nullptr, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
// Non thread-safe swap
|
||||
void swap(PyObjectSlot& other) noexcept {
|
||||
PyInterpreter* tmp_interpreter =
|
||||
pyobj_interpreter_.load(std::memory_order_relaxed);
|
||||
pyobj_interpreter_.store(
|
||||
other.pyobj_interpreter_.load(std::memory_order_relaxed),
|
||||
std::memory_order_relaxed);
|
||||
other.pyobj_interpreter_.store(tmp_interpreter, std::memory_order_relaxed);
|
||||
|
||||
PyObject* tmp_pyobj = pyobj_.load(std::memory_order_relaxed);
|
||||
pyobj_.store(
|
||||
other.pyobj_.load(std::memory_order_relaxed),
|
||||
std::memory_order_relaxed);
|
||||
other.pyobj_.store(tmp_pyobj, std::memory_order_relaxed);
|
||||
}
|
||||
void set_owns_pyobj(bool b);
|
||||
|
||||
private:
|
||||
// This is now always the global interpreter if the PyObject is set.
|
||||
// Maybe we can remove this field some day...
|
||||
// This field contains the interpreter tag for this object. See
|
||||
// Note [Python interpreter tag] for general context
|
||||
//
|
||||
// Note [Memory ordering on Python interpreter tag]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// What memory_order do we need when accessing this atomic? We don't
|
||||
// need a single total modification order (as provided by
|
||||
// memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
|
||||
// transition from -1 to some positive integer and never changes afterwards.
|
||||
// Because there is only one modification, it trivially already has a total
|
||||
// modification order (e.g., we don't need fences or locked instructions on
|
||||
// x86)
|
||||
//
|
||||
// In fact, one could make a reasonable argument that relaxed reads are OK,
|
||||
// due to the presence of external locking (GIL) to ensure that interactions
|
||||
// with other data structures are still correctly synchronized, so that
|
||||
// we fall in the "Single-Location Data Structures" case as described in
|
||||
// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
|
||||
// However, on x86, it doesn't matter if I use acquire or relaxed on the load
|
||||
// as I get the same assembly in both cases. So I just use the more
|
||||
// conservative acquire (which will impede compiler optimizations but I don't
|
||||
// care)
|
||||
std::atomic<PyInterpreter*> pyobj_interpreter_;
|
||||
|
||||
// The PyObject representing this Tensor or nullptr. Ownership is managed
|
||||
// by intrusive_ptr. By the time the PyObjectSlot is destroyed, this
|
||||
// reference is already dead.
|
||||
std::atomic<PyObject*> pyobj_;
|
||||
|
||||
friend class torch::utils::PyObjectPreservation;
|
||||
// This field contains a reference to a PyObject representing this Tensor.
|
||||
// If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
|
||||
// PyObject for it and set this field. This field does not have to be
|
||||
// protected by an atomic as it is only allowed to be accessed when you hold
|
||||
// the GIL, or during destruction of the tensor.
|
||||
//
|
||||
// When a PyObject dies, you are obligated to clear this field
|
||||
// (otherwise, you will try to use-after-free the pyobj); this currently
|
||||
// occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
|
||||
//
|
||||
// NB: Ordinarily, this should not be a strong reference, as if the
|
||||
// PyObject owns the Tensor, this would create a reference cycle.
|
||||
// However, sometimes this ownership flips. To track who owns
|
||||
// who, this has a single pointer tag indicating whether or not the
|
||||
// C++ object owns the PyObject (the common case, zero, means PyObject
|
||||
// owns the C++ object); see _unchecked_untagged_pyobj for raw access
|
||||
// or check_pyobj for checked access. See references to PyObject
|
||||
// resurrection in torch/csrc/autograd/python_variable.cpp
|
||||
PyObject* pyobj_;
|
||||
};
|
||||
|
||||
} // namespace c10::impl
|
||||
|
||||
@ -345,13 +345,6 @@ class CUDAAllocator : public DeviceAllocator {
|
||||
c10::DeviceIndex device,
|
||||
std::shared_ptr<AllocatorState> pps) = 0;
|
||||
virtual std::string name() = 0;
|
||||
std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) override {
|
||||
c10::DeviceGuard device_guard({at::kCUDA, device});
|
||||
size_t free = 0;
|
||||
size_t total = 0;
|
||||
C10_CUDA_CHECK(cudaMemGetInfo(&free, &total));
|
||||
return {free, total};
|
||||
}
|
||||
};
|
||||
|
||||
// Allocator object, statically initialized
|
||||
|
||||
@ -1,111 +0,0 @@
|
||||
#pragma once
|
||||
#include <c10/metal/common.h>
|
||||
|
||||
namespace c10 {
|
||||
namespace metal {
|
||||
C10_METAL_CONSTEXPR unsigned error_message_count = 30;
|
||||
struct ErrorMessage {
|
||||
char file[128];
|
||||
char func[128];
|
||||
char message[250];
|
||||
unsigned int line;
|
||||
};
|
||||
|
||||
struct ErrorMessages {
|
||||
#ifdef __METAL__
|
||||
::metal::atomic<unsigned int> count;
|
||||
#else
|
||||
unsigned int count;
|
||||
#endif
|
||||
ErrorMessage msg[error_message_count];
|
||||
};
|
||||
|
||||
#ifdef __METAL__
|
||||
namespace detail {
|
||||
static uint strncpy(device char* dst, constant const char* src, unsigned len) {
|
||||
uint i = 0;
|
||||
while (src[i] != 0 && i < len - 1) {
|
||||
dst[i] = src[i];
|
||||
i++;
|
||||
}
|
||||
dst[i] = 0;
|
||||
return i;
|
||||
}
|
||||
|
||||
inline uint print_arg(
|
||||
device char* ptr,
|
||||
unsigned len,
|
||||
constant const char* arg) {
|
||||
return strncpy(ptr, arg, len);
|
||||
}
|
||||
|
||||
// Returns number length as string in base10
|
||||
static inline uint base10_length(long num) {
|
||||
uint rc = 1;
|
||||
if (num < 0) {
|
||||
num = -num;
|
||||
rc += 1;
|
||||
}
|
||||
while (num > 9) {
|
||||
num /= 10;
|
||||
rc++;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
// Converts signed integer to string
|
||||
inline uint print_arg(device char* ptr, unsigned len, long arg) {
|
||||
const auto arg_len = base10_length(arg);
|
||||
if (arg_len >= len)
|
||||
return 0;
|
||||
if (arg < 0) {
|
||||
ptr[0] = '-';
|
||||
arg = -arg;
|
||||
}
|
||||
uint idx = 1;
|
||||
do {
|
||||
ptr[arg_len - idx] = '0' + (arg % 10);
|
||||
arg /= 10;
|
||||
idx++;
|
||||
} while (arg > 0);
|
||||
ptr[arg_len] = 0;
|
||||
return arg_len;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void print_args(device char* ptr, unsigned len, T arg) {
|
||||
print_arg(ptr, len, arg);
|
||||
}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
inline void print_args(device char* ptr, unsigned len, T arg, Args... args) {
|
||||
const auto rc = print_arg(ptr, len, arg);
|
||||
print_args(ptr + rc, len - rc, args...);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename... Args>
|
||||
static void report_error(
|
||||
device ErrorMessages* msgs,
|
||||
constant const char* file,
|
||||
int line,
|
||||
constant const char* func,
|
||||
Args... args) {
|
||||
const auto idx =
|
||||
atomic_fetch_add_explicit(&msgs->count, 1, ::metal::memory_order_relaxed);
|
||||
if (idx >= error_message_count) {
|
||||
return;
|
||||
}
|
||||
device auto* msg = &msgs->msg[idx];
|
||||
detail::strncpy(msg->file, file, 128);
|
||||
detail::strncpy(msg->func, func, 128);
|
||||
detail::print_args(msg->message, 250, args...);
|
||||
msg->line = line;
|
||||
}
|
||||
|
||||
#define TORCH_REPORT_ERROR(buf, ...) \
|
||||
::c10::metal::report_error(buf, __FILE__, __LINE__, __func__, __VA_ARGS__)
|
||||
#endif
|
||||
} // namespace metal
|
||||
} // namespace c10
|
||||
@ -1,8 +1,9 @@
|
||||
#include <c10/test/util/Macros.h>
|
||||
#include <c10/util/Metaprogramming.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <torch/headeronly/util/Metaprogramming.h>
|
||||
#include <cstdlib>
|
||||
|
||||
using namespace torch::headeronly::guts;
|
||||
using namespace c10::guts;
|
||||
|
||||
// NOLINTBEGIN(modernize*, cppcoreguidelines-special-member-functions)
|
||||
namespace {
|
||||
@ -64,15 +65,6 @@ static_assert(
|
||||
typename make_function_traits_t<void, typelist::typelist<int, float>>::
|
||||
func_type>::value,
|
||||
"");
|
||||
|
||||
struct Functor final {
|
||||
std::string operator()(int64_t a, float b) const;
|
||||
};
|
||||
static_assert(
|
||||
std::is_same<
|
||||
std::string(int64_t, float),
|
||||
typename infer_function_traits_t<Functor>::func_type>::value,
|
||||
"");
|
||||
} // namespace test_function_traits
|
||||
|
||||
struct MovableOnly {
|
||||
@ -1,8 +1,8 @@
|
||||
#include <c10/util/TypeList.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <torch/headeronly/util/TypeList.h>
|
||||
#include <memory>
|
||||
|
||||
using namespace torch::headeronly::guts::typelist;
|
||||
using namespace c10::guts::typelist;
|
||||
// NOLINTBEGIN(modernize-unary-static-assert)
|
||||
namespace test_size {
|
||||
class MyClass {};
|
||||
@ -1,7 +1,7 @@
|
||||
#include <c10/util/TypeTraits.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include <torch/headeronly/util/TypeTraits.h>
|
||||
|
||||
using namespace torch::headeronly::guts;
|
||||
using namespace c10::guts;
|
||||
|
||||
// NOLINTBEGIN(modernize-unary-static-assert)
|
||||
namespace {
|
||||
1
c10/util/Metaprogramming.cpp
Normal file
1
c10/util/Metaprogramming.cpp
Normal file
@ -0,0 +1 @@
|
||||
#include <c10/util/Metaprogramming.h>
|
||||
@ -1 +1,224 @@
|
||||
#include <torch/headeronly/util/Metaprogramming.h>
|
||||
#pragma once
|
||||
|
||||
#include <c10/util/TypeList.h>
|
||||
#include <type_traits>
|
||||
|
||||
namespace c10::guts {
|
||||
|
||||
/**
|
||||
* Access information about result type or arguments from a function type.
|
||||
* Example:
|
||||
* using A = function_traits<int (float, double)>::return_type // A == int
|
||||
* using A = function_traits<int (float, double)>::parameter_types::tuple_type
|
||||
* // A == tuple<float, double>
|
||||
*/
|
||||
template <class Func>
|
||||
struct function_traits {
|
||||
static_assert(
|
||||
!std::is_same_v<Func, Func>,
|
||||
"In function_traits<Func>, Func must be a plain function type.");
|
||||
};
|
||||
template <class Result, class... Args>
|
||||
struct function_traits<Result(Args...)> {
|
||||
using func_type = Result(Args...);
|
||||
using return_type = Result;
|
||||
using parameter_types = typelist::typelist<Args...>;
|
||||
static constexpr auto number_of_parameters = sizeof...(Args);
|
||||
};
|
||||
|
||||
/**
|
||||
* infer_function_traits: creates a `function_traits` type for a simple
|
||||
* function (pointer) or functor (lambda/struct). Currently does not support
|
||||
* class methods.
|
||||
*/
|
||||
|
||||
template <typename Functor>
|
||||
struct infer_function_traits {
|
||||
using type = function_traits<
|
||||
c10::guts::detail::strip_class_t<decltype(&Functor::operator())>>;
|
||||
};
|
||||
|
||||
template <typename Result, typename... Args>
|
||||
struct infer_function_traits<Result (*)(Args...)> {
|
||||
using type = function_traits<Result(Args...)>;
|
||||
};
|
||||
|
||||
template <typename Result, typename... Args>
|
||||
struct infer_function_traits<Result(Args...)> {
|
||||
using type = function_traits<Result(Args...)>;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using infer_function_traits_t = typename infer_function_traits<T>::type;
|
||||
|
||||
/**
|
||||
* make_function_traits: creates a `function_traits` type given a Return type
|
||||
* and a typelist of Argument types
|
||||
*
|
||||
* Example:
|
||||
* bool f(int, int);
|
||||
*
|
||||
* infer_function_traits_t<f> == make_function_traits_t<bool,
|
||||
* typelist::typelist<int, int>>
|
||||
*/
|
||||
template <typename Result, typename ArgList>
|
||||
struct make_function_traits {
|
||||
static_assert(
|
||||
false_t<ArgList>::value,
|
||||
"In guts::make_function_traits<Result, TypeList>, the ArgList argument must be typelist<...>.");
|
||||
};
|
||||
|
||||
template <typename Result, typename... Args>
|
||||
struct make_function_traits<Result, typelist::typelist<Args...>> {
|
||||
using type = function_traits<Result(Args...)>;
|
||||
};
|
||||
|
||||
template <typename Result, typename ArgList>
|
||||
using make_function_traits_t =
|
||||
typename make_function_traits<Result, ArgList>::type;
|
||||
|
||||
/**
|
||||
* make_offset_index_sequence<Start, N>
|
||||
* Like make_index_sequence<N>, but starting from Start instead of 0.
|
||||
*
|
||||
* Example:
|
||||
* make_offset_index_sequence<10, 3> == std::index_sequence<10, 11, 12>
|
||||
*/
|
||||
template <size_t Start, size_t N, size_t... Is>
|
||||
struct make_offset_index_sequence_impl
|
||||
: make_offset_index_sequence_impl<Start, N - 1, Start + N - 1, Is...> {
|
||||
static_assert(
|
||||
static_cast<int>(Start) >= 0,
|
||||
"make_offset_index_sequence: Start < 0");
|
||||
static_assert(static_cast<int>(N) >= 0, "make_offset_index_sequence: N < 0");
|
||||
};
|
||||
|
||||
template <size_t Start, size_t... Is>
|
||||
struct make_offset_index_sequence_impl<Start, 0, Is...> {
|
||||
typedef std::index_sequence<Is...> type;
|
||||
};
|
||||
|
||||
template <size_t Start, size_t N>
|
||||
using make_offset_index_sequence =
|
||||
typename make_offset_index_sequence_impl<Start, N>::type;
|
||||
|
||||
/**
|
||||
* Use tuple_elements to extract a position-indexed subset of elements
|
||||
* from the argument tuple into a result tuple.
|
||||
*
|
||||
* Example:
|
||||
* std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
|
||||
* std::tuple<int, double> result = tuple_elements(t, std::index_sequence<0,
|
||||
* 2>());
|
||||
*/
|
||||
template <class Tuple, size_t... Is>
|
||||
constexpr auto tuple_elements(Tuple t, std::index_sequence<Is...> /*unused*/) {
|
||||
return std::tuple<std::tuple_element_t<Is, Tuple>...>(std::get<Is>(t)...);
|
||||
}
|
||||
|
||||
/**
|
||||
* Use tuple_take to extract the first or last n elements from the argument
|
||||
* tuple into a result tuple.
|
||||
*
|
||||
* Example:
|
||||
* std::tuple<int, const char*, double> t = std::make_tuple(0, "HEY", 2.0);
|
||||
* std::tuple<int, const char*> first_two = tuple_take<decltype(t), 2>(t);
|
||||
* std::tuple<const char*, double> last_two = tuple_take<decltype(t), -2>(t);
|
||||
*/
|
||||
template <class Tuple, int N, class Enable = void>
|
||||
struct TupleTake {};
|
||||
|
||||
template <class Tuple, int N>
|
||||
struct TupleTake<Tuple, N, std::enable_if_t<N >= 0, void>> {
|
||||
static auto call(Tuple t) {
|
||||
constexpr size_t size = std::tuple_size<Tuple>();
|
||||
static_assert(N <= size, "tuple_take: N > size");
|
||||
return tuple_elements(t, std::make_index_sequence<N>{});
|
||||
}
|
||||
};
|
||||
|
||||
template <class Tuple, int N>
|
||||
struct TupleTake < Tuple,
|
||||
N, std::enable_if_t<N<0, void>> {
|
||||
static auto call(Tuple t) {
|
||||
constexpr size_t size = std::tuple_size<Tuple>();
|
||||
static_assert(-N <= size, "tuple_take: -N > size");
|
||||
return tuple_elements(t, make_offset_index_sequence<size + N, -N>{});
|
||||
}
|
||||
};
|
||||
|
||||
template <class Tuple, int N>
|
||||
auto tuple_take(Tuple t) {
|
||||
return TupleTake<Tuple, N>::call(t);
|
||||
}
|
||||
|
||||
/**
|
||||
* Use tuple_slice to extract a contiguous subtuple from the argument.
|
||||
*
|
||||
* Example:
|
||||
* std::tuple<int, const char*, double, bool> t = std::make_tuple(0,
|
||||
* "HEY", 2.0, false); std::tuple<int, const char*> middle_two =
|
||||
* tuple_slice<decltype(t), 1, 2>(t);
|
||||
*/
|
||||
template <class Tuple, size_t Start, size_t N>
|
||||
constexpr auto tuple_slice(Tuple t) {
|
||||
constexpr size_t size = std::tuple_size<Tuple>();
|
||||
static_assert(Start + N <= size, "tuple_slice: Start + N > size");
|
||||
return tuple_elements(t, make_offset_index_sequence<Start, N>{});
|
||||
}
|
||||
|
||||
/**
|
||||
* Use tuple_map to run a mapping function over a tuple to get a new tuple.
|
||||
*
|
||||
* Example 1:
|
||||
* auto result = tuple_map(std::tuple<int32_t, int32_t, int32_t>(3, 4, 5), []
|
||||
* (int32_t a) -> int16_t {return a+1;});
|
||||
* // result == std::tuple<int16_t, int16_t, int16_t>(4, 5, 6)
|
||||
*
|
||||
* Example 2:
|
||||
* struct Mapper {
|
||||
* std::string operator()(int32_t a) const {
|
||||
* return std::to_string(a);
|
||||
* }
|
||||
* int64_t operator()(const std::string& a) const {
|
||||
* return atoi(a.c_str());
|
||||
* }
|
||||
* };
|
||||
* auto result = tuple_map(std::tuple<int32_t, std::string>(3, "4"),
|
||||
* Mapper());
|
||||
* // result == std::tuple<std::string, int64_t>("3", 4)
|
||||
*
|
||||
* Example 3:
|
||||
* struct A final {
|
||||
* int32_t func() {
|
||||
* return 5;
|
||||
* }
|
||||
* };
|
||||
* struct B final {
|
||||
* std::string func() {
|
||||
* return "5";
|
||||
* }
|
||||
* };
|
||||
* auto result = tuple_map(std::make_tuple(A(), B()), [] (auto a) { return
|
||||
* a.func(); });
|
||||
* // result == std::tuple<int32_t, std::string>(5, "5");
|
||||
*/
|
||||
namespace detail {
|
||||
template <class Mapper, class... Args, size_t... Indices>
|
||||
auto tuple_map(
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
|
||||
std::tuple<Args...>&& tuple,
|
||||
const Mapper& mapper,
|
||||
std::index_sequence<Indices...> /*unused*/) {
|
||||
return std::tuple<decltype(mapper(std::forward<Args>(std::get<Indices>(
|
||||
tuple))))...>(mapper(std::forward<Args>(std::get<Indices>(tuple)))...);
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
template <class Mapper, class... Args>
|
||||
auto tuple_map(std::tuple<Args...>&& tuple, const Mapper& mapper) {
|
||||
return detail::tuple_map(
|
||||
std::move(tuple), mapper, std::index_sequence_for<Args...>());
|
||||
}
|
||||
|
||||
} // namespace c10::guts
|
||||
|
||||
@ -1 +1,515 @@
|
||||
#include <torch/headeronly/util/TypeList.h>
|
||||
#pragma once
|
||||
|
||||
#include <c10/util/TypeTraits.h>
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
namespace c10::guts {
|
||||
|
||||
template <class... T>
|
||||
struct false_t : std::false_type {};
|
||||
template <template <class> class... T>
|
||||
struct false_higher_t : std::false_type {};
|
||||
|
||||
namespace typelist {
|
||||
|
||||
/**
|
||||
* Type holding a list of types for compile time type computations
|
||||
*/
|
||||
template <class... Items>
|
||||
struct typelist final {
|
||||
public:
|
||||
typelist() = delete; // not for instantiation
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns the number of types in a typelist
|
||||
* Example:
|
||||
* 3 == size<typelist<int, int, double>>::value
|
||||
*/
|
||||
template <class TypeList>
|
||||
struct size final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::size<T>, T must be typelist<...>.");
|
||||
};
|
||||
template <class... Types>
|
||||
struct size<typelist<Types...>> final {
|
||||
static constexpr size_t value = sizeof...(Types);
|
||||
};
|
||||
|
||||
/**
|
||||
* Transforms a list of types into a tuple holding these types.
|
||||
* Example:
|
||||
* std::tuple<int, string> == to_tuple_t<typelist<int, string>>
|
||||
*/
|
||||
template <class TypeList>
|
||||
struct to_tuple final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::to_tuple<T>, T must be typelist<...>.");
|
||||
};
|
||||
template <class... Types>
|
||||
struct to_tuple<typelist<Types...>> final {
|
||||
using type = std::tuple<Types...>;
|
||||
};
|
||||
template <class TypeList>
|
||||
using to_tuple_t = typename to_tuple<TypeList>::type;
|
||||
|
||||
/**
|
||||
* Creates a typelist containing the types of a given tuple.
|
||||
* Example:
|
||||
* typelist<int, string> == from_tuple_t<std::tuple<int, string>>
|
||||
*/
|
||||
template <class Tuple>
|
||||
struct from_tuple final {
|
||||
static_assert(
|
||||
false_t<Tuple>::value,
|
||||
"In typelist::from_tuple<T>, T must be std::tuple<...>.");
|
||||
};
|
||||
template <class... Types>
|
||||
struct from_tuple<std::tuple<Types...>> final {
|
||||
using type = typelist<Types...>;
|
||||
};
|
||||
template <class Tuple>
|
||||
using from_tuple_t = typename from_tuple<Tuple>::type;
|
||||
|
||||
/**
|
||||
* Concatenates multiple type lists.
|
||||
* Example:
|
||||
* typelist<int, string, int> == concat_t<typelist<int, string>,
|
||||
* typelist<int>>
|
||||
*/
|
||||
template <class... TypeLists>
|
||||
struct concat final {
|
||||
static_assert(
|
||||
false_t<TypeLists...>::value,
|
||||
"In typelist::concat<T1, ...>, the T arguments each must be typelist<...>.");
|
||||
};
|
||||
template <class... Head1Types, class... Head2Types, class... TailLists>
|
||||
struct concat<typelist<Head1Types...>, typelist<Head2Types...>, TailLists...>
|
||||
final {
|
||||
using type =
|
||||
typename concat<typelist<Head1Types..., Head2Types...>, TailLists...>::
|
||||
type;
|
||||
};
|
||||
template <class... HeadTypes>
|
||||
struct concat<typelist<HeadTypes...>> final {
|
||||
using type = typelist<HeadTypes...>;
|
||||
};
|
||||
template <>
|
||||
struct concat<> final {
|
||||
using type = typelist<>;
|
||||
};
|
||||
template <class... TypeLists>
|
||||
using concat_t = typename concat<TypeLists...>::type;
|
||||
|
||||
/**
|
||||
* Filters the types in a type list by a type trait.
|
||||
* Examples:
|
||||
* typelist<int&, const string&&> == filter_t<std::is_reference,
|
||||
* typelist<void, string, int&, bool, const string&&, int>>
|
||||
*/
|
||||
template <template <class> class Condition, class TypeList>
|
||||
struct filter final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::filter<Condition, TypeList>, the TypeList argument must be typelist<...>.");
|
||||
};
|
||||
template <template <class> class Condition, class Head, class... Tail>
|
||||
struct filter<Condition, typelist<Head, Tail...>> final {
|
||||
static_assert(
|
||||
is_type_condition<Condition>::value,
|
||||
"In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
|
||||
using type = std::conditional_t<
|
||||
Condition<Head>::value,
|
||||
concat_t<
|
||||
typelist<Head>,
|
||||
typename filter<Condition, typelist<Tail...>>::type>,
|
||||
typename filter<Condition, typelist<Tail...>>::type>;
|
||||
};
|
||||
template <template <class> class Condition>
|
||||
struct filter<Condition, typelist<>> final {
|
||||
static_assert(
|
||||
is_type_condition<Condition>::value,
|
||||
"In typelist::filter<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
|
||||
using type = typelist<>;
|
||||
};
|
||||
template <template <class> class Condition, class TypeList>
|
||||
using filter_t = typename filter<Condition, TypeList>::type;
|
||||
|
||||
/**
|
||||
* Counts how many types in the list fulfill a type trait
|
||||
* Examples:
|
||||
* 2 == count_if<std::is_reference, typelist<void, string, int&, bool, const
|
||||
* string&&, int>>
|
||||
*/
|
||||
template <template <class> class Condition, class TypeList>
|
||||
struct count_if final {
|
||||
static_assert(
|
||||
is_type_condition<Condition>::value,
|
||||
"In typelist::count_if<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
|
||||
static_assert(
|
||||
is_instantiation_of<typelist, TypeList>::value,
|
||||
"In typelist::count_if<Condition, TypeList>, the TypeList argument must be typelist<...>.");
|
||||
// TODO Direct implementation might be faster
|
||||
static constexpr size_t value = size<filter_t<Condition, TypeList>>::value;
|
||||
};
|
||||
|
||||
/**
|
||||
* Checks if a typelist contains a certain type.
|
||||
* Examples:
|
||||
* contains<typelist<int, string>, string> == true_type
|
||||
* contains<typelist<int, string>, double> == false_type
|
||||
*/
|
||||
namespace detail {
|
||||
template <class TypeList, class Type, class Enable = void>
|
||||
struct contains {};
|
||||
template <class Type>
|
||||
struct contains<typelist<>, Type, void> : std::false_type {};
|
||||
template <class Type, class Head, class... Tail>
|
||||
struct contains<
|
||||
typelist<Head, Tail...>,
|
||||
Type,
|
||||
std::enable_if_t<std::is_same_v<Head, Type>>> : std::true_type {};
|
||||
template <class Type, class Head, class... Tail>
|
||||
struct contains<
|
||||
typelist<Head, Tail...>,
|
||||
Type,
|
||||
std::enable_if_t<!std::is_same_v<Head, Type>>>
|
||||
: contains<typelist<Tail...>, Type> {};
|
||||
} // namespace detail
|
||||
template <class TypeList, class Type>
|
||||
using contains = typename detail::contains<TypeList, Type>::type;
|
||||
|
||||
/**
|
||||
* Returns true iff the type trait is true for all types in the type list
|
||||
* Examples:
|
||||
* true == all<std::is_reference, typelist<int&, const float&&, const
|
||||
* MyClass&>>::value false == all<std::is_reference, typelist<int&, const
|
||||
* float&&, MyClass>>::value
|
||||
*/
|
||||
template <template <class> class Condition, class TypeList>
|
||||
struct all {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::all<Condition, TypeList>, the TypeList argument must be typelist<...>.");
|
||||
};
|
||||
template <template <class> class Condition, class... Types>
|
||||
struct all<Condition, typelist<Types...>>
|
||||
: std::conjunction<Condition<Types>...> {
|
||||
static_assert(
|
||||
is_type_condition<Condition>::value,
|
||||
"In typelist::all<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns true iff the type trait is true for any type in the type list
|
||||
* Examples:
|
||||
* true == true_for_any_type<std::is_reference, typelist<int, const
|
||||
* float&&, const MyClass>>::value false ==
|
||||
* true_for_any_type<std::is_reference, typelist<int, const float,
|
||||
* MyClass>>::value
|
||||
*/
|
||||
template <template <class> class Condition, class TypeList>
|
||||
struct true_for_any_type final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::true_for_any_type<Condition, TypeList>, the TypeList argument must be typelist<...>.");
|
||||
};
|
||||
template <template <class> class Condition, class... Types>
|
||||
struct true_for_any_type<Condition, typelist<Types...>> final
|
||||
: std::disjunction<Condition<Types>...> {
|
||||
static_assert(
|
||||
is_type_condition<Condition>::value,
|
||||
"In typelist::true_for_any_type<Condition, TypeList>, the Condition argument must be a condition type trait, i.e. have a static constexpr bool ::value member.");
|
||||
};
|
||||
|
||||
/**
|
||||
* Maps types of a type list using a type trait
|
||||
* Example:
|
||||
* typelist<int&, double&, string&> == map_t<std::add_lvalue_reference_t,
|
||||
* typelist<int, double, string>>
|
||||
*/
|
||||
template <template <class> class Mapper, class TypeList>
|
||||
struct map final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::map<Mapper, TypeList>, the TypeList argument must be typelist<...>.");
|
||||
};
|
||||
template <template <class> class Mapper, class... Types>
|
||||
struct map<Mapper, typelist<Types...>> final {
|
||||
using type = typelist<Mapper<Types>...>;
|
||||
};
|
||||
template <template <class> class Mapper, class TypeList>
|
||||
using map_t = typename map<Mapper, TypeList>::type;
|
||||
|
||||
/**
|
||||
* Returns the first element of a type list.
|
||||
* Example:
|
||||
* int == head_t<typelist<int, string>>
|
||||
*/
|
||||
template <class TypeList>
|
||||
struct head final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::head<T>, the T argument must be typelist<...>.");
|
||||
};
|
||||
template <class Head, class... Tail>
|
||||
struct head<typelist<Head, Tail...>> final {
|
||||
using type = Head;
|
||||
};
|
||||
template <class TypeList>
|
||||
using head_t = typename head<TypeList>::type;
|
||||
|
||||
/**
|
||||
* Returns the first element of a type list, or the specified default if the
|
||||
* type list is empty. Example: int == head_t<bool, typelist<int, string>>
|
||||
* bool == head_t<bool, typelist<>>
|
||||
*/
|
||||
template <class Default, class TypeList>
|
||||
struct head_with_default final {
|
||||
using type = Default;
|
||||
};
|
||||
template <class Default, class Head, class... Tail>
|
||||
struct head_with_default<Default, typelist<Head, Tail...>> final {
|
||||
using type = Head;
|
||||
};
|
||||
template <class Default, class TypeList>
|
||||
using head_with_default_t = typename head_with_default<Default, TypeList>::type;
|
||||
|
||||
/**
|
||||
* Returns the N-th element of a type list.
|
||||
* Example:
|
||||
* int == element_t<1, typelist<float, int, char>>
|
||||
*/
|
||||
|
||||
/// Base template.
|
||||
template <size_t Index, class TypeList>
|
||||
struct element final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::element<T>, the T argument must be typelist<...>.");
|
||||
};
|
||||
|
||||
/// Successful case, we have reached the zero index and can "return" the head
|
||||
/// type.
|
||||
template <class Head, class... Tail>
|
||||
struct element<0, typelist<Head, Tail...>> {
|
||||
using type = Head;
|
||||
};
|
||||
|
||||
/// Error case, we have an index but ran out of types! It will only be selected
|
||||
/// if `Ts...` is actually empty!
|
||||
template <size_t Index, class... Ts>
|
||||
struct element<Index, typelist<Ts...>> {
|
||||
static_assert(
|
||||
Index < sizeof...(Ts),
|
||||
"Index is out of bounds in typelist::element");
|
||||
};
|
||||
|
||||
/// Shave off types until we hit the <0, Head, Tail...> or <Index> case.
|
||||
template <size_t Index, class Head, class... Tail>
|
||||
struct element<Index, typelist<Head, Tail...>>
|
||||
: element<Index - 1, typelist<Tail...>> {};
|
||||
|
||||
/// Convenience alias.
|
||||
template <size_t Index, class TypeList>
|
||||
using element_t = typename element<Index, TypeList>::type;
|
||||
|
||||
/**
|
||||
* Returns the last element of a type list.
|
||||
* Example:
|
||||
* int == last_t<typelist<int, string>>
|
||||
*/
|
||||
template <class TypeList>
|
||||
struct last final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::last<T>, the T argument must be typelist<...>.");
|
||||
};
|
||||
template <class Head, class... Tail>
|
||||
struct last<typelist<Head, Tail...>> final {
|
||||
using type = typename last<typelist<Tail...>>::type;
|
||||
};
|
||||
template <class Head>
|
||||
struct last<typelist<Head>> final {
|
||||
using type = Head;
|
||||
};
|
||||
template <class TypeList>
|
||||
using last_t = typename last<TypeList>::type;
|
||||
static_assert(std::is_same_v<int, last_t<typelist<double, float, int>>>);
|
||||
|
||||
/**
|
||||
* Take/drop a number of arguments from a typelist.
|
||||
* Example:
|
||||
* typelist<int, string> == take_t<typelist<int, string, bool>, 2>
|
||||
* typelist<bool> == drop_t<typelist<int, string, bool>, 2>
|
||||
*/
|
||||
namespace detail {
|
||||
template <class TypeList, size_t offset, class IndexSequence>
|
||||
struct take_elements final {};
|
||||
|
||||
template <class TypeList, size_t offset, size_t... Indices>
|
||||
struct take_elements<TypeList, offset, std::index_sequence<Indices...>> final {
|
||||
using type = typelist<typename element<offset + Indices, TypeList>::type...>;
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <class TypeList, size_t num>
|
||||
struct take final {
|
||||
static_assert(
|
||||
is_instantiation_of<typelist, TypeList>::value,
|
||||
"In typelist::take<T, num>, the T argument must be typelist<...>.");
|
||||
static_assert(
|
||||
num <= size<TypeList>::value,
|
||||
"Tried to typelist::take more elements than there are in the list");
|
||||
using type = typename detail::
|
||||
take_elements<TypeList, 0, std::make_index_sequence<num>>::type;
|
||||
};
|
||||
template <class TypeList, size_t num>
|
||||
using take_t = typename take<TypeList, num>::type;
|
||||
|
||||
template <class TypeList, size_t num>
|
||||
struct drop final {
|
||||
static_assert(
|
||||
is_instantiation_of<typelist, TypeList>::value,
|
||||
"In typelist::drop<T, num>, the T argument must be typelist<...>.");
|
||||
static_assert(
|
||||
num <= size<TypeList>::value,
|
||||
"Tried to typelist::drop more elements than there are in the list");
|
||||
using type = typename detail::take_elements<
|
||||
TypeList,
|
||||
num,
|
||||
std::make_index_sequence<size<TypeList>::value - num>>::type;
|
||||
};
|
||||
template <class TypeList, size_t num>
|
||||
using drop_t = typename drop<TypeList, num>::type;
|
||||
|
||||
/**
|
||||
* Like drop, but returns an empty list rather than an assertion error if `num`
|
||||
* is larger than the size of the TypeList.
|
||||
* Example:
|
||||
* typelist<> == drop_if_nonempty_t<typelist<string, bool>, 2>
|
||||
* typelist<> == drop_if_nonempty_t<typelist<int, string, bool>, 3>
|
||||
*/
|
||||
template <class TypeList, size_t num>
|
||||
struct drop_if_nonempty final {
|
||||
static_assert(
|
||||
is_instantiation_of<typelist, TypeList>::value,
|
||||
"In typelist::drop<T, num>, the T argument must be typelist<...>.");
|
||||
using type = typename detail::take_elements<
|
||||
TypeList,
|
||||
std::min(num, size<TypeList>::value),
|
||||
std::make_index_sequence<
|
||||
size<TypeList>::value - std::min(num, size<TypeList>::value)>>::type;
|
||||
};
|
||||
template <class TypeList, size_t num>
|
||||
using drop_if_nonempty_t = typename drop_if_nonempty<TypeList, num>::type;
|
||||
|
||||
/**
|
||||
* Reverses a typelist.
|
||||
* Example:
|
||||
* typelist<int, string> == reverse_t<typelist<string, int>>
|
||||
*/
|
||||
template <class TypeList>
|
||||
struct reverse final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::reverse<T>, the T argument must be typelist<...>.");
|
||||
};
|
||||
template <class Head, class... Tail>
|
||||
struct reverse<typelist<Head, Tail...>> final {
|
||||
using type =
|
||||
concat_t<typename reverse<typelist<Tail...>>::type, typelist<Head>>;
|
||||
};
|
||||
template <>
|
||||
struct reverse<typelist<>> final {
|
||||
using type = typelist<>;
|
||||
};
|
||||
template <class TypeList>
|
||||
using reverse_t = typename reverse<TypeList>::type;
|
||||
|
||||
/**
|
||||
* Find the index of the first type in a typelist fulfilling a type trait
|
||||
* condition. Example:
|
||||
*
|
||||
* 2 == find_if<typelist<char, int, char&, int&>, std::is_reference>::value
|
||||
*/
|
||||
template <class TypeList, template <class> class Condition, class Enable = void>
|
||||
struct find_if final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::find_if<TypeList, Condition>, the TypeList argument must be typelist<...>.");
|
||||
};
|
||||
template <template <class> class Condition>
|
||||
struct find_if<typelist<>, Condition, void> final {
|
||||
static_assert(
|
||||
false_higher_t<Condition>::value,
|
||||
"In typelist::find_if<Type/List, Condition>, didn't find any type fulfilling the Condition.");
|
||||
};
|
||||
template <class Head, class... Tail, template <class> class Condition>
|
||||
struct find_if<
|
||||
typelist<Head, Tail...>,
|
||||
Condition,
|
||||
std::enable_if_t<Condition<Head>::value>>
|
||||
final {
|
||||
static constexpr size_t value = 0;
|
||||
};
|
||||
template <class Head, class... Tail, template <class> class Condition>
|
||||
struct find_if<
|
||||
typelist<Head, Tail...>,
|
||||
Condition,
|
||||
std::enable_if_t<!Condition<Head>::value>>
|
||||
final {
|
||||
static constexpr size_t value =
|
||||
1 + find_if<typelist<Tail...>, Condition>::value;
|
||||
};
|
||||
|
||||
/**
|
||||
* Maps a list of types into a list of values.
|
||||
* Examples:
|
||||
* // Example 1
|
||||
* auto sizes =
|
||||
* map_types_to_values<typelist<int64_t, bool, uint32_t>>(
|
||||
* [] (auto t) { return sizeof(decltype(t)::type); }
|
||||
* );
|
||||
* // sizes == std::tuple<size_t, size_t, size_t>{8, 1, 4}
|
||||
*
|
||||
* // Example 2
|
||||
* auto shared_ptrs =
|
||||
* map_types_to_values<typelist<int, double>>(
|
||||
* [] (auto t) { return make_shared<typename decltype(t)::type>(); }
|
||||
* );
|
||||
* // shared_ptrs == std::tuple<shared_ptr<int>, shared_ptr<double>>()
|
||||
*/
|
||||
namespace detail {
|
||||
template <class T>
|
||||
struct type_ final {
|
||||
using type = T;
|
||||
};
|
||||
template <class TypeList>
|
||||
struct map_types_to_values final {
|
||||
static_assert(
|
||||
false_t<TypeList>::value,
|
||||
"In typelist::map_types_to_values<T>, the T argument must be typelist<...>.");
|
||||
};
|
||||
template <class... Types>
|
||||
struct map_types_to_values<typelist<Types...>> final {
|
||||
template <class Func>
|
||||
static auto call(Func&& func) {
|
||||
return std::tuple{std::forward<Func>(func)(type_<Types>())...};
|
||||
}
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <class TypeList, class Func>
|
||||
auto map_types_to_values(Func&& func) {
|
||||
return detail::map_types_to_values<TypeList>::call(std::forward<Func>(func));
|
||||
}
|
||||
|
||||
} // namespace typelist
|
||||
} // namespace c10::guts
|
||||
|
||||
@ -1 +1,151 @@
|
||||
#include <torch/headeronly/util/TypeTraits.h>
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <type_traits>
|
||||
|
||||
namespace c10::guts {
|
||||
|
||||
/**
|
||||
* is_equality_comparable<T> is true_type iff the equality operator is defined
|
||||
* for T.
|
||||
*/
|
||||
template <class T, class Enable = void>
|
||||
struct is_equality_comparable : std::false_type {};
|
||||
template <class T>
|
||||
struct is_equality_comparable<
|
||||
T,
|
||||
std::void_t<decltype(std::declval<T&>() == std::declval<T&>())>>
|
||||
: std::true_type {};
|
||||
template <class T>
|
||||
using is_equality_comparable_t = typename is_equality_comparable<T>::type;
|
||||
|
||||
/**
|
||||
* is_hashable<T> is true_type iff std::hash is defined for T
|
||||
*/
|
||||
template <class T, class Enable = void>
|
||||
struct is_hashable : std::false_type {};
|
||||
template <class T>
|
||||
struct is_hashable<T, std::void_t<decltype(std::hash<T>()(std::declval<T&>()))>>
|
||||
: std::true_type {};
|
||||
template <class T>
|
||||
using is_hashable_t = typename is_hashable<T>::type;
|
||||
|
||||
/**
|
||||
* is_function_type<T> is true_type iff T is a plain function type (i.e.
|
||||
* "Result(Args...)")
|
||||
*/
|
||||
template <class T>
|
||||
struct is_function_type : std::false_type {};
|
||||
template <class Result, class... Args>
|
||||
struct is_function_type<Result(Args...)> : std::true_type {};
|
||||
template <class T>
|
||||
using is_function_type_t = typename is_function_type<T>::type;
|
||||
|
||||
/**
|
||||
* is_instantiation_of<T, I> is true_type iff I is a template instantiation of T
|
||||
* (e.g. vector<int> is an instantiation of vector) Example:
|
||||
* is_instantiation_of_t<vector, vector<int>> // true
|
||||
* is_instantiation_of_t<pair, pair<int, string>> // true
|
||||
* is_instantiation_of_t<vector, pair<int, string>> // false
|
||||
*/
|
||||
template <template <class...> class Template, class T>
|
||||
struct is_instantiation_of : std::false_type {};
|
||||
template <template <class...> class Template, class... Args>
|
||||
struct is_instantiation_of<Template, Template<Args...>> : std::true_type {};
|
||||
template <template <class...> class Template, class T>
|
||||
using is_instantiation_of_t = typename is_instantiation_of<Template, T>::type;
|
||||
|
||||
namespace detail {
|
||||
/**
|
||||
* strip_class: helper to remove the class type from pointers to `operator()`.
|
||||
*/
|
||||
|
||||
template <typename T>
|
||||
struct strip_class {};
|
||||
template <typename Class, typename Result, typename... Args>
|
||||
struct strip_class<Result (Class::*)(Args...)> {
|
||||
using type = Result(Args...);
|
||||
};
|
||||
template <typename Class, typename Result, typename... Args>
|
||||
struct strip_class<Result (Class::*)(Args...) const> {
|
||||
using type = Result(Args...);
|
||||
};
|
||||
template <typename T>
|
||||
using strip_class_t = typename strip_class<T>::type;
|
||||
} // namespace detail
|
||||
|
||||
/**
|
||||
* Evaluates to true_type, iff the given class is a Functor
|
||||
* (i.e. has a call operator with some set of arguments)
|
||||
*/
|
||||
|
||||
template <class Functor, class Enable = void>
|
||||
struct is_functor : std::false_type {};
|
||||
template <class Functor>
|
||||
struct is_functor<
|
||||
Functor,
|
||||
std::enable_if_t<is_function_type<
|
||||
detail::strip_class_t<decltype(&Functor::operator())>>::value>>
|
||||
: std::true_type {};
|
||||
|
||||
/**
|
||||
* lambda_is_stateless<T> is true iff the lambda type T is stateless
|
||||
* (i.e. does not have a closure).
|
||||
* Example:
|
||||
* auto stateless_lambda = [] (int a) {return a;};
|
||||
* lambda_is_stateless<decltype(stateless_lambda)> // true
|
||||
* auto stateful_lambda = [&] (int a) {return a;};
|
||||
* lambda_is_stateless<decltype(stateful_lambda)> // false
|
||||
*/
|
||||
namespace detail {
|
||||
template <class LambdaType, class FuncType>
|
||||
struct is_stateless_lambda__ final {
|
||||
static_assert(
|
||||
!std::is_same_v<LambdaType, LambdaType>,
|
||||
"Base case shouldn't be hit");
|
||||
};
|
||||
// implementation idea: According to the C++ standard, stateless lambdas are
|
||||
// convertible to function pointers
|
||||
template <class LambdaType, class C, class Result, class... Args>
|
||||
struct is_stateless_lambda__<LambdaType, Result (C::*)(Args...) const>
|
||||
: std::is_convertible<LambdaType, Result (*)(Args...)> {};
|
||||
template <class LambdaType, class C, class Result, class... Args>
|
||||
struct is_stateless_lambda__<LambdaType, Result (C::*)(Args...)>
|
||||
: std::is_convertible<LambdaType, Result (*)(Args...)> {};
|
||||
|
||||
// case where LambdaType is not even a functor
|
||||
template <class LambdaType, class Enable = void>
|
||||
struct is_stateless_lambda_ final : std::false_type {};
|
||||
// case where LambdaType is a functor
|
||||
template <class LambdaType>
|
||||
struct is_stateless_lambda_<
|
||||
LambdaType,
|
||||
std::enable_if_t<is_functor<LambdaType>::value>>
|
||||
: is_stateless_lambda__<LambdaType, decltype(&LambdaType::operator())> {};
|
||||
} // namespace detail
|
||||
template <class T>
|
||||
using is_stateless_lambda = detail::is_stateless_lambda_<std::decay_t<T>>;
|
||||
|
||||
/**
|
||||
* is_type_condition<C> is true_type iff C<...> is a type trait representing a
|
||||
* condition (i.e. has a constexpr static bool ::value member) Example:
|
||||
* is_type_condition<std::is_reference> // true
|
||||
*/
|
||||
template <template <class> class C, class Enable = void>
|
||||
struct is_type_condition : std::false_type {};
|
||||
template <template <class> class C>
|
||||
struct is_type_condition<
|
||||
C,
|
||||
std::enable_if_t<
|
||||
std::is_same_v<bool, std::remove_cv_t<decltype(C<int>::value)>>>>
|
||||
: std::true_type {};
|
||||
|
||||
/**
|
||||
* is_fundamental<T> is true_type iff the lambda type T is a fundamental type
|
||||
* (that is, arithmetic type, void, or nullptr_t). Example: is_fundamental<int>
|
||||
* // true We define it here to resolve a MSVC bug. See
|
||||
* https://github.com/pytorch/pytorch/issues/30932 for details.
|
||||
*/
|
||||
template <class T>
|
||||
struct is_fundamental : std::is_fundamental<T> {};
|
||||
} // namespace c10::guts
|
||||
|
||||
@ -12,10 +12,6 @@ template <typename, typename...>
|
||||
class class_;
|
||||
}
|
||||
|
||||
namespace torch::utils {
|
||||
class PyObjectPreservation;
|
||||
}
|
||||
|
||||
namespace c10 {
|
||||
class intrusive_ptr_target;
|
||||
namespace raw {
|
||||
@ -37,8 +33,6 @@ constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
|
||||
constexpr uint64_t kReferenceCountOne = 1;
|
||||
constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
|
||||
constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
|
||||
// Indicates whether the object has a PyObject wrapper.
|
||||
constexpr uint64_t kHasPyObject = (uint64_t(1) << 63);
|
||||
|
||||
template <class TTarget>
|
||||
struct intrusive_target_default_null_type final {
|
||||
@ -61,11 +55,7 @@ inline uint32_t refcount(uint64_t combined_refcount) {
|
||||
}
|
||||
|
||||
inline uint32_t weakcount(uint64_t combined_refcount) {
|
||||
return static_cast<uint32_t>((combined_refcount & ~kHasPyObject) >> 32);
|
||||
}
|
||||
|
||||
inline bool has_pyobject(uint64_t combined_refcount) {
|
||||
return (combined_refcount & kHasPyObject) != 0;
|
||||
return static_cast<uint32_t>(combined_refcount >> 32);
|
||||
}
|
||||
|
||||
// The only requirement for refcount increment is that it happens-before
|
||||
@ -76,6 +66,12 @@ inline uint64_t atomic_combined_refcount_increment(
|
||||
return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
|
||||
}
|
||||
|
||||
inline uint32_t atomic_refcount_increment(
|
||||
std::atomic<uint64_t>& combined_refcount) {
|
||||
return detail::refcount(atomic_combined_refcount_increment(
|
||||
combined_refcount, kReferenceCountOne));
|
||||
}
|
||||
|
||||
inline uint32_t atomic_weakcount_increment(
|
||||
std::atomic<uint64_t>& combined_refcount) {
|
||||
return detail::weakcount(atomic_combined_refcount_increment(
|
||||
@ -103,11 +99,6 @@ inline uint32_t atomic_weakcount_decrement(
|
||||
combined_refcount, kWeakReferenceCountOne));
|
||||
}
|
||||
|
||||
template <class T, class = void>
|
||||
struct TargetTraits {
|
||||
static constexpr bool can_have_pyobject = false;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/**
|
||||
@ -164,23 +155,6 @@ class C10_API intrusive_ptr_target {
|
||||
// we can atomically operate on both at the same time for performance
|
||||
// and defined behaviors.
|
||||
//
|
||||
// Note [PyObject preservation for Tensor and Storages]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// intrusive_ptr has special support for preserving PyObject wrappers
|
||||
// for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of
|
||||
// the combined_refcount_ is used to indicate whether the object has a
|
||||
// PyObject wrapper.
|
||||
//
|
||||
// - The PyObject, if it exists, holds a strong reference to the
|
||||
// intrusive_ptr_target.
|
||||
//
|
||||
// - When the refcount goes from 1 to 2, we incref the PyObject.
|
||||
//
|
||||
// - When the refcount goes from 2 to 1, we decref the PyObject.
|
||||
//
|
||||
// In other words, the intrusive_ptr keeps the PyObject alive as long as there
|
||||
// are other C++ references to the intrusive_ptr_target.
|
||||
|
||||
mutable std::atomic<uint64_t> combined_refcount_;
|
||||
static_assert(sizeof(std::atomic<uint64_t>) == 8);
|
||||
static_assert(alignof(std::atomic<uint64_t>) == 8);
|
||||
@ -198,8 +172,6 @@ class C10_API intrusive_ptr_target {
|
||||
template <typename T>
|
||||
friend struct ExclusivelyOwnedTensorTraits;
|
||||
|
||||
friend class torch::utils::PyObjectPreservation;
|
||||
|
||||
protected:
|
||||
// protected destructor. We never want to destruct intrusive_ptr_target*
|
||||
// directly.
|
||||
@ -283,16 +255,6 @@ class C10_API intrusive_ptr_target {
|
||||
*/
|
||||
virtual void release_resources() {}
|
||||
|
||||
/**
|
||||
* These two methods are called when the refcount transitions between one
|
||||
* and two and the object has a PyObject wrapper.
|
||||
*/
|
||||
virtual void incref_pyobject() const {}
|
||||
virtual void decref_pyobject() const {}
|
||||
virtual bool try_incref_pyobject() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
|
||||
return detail::refcount(combined_refcount_.load(order));
|
||||
}
|
||||
@ -303,15 +265,6 @@ class C10_API intrusive_ptr_target {
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
template <>
|
||||
struct TargetTraits<c10::intrusive_ptr_target> {
|
||||
// A generic intrusive_ptr<intrusive_ptr_target> may actually be a TensorImpl
|
||||
// or StorageImpl, so we have to allow for PyObject support.
|
||||
static constexpr bool can_have_pyobject = true;
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <class TTarget, class NullType>
|
||||
class weak_intrusive_ptr;
|
||||
|
||||
@ -361,34 +314,18 @@ class intrusive_ptr final {
|
||||
|
||||
void retain_() {
|
||||
if (target_ != NullType::singleton()) {
|
||||
uint64_t combined = detail::atomic_combined_refcount_increment(
|
||||
target_->combined_refcount_, detail::kReferenceCountOne);
|
||||
uint32_t new_refcount = detail::refcount(combined);
|
||||
uint32_t new_refcount =
|
||||
detail::atomic_refcount_increment(target_->combined_refcount_);
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
new_refcount != 1,
|
||||
"intrusive_ptr: Cannot increase refcount after it reached zero.");
|
||||
|
||||
if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
// If the refcount transitioned from 1 to 2, we need to incref the
|
||||
// PyObject. In other words, we need to ensure that the PyObject stays
|
||||
// alive now that we have a C++ reference to this object in addition to
|
||||
// the PyObject itself.
|
||||
if (C10_UNLIKELY(
|
||||
detail::has_pyobject(combined) &&
|
||||
detail::refcount(combined) == 2)) {
|
||||
target_->incref_pyobject();
|
||||
}
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
!detail::has_pyobject(combined),
|
||||
"TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reset_() noexcept {
|
||||
if (target_ != NullType::singleton()) {
|
||||
if (is_uniquely_owned()) {
|
||||
if (target_->combined_refcount_.load(std::memory_order_acquire) ==
|
||||
detail::kUniqueRef) {
|
||||
// Both counts are 1, so there are no weak references and
|
||||
// we are releasing the last strong reference. No other
|
||||
// threads can observe the effects of this target_ deletion
|
||||
@ -400,10 +337,9 @@ class intrusive_ptr final {
|
||||
|
||||
auto combined_refcount = detail::atomic_combined_refcount_decrement(
|
||||
target_->combined_refcount_, detail::kReferenceCountOne);
|
||||
uint32_t new_refcount = detail::refcount(combined_refcount);
|
||||
bool has_pyobject = detail::has_pyobject(combined_refcount);
|
||||
if (new_refcount == 0) {
|
||||
bool should_delete = detail::weakcount(combined_refcount) == 1;
|
||||
if (detail::refcount(combined_refcount) == 0) {
|
||||
bool should_delete =
|
||||
(combined_refcount == detail::kWeakReferenceCountOne);
|
||||
// See comment above about weakcount. As long as refcount>0,
|
||||
// weakcount is one larger than the actual number of weak references.
|
||||
// So we need to decrement it here.
|
||||
@ -420,18 +356,6 @@ class intrusive_ptr final {
|
||||
if (should_delete) {
|
||||
delete target_;
|
||||
}
|
||||
} else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
// If the refcount transitioned from 2 to 1, we need to decref the
|
||||
// PyObject. In other words, we don't want to keep the PyObject alive if
|
||||
// there are no C++ references to this object other than the PyObject
|
||||
// itself.
|
||||
if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
|
||||
target_->decref_pyobject();
|
||||
}
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
!has_pyobject,
|
||||
"TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -598,16 +522,6 @@ class intrusive_ptr final {
|
||||
return use_count() == 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stronger than unique() in that it must not have any weakrefs as well.
|
||||
*/
|
||||
bool is_uniquely_owned() const noexcept {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
|
||||
uint64_t combined =
|
||||
target_->combined_refcount_.load(std::memory_order_acquire);
|
||||
return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an owning (!) pointer to the underlying object and makes the
|
||||
* intrusive_ptr instance invalid. That means the refcount is not decreased.
|
||||
@ -1018,7 +932,6 @@ class weak_intrusive_ptr final {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
} else {
|
||||
bool increfed = false;
|
||||
auto combined_refcount =
|
||||
target_->combined_refcount_.load(std::memory_order_relaxed);
|
||||
do {
|
||||
@ -1027,31 +940,12 @@ class weak_intrusive_ptr final {
|
||||
// Return nullptr.
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
}
|
||||
if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
if (detail::has_pyobject(combined_refcount) &&
|
||||
detail::refcount(combined_refcount) == 1 && !increfed) {
|
||||
// Object has a python wrapper with no other C++ references.
|
||||
// We need to to incref the Python object before we acquire a
|
||||
// strong reference to the C++ object to avoid a situation
|
||||
// where the Python object is deallocated concurrently.
|
||||
if (!target_->try_incref_pyobject()) {
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
}
|
||||
increfed = true;
|
||||
}
|
||||
}
|
||||
} while (!target_->combined_refcount_.compare_exchange_weak(
|
||||
combined_refcount,
|
||||
combined_refcount + detail::kReferenceCountOne,
|
||||
std::memory_order_acquire,
|
||||
std::memory_order_relaxed));
|
||||
|
||||
if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
if (increfed && detail::refcount(combined_refcount) != 1) {
|
||||
target_->decref_pyobject();
|
||||
}
|
||||
}
|
||||
|
||||
return intrusive_ptr<TTarget, NullType>(
|
||||
target_, raw::DontIncreaseRefcount{});
|
||||
}
|
||||
@ -1166,14 +1060,7 @@ namespace intrusive_ptr {
|
||||
// NullType::singleton to this function
|
||||
inline void incref(intrusive_ptr_target* self) {
|
||||
if (self) {
|
||||
uint64_t combined = detail::atomic_combined_refcount_increment(
|
||||
self->combined_refcount_, detail::kReferenceCountOne);
|
||||
|
||||
if (C10_UNLIKELY(
|
||||
detail::has_pyobject(combined) &&
|
||||
detail::refcount(combined) == 2)) {
|
||||
self->incref_pyobject();
|
||||
}
|
||||
detail::atomic_refcount_increment(self->combined_refcount_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -926,14 +926,15 @@ class DeviceCachingAllocator {
|
||||
(release_cached_blocks() && alloc_block(params, true));
|
||||
}
|
||||
if (!block_found) {
|
||||
const auto& raw_device = c10::xpu::get_raw_device(device);
|
||||
const auto device_total =
|
||||
raw_device.get_info<sycl::info::device::global_mem_size>();
|
||||
c10::xpu::DeviceProp device_prop;
|
||||
c10::xpu::get_device_properties(&device_prop, device);
|
||||
auto device_total = device_prop.global_mem_size;
|
||||
// Estimate the available device memory when the SYCL runtime does not
|
||||
// support the corresponding aspect (ext_intel_free_memory).
|
||||
size_t device_free = device_total -
|
||||
size_t device_free = device_prop.global_mem_size -
|
||||
stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
|
||||
.current;
|
||||
auto& raw_device = c10::xpu::get_raw_device(device);
|
||||
// TODO: Remove the aspect check once the SYCL runtime bug is fixed on
|
||||
// affected devices.
|
||||
if (raw_device.has(sycl::aspect::ext_intel_free_memory)) {
|
||||
@ -1051,37 +1052,21 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t> getMemoryInfo() {
|
||||
const auto& device = c10::xpu::get_raw_device(device_index);
|
||||
const size_t total = device.get_info<sycl::info::device::global_mem_size>();
|
||||
TORCH_CHECK(
|
||||
device.has(sycl::aspect::ext_intel_free_memory),
|
||||
"The device (",
|
||||
device.get_info<sycl::info::device::name>(),
|
||||
") doesn't support querying the available free memory. ",
|
||||
"You can file an issue at https://github.com/pytorch/pytorch/issues ",
|
||||
"to help us prioritize its implementation.");
|
||||
const size_t free =
|
||||
device.get_info<sycl::ext::intel::info::device::free_memory>();
|
||||
return {free, total};
|
||||
}
|
||||
|
||||
double getMemoryFraction() {
|
||||
if (!set_fraction) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
const auto device_total =
|
||||
xpu::get_raw_device(device_index)
|
||||
.get_info<sycl::info::device::global_mem_size>();
|
||||
c10::xpu::DeviceProp device_prop;
|
||||
c10::xpu::get_device_properties(&device_prop, device_index);
|
||||
return static_cast<double>(allowed_memory_maximum) /
|
||||
static_cast<double>(device_total);
|
||||
static_cast<double>(device_prop.global_mem_size);
|
||||
}
|
||||
|
||||
void setMemoryFraction(double fraction) {
|
||||
const auto device_total =
|
||||
xpu::get_raw_device(device_index)
|
||||
.get_info<sycl::info::device::global_mem_size>();
|
||||
c10::xpu::DeviceProp device_prop;
|
||||
c10::xpu::get_device_properties(&device_prop, device_index);
|
||||
auto device_total = device_prop.global_mem_size;
|
||||
allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
|
||||
set_fraction = true;
|
||||
}
|
||||
@ -1255,11 +1240,6 @@ class XPUAllocator : public DeviceAllocator {
|
||||
c10::xpu::get_raw_device(dev_to_access));
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t> getMemoryInfo(DeviceIndex device) override {
|
||||
assertValidDevice(device);
|
||||
return device_allocators[device]->getMemoryInfo();
|
||||
}
|
||||
|
||||
double getMemoryFraction(DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
return device_allocators[device]->getMemoryFraction();
|
||||
|
||||
@ -478,7 +478,6 @@ function(torch_update_find_cuda_flags)
|
||||
endfunction()
|
||||
|
||||
include(CheckCXXCompilerFlag)
|
||||
include(CheckCCompilerFlag)
|
||||
include(CheckLinkerFlag)
|
||||
|
||||
##############################################################################
|
||||
@ -502,24 +501,6 @@ function(append_cxx_flag_if_supported flag outputvar)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(append_c_flag_if_supported flag outputvar)
|
||||
string(TOUPPER "HAS${flag}" _FLAG_NAME)
|
||||
string(REGEX REPLACE "[=-]" "_" _FLAG_NAME "${_FLAG_NAME}")
|
||||
|
||||
# GCC silences unknown -Wno-XXX flags, so test the corresponding -WXXX.
|
||||
if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
||||
string(REGEX REPLACE "^Wno-" "W" new_flag "${flag}")
|
||||
else()
|
||||
set(new_flag "${flag}")
|
||||
endif()
|
||||
|
||||
check_c_compiler_flag("${new_flag}" ${_FLAG_NAME})
|
||||
if(${_FLAG_NAME})
|
||||
string(APPEND ${outputvar} " ${flag}")
|
||||
set(${outputvar} "${${outputvar}}" PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(target_compile_options_if_supported target flag)
|
||||
set(_compile_options "")
|
||||
append_cxx_flag_if_supported("${flag}" _compile_options)
|
||||
|
||||
@ -40,7 +40,6 @@
|
||||
:nosignatures:
|
||||
|
||||
empty_cache
|
||||
get_memory_info
|
||||
max_memory_allocated
|
||||
max_memory_reserved
|
||||
memory_allocated
|
||||
|
||||
@ -1,21 +0,0 @@
|
||||
# torch.mtia.mtia_graph
|
||||
|
||||
The MTIA backend is implemented out of the tree, only interfaces are defined here.
|
||||
|
||||
```{eval-rst}
|
||||
.. automodule:: torch.mtia.mtia_graph
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. currentmodule:: torch.mtia.mtia_graph
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: MTIAGraph
|
||||
:members:
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autoclass:: graph
|
||||
:members:
|
||||
```
|
||||
@ -29,7 +29,6 @@ mps
|
||||
xpu
|
||||
mtia
|
||||
mtia.memory
|
||||
mtia.mtia_graph
|
||||
meta
|
||||
torch.backends <backends>
|
||||
torch.export <export>
|
||||
|
||||
@ -172,9 +172,9 @@ ignore = [
|
||||
"SIM102", "SIM103", "SIM112", # flake8-simplify code styles
|
||||
"SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason
|
||||
"SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression
|
||||
"SIM110", # Checks for for loops that can be replaced with a builtin function, like any or all.
|
||||
"SIM110",
|
||||
"SIM114", # Combine `if` branches using logical `or` operator
|
||||
"SIM115", # Checks for cases where files are opened without using a context manager.
|
||||
"SIM115",
|
||||
"SIM116", # Disable Use a dictionary instead of consecutive `if` statements
|
||||
"SIM117",
|
||||
"SIM118",
|
||||
@ -184,6 +184,7 @@ ignore = [
|
||||
"TC006",
|
||||
# TODO: Remove Python-3.10 specific suppressions
|
||||
"B905",
|
||||
"UP035",
|
||||
]
|
||||
select = [
|
||||
"B",
|
||||
@ -260,7 +261,6 @@ select = [
|
||||
"TRY401", # verbose-log-message
|
||||
"UP",
|
||||
"YTT",
|
||||
"S101",
|
||||
]
|
||||
|
||||
[tool.ruff.lint.pyupgrade]
|
||||
@ -340,39 +340,6 @@ keep-runtime-typing = true
|
||||
"tools/linter/**" = [
|
||||
"LOG015" # please fix
|
||||
]
|
||||
"benchmarks/**" = [
|
||||
"S101"
|
||||
]
|
||||
"test/**" = [
|
||||
"S101"
|
||||
]
|
||||
"torchgen/**" = [
|
||||
"S101"
|
||||
]
|
||||
"torch/**" = [
|
||||
"S101"
|
||||
]
|
||||
"tools/**" = [
|
||||
"S101"
|
||||
]
|
||||
"setup.py" = [
|
||||
"S101"
|
||||
]
|
||||
"functorch/**" = [
|
||||
"S101"
|
||||
]
|
||||
"docs/**" = [
|
||||
"S101"
|
||||
]
|
||||
"android/**" = [
|
||||
"S101"
|
||||
]
|
||||
".github/**" = [
|
||||
"S101"
|
||||
]
|
||||
".ci/**" = [
|
||||
"S101"
|
||||
]
|
||||
|
||||
[tool.codespell]
|
||||
ignore-words = "tools/linter/dictionary.txt"
|
||||
|
||||
33
setup.py
33
setup.py
@ -630,37 +630,6 @@ def mirror_files_into_torchgen() -> None:
|
||||
raise RuntimeError("Check the file paths in `mirror_files_into_torchgen()`")
|
||||
|
||||
|
||||
def mirror_inductor_external_kernels() -> None:
|
||||
"""
|
||||
Copy external kernels into Inductor so they are importable.
|
||||
"""
|
||||
paths = [
|
||||
(
|
||||
CWD / "torch/_inductor/kernel/vendored_templates/cutedsl_grouped_gemm.py",
|
||||
CWD
|
||||
/ "third_party/cutlass/examples/python/CuTeDSL/blackwell/grouped_gemm.py",
|
||||
),
|
||||
]
|
||||
for new_path, orig_path in paths:
|
||||
# Create the dirs involved in new_path if they don't exist
|
||||
if not new_path.exists():
|
||||
new_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy the files from the orig location to the new location
|
||||
if orig_path.is_file():
|
||||
shutil.copyfile(orig_path, new_path)
|
||||
continue
|
||||
if orig_path.is_dir():
|
||||
if new_path.exists():
|
||||
# copytree fails if the tree exists already, so remove it.
|
||||
shutil.rmtree(new_path)
|
||||
shutil.copytree(orig_path, new_path)
|
||||
continue
|
||||
raise RuntimeError(
|
||||
"Check the file paths in `mirror_inductor_external_kernels()`"
|
||||
)
|
||||
|
||||
|
||||
# ATTENTION: THIS IS AI SLOP
|
||||
def extract_variant_from_version(version: str) -> str:
|
||||
"""Extract variant from version string, defaulting to 'cpu'."""
|
||||
@ -1646,7 +1615,6 @@ def main() -> None:
|
||||
mirror_files_into_torchgen()
|
||||
if RUN_BUILD_DEPS:
|
||||
build_deps()
|
||||
mirror_inductor_external_kernels()
|
||||
|
||||
(
|
||||
ext_modules,
|
||||
@ -1681,7 +1649,6 @@ def main() -> None:
|
||||
"_inductor/codegen/aoti_runtime/*.cpp",
|
||||
"_inductor/script.ld",
|
||||
"_inductor/kernel/flex/templates/*.jinja",
|
||||
"_inductor/kernel/templates/*.jinja",
|
||||
"_export/serde/*.yaml",
|
||||
"_export/serde/*.thrift",
|
||||
"share/cmake/ATen/*.cmake",
|
||||
|
||||
@ -208,7 +208,7 @@ class _BaseDataSparsiferTestCase(TestCase):
|
||||
assert len(sparsifier1.data_groups) == len(sparsifier2.data_groups)
|
||||
|
||||
state1 = state_dict1["state"]
|
||||
for name in state1:
|
||||
for name in state1.keys():
|
||||
# compare mask
|
||||
assert name in sparsifier2.state
|
||||
assert "mask" in sparsifier2.state[name]
|
||||
|
||||
@ -119,7 +119,7 @@ class TestBaseSparsifier(TestCase):
|
||||
for idx in range(len(sparsifier0.groups)):
|
||||
mg0 = sparsifier0.groups[idx]
|
||||
mg1 = sparsifier1.groups[idx]
|
||||
for key in mg0:
|
||||
for key in mg0.keys():
|
||||
assert key in mg1
|
||||
if key == "module":
|
||||
# We cannot compare modules as they are different
|
||||
|
||||
@ -17,11 +17,8 @@ set(AOTI_ABI_CHECK_TEST_SRCS
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_metaprogramming.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_scalartype.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_typelist.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_typetraits.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_vec.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_vec_half.cpp
|
||||
)
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <torch/csrc/stable/accelerator.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
@ -529,149 +528,6 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("make_tensor_clones_and_call_foreach", &boxed_make_tensor_clones_and_call_foreach);
|
||||
}
|
||||
|
||||
// Test functions for torch::stable::Tensor device method
|
||||
|
||||
torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
|
||||
return tensor.device();
|
||||
}
|
||||
|
||||
void boxed_test_tensor_device(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::Device res = test_tensor_device(
|
||||
torch::stable::detail::to<torch::stable::Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
// Test functions for torch::stable::Device
|
||||
|
||||
torch::stable::Device test_device_constructor(
|
||||
bool is_cuda,
|
||||
torch::stable::DeviceIndex index,
|
||||
bool use_str) {
|
||||
using torch::stable::Device;
|
||||
using torch::stable::DeviceType;
|
||||
|
||||
if (use_str) {
|
||||
std::string device_str;
|
||||
if (is_cuda) {
|
||||
device_str = "cuda:" + std::to_string(index);
|
||||
} else {
|
||||
device_str = "cpu";
|
||||
}
|
||||
return Device(device_str);
|
||||
} else {
|
||||
if (is_cuda) {
|
||||
return Device(DeviceType::CUDA, index);
|
||||
} else {
|
||||
return Device(DeviceType::CPU);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void boxed_test_device_constructor(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::Device res = test_device_constructor(
|
||||
torch::stable::detail::to<bool>(stack[0]),
|
||||
torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]),
|
||||
torch::stable::detail::to<bool>(stack[2]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
|
||||
return d1 == d2;
|
||||
}
|
||||
|
||||
void boxed_test_device_equality(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
bool res = test_device_equality(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]),
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[1]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
torch::stable::Device test_device_set_index(
|
||||
torch::stable::Device device,
|
||||
torch::stable::DeviceIndex index) {
|
||||
device.set_index(index);
|
||||
return device;
|
||||
}
|
||||
|
||||
void boxed_test_device_set_index(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::Device res = test_device_set_index(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]),
|
||||
torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
|
||||
return device.index();
|
||||
}
|
||||
|
||||
void boxed_test_device_index(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::DeviceIndex res = test_device_index(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
bool test_device_is_cuda(torch::stable::Device device) {
|
||||
return device.is_cuda();
|
||||
}
|
||||
|
||||
void boxed_test_device_is_cuda(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
bool res = test_device_is_cuda(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
bool test_device_is_cpu(torch::stable::Device device) {
|
||||
return device.is_cpu();
|
||||
}
|
||||
|
||||
void boxed_test_device_is_cpu(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
bool res = test_device_is_cpu(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("test_tensor_device(Tensor t) -> Device");
|
||||
m.def(
|
||||
"test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
|
||||
m.def("test_device_equality(Device d1, Device d2) -> bool");
|
||||
m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
|
||||
m.def("test_device_index(Device device) -> DeviceIndex");
|
||||
m.def("test_device_is_cuda(Device device) -> bool");
|
||||
m.def("test_device_is_cpu(Device device) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_tensor_device", &boxed_test_tensor_device);
|
||||
m.impl("test_device_constructor", &boxed_test_device_constructor);
|
||||
m.impl("test_device_equality", &boxed_test_device_equality);
|
||||
m.impl("test_device_set_index", &boxed_test_device_set_index);
|
||||
m.impl("test_device_index", &boxed_test_device_index);
|
||||
m.impl("test_device_is_cuda", &boxed_test_device_is_cuda);
|
||||
m.impl("test_device_is_cpu", &boxed_test_device_is_cpu);
|
||||
}
|
||||
|
||||
// Test functions for torch::stable::accelerator APIs
|
||||
|
||||
#ifdef LAE_USE_CUDA
|
||||
@ -761,66 +617,3 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
}
|
||||
|
||||
#endif // LAE_USE_CUDA
|
||||
|
||||
Tensor test_parallel_for(int64_t size, int64_t grain_size) {
|
||||
AtenTensorHandle tensor_handle;
|
||||
int64_t stride = 1;
|
||||
|
||||
aoti_torch_empty_strided(
|
||||
1,
|
||||
&size,
|
||||
&stride,
|
||||
aoti_torch_dtype_int64(),
|
||||
aoti_torch_device_type_cpu(),
|
||||
0,
|
||||
&tensor_handle);
|
||||
|
||||
Tensor tensor(tensor_handle);
|
||||
int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
|
||||
|
||||
torch::stable::zero_(tensor);
|
||||
|
||||
// Use parallel_for to fill each element with its index
|
||||
// If using a parallel path, the thread id is encoded in the upper 32 bits
|
||||
torch::stable::parallel_for(
|
||||
0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
|
||||
for (auto i = begin; i < end; i++) {
|
||||
STD_TORCH_CHECK(i <= UINT32_MAX);
|
||||
uint32_t thread_id;
|
||||
torch_get_thread_idx(&thread_id);
|
||||
data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
|
||||
}
|
||||
});
|
||||
|
||||
return tensor;
|
||||
}
|
||||
|
||||
void boxed_test_parallel_for(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
Tensor res = test_parallel_for(to<int64_t>(stack[0]), to<int64_t>(stack[1]));
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
uint32_t test_get_num_threads() {
|
||||
return torch::stable::get_num_threads();
|
||||
}
|
||||
|
||||
void boxed_test_get_num_threads(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
uint32_t res = test_get_num_threads();
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("test_parallel_for(int size, int grain_size) -> Tensor");
|
||||
m.def("test_get_num_threads() -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_parallel_for", &boxed_test_parallel_for);
|
||||
m.impl("test_get_num_threads", &boxed_test_get_num_threads);
|
||||
}
|
||||
|
||||
@ -215,18 +215,6 @@ def test_default_constructor(defined) -> bool:
|
||||
return torch.ops.libtorch_agnostic.test_default_constructor.default(defined)
|
||||
|
||||
|
||||
def test_tensor_device(t):
|
||||
"""
|
||||
Tests Tensor device() method.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to get device from
|
||||
|
||||
Returns: Device - device of the tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_tensor_device.default(t)
|
||||
|
||||
|
||||
def my_pad(t) -> Tensor:
|
||||
"""
|
||||
Pads the input tensor with hardcoded padding parameters.
|
||||
@ -387,103 +375,3 @@ def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
|
||||
return torch.ops.libtorch_agnostic.make_tensor_clones_and_call_foreach.default(
|
||||
t1, t2
|
||||
)
|
||||
|
||||
|
||||
def test_device_constructor(is_cuda, index, use_str):
|
||||
"""
|
||||
Tests creating a Device from DeviceType and index, or from a string.
|
||||
|
||||
Args:
|
||||
is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
|
||||
index: int - device index
|
||||
use_str: bool - if True, constructs from string; if False, constructs from DeviceType
|
||||
|
||||
Returns: Device - A device with the specified type and index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_constructor.default(
|
||||
is_cuda, index, use_str
|
||||
)
|
||||
|
||||
|
||||
def test_device_equality(d1, d2) -> bool:
|
||||
"""
|
||||
Tests Device equality operator.
|
||||
|
||||
Args:
|
||||
d1: Device - first device
|
||||
d2: Device - second device
|
||||
|
||||
Returns: bool - True if devices are equal
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_equality.default(d1, d2)
|
||||
|
||||
|
||||
def test_device_set_index(device, index):
|
||||
"""
|
||||
Tests Device set_index() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to modify
|
||||
index: int - new device index
|
||||
|
||||
Returns: Device - device with updated index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_set_index.default(device, index)
|
||||
|
||||
|
||||
def test_device_index(device) -> int:
|
||||
"""
|
||||
Tests Device index() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to query
|
||||
|
||||
Returns: int - device index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_index.default(device)
|
||||
|
||||
|
||||
def test_device_is_cuda(device) -> bool:
|
||||
"""
|
||||
Tests Device is_cuda() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to check
|
||||
|
||||
Returns: bool - True if device is CUDA
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_is_cuda.default(device)
|
||||
|
||||
|
||||
def test_device_is_cpu(device) -> bool:
|
||||
"""
|
||||
Tests Device is_cpu() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to check
|
||||
|
||||
Returns: bool - True if device is CPU
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_is_cpu.default(device)
|
||||
|
||||
|
||||
def test_parallel_for(size, grain_size) -> Tensor:
|
||||
"""
|
||||
Tests the parallel_for functionality by using it to fill a tensor with indices.
|
||||
Args:
|
||||
size: int - size of the tensor to create
|
||||
grain_size: int - grain size for parallel_for
|
||||
Returns: Tensor - a 1D int64 tensor where each element contains its index
|
||||
(if multiple threads are used the threadid will be encoded in the upper 32 bits)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_parallel_for.default(size, grain_size)
|
||||
|
||||
|
||||
def test_get_num_threads() -> int:
|
||||
"""
|
||||
Tests the get_num_threads functionality by returning the number of threads
|
||||
for the parallel backend.
|
||||
|
||||
Returns: int - the number of threads for the parallel backend
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_get_num_threads.default()
|
||||
|
||||
@ -418,113 +418,6 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(result[0], t1 * t1)
|
||||
self.assertEqual(result[1], t2 * t2)
|
||||
|
||||
@onlyCUDA
|
||||
def test_device(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
cuda_device = libtorch_agnostic.ops.test_device_constructor(
|
||||
is_cuda=True, index=1, use_str=False
|
||||
)
|
||||
self.assertEqual(cuda_device, torch.device("cuda:1"))
|
||||
cuda_device = libtorch_agnostic.ops.test_device_constructor(
|
||||
is_cuda=True, index=1, use_str=True
|
||||
)
|
||||
self.assertEqual(cuda_device, torch.device("cuda:1"))
|
||||
|
||||
self.assertEqual(libtorch_agnostic.ops.test_device_index(cuda_device), 1)
|
||||
self.assertTrue(
|
||||
libtorch_agnostic.ops.test_device_equality(
|
||||
cuda_device, torch.device("cuda:1")
|
||||
)
|
||||
)
|
||||
self.assertFalse(
|
||||
libtorch_agnostic.ops.test_device_equality(
|
||||
cuda_device, torch.device("cuda:0")
|
||||
)
|
||||
)
|
||||
self.assertFalse(libtorch_agnostic.ops.test_device_is_cpu(cuda_device))
|
||||
self.assertTrue(libtorch_agnostic.ops.test_device_is_cuda(cuda_device))
|
||||
|
||||
cuda_0_device = libtorch_agnostic.ops.test_device_set_index(cuda_device, 0)
|
||||
self.assertEqual(cuda_0_device, torch.device("cuda:0"))
|
||||
|
||||
cpu_device = libtorch_agnostic.ops.test_device_constructor(False, 0, False)
|
||||
self.assertEqual(cpu_device, torch.device("cpu"))
|
||||
self.assertTrue(
|
||||
libtorch_agnostic.ops.test_device_equality(
|
||||
cpu_device, torch.device("cpu")
|
||||
)
|
||||
)
|
||||
self.assertTrue(libtorch_agnostic.ops.test_device_is_cpu(cpu_device))
|
||||
self.assertFalse(libtorch_agnostic.ops.test_device_is_cuda(cpu_device))
|
||||
self.assertFalse(
|
||||
libtorch_agnostic.ops.test_device_equality(cpu_device, cuda_device)
|
||||
)
|
||||
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, "Device index 129 is out of range for int8_t"
|
||||
):
|
||||
libtorch_agnostic.ops.test_device_constructor(
|
||||
is_cuda=True, index=129, use_str=False
|
||||
)
|
||||
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, "Device index 129 is out of range for int8_t"
|
||||
):
|
||||
libtorch_agnostic.ops.test_device_set_index(cuda_device, 129)
|
||||
|
||||
@onlyCUDA
|
||||
@deviceCountAtLeast(2)
|
||||
def test_tensor_device(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3)
|
||||
self.assertEqual(libtorch_agnostic.ops.test_tensor_device(t), t.device)
|
||||
|
||||
t_cuda = torch.randn(2, 3, device="cuda")
|
||||
self.assertEqual(
|
||||
libtorch_agnostic.ops.test_tensor_device(t_cuda), t_cuda.device
|
||||
)
|
||||
|
||||
t_cuda_1 = torch.randn(2, 3, device="cuda:1")
|
||||
self.assertEqual(
|
||||
libtorch_agnostic.ops.test_tensor_device(t_cuda_1), t_cuda_1.device
|
||||
)
|
||||
|
||||
@onlyCPU
|
||||
# TODO: Debug this:
|
||||
# Dynamo failed to run FX node with fake tensors:
|
||||
# call_function libtorch_agnostic.test_parallel_for.default(*(100, 10), **{}):
|
||||
# got RuntimeError('libtorch_agnostic::test_parallel_for() expected at most
|
||||
# 2 argument(s) but received 3 argument(s).
|
||||
# Declaration: libtorch_agnostic::test_parallel_for(int size, int grain_size) -> Tensor')
|
||||
@xfailIfTorchDynamo
|
||||
def test_parallel_for(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
num_threads = torch.get_num_threads()
|
||||
size = 100
|
||||
grain_size = 10
|
||||
expected_num_threads_used = min(
|
||||
(size + grain_size - 1) // grain_size, num_threads
|
||||
)
|
||||
|
||||
result = libtorch_agnostic.ops.test_parallel_for(size, grain_size)
|
||||
result_thread_ids = torch.unique(torch.bitwise_right_shift(result, 32))
|
||||
result_values = torch.bitwise_and(result, 0xFFFFFFFF)
|
||||
expected = torch.arange(size, dtype=torch.int64)
|
||||
|
||||
self.assertEqual(result_values, expected)
|
||||
self.assertEqual(result_thread_ids, torch.arange(expected_num_threads_used))
|
||||
|
||||
@onlyCPU
|
||||
def test_get_num_threads(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
num_threads = libtorch_agnostic.ops.test_get_num_threads()
|
||||
expected_num_threads = torch.get_num_threads()
|
||||
self.assertEqual(num_threads, expected_num_threads)
|
||||
|
||||
instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from backend import get_custom_backend_library_path, Model, to_custom_backend
|
||||
@ -40,11 +41,14 @@ class TestCustomBackend(TestCase):
|
||||
self.test_execute()
|
||||
|
||||
# Save and load.
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
f = tempfile.NamedTemporaryFile(delete=False)
|
||||
try:
|
||||
f.close()
|
||||
torch.jit.save(self.model, f.name)
|
||||
loaded = torch.jit.load(f.name)
|
||||
self.model = loaded
|
||||
finally:
|
||||
os.unlink(f.name)
|
||||
self.model = loaded
|
||||
|
||||
# Test execution again.
|
||||
self.test_execute()
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# Owner(s): ["module: unknown"]
|
||||
|
||||
import os.path
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
@ -143,13 +144,16 @@ def forward(self, arg0_1):
|
||||
# Ideally we would like to not have to manually delete the file, but NamedTemporaryFile
|
||||
# opens the file, and it cannot be opened multiple times in Windows. To support Windows,
|
||||
# close the file after creation and try to remove it manually.
|
||||
with tempfile.NamedTemporaryFile() as file:
|
||||
file = tempfile.NamedTemporaryFile(delete=False)
|
||||
try:
|
||||
file.close()
|
||||
model.save(file.name)
|
||||
loaded = torch.jit.load(file.name)
|
||||
finally:
|
||||
os.unlink(file.name)
|
||||
|
||||
output = loaded.forward(torch.ones(5))
|
||||
self.assertTrue(output.allclose(torch.ones(5) + 1))
|
||||
output = loaded.forward(torch.ones(5))
|
||||
self.assertTrue(output.allclose(torch.ones(5) + 1))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
# Owner(s): ["module: fsdp"]
|
||||
import functools
|
||||
import os
|
||||
import unittest
|
||||
import unittest.mock
|
||||
|
||||
import torch.distributed as dist
|
||||
from torch._dynamo.test_case import run_tests
|
||||
@ -37,9 +37,9 @@ import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
from torch.distributed.fsdp import fully_shard
|
||||
logger = logging.getLogger("torch.distributed.fsdp.fully_shard")
|
||||
logger = logging.getLogger("torch.distributed._composable.fsdp")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
device = '{device_type.type}'
|
||||
device = {device_type.type}
|
||||
torch.manual_seed(0)
|
||||
model = nn.Sequential(*[nn.Linear(4, 4, device=device, bias=False) for _ in range(2)])
|
||||
for layer in model:
|
||||
|
||||
@ -80,7 +80,7 @@ class TestSACILP(TestCase):
|
||||
# postprocessing due to the fact that for ModTracker, the post backward hook
|
||||
# is not being called for modules whose inputs don't require gradients
|
||||
# TODO: fix this in ModTracker and ensure it does not lead to any perf regression
|
||||
if _ModState.POST_BW not in mod_stats.snapshots:
|
||||
if _ModState.POST_BW not in mod_stats.snapshots.keys():
|
||||
mod_stats.snapshots.setdefault(_ModState.POST_BW, []).append(
|
||||
copy.deepcopy(last_snapshot)
|
||||
)
|
||||
|
||||
@ -16,7 +16,7 @@ from torch.distributed.argparse_util import check_env, env
|
||||
class ArgParseUtilTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# remove any lingering environment variables
|
||||
for e in os.environ.keys(): # noqa: SIM118
|
||||
for e in os.environ.keys():
|
||||
if e.startswith("PET_"):
|
||||
del os.environ[e]
|
||||
|
||||
|
||||
@ -207,7 +207,7 @@ class TestDefaultStager(TestCase):
|
||||
for i, result in enumerate(staged_results):
|
||||
self.assertIsInstance(result, dict)
|
||||
# Verify the result contains the expected keys
|
||||
for key in state_dicts[i]:
|
||||
for key in state_dicts[i].keys():
|
||||
self.assertIn(key, result)
|
||||
|
||||
stager.close()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user