Nvfuser code base nuke (#111447)

removing nvfuser code base.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/111447
Approved by: https://github.com/albanD
This commit is contained in:
jjsjann123
2023-11-01 20:53:14 +00:00
committed by PyTorch MergeBot
parent 5a6f8014c4
commit 9d23440c81
313 changed files with 1 additions and 170447 deletions

View File

@ -8,7 +8,7 @@ load("@pytorch//tools/rules:cu.bzl", "cu_library")
load("@pytorch//tools/config:defs.bzl", "if_cuda")
load("@pytorch//:aten.bzl", "generate_aten", "intern_build_aten_ops")
load(":build.bzl", "GENERATED_AUTOGRAD_CPP", "GENERATED_AUTOGRAD_PYTHON", "define_targets")
load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_nvfuser_generated_headers", "libtorch_nvfuser_runtime_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "libtorch_python_cuda_sources", "libtorch_python_distributed_sources")
load(":build_variables.bzl", "jit_core_sources", "lazy_tensor_ts_sources", "libtorch_core_sources", "libtorch_cuda_sources", "libtorch_distributed_sources", "libtorch_extra_sources", "libtorch_python_core_sources", "torch_cpp_srcs", "libtorch_python_cuda_sources", "libtorch_python_distributed_sources")
load(":ufunc_defs.bzl", "aten_ufunc_generated_cpu_kernel_sources", "aten_ufunc_generated_cpu_sources", "aten_ufunc_generated_cuda_sources")
load("//:tools/bazel.bzl", "rules")

View File

@ -208,9 +208,6 @@ cmake_dependent_option(
cmake_dependent_option(
USE_CUSPARSELT "Use cuSPARSELt" ON
"USE_CUDA" OFF)
cmake_dependent_option(
BUILD_NVFUSER_BENCHMARK "Build C++ binaries for nvfuser benchmarks" OFF
"USE_CUDA" OFF)
cmake_dependent_option(
USE_EXPERIMENTAL_CUDNN_V8_API "Use experimental cuDNN v8 API" ON
"USE_CUDNN" OFF)

View File

@ -16,42 +16,6 @@ GENERATED_LAZY_TS_CPP = [
"lazy/generated/RegisterLazy.cpp",
]
# NVFuser runtime library
libtorch_nvfuser_runtime_sources = [
"third_party/nvfuser/runtime/array.cu",
"third_party/nvfuser/runtime/array_rocm.cu",
"third_party/nvfuser/runtime/bf16_support.cu",
"third_party/nvfuser/runtime/bf16_support_rocm.cu",
"third_party/nvfuser/runtime/block_reduction.cu",
"third_party/nvfuser/runtime/block_sync_atomic.cu",
"third_party/nvfuser/runtime/block_sync_default.cu",
"third_party/nvfuser/runtime/block_sync_default_rocm.cu",
"third_party/nvfuser/runtime/broadcast.cu",
"third_party/nvfuser/runtime/fp16_support.cu",
"third_party/nvfuser/runtime/fused_reduction.cu",
"third_party/nvfuser/runtime/fused_welford_helper.cu",
"third_party/nvfuser/runtime/fused_welford_impl.cu",
"third_party/nvfuser/runtime/grid_broadcast.cu",
"third_party/nvfuser/runtime/grid_reduction.cu",
"third_party/nvfuser/runtime/grid_sync.cu",
"third_party/nvfuser/runtime/helpers.cu",
"third_party/nvfuser/runtime/index_utils.cu",
"third_party/nvfuser/runtime/memory.cu",
"third_party/nvfuser/runtime/random_numbers.cu",
"third_party/nvfuser/runtime/swizzle.cu",
"third_party/nvfuser/runtime/tensor.cu",
"third_party/nvfuser/runtime/tensorcore.cu",
"third_party/nvfuser/runtime/tuple.cu",
"third_party/nvfuser/runtime/type_traits.cu",
"third_party/nvfuser/runtime/warp.cu",
"third_party/nvfuser/runtime/warp_rocm.cu",
"third_party/nvfuser/runtime/welford.cu",
"aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh",
"aten/src/ATen/cuda/detail/UnpackRaw.cuh",
]
libtorch_nvfuser_generated_headers = ["{}.h".format(name.split("/")[-1].split(".")[0]) for name in libtorch_nvfuser_runtime_sources]
def libtorch_generated_sources(gencode_pattern):
return [gencode_pattern.format(name) for name in [
"torch/csrc/autograd/generated/Functions.cpp",

View File

@ -27,7 +27,6 @@ function(caffe2_print_configuration_summary)
message(STATUS " BUILD_CAFFE2_OPS : ${BUILD_CAFFE2_OPS}")
message(STATUS " BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}")
message(STATUS " BUILD_TENSOREXPR_BENCHMARK: ${BUILD_TENSOREXPR_BENCHMARK}")
message(STATUS " BUILD_NVFUSER_BENCHMARK: ${BUILD_NVFUSER_BENCHMARK}")
message(STATUS " BUILD_BINARY : ${BUILD_BINARY}")
message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}")
if(${CAFFE2_LINK_LOCAL_PROTOBUF})

View File

@ -1,371 +0,0 @@
if(NOT BUILD_NVFUSER)
return()
endif()
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(nvfuser)
if(NOT USE_ROCM)
set(TORCHLIB_FLAVOR torch_cuda)
else()
set(TORCHLIB_FLAVOR torch_hip)
endif()
# --- project
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/nvfuser")
set(NVFUSER_ROOT ${PROJECT_SOURCE_DIR})
set(NVFUSER_SRCS_DIR "${NVFUSER_ROOT}/csrc")
set(TORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../..")
set(TORCH_INSTALL_LIB_DIR ${TORCH_ROOT}/torch/lib)
# --- build nvfuser_codegen library
set(NVFUSER_SRCS)
set(NVFUSER_CODEGEN ${PROJECT_NAME}_codegen)
list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/arith.cpp
${NVFUSER_SRCS_DIR}/compute_at.cpp
${NVFUSER_SRCS_DIR}/inlining.cpp
${NVFUSER_SRCS_DIR}/compute_at_map.cpp
${NVFUSER_SRCS_DIR}/codegen.cpp
${NVFUSER_SRCS_DIR}/contiguity.cpp
${NVFUSER_SRCS_DIR}/dispatch.cpp
${NVFUSER_SRCS_DIR}/expr_evaluator.cpp
${NVFUSER_SRCS_DIR}/kernel_expr_evaluator.cpp
${NVFUSER_SRCS_DIR}/executor.cpp
${NVFUSER_SRCS_DIR}/executor_kernel_arg.cpp
${NVFUSER_SRCS_DIR}/executor_launch_params.cpp
${NVFUSER_SRCS_DIR}/evaluator_common.cpp
${NVFUSER_SRCS_DIR}/executor_utils.cpp
${NVFUSER_SRCS_DIR}/fusion.cpp
${NVFUSER_SRCS_DIR}/graph_fuser.cpp
${NVFUSER_SRCS_DIR}/grouped_reduction.cpp
${NVFUSER_SRCS_DIR}/index_compute.cpp
${NVFUSER_SRCS_DIR}/lower_index_compute.cpp
${NVFUSER_SRCS_DIR}/instrumentation.cpp
${NVFUSER_SRCS_DIR}/ir_base_nodes.cpp
${NVFUSER_SRCS_DIR}/ir_builder.cpp
${NVFUSER_SRCS_DIR}/ir_cloner.cpp
${NVFUSER_SRCS_DIR}/ir_container.cpp
${NVFUSER_SRCS_DIR}/ir_graphviz.cpp
${NVFUSER_SRCS_DIR}/ir_nodes.cpp
${NVFUSER_SRCS_DIR}/ir_iostream.cpp
${NVFUSER_SRCS_DIR}/ir_utils.cpp
${NVFUSER_SRCS_DIR}/iter_visitor.cpp
${NVFUSER_SRCS_DIR}/kernel.cpp
${NVFUSER_SRCS_DIR}/kernel_cache.cpp
${NVFUSER_SRCS_DIR}/kernel_ir.cpp
${NVFUSER_SRCS_DIR}/kernel_ir_dispatch.cpp
${NVFUSER_SRCS_DIR}/lower_alias_memory.cpp
${NVFUSER_SRCS_DIR}/lower_allocation.cpp
${NVFUSER_SRCS_DIR}/lower_double_buffer.cpp
${NVFUSER_SRCS_DIR}/lower_divisible_split.cpp
${NVFUSER_SRCS_DIR}/lower_expr_sort.cpp
${NVFUSER_SRCS_DIR}/lower_fused_reduction.cpp
${NVFUSER_SRCS_DIR}/lower_fusion_simplifier.cpp
${NVFUSER_SRCS_DIR}/lower_index.cpp
${NVFUSER_SRCS_DIR}/lower_index_hoist.cpp
${NVFUSER_SRCS_DIR}/lower_insert_syncs.cpp
${NVFUSER_SRCS_DIR}/lower_instrument.cpp
${NVFUSER_SRCS_DIR}/lower_loops.cpp
${NVFUSER_SRCS_DIR}/lower_magic_zero.cpp
${NVFUSER_SRCS_DIR}/lower_misaligned_vectorization.cpp
${NVFUSER_SRCS_DIR}/lower_predicate.cpp
${NVFUSER_SRCS_DIR}/lower_predicate_elimination.cpp
${NVFUSER_SRCS_DIR}/lower_replace_size.cpp
${NVFUSER_SRCS_DIR}/lower_shift.cpp
${NVFUSER_SRCS_DIR}/lower_sync_information.cpp
${NVFUSER_SRCS_DIR}/lower_thread_predicate.cpp
${NVFUSER_SRCS_DIR}/lower_trivial_broadcast.cpp
${NVFUSER_SRCS_DIR}/lower_trivial_reductions.cpp
${NVFUSER_SRCS_DIR}/lower_unroll.cpp
${NVFUSER_SRCS_DIR}/lower_utils.cpp
${NVFUSER_SRCS_DIR}/lower_validation.cpp
${NVFUSER_SRCS_DIR}/lower_warp_reduce.cpp
${NVFUSER_SRCS_DIR}/lower2device.cpp
${NVFUSER_SRCS_DIR}/lower_bank_conflict.cpp
${NVFUSER_SRCS_DIR}/manager.cpp
${NVFUSER_SRCS_DIR}/maxinfo_propagator.cpp
${NVFUSER_SRCS_DIR}/mutator.cpp
${NVFUSER_SRCS_DIR}/non_divisible_split.cpp
${NVFUSER_SRCS_DIR}/ops/alias.cpp
${NVFUSER_SRCS_DIR}/ops/composite.cpp
${NVFUSER_SRCS_DIR}/ops/normalization.cpp
${NVFUSER_SRCS_DIR}/parallel_dimension_map.cpp
${NVFUSER_SRCS_DIR}/parallel_type_bitmap.cpp
${NVFUSER_SRCS_DIR}/parser.cpp
${NVFUSER_SRCS_DIR}/partial_split_map.cpp
${NVFUSER_SRCS_DIR}/partition.cpp
${NVFUSER_SRCS_DIR}/predicate_compute.cpp
${NVFUSER_SRCS_DIR}/python_frontend/fusion_cache.cpp
${NVFUSER_SRCS_DIR}/python_frontend/fusion_definition.cpp
${NVFUSER_SRCS_DIR}/python_frontend/fusion_interface.cpp
${NVFUSER_SRCS_DIR}/register_interface.cpp
${NVFUSER_SRCS_DIR}/root_domain_map.cpp
${NVFUSER_SRCS_DIR}/scheduler/pointwise.cpp
${NVFUSER_SRCS_DIR}/scheduler/pointwise_utils.cpp
${NVFUSER_SRCS_DIR}/scheduler/transpose.cpp
${NVFUSER_SRCS_DIR}/scheduler/normalization.cpp
${NVFUSER_SRCS_DIR}/scheduler/reduction.cpp
${NVFUSER_SRCS_DIR}/scheduler/matmul.cpp
${NVFUSER_SRCS_DIR}/scheduler/reduction_utils.cpp
${NVFUSER_SRCS_DIR}/scheduler/registry.cpp
${NVFUSER_SRCS_DIR}/scheduler/utils.cpp
${NVFUSER_SRCS_DIR}/scheduler/vectorize_helper.cpp
${NVFUSER_SRCS_DIR}/type_inference.cpp
${NVFUSER_SRCS_DIR}/type_promotion.cpp
${NVFUSER_SRCS_DIR}/fusion_segmenter.cpp
${NVFUSER_SRCS_DIR}/tensor_view.cpp
${NVFUSER_SRCS_DIR}/transform_iter.cpp
${NVFUSER_SRCS_DIR}/transform_replay.cpp
${NVFUSER_SRCS_DIR}/transform_rfactor.cpp
${NVFUSER_SRCS_DIR}/transform_view.cpp
${NVFUSER_SRCS_DIR}/type.cpp
${NVFUSER_SRCS_DIR}/utils.cpp
${NVFUSER_SRCS_DIR}/mma_type.cpp
${NVFUSER_SRCS_DIR}/scheduler/mma_utils.cpp
)
add_library(${NVFUSER_CODEGEN} SHARED ${NVFUSER_SRCS})
if(NOT USE_ROCM)
target_compile_options(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
# NB: This must be target_compile_definitions, not target_compile_options,
# as the latter is not respected by nvcc
target_compile_definitions(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
else()
target_compile_options(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
target_compile_definitions(${NVFUSER_CODEGEN} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
target_compile_definitions(${NVFUSER_CODEGEN} PRIVATE
USE_ROCM
__HIP_PLATFORM_HCC__
)
endif()
target_link_libraries(${NVFUSER_CODEGEN} PRIVATE torch ${TORCHLIB_FLAVOR})
if(NOT USE_ROCM)
target_link_libraries(${NVFUSER_CODEGEN} PRIVATE ${CUDA_NVRTC_LIB} torch::nvtoolsext)
target_include_directories(${NVFUSER_CODEGEN} PRIVATE ${CUDA_INCLUDE_DIRS})
else()
target_link_libraries(${NVFUSER_CODEGEN} PRIVATE ${ROCM_HIPRTC_LIB})
target_include_directories(${NVFUSER_CODEGEN} PRIVATE ${Caffe2_HIP_INCLUDE})
endif()
if(NOT MSVC)
target_compile_options(${NVFUSER_CODEGEN} PRIVATE -Wno-unused-variable)
endif()
target_include_directories(${NVFUSER_CODEGEN}
PUBLIC $<BUILD_INTERFACE:${NVFUSER_SRCS_DIR}>)
set_property(TARGET ${NVFUSER_CODEGEN} PROPERTY CXX_STANDARD 17)
install(TARGETS ${NVFUSER_CODEGEN} EXPORT NvfuserTargets DESTINATION "${TORCH_INSTALL_LIB_DIR}")
file(WRITE "${TORCH_ROOT}/test/_nvfuser/.gitignore" "*")
# --- build nvfuser_python library
if(BUILD_PYTHON)
set(NVFUSER "${PROJECT_NAME}")
#find_package(pybind11 REQUIRED)
set(NVFUSER_PYTHON_SRCS)
list(APPEND NVFUSER_PYTHON_SRCS
${NVFUSER_SRCS_DIR}/python_frontend/python_bindings.cpp
${NVFUSER_SRCS_DIR}/python_frontend/python_bindings_extension.cpp
)
add_library(${NVFUSER} MODULE ${NVFUSER_PYTHON_SRCS})
if(NOT USE_ROCM)
target_compile_options(${NVFUSER} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
# NB: This must be target_compile_definitions, not target_compile_options,
# as the latter is not respected by nvcc
target_compile_definitions(${NVFUSER} PRIVATE "-DTORCH_CUDA_BUILD_MAIN_LIB")
target_link_libraries(${NVFUSER} PRIVATE torch::nvtoolsext)
else()
target_compile_options(${NVFUSER} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
target_compile_definitions(${NVFUSER} PRIVATE "-DTORCH_HIP_BUILD_MAIN_LIB")
target_compile_definitions(${NVFUSER} PRIVATE
USE_ROCM
__HIP_PLATFORM_HCC__
)
target_include_directories(${NVFUSER_CODEGEN} PRIVATE ${Caffe2_HIP_INCLUDE})
endif()
target_link_libraries(${NVFUSER} PRIVATE ${NVFUSER_CODEGEN})
target_link_libraries(${NVFUSER} PRIVATE torch torch_python ${TORCHLIB_FLAVOR})
target_link_libraries(${NVFUSER} PRIVATE pybind::pybind11)
target_include_directories(${NVFUSER} PRIVATE ${TORCH_ROOT})
target_compile_definitions(${NVFUSER} PRIVATE EXTENSION_NAME=_C)
target_compile_options(${NVFUSER} PRIVATE ${TORCH_PYTHON_COMPILE_OPTIONS})
# avoid using Python3_add_library, copied from functorch
set_target_properties(${NVFUSER} PROPERTIES PREFIX "" DEBUG_POSTFIX "")
if(NOT MSVC)
target_compile_options(${NVFUSER} PRIVATE -Wno-unused-variable)
set_target_properties(${NVFUSER} PROPERTIES SUFFIX ".so")
else()
set_target_properties(${NVFUSER} PROPERTIES SUFFIX ".pyd")
endif()
set_target_properties(${NVFUSER} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
${CMAKE_BINARY_DIR}/nvfuser)
set_target_properties(${NVFUSER} PROPERTIES INSTALL_RPATH "${_rpath_portable_origin}/../torch/lib")
if(TORCH_PYTHON_LINK_FLAGS AND NOT TORCH_PYTHON_LINK_FLAGS STREQUAL "")
message(STATUS "somehow this is happening")
set_target_properties(${NVFUSER} PROPERTIES LINK_FLAGS ${TORCH_PYTHON_LINK_FLAGS})
endif()
install(TARGETS ${NVFUSER} EXPORT NvfuserTargets DESTINATION ${TORCH_ROOT}/nvfuser/)
# install nvfuser python files
install(DIRECTORY "${NVFUSER_ROOT}/python/"
DESTINATION "${TORCH_ROOT}/nvfuser"
FILES_MATCHING PATTERN "*.py" )
file(WRITE "${TORCH_ROOT}/nvfuser/.gitignore" "*")
endif()
# --- generate runtime files
# The list of NVFUSER runtime files
list(APPEND NVFUSER_RUNTIME_FILES
${NVFUSER_ROOT}/runtime/array.cu
${NVFUSER_ROOT}/runtime/block_reduction.cu
${NVFUSER_ROOT}/runtime/block_sync_atomic.cu
${NVFUSER_ROOT}/runtime/block_sync_default.cu
${NVFUSER_ROOT}/runtime/broadcast.cu
${NVFUSER_ROOT}/runtime/fp16_support.cu
${NVFUSER_ROOT}/runtime/fused_reduction.cu
${NVFUSER_ROOT}/runtime/fused_welford_helper.cu
${NVFUSER_ROOT}/runtime/fused_welford_impl.cu
${NVFUSER_ROOT}/runtime/bf16_support.cu
${NVFUSER_ROOT}/runtime/grid_broadcast.cu
${NVFUSER_ROOT}/runtime/grid_reduction.cu
${NVFUSER_ROOT}/runtime/grid_sync.cu
${NVFUSER_ROOT}/runtime/helpers.cu
${NVFUSER_ROOT}/runtime/index_utils.cu
${NVFUSER_ROOT}/runtime/random_numbers.cu
${NVFUSER_ROOT}/runtime/swizzle.cu
${NVFUSER_ROOT}/runtime/tensor.cu
${NVFUSER_ROOT}/runtime/tuple.cu
${NVFUSER_ROOT}/runtime/type_traits.cu
${NVFUSER_ROOT}/runtime/welford.cu
${NVFUSER_ROOT}/runtime/warp.cu
${NVFUSER_ROOT}/runtime/tensorcore.cu
${NVFUSER_ROOT}/runtime/memory.cu
${TORCH_ROOT}/aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh
${TORCH_ROOT}/aten/src/ATen/cuda/detail/UnpackRaw.cuh
)
if(USE_ROCM)
list(APPEND NVFUSER_RUNTIME_FILES
${NVFUSER_ROOT}/runtime/array_rocm.cu
${NVFUSER_ROOT}/runtime/bf16_support_rocm.cu
${NVFUSER_ROOT}/runtime/block_sync_default_rocm.cu
${NVFUSER_ROOT}/runtime/warp_rocm.cu
)
endif()
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/include/nvfuser_resources")
# "stringify" NVFUSER runtime sources
# (generate C++ header files embedding the original input as a string literal)
set(NVFUSER_STRINGIFY_TOOL "${NVFUSER_ROOT}/tools/stringify_file.py")
foreach(src ${NVFUSER_RUNTIME_FILES})
get_filename_component(filename ${src} NAME_WE)
set(dst "${CMAKE_BINARY_DIR}/include/nvfuser_resources/${filename}.h")
add_custom_command(
COMMENT "Stringify NVFUSER runtime source file"
OUTPUT ${dst}
DEPENDS ${src} "${NVFUSER_STRINGIFY_TOOL}"
COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst}
)
add_custom_target(nvfuser_rt_${filename} DEPENDS ${dst})
add_dependencies(${NVFUSER_CODEGEN} nvfuser_rt_${filename})
# also generate the resource headers during the configuration step
# (so tools like clang-tidy can run w/o requiring a real build)
execute_process(COMMAND
${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst})
endforeach()
target_include_directories(${NVFUSER_CODEGEN} PRIVATE "${CMAKE_BINARY_DIR}/include")
# -- build tests
# note: ideally we don't need USE_CUDA here, but our cpp tests are not ROCM compatible.
if(BUILD_TEST AND USE_CUDA)
set(NVFUSER_TESTS "${PROJECT_NAME}_tests")
set(JIT_TEST_SRCS)
list(APPEND JIT_TEST_SRCS ${NVFUSER_SRCS_DIR}/python_frontend/test/test_nvfuser_fusion_definition.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_SRCS_DIR}/python_frontend/test/test_nvfuser_fusion_cache.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_SRCS_DIR}/python_frontend/test/test_nvfuser_fusion_record.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu1.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu2.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu3.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_tensor_factories.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_fused_reduction.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_shift.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_tensorcore.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_view.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_transpose.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_utils.cpp)
add_executable(${NVFUSER_TESTS}
${TORCH_ROOT}/test/cpp/common/main.cpp
${TORCH_ROOT}/test/cpp/jit/test_utils.cpp
${JIT_TEST_SRCS})
target_compile_definitions(${NVFUSER_TESTS} PRIVATE USE_GTEST)
if(NOT USE_ROCM)
target_compile_definitions(${NVFUSER_TESTS} PRIVATE USE_CUDA)
else()
target_compile_definitions(${NVFUSER_TESTS} PRIVATE USE_ROCM)
endif()
target_include_directories(${NVFUSER_TESTS} PRIVATE "${NVFUSER_ROOT}" "${TORCH_ROOT}/torch/csrc/api/include/")
target_link_libraries(${NVFUSER_TESTS} PRIVATE ${NVFUSER_CODEGEN} torch ${TORCHLIB_FLAVOR} gtest_main gmock_main)
if(NOT MSVC)
target_compile_options(${NVFUSER_TESTS} PRIVATE -Wno-unused-variable)
endif()
install(TARGETS ${NVFUSER_TESTS} DESTINATION bin)
endif()
if(BUILD_NVFUSER_BENCHMARK)
set(NVFUSER_BENCHMARK "${PROJECT_NAME}_bench")
set(BENCHMARK_SRCS)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/batch_norm_channels_first.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/batch_norm_channels_first_backward.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/batch_norm_channels_last.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/batch_norm_channels_last_backward.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/bert.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/broadcast.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/gelu_backward.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/heuristic_lookup.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/shape_inference.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/instance_norm.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/layer_norm.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/layer_norm_backward.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/rms_norm.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/rms_norm_backward.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/lstm_cell.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/reduction.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/softmax.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/softmax_backward.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/scale_bias_relu.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/transpose.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/matmul.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/timm.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/utils.cpp)
list(APPEND BENCHMARK_SRCS ${NVFUSER_ROOT}/benchmark/main.cpp)
add_executable(${NVFUSER_BENCHMARK} ${BENCHMARK_SRCS})
target_link_libraries(${NVFUSER_BENCHMARK} PRIVATE torch_library benchmark ${NVFUSER_CODEGEN})
target_include_directories(${NVFUSER_BENCHMARK} PRIVATE ${NVFUSER_ROOT})
if(NOT MSVC)
target_compile_options_if_supported(nvfuser_bench -Werror)
target_compile_options_if_supported(nvfuser_bench -Wno-unused-variable)
target_compile_options_if_supported(nvfuser_bench -Wno-deprecated-copy)
endif()
endif()

View File

@ -1,35 +0,0 @@
if(USE_CUDA)
add_executable(nvfuser_bench
batch_norm_channels_first.cpp
batch_norm_channels_first_backward.cpp
batch_norm_channels_last.cpp
batch_norm_channels_last_backward.cpp
bert.cpp
broadcast.cpp
gelu_backward.cpp
heuristic_lookup.cpp
shape_inference.cpp
instance_norm.cpp
layer_norm.cpp
layer_norm_backward.cpp
rms_norm.cpp
rms_norm_backward.cpp
lstm_cell.cpp
reduction.cpp
softmax.cpp
softmax_backward.cpp
scale_bias_relu.cpp
transpose.cpp
matmul.cpp
timm.cpp
utils.cpp
main.cpp)
target_link_libraries(nvfuser_bench PRIVATE torch_library benchmark)
if(NOT MSVC)
target_compile_options_if_supported(nvfuser_bench -Werror)
target_compile_options_if_supported(nvfuser_bench -Wno-unused-variable)
target_compile_options_if_supported(nvfuser_bench -Wno-deprecated-copy)
endif()
endif()

View File

@ -1,335 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupBatchNorm(Fusion* fusion, DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
const bool kTraining = true;
const float kMomentum = 0.1;
const float kEps = 1e-5;
// setup fusion
auto input = makeContigTensor(4, dtype);
auto weight = makeContigTensor(1, dtype);
auto bias = makeContigTensor(1, dtype);
auto running_mean = makeContigTensor(1, DataType::Float);
auto running_var = makeContigTensor(1, DataType::Float);
fusion->addInput(input);
fusion->addInput(weight);
fusion->addInput(bias);
fusion->addInput(running_mean);
fusion->addInput(running_var);
if (dtype == DataType::Half) {
input = castOp(DataType::Float, input);
weight = castOp(DataType::Float, weight);
bias = castOp(DataType::Float, bias);
}
auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
auto eps_ptr = IrBuilder::create<Double>(kEps);
auto result = batch_norm(
input,
weight,
bias,
running_mean,
running_var,
kTraining,
momentum_ptr,
eps_ptr);
auto output = result.output;
if (dtype == DataType::Half) {
output = castOp(DataType::Half, output);
}
fusion->addOutput(output);
}
static void NvFuserScheduler_BatchNorm(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2),
benchmark_state.range(2)};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_x = at::randn(input_shape, options);
at::Tensor at_weight = at::ones({input_shape[1]}, options);
at::Tensor at_bias = at::zeros({input_shape[1]}, options);
at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
std::vector<c10::IValue> aten_inputs(
{at_x, at_weight, at_bias, at_run_mean, at_run_var});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
int64_t(dataTypeSize(dtype)) +
(2 * (at_run_mean.numel() + at_run_var.numel()) *
int64_t(dataTypeSize(DataType::Float)))));
}
//------------------------------------------------------------------------------
static void Baseline_BatchNorm(
benchmark::State& benchmark_state,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
const float kMomentum = 0.1;
const float kEps = 1e-5;
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2),
benchmark_state.range(2)};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_x = at::randn(input_shape, options);
at::Tensor at_weight = at::ones({input_shape[1]}, options);
at::Tensor at_bias = at::zeros({input_shape[1]}, options);
at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
auto ato_weight = c10::optional<at::Tensor>(at_weight);
auto ato_bias = c10::optional<at::Tensor>(at_bias);
auto ato_run_mean = c10::optional<at::Tensor>(at_run_mean);
auto ato_run_var = c10::optional<at::Tensor>(at_run_var);
auto output = at::batch_norm(
at_x,
ato_weight,
ato_bias,
ato_run_mean,
ato_run_var,
true,
kMomentum,
kEps,
true);
clearL2Cache();
cudaDeviceSynchronize();
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
auto output = at::batch_norm(
at_x,
ato_weight,
ato_bias,
ato_run_mean,
ato_run_var,
true,
kMomentum,
kEps,
true);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
cudaDeviceSynchronize();
clearL2Cache();
cudaDeviceSynchronize();
}
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
int64_t(dataTypeSize(dtype)) +
(2 * (at_run_mean.numel() + at_run_var.numel()) *
int64_t(dataTypeSize(DataType::Float)))));
}
//------------------------------------------------------------------------------
static void Baseline_BatchNorm_cuDNN_fp32(benchmark::State& benchmark_state) {
Baseline_BatchNorm(benchmark_state, DataType::Float);
}
static void Baseline_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) {
Baseline_BatchNorm(benchmark_state, DataType::Half);
}
// Simple aliases just for names in the printed output
static void Baseline_ResNet_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) {
Baseline_BatchNorm(benchmark_state, DataType::Half);
}
static void Baseline_ResNext_BatchNorm_cuDNN_fp16(benchmark::State& benchmark_state) {
Baseline_BatchNorm(benchmark_state, DataType::Half);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_BatchNorm_fp32,
setupBatchNorm,
NvFuserScheduler_BatchNorm,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_BatchNorm_fp16,
setupBatchNorm,
NvFuserScheduler_BatchNorm,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_BatchNorm_cuDNN_fp32)
// ->RangeMultiplier(2)
// cuDNN didn't make it to 1024
->Ranges({{64, 512}, {32, 128}, {2, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_cuDNN_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_cuDNN_fp16)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_cuDNN_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
// RESNET and REXNEXT benchmarks
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_ResNet_BatchNorm_fp16,
setupBatchNorm,
NvFuserScheduler_BatchNorm,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_fp16)
->Args({256, 64, 112})
->Args({256, 64, 56})
->Args({256, 256, 56})
->Args({256, 128, 56})
->Args({256, 128, 28})
->Args({256, 512, 28})
->Args({256, 256, 28})
->Args({256, 256, 14})
->Args({256, 1024, 14})
->Args({256, 512, 14})
->Args({256, 512, 7})
->Args({256, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_ResNext_BatchNorm_fp16,
setupBatchNorm,
NvFuserScheduler_BatchNorm,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_fp16)
->Args({128, 64, 112})
->Args({128, 128, 56})
->Args({128, 256, 56})
->Args({128, 128, 56})
->Args({128, 256, 28})
->Args({128, 512, 28})
->Args({128, 512, 14})
->Args({128, 1024, 14})
->Args({128, 1024, 7})
->Args({128, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_ResNet_BatchNorm_cuDNN_fp16)
->Args({256, 64, 112})
->Args({256, 64, 56})
->Args({256, 256, 56})
->Args({256, 128, 56})
->Args({256, 128, 28})
->Args({256, 512, 28})
->Args({256, 256, 28})
->Args({256, 256, 14})
->Args({256, 1024, 14})
->Args({256, 512, 14})
->Args({256, 512, 7})
->Args({256, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_ResNext_BatchNorm_cuDNN_fp16)
->Args({128, 64, 112})
->Args({128, 128, 56})
->Args({128, 256, 56})
->Args({128, 128, 56})
->Args({128, 256, 28})
->Args({128, 512, 28})
->Args({128, 512, 14})
->Args({128, 1024, 14})
->Args({128, 1024, 7})
->Args({128, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,358 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <ATen/Operators.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
const bool kTraining = true;
const float kEps = 1e-5;
// setup fusion
auto input = makeContigTensor(4, dtype);
auto grad_output = makeContigTensor(4, dtype);
auto weight = makeContigTensor(1, DataType::Float);
auto running_mean = makeContigTensor(1, DataType::Float);
auto running_var = makeContigTensor(1, DataType::Float);
auto save_mean = makeContigTensor(1, DataType::Float);
auto save_var = makeContigTensor(1, DataType::Float);
fusion->addInput(input);
fusion->addInput(grad_output);
fusion->addInput(weight);
fusion->addInput(running_mean);
fusion->addInput(running_var);
fusion->addInput(save_mean);
fusion->addInput(save_var);
if (dtype == DataType::Half) {
input = castOp(DataType::Float, input);
grad_output = castOp(DataType::Float, grad_output);
}
auto eps_ptr = IrBuilder::create<Double>(kEps);
auto result = batch_norm_backward(
input,
grad_output,
weight,
running_mean,
running_var,
save_mean,
save_var,
kTraining,
eps_ptr,
std::vector<bool>(3, true));
auto grad_input = result.grad_input;
auto grad_weight = result.grad_weight;
auto grad_bias = result.grad_bias;
if (dtype == DataType::Half) {
grad_input = castOp(DataType::Half, grad_input);
grad_weight = castOp(DataType::Half, grad_weight);
grad_bias = castOp(DataType::Half, grad_bias);
}
fusion->addOutput(grad_input);
fusion->addOutput(grad_weight);
fusion->addOutput(grad_bias);
}
static void NvFuserScheduler_BatchNorm_BWD(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2),
benchmark_state.range(2)};
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor input = at::randn(input_shape, options);
at::Tensor grad_out = at::randn(input_shape, options);
at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
std::vector<c10::IValue> aten_inputs(
{input, grad_out, weight, run_mean, run_var, save_mean, save_var});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
(run_mean.numel() + run_var.numel() + save_mean.numel() +
save_var.numel() + weight.numel()) *
int64_t(dataTypeSize(DataType::Float))));
}
//------------------------------------------------------------------------------
static void Baseline_BatchNorm_BWD(
benchmark::State& benchmark_state,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
const float kMomentum = 0.1;
const float kEps = 1e-5;
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2),
benchmark_state.range(2)};
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor input = at::randn(input_shape, options);
at::Tensor grad_out = at::randn(input_shape, options);
at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
at::Tensor bias = at::zeros({input_shape[1]}, fp32_options);
at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
auto ato_weight = c10::optional<at::Tensor>(weight);
auto ato_bias = c10::optional<at::Tensor>(bias);
auto ato_run_mean = c10::optional<at::Tensor>(run_mean);
auto ato_run_var = c10::optional<at::Tensor>(run_var);
auto ato_save_mean = c10::optional<at::Tensor>(save_mean);
auto ato_save_var = c10::optional<at::Tensor>(save_var);
auto fwd_result = at::_ops::_batch_norm_impl_index::call(
input,
ato_weight,
ato_bias,
ato_run_mean,
ato_run_var,
true,
kMomentum,
kEps,
true);
cudaDeviceSynchronize();
// Sync everything up before we start
clearL2Cache();
cudaDeviceSynchronize();
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
at::_ops::cudnn_batch_norm_backward::call(
input,
grad_out,
weight,
ato_run_mean,
ato_run_var,
save_mean,
save_var,
kEps,
std::get<3>(fwd_result));
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
cudaDeviceSynchronize();
clearL2Cache();
cudaDeviceSynchronize();
}
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
(run_mean.numel() + run_var.numel() + save_mean.numel() +
save_var.numel() + weight.numel()) *
int64_t(dataTypeSize(DataType::Float))));
}
//------------------------------------------------------------------------------
static void Baseline_BatchNorm_BWD_cuDNN_fp32(
benchmark::State& benchmark_state) {
Baseline_BatchNorm_BWD(benchmark_state, DataType::Float);
}
static void Baseline_BatchNorm_BWD_cuDNN_fp16(
benchmark::State& benchmark_state) {
Baseline_BatchNorm_BWD(benchmark_state, DataType::Half);
}
// Simple aliases just for names in the printed output
static void Baseline_ResNet_BatchNorm_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
Baseline_BatchNorm_BWD(benchmark_state, DataType::Half);
}
static void Baseline_ResNext_BatchNorm_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
Baseline_BatchNorm_BWD(benchmark_state, DataType::Half);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_BatchNorm_BWD_fp32,
setupBatchNorm_BWD,
NvFuserScheduler_BatchNorm_BWD,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_BatchNorm_BWD_fp16,
setupBatchNorm_BWD,
NvFuserScheduler_BatchNorm_BWD,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32)
// ->RangeMultiplier(2)
// cuDNN didn't make it to 1024
->Ranges({{64, 512}, {32, 128}, {2, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_BWD_cuDNN_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
// RESNET and REXNEXT benchmarks
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_ResNet_BatchNorm_BWD_fp16,
setupBatchNorm_BWD,
NvFuserScheduler_BatchNorm_BWD,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_BWD_fp16)
->Args({256, 64, 112})
->Args({256, 64, 56})
->Args({256, 256, 56})
->Args({256, 128, 56})
->Args({256, 128, 28})
->Args({256, 512, 28})
->Args({256, 256, 28})
->Args({256, 256, 14})
->Args({256, 1024, 14})
->Args({256, 512, 14})
->Args({256, 512, 7})
->Args({256, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_ResNext_BatchNorm_BWD_fp16,
setupBatchNorm_BWD,
NvFuserScheduler_BatchNorm_BWD,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_BWD_fp16)
->Args({128, 64, 112})
->Args({128, 128, 56})
->Args({128, 256, 56})
->Args({128, 128, 56})
->Args({128, 256, 28})
->Args({128, 512, 28})
->Args({128, 512, 14})
->Args({128, 1024, 14})
->Args({128, 1024, 7})
->Args({128, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_ResNet_BatchNorm_BWD_cuDNN_fp16)
->Args({256, 64, 112})
->Args({256, 64, 56})
->Args({256, 256, 56})
->Args({256, 128, 56})
->Args({256, 128, 28})
->Args({256, 512, 28})
->Args({256, 256, 28})
->Args({256, 256, 14})
->Args({256, 1024, 14})
->Args({256, 512, 14})
->Args({256, 512, 7})
->Args({256, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_ResNext_BatchNorm_BWD_cuDNN_fp16)
->Args({128, 64, 112})
->Args({128, 128, 56})
->Args({128, 256, 56})
->Args({128, 128, 56})
->Args({128, 256, 28})
->Args({128, 512, 28})
->Args({128, 512, 14})
->Args({128, 1024, 14})
->Args({128, 1024, 7})
->Args({128, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,363 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupBatchNorm_nhwc(Fusion* fusion, DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
const bool kTraining = true;
const float kMomentum = 0.1;
const float kEps = 1e-5;
// setup fusion
auto input = makeContigTensor(4, dtype);
auto weight = makeContigTensor(1, dtype);
auto bias = makeContigTensor(1, dtype);
auto running_mean = makeContigTensor(1, DataType::Float);
auto running_var = makeContigTensor(1, DataType::Float);
fusion->addInput(input);
fusion->addInput(weight);
fusion->addInput(bias);
fusion->addInput(running_mean);
fusion->addInput(running_var);
if (dtype == DataType::Half) {
input = castOp(DataType::Float, input);
weight = castOp(DataType::Float, weight);
bias = castOp(DataType::Float, bias);
}
auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
auto eps_ptr = IrBuilder::create<Double>(kEps);
auto result = batch_norm(
input,
weight,
bias,
running_mean,
running_var,
kTraining,
momentum_ptr,
eps_ptr,
true);
auto output = result.output;
if (dtype == DataType::Half) {
output = castOp(DataType::Half, output);
}
fusion->addOutput(output);
}
static void NvFuserScheduler_BatchNorm_nhwc(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(2),
benchmark_state.range(2),
benchmark_state.range(1)};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_x = at::randn(input_shape, options);
at::Tensor at_weight = at::ones({input_shape[3]}, options);
at::Tensor at_bias = at::zeros({input_shape[3]}, options);
at::Tensor at_run_mean = at::zeros({input_shape[3]}, fp32_options);
at::Tensor at_run_var = at::ones({input_shape[3]}, fp32_options);
std::vector<c10::IValue> aten_inputs(
{at_x, at_weight, at_bias, at_run_mean, at_run_var});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
int64_t(dataTypeSize(dtype)) +
(2 * (at_run_mean.numel() + at_run_var.numel()) *
int64_t(dataTypeSize(DataType::Float)))));
}
//------------------------------------------------------------------------------
static void Baseline_BatchNorm_nhwc(
benchmark::State& benchmark_state,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
const float kMomentum = 0.1;
const float kEps = 1e-5;
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2),
benchmark_state.range(2)};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_x = at::randn(input_shape, options)
.contiguous(c10::MemoryFormat::ChannelsLast);
at::Tensor at_weight = at::ones({input_shape[1]}, options);
at::Tensor at_bias = at::zeros({input_shape[1]}, options);
at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
auto ato_weight = c10::optional<at::Tensor>(at_weight);
auto ato_bias = c10::optional<at::Tensor>(at_bias);
auto ato_run_mean = c10::optional<at::Tensor>(at_run_mean);
auto ato_run_var = c10::optional<at::Tensor>(at_run_var);
auto output = at::batch_norm(
at_x,
ato_weight,
ato_bias,
ato_run_mean,
ato_run_var,
true,
kMomentum,
kEps,
true);
clearL2Cache();
cudaDeviceSynchronize();
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
at::_ops::_batch_norm_impl_index::call(
at_x,
ato_weight,
ato_bias,
ato_run_mean,
ato_run_var,
true,
kMomentum,
kEps,
true);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
cudaDeviceSynchronize();
clearL2Cache();
cudaDeviceSynchronize();
}
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
((2 * (at_x.numel() + at_weight.numel() + at_bias.numel())) *
int64_t(dataTypeSize(dtype)) +
(2 * (at_run_mean.numel() + at_run_var.numel()) *
int64_t(dataTypeSize(DataType::Float)))));
}
//------------------------------------------------------------------------------
static void Baseline_BatchNorm_nhwc_cuDNN_fp32(
benchmark::State& benchmark_state) {
Baseline_BatchNorm_nhwc(benchmark_state, DataType::Float);
}
static void Baseline_BatchNorm_nhwc_cuDNN_fp16(
benchmark::State& benchmark_state) {
Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half);
}
// Simple aliases just for names in the printed output
static void Baseline_ResNet_BatchNorm_nhwc_cuDNN_fp16(benchmark::State& benchmark_state) {
Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half);
}
static void Baseline_ResNext_BatchNorm_nhwc_cuDNN_fp16(benchmark::State& benchmark_state) {
Baseline_BatchNorm_nhwc(benchmark_state, DataType::Half);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_BatchNorm_nhwc_fp32,
setupBatchNorm_nhwc,
NvFuserScheduler_BatchNorm_nhwc,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp32)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_BatchNorm_nhwc_fp16,
setupBatchNorm_nhwc,
NvFuserScheduler_BatchNorm_nhwc,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp16)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp32)
// ->RangeMultiplier(2)
// cuDNN didn't make it to 1024
->Ranges({{64, 512}, {32, 128}, {2, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp16)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_nhwc_cuDNN_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
// RESNET and REXNEXT benchmarks
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_ResNet_BatchNorm_nhwc_fp16,
setupBatchNorm_nhwc,
NvFuserScheduler_BatchNorm_nhwc,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_nhwc_fp16)
->Args({256, 64, 112})
->Args({256, 64, 56})
->Args({256, 256, 56})
->Args({256, 128, 56})
->Args({256, 128, 28})
->Args({256, 512, 28})
->Args({256, 256, 28})
->Args({256, 256, 14})
->Args({256, 1024, 14})
->Args({256, 512, 14})
->Args({256, 512, 7})
->Args({256, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_ResNext_BatchNorm_nhwc_fp16,
setupBatchNorm_nhwc,
NvFuserScheduler_BatchNorm_nhwc,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_nhwc_fp16)
->Args({128, 64, 112})
->Args({128, 128, 56})
->Args({128, 256, 56})
->Args({128, 128, 56})
->Args({128, 256, 28})
->Args({128, 512, 28})
->Args({128, 512, 14})
->Args({128, 1024, 14})
->Args({128, 1024, 7})
->Args({128, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
// Permutation of TIMM sizes
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16,
setupBatchNorm_nhwc,
NvFuserScheduler_BatchNorm_nhwc,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16)
->ArgsProduct(
{{8, 16, 32, 64, 128, 256},
{24, 40, 48, 56, 72, 152, 184, 200, 368},
{7, 14, 28, 56, 112}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_fp16)
->ArgsProduct(
{{128, 256, 512, 1024, 2048},
{24, 40, 48, 56, 72, 152},
{7, 14, 28, 56}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_ResNet_BatchNorm_nhwc_cuDNN_fp16)
->Args({256, 64, 112})
->Args({256, 64, 56})
->Args({256, 256, 56})
->Args({256, 128, 56})
->Args({256, 128, 28})
->Args({256, 512, 28})
->Args({256, 256, 28})
->Args({256, 256, 14})
->Args({256, 1024, 14})
->Args({256, 512, 14})
->Args({256, 512, 7})
->Args({256, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_ResNext_BatchNorm_nhwc_cuDNN_fp16)
->Args({128, 64, 112})
->Args({128, 128, 56})
->Args({128, 256, 56})
->Args({128, 128, 56})
->Args({128, 256, 28})
->Args({128, 512, 28})
->Args({128, 512, 14})
->Args({128, 1024, 14})
->Args({128, 1024, 7})
->Args({128, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,383 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <ATen/Operators.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
const bool kTraining = true;
const float kEps = 1e-5;
// setup fusion
auto input = makeContigTensor(4, dtype);
auto grad_output = makeContigTensor(4, dtype);
auto weight = makeContigTensor(1, DataType::Float);
auto running_mean = makeContigTensor(1, DataType::Float);
auto running_var = makeContigTensor(1, DataType::Float);
auto save_mean = makeContigTensor(1, DataType::Float);
auto save_var = makeContigTensor(1, DataType::Float);
fusion->addInput(input);
fusion->addInput(grad_output);
fusion->addInput(weight);
fusion->addInput(running_mean);
fusion->addInput(running_var);
fusion->addInput(save_mean);
fusion->addInput(save_var);
if (dtype == DataType::Half) {
input = castOp(DataType::Float, input);
grad_output = castOp(DataType::Float, grad_output);
}
auto eps_ptr = IrBuilder::create<Double>(kEps);
auto result = batch_norm_backward(
input,
grad_output,
weight,
running_mean,
running_var,
save_mean,
save_var,
kTraining,
eps_ptr,
std::vector<bool>(3, true),
true);
auto grad_input = result.grad_input;
auto grad_weight = result.grad_weight;
auto grad_bias = result.grad_bias;
if (dtype == DataType::Half) {
grad_input = castOp(DataType::Half, grad_input);
grad_weight = castOp(DataType::Half, grad_weight);
grad_bias = castOp(DataType::Half, grad_bias);
}
fusion->addOutput(grad_input);
fusion->addOutput(grad_weight);
fusion->addOutput(grad_bias);
}
static void NvFuserScheduler_BatchNorm_nhwc_BWD(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(2),
benchmark_state.range(2),
benchmark_state.range(1)};
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor input = at::randn(input_shape, options);
at::Tensor grad_out = at::randn(input_shape, options);
at::Tensor weight = at::ones({input_shape[3]}, fp32_options);
at::Tensor run_mean = at::zeros({input_shape[3]}, fp32_options);
at::Tensor run_var = at::ones({input_shape[3]}, fp32_options);
at::Tensor save_mean = at::zeros({input_shape[3]}, fp32_options);
at::Tensor save_var = at::ones({input_shape[3]}, fp32_options);
std::vector<c10::IValue> aten_inputs(
{input, grad_out, weight, run_mean, run_var, save_mean, save_var});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
(run_mean.numel() + run_var.numel() + save_mean.numel() +
save_var.numel() + weight.numel()) *
int64_t(dataTypeSize(DataType::Float))));
}
//------------------------------------------------------------------------------
static void Baseline_BatchNorm_nhwc_BWD(
benchmark::State& benchmark_state,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
const float kMomentum = 0.1;
const float kEps = 1e-5;
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2),
benchmark_state.range(2)};
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor input = at::randn(input_shape, options)
.contiguous(c10::MemoryFormat::ChannelsLast);
at::Tensor grad_out = at::randn(input_shape, options)
.contiguous(c10::MemoryFormat::ChannelsLast);
at::Tensor weight = at::ones({input_shape[1]}, fp32_options);
at::Tensor bias = at::zeros({input_shape[1]}, fp32_options);
at::Tensor run_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor run_var = at::ones({input_shape[1]}, fp32_options);
at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
auto ato_weight = c10::optional<at::Tensor>(weight);
auto ato_bias = c10::optional<at::Tensor>(bias);
auto ato_run_mean = c10::optional<at::Tensor>(run_mean);
auto ato_run_var = c10::optional<at::Tensor>(run_var);
auto ato_save_mean = c10::optional<at::Tensor>(save_mean);
auto ato_save_var = c10::optional<at::Tensor>(save_var);
auto fwd_result = at::_ops::_batch_norm_impl_index::call(
input,
ato_weight,
ato_bias,
ato_run_mean,
ato_run_var,
true,
kMomentum,
kEps,
true);
cudaDeviceSynchronize();
// Sync everything up before we start
clearL2Cache();
cudaDeviceSynchronize();
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
at::_ops::cudnn_batch_norm_backward::call(
input,
grad_out,
weight,
ato_run_mean,
ato_run_var,
save_mean,
save_var,
kEps,
std::get<3>(fwd_result));
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
cudaDeviceSynchronize();
clearL2Cache();
cudaDeviceSynchronize();
}
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(((3 * input.numel()) * int64_t(dataTypeSize(dtype))) +
(run_mean.numel() + run_var.numel() + save_mean.numel() +
save_var.numel() + weight.numel()) *
int64_t(dataTypeSize(DataType::Float))));
}
//------------------------------------------------------------------------------
static void Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32(
benchmark::State& benchmark_state) {
Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Float);
}
static void Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16(
benchmark::State& benchmark_state) {
Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half);
}
// Simple aliases just for names in the printed output
static void Baseline_ResNet_BatchNorm_nhwc_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half);
}
static void Baseline_ResNext_BatchNorm_nhwc_BWD_cuDNN_fp16(benchmark::State& benchmark_state) {
Baseline_BatchNorm_nhwc_BWD(benchmark_state, DataType::Half);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_BatchNorm_nhwc_BWD_fp32,
setupBatchNorm_nhwc_BWD,
NvFuserScheduler_BatchNorm_nhwc_BWD,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_BatchNorm_nhwc_BWD_fp16,
setupBatchNorm_nhwc_BWD,
NvFuserScheduler_BatchNorm_nhwc_BWD,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_BatchNorm_nhwc_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16)
// ->RangeMultiplier(2)
->Ranges({{64, 512}, {32, 128}, {2, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_BatchNorm_nhwc_BWD_cuDNN_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 64}, {2, 32}, {2, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
// RESNET and REXNEXT benchmarks
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_ResNet_BatchNorm_nhwc_BWD_fp16,
setupBatchNorm_nhwc_BWD,
NvFuserScheduler_BatchNorm_nhwc_BWD,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNet_BatchNorm_nhwc_BWD_fp16)
->Args({256, 64, 112})
->Args({256, 64, 56})
->Args({256, 256, 56})
->Args({256, 128, 56})
->Args({256, 128, 28})
->Args({256, 512, 28})
->Args({256, 256, 28})
->Args({256, 256, 14})
->Args({256, 1024, 14})
->Args({256, 512, 14})
->Args({256, 512, 7})
->Args({256, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_ResNext_BatchNorm_nhwc_BWD_fp16,
setupBatchNorm_nhwc_BWD,
NvFuserScheduler_BatchNorm_nhwc_BWD,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_nhwc_BWD_fp16)
->Args({128, 64, 112})
->Args({128, 128, 56})
->Args({128, 256, 56})
->Args({128, 128, 56})
->Args({128, 256, 28})
->Args({128, 512, 28})
->Args({128, 512, 14})
->Args({128, 1024, 14})
->Args({128, 1024, 7})
->Args({128, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
// Permutation of TIMM sizes
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16,
setupBatchNorm_nhwc_BWD,
NvFuserScheduler_BatchNorm_nhwc_BWD,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16)
->ArgsProduct(
{{8, 16, 32, 64, 128, 256},
{24, 40, 48, 56, 72, 152, 184, 200, 368},
{7, 14, 28, 56, 112}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_BatchNorm_nhwc_BWD_fp16)
->ArgsProduct(
{{128, 256, 512, 1024, 2048},
{24, 40, 48, 56, 72, 152},
{7, 14, 28, 56}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_ResNet_BatchNorm_nhwc_BWD_cuDNN_fp16)
->Args({256, 64, 112})
->Args({256, 64, 56})
->Args({256, 256, 56})
->Args({256, 128, 56})
->Args({256, 128, 28})
->Args({256, 512, 28})
->Args({256, 256, 28})
->Args({256, 256, 14})
->Args({256, 1024, 14})
->Args({256, 512, 14})
->Args({256, 512, 7})
->Args({256, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_ResNext_BatchNorm_nhwc_BWD_cuDNN_fp16)
->Args({128, 64, 112})
->Args({128, 128, 56})
->Args({128, 256, 56})
->Args({128, 128, 56})
->Args({128, 256, 28})
->Args({128, 512, 28})
->Args({128, 512, 14})
->Args({128, 1024, 14})
->Args({128, 1024, 7})
->Args({128, 2048, 7})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,766 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <sstream>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
// Return reduction tensor view and output of reduction
static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) {
FusionGuard fg(fusion);
bool is_fp16 = dtype == DataType::Half;
TensorView* tv0 = TensorViewBuilder()
.ndims(4)
.dtype(dtype)
.contiguity({true, false, false, true})
.shape({-1, 1, 1, -1})
.build();
TensorView* tv1 = makeContigTensor(4, dtype);
fusion->addInput(tv0);
fusion->addInput(tv1);
// TODO: should be input
auto d16 = IrBuilder::create<Double>(1.0);
if (is_fp16) {
tv0 = castOp(DataType::Float, tv0);
tv1 = castOp(DataType::Float, tv1);
}
auto tv2 = div(tv1, d16);
auto tv3 = add(tv2, tv0);
auto tv10 = softmax(tv3, 3);
auto dropout_tvs = dropout(tv10, IrBuilder::create<Double>(0.9));
auto tv12 = dropout_tvs.mask;
auto tv14 = dropout_tvs.output;
if (is_fp16) {
tv14 = castOp(DataType::Half, tv14);
tv10 = castOp(DataType::Half, tv10);
tv3 = castOp(DataType::Half, tv3);
}
fusion->addOutput(tv14);
fusion->addOutput(tv12);
fusion->addOutput(tv10);
fusion->addOutput(tv3);
}
static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) {
TensorView* tv0 = makeContigTensor(4, dtype);
// Strangely tv1 isn't used anywhere, need to come back to that...
TensorView* tv1 = makeContigTensor(4, dtype);
TensorView* tv2 = makeContigTensor(4, dtype);
TensorView* tv3 = makeContigTensor(4, DataType::Bool);
fusion->addInput(tv0);
fusion->addInput(tv1);
fusion->addInput(tv2);
fusion->addInput(tv3);
bool is_fp16 = dtype == DataType::Half;
if (is_fp16) {
tv0 = castOp(DataType::Float, tv0);
tv1 = castOp(DataType::Float, tv1);
tv2 = castOp(DataType::Float, tv2);
}
// TODO: should be inputs
auto d32 = IrBuilder::create<Double>(1.0);
// fusion->addInput(d32);
auto d33 = IrBuilder::create<Double>(2.0);
// fusion->addInput(d33);
auto tv4 = mul(tv2, tv3);
auto tv5 = mul(tv4, d33);
auto tv6 = mul(tv5, tv0);
auto tv7 = sum(tv6, {-1});
auto tv8 = broadcast(tv7, {false, false, false, true});
auto tv9 = mul(tv0, tv8);
auto tv10 = sub(tv6, tv9);
auto tv11 = div(tv10, d32);
if (is_fp16) {
tv10 = castOp(DataType::Half, tv10);
tv11 = castOp(DataType::Half, tv11);
}
fusion->addOutput(tv11);
fusion->addOutput(tv10);
}
static void MagicScheduler_DivMaxSoftDropFwd(
benchmark::State& benchmark_state,
DataType dtype) {
Fusion fusion;
FusionGuard fg(&fusion);
auto w = benchmark_state.range(0);
auto x = benchmark_state.range(1);
auto y = benchmark_state.range(2);
auto z = benchmark_state.range(3);
setupDivMaxSoftmaxDropoutForward(&fusion, dtype);
auto tvs = ir_utils::allTvs(&fusion);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({w, 1, 1, z}, options);
at::Tensor t1 = at::randn({w, x, y, z}, options);
std::vector<c10::IValue> at_inputs = {t0, t1};
std::vector<at::Tensor> cg_outputs;
auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, *norm_params);
FusionExecutor fe;
fe.compileFusion(&fusion);
fe.setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
cg_outputs = fe.runFusion({t0, t1}, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
int64_t bytes = 0;
for (auto tensor : std::vector<at::Tensor>({t0, t1})) {
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
for (auto tensor : cg_outputs) {
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
}
static void MagicScheduler_DivMaxSoftDropBwd(
benchmark::State& benchmark_state,
DataType dtype) {
Fusion fusion;
FusionGuard fg(&fusion);
auto w = benchmark_state.range(0);
auto x = benchmark_state.range(1);
auto y = benchmark_state.range(2);
auto z = benchmark_state.range(3);
setupDivMaxSoftmaxDropoutBackward(&fusion, dtype);
auto tvs = ir_utils::allTvs(&fusion);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({w, x, y, z}, options);
at::Tensor t1 = at::randn({w, x, y, z}, options);
at::Tensor t2 = at::randn({w, x, y, z}, options);
at::Tensor t3 = at::randn({w, x, y, z}, options).round().to(at::kBool);
std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
std::vector<at::Tensor> cg_outputs;
auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, *norm_params);
FusionExecutor fe;
fe.compileFusion(&fusion);
fe.setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
cg_outputs = fe.runFusion({t0, t1, t2, t3}, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
int64_t bytes = 0;
// Some reason t1 isn't used, ignore it.
for (auto tensor : std::vector<at::Tensor>({t0, t2, t3})) {
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
for (auto tensor : cg_outputs) {
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
}
static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) {
FusionGuard fg(fusion);
bool is_fp16 = dtype == DataType::Half;
TensorView* tv0 = makeContigTensor(1, dtype);
TensorView* tv1 = makeContigTensor(1, dtype);
TensorView* tv2 = makeContigTensor(3, dtype);
TensorView* tv3 = makeContigTensor(3, dtype);
TensorView* tv4 = makeContigTensor(1, dtype);
fusion->addInput(tv0);
fusion->addInput(tv1);
fusion->addInput(tv2);
fusion->addInput(tv3);
fusion->addInput(tv4);
if (is_fp16) {
tv0 = castOp(DataType::Float, tv0);
tv1 = castOp(DataType::Float, tv1);
tv2 = castOp(DataType::Float, tv2);
tv3 = castOp(DataType::Float, tv3);
tv4 = castOp(DataType::Float, tv4);
}
auto tv5 = broadcast(tv4, {true, true, false});
auto tv6 = add(tv3, tv5);
auto dropout_outs = dropout(tv6, IrBuilder::create<Double>(0.9));
auto tv8 = dropout_outs.output;
auto tv10 = dropout_outs.mask;
auto tv11 = add(tv10, tv2);
auto layer_norm_outs =
layer_norm(tv11, 1, tv0, tv1, IrBuilder::create<Double>(1e-5));
auto tv14 = layer_norm_outs.output;
auto tv21 = layer_norm_outs.mean;
auto tv26 = layer_norm_outs.invstd;
if (is_fp16) {
tv11 = castOp(DataType::Half, tv11);
tv14 = castOp(DataType::Half, tv14);
tv21 = castOp(DataType::Half, tv21);
tv26 = castOp(DataType::Half, tv26);
}
fusion->addOutput(tv8);
fusion->addOutput(tv11);
fusion->addOutput(tv14);
fusion->addOutput(tv21);
fusion->addOutput(tv26);
}
static void MagicScheduler_BiasDropoutAddLayernormFwd(
benchmark::State& benchmark_state,
DataType dtype) {
Fusion fusion;
FusionGuard fg(&fusion);
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
auto z = benchmark_state.range(2);
setupBiasDropoutAddLayernormFwd(&fusion, dtype);
auto tvs = ir_utils::allTvs(&fusion);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({z}, options);
at::Tensor t1 = at::randn({z}, options);
at::Tensor t2 = at::randn({x, y, z}, options);
at::Tensor t3 = at::randn({x, y, z}, options);
at::Tensor t4 = at::randn({z}, options);
std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3, t4};
std::vector<at::Tensor> cg_outputs;
auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, *norm_params);
FusionExecutor fe;
fe.compileFusion(&fusion);
fe.setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
int64_t bytes = 0;
for (auto inp : at_inputs) {
auto tensor = inp.toTensor();
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
for (auto tensor : cg_outputs) {
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
}
static void setupBiasDropoutAddLayernormBwd1(Fusion* fusion, DataType dtype) {
FusionGuard fg(fusion);
bool is_fp16 = dtype == DataType::Half;
TensorView* tv1 = makeContigTensor(3, dtype);
TensorView* tv2 = makeContigTensor(3, dtype);
TensorView* tv3 = TensorViewBuilder()
.ndims(3)
.dtype(dtype)
.contiguity({true, true, true})
.shape({-1, -1, 1})
.build();
TensorView* tv4 = TensorViewBuilder()
.ndims(3)
.dtype(dtype)
.contiguity({true, true, true})
.shape({-1, -1, 1})
.build();
fusion->addInput(tv1);
fusion->addInput(tv2);
fusion->addInput(tv3);
fusion->addInput(tv4);
if (is_fp16) {
tv1 = castOp(DataType::Float, tv1);
tv2 = castOp(DataType::Float, tv2);
tv3 = castOp(DataType::Float, tv3);
tv4 = castOp(DataType::Float, tv4);
}
auto tv7 = sub(tv2, tv3);
auto tv8 = mul(tv7, tv4);
auto tv24 = sum(tv1, {0, 1});
auto tv22 = mul(tv1, tv8);
auto tv23 = sum(tv22, {0, 1});
if (is_fp16) {
tv24 = castOp(DataType::Half, tv24);
tv23 = castOp(DataType::Half, tv23);
tv8 = castOp(DataType::Half, tv8);
}
fusion->addOutput(tv24);
fusion->addOutput(tv23);
fusion->addOutput(tv8);
}
static void MagicScheduler_BiasDropoutAddLayernormBwd1(
benchmark::State& benchmark_state,
DataType dtype) {
Fusion fusion;
FusionGuard fg(&fusion);
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
auto z = benchmark_state.range(2);
setupBiasDropoutAddLayernormBwd1(&fusion, dtype);
auto tvs = ir_utils::allTvs(&fusion);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({x, y, z}, options);
at::Tensor t1 = at::randn({x, y, z}, options);
at::Tensor t2 = at::randn({x, y, 1}, options);
at::Tensor t3 = at::randn({x, y, 1}, options);
std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
std::vector<at::Tensor> cg_outputs;
auto norm_params = getReductionHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
scheduleReduction(&fusion, *norm_params);
FusionExecutor fe;
fe.compileFusion(&fusion);
fe.setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
clearL2Cache();
cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
int64_t bytes = 0;
for (auto inp : at_inputs) {
auto tensor = inp.toTensor();
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
for (auto tensor : cg_outputs) {
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
}
static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) {
FusionGuard fg(fusion);
bool is_fp16 = dtype == DataType::Half;
TensorView* tv4 = TensorViewBuilder()
.ndims(3)
.dtype(dtype)
.contiguity({true, true, true})
.shape({-1, -1, 1})
.build();
TensorView* tv5 = makeContigTensor(1, dtype);
TensorView* tv1 = makeContigTensor(3, dtype);
TensorView* tv8 = makeContigTensor(3, dtype);
fusion->addInput(tv4);
fusion->addInput(tv5);
fusion->addInput(tv1);
fusion->addInput(tv8);
if (is_fp16) {
tv4 = castOp(DataType::Float, tv4);
tv5 = castOp(DataType::Float, tv5);
tv1 = castOp(DataType::Float, tv1);
tv8 = castOp(DataType::Float, tv8);
}
auto d36 = mul(IrBuilder::create<Double>(1.0), tv1->axis(2)->extent());
auto d47 = unaryOp(UnaryOpType::Reciprocal, d36);
auto tv9 = broadcast(tv5, {true, true, false});
auto tv10 = mul(tv1, tv9);
auto tv14 = mul(tv10, tv8);
auto tv15 = sum(tv14, {2});
auto tv16 = broadcast(tv15, {false, false, true});
auto tv17 = mul(tv8, tv16);
auto tv12 = sum(tv10, {2});
auto tv13 = broadcast(tv12, {false, false, true});
auto tv11 = mul(d36, tv10);
auto tv18 = sub(tv11, tv13);
auto tv20 = mul(d47, tv4);
auto tv19 = sub(tv18, tv17);
auto tv21 = mul(tv20, tv19);
if (is_fp16) {
tv21 = castOp(DataType::Half, tv21);
}
fusion->addOutput(tv21);
}
static void MagicScheduler_BiasDropoutAddLayernormBwd2(
benchmark::State& benchmark_state,
DataType dtype) {
Fusion fusion;
FusionGuard fg(&fusion);
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
auto z = benchmark_state.range(2);
setupBiasDropoutAddLayernormBwd2(&fusion, dtype);
auto tvs = ir_utils::allTvs(&fusion);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor t4 = at::randn({x, y, 1}, options);
at::Tensor t5 = at::randn({z}, options);
at::Tensor t1 = at::randn({x, y, z}, options);
at::Tensor t8 = at::randn({x, y, z}, options);
std::vector<c10::IValue> at_inputs = {t4, t5, t1, t8};
std::vector<at::Tensor> cg_outputs;
auto norm_params = getPersistentHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
schedulePersistentKernel(&fusion, *norm_params);
FusionExecutor fe;
fe.compileFusion(&fusion);
fe.setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
int64_t bytes = 0;
for (auto inp : at_inputs) {
auto tensor = inp.toTensor();
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
for (auto tensor : cg_outputs) {
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
}
static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) {
FusionGuard fg(fusion);
bool is_fp16 = dtype == DataType::Half;
TensorView* tv0 = makeContigTensor(3, dtype);
TensorView* tv21 = makeContigTensor(3, dtype);
fusion->addInput(tv0);
fusion->addInput(tv21);
if (is_fp16) {
tv0 = castOp(DataType::Float, tv0);
tv21 = castOp(DataType::Float, tv21);
}
// Uncertain this is the right value, but going for it anyways
auto d34 = div(IrBuilder::create<Double>(1.0), tv0->axis(2)->extent());
auto tv25 = mul(tv21, tv0);
auto tv26 = mul(tv25, d34);
auto tv27 = sum(tv26, {0, 1});
if (is_fp16) {
tv26 = castOp(DataType::Half, tv27);
tv27 = castOp(DataType::Half, tv27);
}
fusion->addOutput(tv26);
fusion->addOutput(tv27);
}
static void MagicScheduler_BiasDropoutAddLayernormBwd3(
benchmark::State& benchmark_state,
DataType dtype) {
Fusion fusion;
FusionGuard fg(&fusion);
auto x = benchmark_state.range(0);
auto y = benchmark_state.range(1);
auto z = benchmark_state.range(2);
setupBiasDropoutAddLayernormBwd3(&fusion, dtype);
auto tvs = ir_utils::allTvs(&fusion);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({x, y, z}, options);
at::Tensor t21 = at::randn({x, y, z}, options);
std::vector<c10::IValue> at_inputs = {t0, t21};
std::vector<at::Tensor> cg_outputs;
auto norm_params = getReductionHeuristics(&fusion, at_inputs);
TORCH_CHECK(norm_params != nullptr, "Norm scheduler can't be used!");
scheduleReduction(&fusion, *norm_params);
FusionExecutor fe;
fe.compileFusion(&fusion);
fe.setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
cg_outputs = fe.runFusion(at_inputs, norm_params->lparams);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
int64_t bytes = 0;
for (auto inp : at_inputs) {
auto tensor = inp.toTensor();
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
for (auto tensor : cg_outputs) {
bytes += tensor.numel() *
(int64_t)dataTypeSize(aten_to_data_type(tensor.scalar_type()));
}
benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
}
//------------------------------------------------------------------------------
static void DivMaxSoftDropFwd_fp32(benchmark::State& benchmark_state) {
MagicScheduler_DivMaxSoftDropFwd(benchmark_state, DataType::Float);
}
static void DivMaxSoftDropBwd_fp32(benchmark::State& benchmark_state) {
MagicScheduler_DivMaxSoftDropBwd(benchmark_state, DataType::Float);
}
static void DivMaxSoftDropFwd_fp16(benchmark::State& benchmark_state) {
MagicScheduler_DivMaxSoftDropFwd(benchmark_state, DataType::Half);
}
static void DivMaxSoftDropBwd_fp16(benchmark::State& benchmark_state) {
MagicScheduler_DivMaxSoftDropBwd(benchmark_state, DataType::Half);
}
static void BiasDropoutAddLayernormFwd_fp32(
benchmark::State& benchmark_state) {
MagicScheduler_BiasDropoutAddLayernormFwd(benchmark_state, DataType::Float);
}
static void BiasDropoutAddLayernormFwd_tf32(
benchmark::State& benchmark_state) {
MagicScheduler_BiasDropoutAddLayernormFwd(benchmark_state, DataType::Float);
}
static void BiasDropoutAddLayernormBwd1_fp32(
benchmark::State& benchmark_state) {
MagicScheduler_BiasDropoutAddLayernormBwd1(benchmark_state, DataType::Float);
}
// Use full ampere wave here
static void BiasDropoutAddLayernormBwd1_tf32(
benchmark::State& benchmark_state) {
MagicScheduler_BiasDropoutAddLayernormBwd1(benchmark_state, DataType::Float);
}
static void BiasDropoutAddLayernormBwd2_fp32(
benchmark::State& benchmark_state) {
MagicScheduler_BiasDropoutAddLayernormBwd2(benchmark_state, DataType::Float);
}
static void BiasDropoutAddLayernormBwd3_fp32(
benchmark::State& benchmark_state) {
MagicScheduler_BiasDropoutAddLayernormBwd3(benchmark_state, DataType::Float);
}
//------------------------------------------------------------------------------
BENCHMARK(DivMaxSoftDropFwd_fp32)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(DivMaxSoftDropBwd_fp32)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(DivMaxSoftDropFwd_fp16)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(DivMaxSoftDropBwd_fp16)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {16, 16}, {128, 128}, {128, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(BiasDropoutAddLayernormFwd_fp32)
// ->RangeMultiplier(2)
->Ranges({{32, 1024}, {128, 128}, {1024, 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
// Use full ampere wave here
BENCHMARK(BiasDropoutAddLayernormFwd_tf32)
// ->RangeMultiplier(2)
->Ranges({{32, 1024}, {128, 128}, {864, 864}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(BiasDropoutAddLayernormBwd1_fp32)
// ->RangeMultiplier(2)
->Ranges({{32, 1024}, {128, 128}, {1024, 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
// Use full ampere wave here
BENCHMARK(BiasDropoutAddLayernormBwd1_tf32)
// ->RangeMultiplier(2)
->Ranges({{32, 1024}, {128, 128}, {864, 864}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(BiasDropoutAddLayernormBwd2_fp32)
->Ranges({{32, 1024}, {128, 128}, {1024, 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(BiasDropoutAddLayernormBwd3_fp32)
->Ranges({{32, 1024}, {128, 128}, {1024, 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,367 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <sstream>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
// Return broadcast tensor view and output of broadcast
static void setupBroadcast(Fusion* fusion, DataType dtype, int bcast_axis) {
FusionGuard fg(fusion);
bool is_fp16 = dtype == DataType::Half;
TensorView* tv0 = makeContigTensor(2, dtype);
TensorView* tv1 = makeContigTensor(1, dtype);
fusion->addInput(tv0);
fusion->addInput(tv1);
std::vector<bool> bcast_pattern(2, false);
bcast_pattern[bcast_axis] = true;
if (is_fp16) {
tv0 = castOp(DataType::Float, tv0);
tv1 = castOp(DataType::Float, tv1);
}
TensorView* tv2 = broadcast(tv1, bcast_pattern);
TensorView* tv3 = add(tv0, tv2);
if (is_fp16) {
tv3 = castOp(DataType::Half, tv3);
}
fusion->addOutput(tv3);
}
static void NvFuserScheduler_Broadcast(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype,
int bcast_dim) {
auto bcast_size = benchmark_state.range(0);
auto iter_size = benchmark_state.range(1);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor t0 =
(bcast_dim ? at::randn({iter_size, bcast_size}, options)
: at::randn({bcast_size, iter_size}, options));
at::Tensor t1 = at::randn({iter_size}, options);
fusion_executor_cache->profile(true);
fusion_executor_cache->runFusionWithInputs({t0, t1});
auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
auto executor_instance = compile_log.fusion_executor;
auto params = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
benchmark_state.SetLabel(params + lparams);
fusion_executor_cache->profile(false);
executor_instance->setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
clearL2Cache();
auto cg_outputs = fusion_executor_cache->runFusionWithInputs({t0, t1});
benchmark_state.SetIterationTime(
executor_instance->kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(iter_size * bcast_size * 2 + iter_size) * int64_t(dataTypeSize(dtype)));
}
static void Baseline_Broadcast(
benchmark::State& benchmark_state,
DataType dtype,
int bcast_dim) {
auto bcast_size = benchmark_state.range(0);
auto iter_size = benchmark_state.range(1);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor t0 =
(bcast_dim ? at::randn({iter_size, bcast_size}, options)
: at::randn({bcast_size, iter_size}, options));
at::Tensor t1 = at::randn({iter_size}, options);
// Sync everything up before we start
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
auto output = t0.add(t1.unsqueeze(bcast_dim));
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
C10_CUDA_CHECK(cudaDeviceSynchronize());
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(iter_size * bcast_size * 2 + iter_size) * int64_t(dataTypeSize(dtype)));
}
//------------------------------------------------------------------------------
static void Baseline_Broadcast_Outer_fp32(benchmark::State& benchmark_state) {
Baseline_Broadcast(benchmark_state, DataType::Float, 0);
}
static void Baseline_Broadcast_Outer_fp16(benchmark::State& benchmark_state) {
Baseline_Broadcast(benchmark_state, DataType::Half, 0);
}
static void Baseline_Broadcast_Inner_fp32(benchmark::State& benchmark_state) {
Baseline_Broadcast(benchmark_state, DataType::Float, 1);
}
static void Baseline_Broadcast_Inner_fp16(benchmark::State& benchmark_state) {
Baseline_Broadcast(benchmark_state, DataType::Half, 1);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Broadcast_Outer_fp32,
setupBroadcast,
NvFuserScheduler_Broadcast,
DataType::Float,
0);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Broadcast_Outer_fp16,
setupBroadcast,
NvFuserScheduler_Broadcast,
DataType::Half,
0);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Broadcast_Inner_fp32,
setupBroadcast,
NvFuserScheduler_Broadcast,
DataType::Float,
1);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Broadcast_Inner_fp16,
setupBroadcast,
NvFuserScheduler_Broadcast,
DataType::Half,
1);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Broadcast_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_Broadcast_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Broadcast_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,242 +0,0 @@
// Based on NVFuserTest.FusionBiasGeluBwd_CUDA
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
static void setupFusion(Fusion* fusion) {
FusionGuard fg(fusion);
const float k_079 = 0.79788456;
const float k_004 = 0.044715;
const float k_010 = 0.1070322243;
// gradient tensor
auto t0 = makeContigTensor(3, DataType::Half);
fusion->addInput(t0);
auto t1 = castOp(DataType::Float, t0);
// bias tensor
auto t2 = makeContigTensor(1, DataType::Half);
fusion->addInput(t2);
auto t3 = castOp(DataType::Float, t2);
// input tensor
auto t4 = makeContigTensor(3, DataType::Half);
fusion->addInput(t4);
auto t5 = castOp(DataType::Float, t4);
auto t6 = broadcast(t3, {true, true, false});
auto t7 = add(t6, t5);
auto t8 = mul(t7, IrBuilder::create<Double>(k_079));
auto t9 = mul(t7, IrBuilder::create<Double>(k_004));
auto t10 = mul(t9, t7);
auto t11 = add(t10, IrBuilder::create<Int>(1));
auto t12 = mul(t8, t11);
auto t13 = unaryOp(UnaryOpType::Tanh, t12);
auto t14 = mul(t7, IrBuilder::create<Double>(0.5));
auto t15 = mul(t13, t13);
auto t16 = unaryOp(UnaryOpType::Neg, t15);
auto t17 = add(t16, IrBuilder::create<Int>(1));
auto t18 = mul(t7, IrBuilder::create<Double>(k_010));
auto t19 = mul(t18, t7);
auto t20 = add(t19, IrBuilder::create<Double>(k_079));
auto t21 = mul(t17, t20);
auto t22 = mul(t14, t21);
auto t23 = add(t13, IrBuilder::create<Int>(1));
auto t24 = mul(t23, IrBuilder::create<Double>(0.5));
auto t25 = add(t22, t24);
auto t26 = mul(t25, t1);
// Save float output for validation
fusion->addOutput(t26);
auto t27 = castOp(DataType::Half, t26);
fusion->addOutput(t27);
}
static std::vector<c10::IValue> setupInputs() {
at::manual_seed(0);
auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
std::vector<int64_t> input_shape{6, 512, 4096};
std::vector<int64_t> bias_shape{4096};
auto at_input = at::randn(input_shape, options);
auto at_bias = at::randn(bias_shape, options);
auto at_grad = at::randn(input_shape, options);
return {at_grad, at_bias, at_input};
}
//------------------------------------------------------------------------------
static void GeluBackward_SetupFusion(benchmark::State& benchmark_state) {
for (auto _ : benchmark_state) {
Fusion fusion;
setupFusion(&fusion);
}
}
BENCHMARK(GeluBackward_SetupFusion)->Unit(benchmark::kMicrosecond);
//------------------------------------------------------------------------------
static void GeluBackward_AutoSchedule(benchmark::State& benchmark_state) {
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
benchmark_state.PauseTiming();
Fusion fusion;
setupFusion(&fusion);
std::vector<c10::IValue> inputs = setupInputs();
benchmark_state.ResumeTiming();
// Auto-schedule
schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
}
}
BENCHMARK(GeluBackward_AutoSchedule)->Unit(benchmark::kMicrosecond);
//------------------------------------------------------------------------------
static void GeluBackward_Lower(benchmark::State& benchmark_state) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs();
schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
for (auto _ : benchmark_state) {
GpuLower gpu_lower(&fusion);
}
}
BENCHMARK(GeluBackward_Lower)->Unit(benchmark::kMillisecond);
//------------------------------------------------------------------------------
static void GeluBackward_Compile(benchmark::State& benchmark_state) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs();
schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
for (auto _ : benchmark_state) {
FusionExecutor executor;
executor.compileFusion(&fusion);
}
}
BENCHMARK(GeluBackward_Compile)->Unit(benchmark::kMillisecond);
//------------------------------------------------------------------------------
static void GeluBackward_RunFusion(benchmark::State& benchmark_state) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs();
// outputs
std::vector<at::Tensor> outputs;
auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
FusionExecutor executor;
executor.compileFusion(&fusion);
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
C10_CUDA_CHECK(cudaDeviceSynchronize());
clearL2Cache();
}
}
BENCHMARK(GeluBackward_RunFusion)->Unit(benchmark::kMicrosecond);
//------------------------------------------------------------------------------
static void GeluBackward_RunFusion_GpuOnly(benchmark::State& benchmark_state) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs();
// outputs
std::vector<at::Tensor> outputs;
auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
FusionExecutor executor;
executor.setMeasureKernelTimeFlag(true);
executor.compileFusion(&fusion);
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
benchmark_state.SetIterationTime(executor.kernelTimeMs() / 1000.0);
clearL2Cache();
}
}
BENCHMARK(GeluBackward_RunFusion_GpuOnly)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
static void GeluBackward_RunFusion_CpuOnly(benchmark::State& benchmark_state) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs();
// outputs
std::vector<at::Tensor> outputs;
auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
FusionExecutor executor;
executor.setExecuteKernelFlag(false);
executor.compileFusion(&fusion);
for (auto _ : benchmark_state) {
outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
}
}
BENCHMARK(GeluBackward_RunFusion_CpuOnly)->Unit(benchmark::kMicrosecond);

View File

@ -1,165 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
static auto getLayerBackwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::vector<at::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();
const size_t kM = shape.size();
const size_t kN = norm_shape.size();
const size_t kOuterNumDims = kM - kN;
std::vector<int64_t> outer_shape;
for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
outer_shape.push_back(shape[idx]);
}
for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
outer_shape.push_back(1);
}
auto grad_out = makeSymbolicTensor(shape.size());
auto input = makeSymbolicTensor(shape.size());
auto mean = makeConcreteTensor(outer_shape);
auto rstd = makeConcreteTensor(outer_shape);
auto weight = makeSymbolicTensor(norm_shape.size());
auto bias = makeSymbolicTensor(norm_shape.size());
fusion.addInput(grad_out);
fusion.addInput(input);
fusion.addInput(mean);
fusion.addInput(rstd);
fusion.addInput(weight);
fusion.addInput(bias);
auto grads = layer_norm_backward(
grad_out,
input,
norm_shape,
mean,
rstd,
weight,
bias,
{true, true, true});
fusion.addOutput(grads.grad_input);
fusion.addOutput(grads.grad_weight);
fusion.addOutput(grads.grad_bias);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_grad_out = at::randn(shape, options);
at::Tensor aten_input = at::randn(shape, options);
at::Tensor aten_weight = at::randn(norm_shape, options);
at::Tensor aten_bias = at::randn(norm_shape, options);
auto at_weight = c10::optional<at::Tensor>(aten_weight);
auto at_bias = c10::optional<at::Tensor>(aten_bias);
const float kEps = 1e-5;
auto aten_results =
at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
auto aten_output = std::get<0>(aten_results);
auto aten_mean = std::get<1>(aten_results);
auto aten_rstd = std::get<2>(aten_results);
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {
aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
return fec->getMostRecentKernelRuntime();
}
static void LayerNormBackward_HeuristicLookup(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::vector<at::IValue> aten_inputs;
std::vector<int64_t> shape{20, 100, 35, 67};
std::vector<int64_t> norm_shape{67};
auto runtime = getLayerBackwardNormRuntime(
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
TORCH_INTERNAL_ASSERT(
runtime->getMaybeHeuristicsFor(aten_inputs).has_value());
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
runtime->getMaybeHeuristicsFor(aten_inputs);
}
}
static auto getLayerForwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::vector<at::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();
const float kEps = 1e-5;
Double* eps_ptr = IrBuilder::create<Double>(kEps);
auto input = makeSymbolicTensor(shape.size());
fusion.addInput(input);
auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
fusion.addOutput(result.output);
fusion.addOutput(result.mean);
fusion.addOutput(result.invstd);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_input = at::randn(shape, options);
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {aten_input};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
return fec->getMostRecentKernelRuntime();
}
static void LayerNormForward_HeuristicLookup(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::vector<at::IValue> aten_inputs;
std::vector<int64_t> shape{20, 100, 35, 67};
std::vector<int64_t> norm_shape{67};
auto runtime = getLayerForwardNormRuntime(
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
TORCH_INTERNAL_ASSERT(
runtime->getMaybeHeuristicsFor(aten_inputs).has_value());
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
runtime->getMaybeHeuristicsFor(aten_inputs);
}
}
BENCHMARK(LayerNormBackward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_HeuristicLookup)->Unit(benchmark::kMicrosecond);

View File

@ -1,171 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
static auto getLayerBackwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::vector<at::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();
const size_t kM = shape.size();
const size_t kN = norm_shape.size();
const size_t kOuterNumDims = kM - kN;
std::vector<int64_t> outer_shape;
for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
outer_shape.push_back(shape[idx]);
}
for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
outer_shape.push_back(1);
}
auto grad_out = makeSymbolicTensor(shape.size());
auto input = makeSymbolicTensor(shape.size());
auto mean = makeConcreteTensor(outer_shape);
auto rstd = makeConcreteTensor(outer_shape);
auto weight = makeSymbolicTensor(norm_shape.size());
auto bias = makeSymbolicTensor(norm_shape.size());
fusion.addInput(grad_out);
fusion.addInput(input);
fusion.addInput(mean);
fusion.addInput(rstd);
fusion.addInput(weight);
fusion.addInput(bias);
auto grads = layer_norm_backward(
grad_out,
input,
norm_shape,
mean,
rstd,
weight,
bias,
{true, true, true});
fusion.addOutput(grads.grad_input);
fusion.addOutput(grads.grad_weight);
fusion.addOutput(grads.grad_bias);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_grad_out = at::randn(shape, options);
at::Tensor aten_input = at::randn(shape, options);
at::Tensor aten_weight = at::randn(norm_shape, options);
at::Tensor aten_bias = at::randn(norm_shape, options);
auto at_weight = c10::optional<at::Tensor>(aten_weight);
auto at_bias = c10::optional<at::Tensor>(aten_bias);
const float kEps = 1e-5;
auto aten_results =
at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
auto aten_output = std::get<0>(aten_results);
auto aten_mean = std::get<1>(aten_results);
auto aten_rstd = std::get<2>(aten_results);
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {
aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
return fec->getMostRecentKernelRuntime();
}
static void LayerNormBackward_HeuristicLookup(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::vector<at::IValue> aten_inputs;
std::vector<int64_t> shape{20, 100, 35, 67};
std::vector<int64_t> norm_shape{67};
auto runtime = getLayerBackwardNormRuntime(
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
TORCH_INTERNAL_ASSERT(
runtime->getMaybeHeuristicsFor(args).has_value());
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
runtime->getMaybeHeuristicsFor(args);
}
}
static auto getLayerForwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::vector<at::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();
const float kEps = 1e-5;
Double* eps_ptr = IrBuilder::create<Double>(kEps);
auto input = makeSymbolicTensor(shape.size());
fusion.addInput(input);
auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
fusion.addOutput(result.output);
fusion.addOutput(result.mean);
fusion.addOutput(result.invstd);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_input = at::randn(shape, options);
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {aten_input};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
return fec->getMostRecentKernelRuntime();
}
static void LayerNormForward_HeuristicLookup(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::vector<at::IValue> aten_inputs;
std::vector<int64_t> shape{20, 100, 35, 67};
std::vector<int64_t> norm_shape{67};
auto runtime = getLayerForwardNormRuntime(
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
TORCH_INTERNAL_ASSERT(
runtime->getMaybeHeuristicsFor(args).has_value());
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
runtime->getMaybeHeuristicsFor(args);
}
}
BENCHMARK(LayerNormBackward_HeuristicLookup)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_HeuristicLookup)->Unit(benchmark::kMicrosecond);

View File

@ -1,316 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
static void setupInstanceNorm(
Fusion* fusion,
DataType dtype,
bool channels_last_3d = false) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
auto input = makeContigTensor(4, dtype);
if (channels_last_3d) {
input = makeContigTensor(5, dtype);
}
auto weight = makeContigTensor(1, dtype);
auto bias = makeContigTensor(1, dtype);
auto running_mean = makeContigTensor(1, DataType::Float);
auto running_var = makeContigTensor(1, DataType::Float);
fusion->addInput(input);
fusion->addInput(weight);
fusion->addInput(bias);
fusion->addInput(running_mean);
fusion->addInput(running_var);
if (dtype == DataType::Half) {
input = castOp(DataType::Float, input);
weight = castOp(DataType::Float, weight);
bias = castOp(DataType::Float, bias);
}
const bool kTraining = true;
const float kMomentum = 0.1;
const float kEps = 1e-5;
auto momentum_ptr = IrBuilder::create<Double>(kMomentum);
auto eps_ptr = IrBuilder::create<Double>(kEps);
auto norm = instance_norm(
input,
weight,
bias,
running_mean,
running_var,
kTraining,
momentum_ptr,
eps_ptr,
channels_last_3d);
auto output = unaryOp(UnaryOpType::Relu, norm.output);
if (dtype == DataType::Half) {
output = castOp(DataType::Half, output);
}
fusion->addOutput(output);
}
//------------------------------------------------------------------------------
static void NvFuserScheduler_InstanceNorm(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype,
bool channels_last_3d = false) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(2),
benchmark_state.range(1),
benchmark_state.range(1)};
std::vector<int64_t> input_shape_3d{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(1),
benchmark_state.range(1),
benchmark_state.range(2)};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_x =
at::randn(channels_last_3d ? input_shape_3d : input_shape, options);
at::Tensor at_weight = at::ones({benchmark_state.range(2)}, options);
at::Tensor at_bias = at::zeros({benchmark_state.range(2)}, options);
at::Tensor at_mean = at::zeros({benchmark_state.range(2)}, fp32_options);
at::Tensor at_var = at::ones({benchmark_state.range(2)}, fp32_options);
std::vector<c10::IValue> aten_inputs = {
at_x, at_weight, at_bias, at_mean, at_var};
std::vector<at::Tensor> outputs;
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
const size_t kChannels = benchmark_state.range(2);
// Read: x, weight, bias, running_mean, running_var
// Write: y, running_mean, running_var
benchmark_state.SetBytesProcessed(
benchmark_state.iterations() *
((kChannels * 2 + at_x.numel() * 2) * dataTypeSize(dtype) +
(kChannels * 2 * 2) * dataTypeSize(DataType::Float)));
}
static void Baseline_InstanceNorm(
benchmark::State& benchmark_state,
DataType dtype,
bool channels_last_3d = false) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(2),
benchmark_state.range(1),
benchmark_state.range(1)};
std::vector<int64_t> input_shape_3d{
benchmark_state.range(0),
benchmark_state.range(2),
benchmark_state.range(1),
benchmark_state.range(1),
benchmark_state.range(1),
};
const float kMomentum = 0.1;
const float kEps = 1e-5;
const auto aten_dtype = data_type_to_aten(dtype);
at::manual_seed(0);
auto options = at::TensorOptions().dtype(aten_dtype).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor at_x = at::randn(input_shape, options);
if (channels_last_3d) {
at_x = at::randn(
input_shape_3d,
options.memory_format(c10::MemoryFormat::ChannelsLast3d));
}
at::Tensor at_weight = at::ones({benchmark_state.range(2)}, options);
at::Tensor at_bias = at::zeros({benchmark_state.range(2)}, options);
at::Tensor at_mean = at::zeros({benchmark_state.range(2)}, fp32_options);
at::Tensor at_var = at::ones({benchmark_state.range(2)}, fp32_options);
auto ato_weight = c10::optional<at::Tensor>(at_weight);
auto ato_bias = c10::optional<at::Tensor>(at_bias);
auto ato_running_mean = c10::optional<at::Tensor>(at_mean);
auto ato_running_var = c10::optional<at::Tensor>(at_var);
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
auto norm = at::instance_norm(
at_x,
ato_weight,
ato_bias,
ato_running_mean,
ato_running_var,
true,
kMomentum,
kEps,
false);
auto output = at::relu(norm);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
C10_CUDA_CHECK(cudaDeviceSynchronize());
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
const size_t kChannels = benchmark_state.range(2);
// Read: x, weight, bias, running_mean, running_var
// Write: y, running_mean, running_var
benchmark_state.SetBytesProcessed(
benchmark_state.iterations() *
((kChannels * 2 + at_x.numel() * 2) * dataTypeSize(dtype) +
(kChannels * 2 * 2) * dataTypeSize(DataType::Float)));
}
//------------------------------------------------------------------------------
static void Baseline_InstanceNorm_fp32(benchmark::State& benchmark_state) {
Baseline_InstanceNorm(benchmark_state, DataType::Float);
}
static void Baseline_InstanceNorm_fp16(benchmark::State& benchmark_state) {
Baseline_InstanceNorm(benchmark_state, DataType::Half);
}
static void Baseline_InstanceNorm_fp32_channels_last_3d(
benchmark::State& benchmark_state) {
Baseline_InstanceNorm(benchmark_state, DataType::Float, true);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_InstanceNorm_fp32,
setupInstanceNorm,
NvFuserScheduler_InstanceNorm,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_InstanceNorm_fp16,
setupInstanceNorm,
NvFuserScheduler_InstanceNorm,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_InstanceNorm3d_channels_last_fp32,
setupInstanceNorm,
NvFuserScheduler_InstanceNorm,
DataType::Float,
true);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
->RangeMultiplier(2)
->Ranges({{1, 8}, {128, 128}, {32, 32}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
->RangeMultiplier(2)
->Ranges({{1, 8}, {64, 64}, {64, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
->RangeMultiplier(2)
->Ranges({{1, 8}, {32, 32}, {128, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
->RangeMultiplier(2)
->Ranges({{1, 8}, {16, 16}, {256, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_InstanceNorm3d_channels_last_fp32)
->RangeMultiplier(2)
->Ranges({{1, 8}, {4, 8}, {320, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_InstanceNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_InstanceNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
->RangeMultiplier(2)
->Ranges({{2, 8}, {128, 128}, {32, 32}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
->RangeMultiplier(2)
->Ranges({{2, 8}, {64, 64}, {64, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
->RangeMultiplier(2)
->Ranges({{2, 8}, {16, 16}, {256, 256}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_InstanceNorm_fp32_channels_last_3d)
->RangeMultiplier(2)
->Ranges({{2, 8}, {4, 8}, {320, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------

View File

@ -1,240 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupLayerNorm(Fusion* fusion, DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
const float kEps = 1e-5;
Double* eps_ptr = IrBuilder::create<Double>(kEps);
// setup fusion
auto input = makeContigTensor(2, dtype);
auto weight = makeContigTensor(1, dtype);
auto bias = makeContigTensor(1, dtype);
fusion->addInput(input);
fusion->addInput(weight);
fusion->addInput(bias);
if (dtype == DataType::Half) {
input = castOp(DataType::Float, input);
weight = castOp(DataType::Float, weight);
bias = castOp(DataType::Float, bias);
}
auto layer_norm_results = layer_norm(input, 1, weight, bias, eps_ptr);
auto output = layer_norm_results.output;
if (dtype != DataType::Float) {
output = castOp(dtype, output);
}
fusion->addOutput(output);
}
static void NvFuserScheduler_LayerNorm(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
std::vector<int64_t> input_shape{
benchmark_state.range(0), benchmark_state.range(1)};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor input = at::randn(input_shape, options);
at::Tensor weight = at::randn({input_shape[1]}, options);
at::Tensor bias = at::randn({input_shape[1]}, options);
std::vector<c10::IValue> aten_inputs({input, weight, bias});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(2 * input.numel() + weight.numel() + bias.numel()) *
int64_t(dataTypeSize(dtype)));
}
//------------------------------------------------------------------------------
static void Baseline_LayerNorm(
benchmark::State& benchmark_state,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
std::vector<int64_t> input_shape{
benchmark_state.range(0), benchmark_state.range(1)};
const size_t kReductionAxis = 1;
std::vector<int64_t> norm_shape;
for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) {
norm_shape.push_back(input_shape[idx]);
}
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor input = at::randn(input_shape, options);
at::Tensor weight = at::randn({input_shape[1]}, options);
at::Tensor bias = at::randn({input_shape[1]}, options);
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
auto output = at::layer_norm(input, norm_shape, weight, bias);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
C10_CUDA_CHECK(cudaDeviceSynchronize());
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(2 * input.numel() + weight.numel() + bias.numel()) *
int64_t(dataTypeSize(dtype)));
}
static void Baseline_LayerNorm_fp32(benchmark::State& benchmark_state) {
Baseline_LayerNorm(benchmark_state, DataType::Float);
}
static void Baseline_LayerNorm_fp16(benchmark::State& benchmark_state) {
Baseline_LayerNorm(benchmark_state, DataType::Half);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_LayerNorm_fp32,
setupLayerNorm,
NvFuserScheduler_LayerNorm,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{160, 320}, {2, 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_LayerNorm_fp16,
setupLayerNorm,
NvFuserScheduler_LayerNorm,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{160, 320}, {2, 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_LayerNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{160, 320}, {2, 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{160, 320}, {2, 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,274 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
FusionGuard fg(fusion);
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
// setup fusion
auto grad_out = makeContigTensor(2, dtype);
auto input = makeContigTensor(2, dtype);
auto weight = makeContigTensor(1, dtype);
auto bias = makeContigTensor(1, dtype);
auto mean = TensorViewBuilder()
.contiguity({false, false})
.shape({-1, 1})
.dtype(DataType::Float)
.build();
auto rstd = TensorViewBuilder()
.contiguity({false, false})
.shape({-1, 1})
.dtype(DataType::Float)
.build();
fusion->addInput(grad_out);
fusion->addInput(input);
fusion->addInput(weight);
fusion->addInput(bias);
fusion->addInput(mean);
fusion->addInput(rstd);
if (dtype == DataType::Half) {
grad_out = castOp(DataType::Float, grad_out);
input = castOp(DataType::Float, input);
weight = castOp(DataType::Float, weight);
bias = castOp(DataType::Float, bias);
}
auto layer_norm_results = layer_norm_backward(
grad_out, input, {1}, mean, rstd, weight, bias, {true, true, true});
if (dtype != DataType::Float) {
layer_norm_results.grad_input =
castOp(dtype, layer_norm_results.grad_input);
layer_norm_results.grad_bias = castOp(dtype, layer_norm_results.grad_bias);
layer_norm_results.grad_weight =
castOp(dtype, layer_norm_results.grad_weight);
}
fusion->addOutput(layer_norm_results.grad_input);
fusion->addOutput(layer_norm_results.grad_bias);
fusion->addOutput(layer_norm_results.grad_weight);
}
static void NvFuserScheduler_LayerNorm_BWD(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
std::vector<int64_t> input_shape{
benchmark_state.range(0), benchmark_state.range(1)};
// inputs
at::manual_seed(0);
auto maybe_fp16_options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor grad_out = at::randn(input_shape, maybe_fp16_options);
at::Tensor input = at::randn(input_shape, maybe_fp16_options);
at::Tensor weight = at::randn({input_shape[1]}, maybe_fp16_options);
at::Tensor bias = at::randn({input_shape[1]}, maybe_fp16_options);
at::Tensor mean = at::randn({input_shape[0], 1}, fp32_options);
at::Tensor rstd = at::randn({input_shape[0], 1}, fp32_options);
std::vector<c10::IValue> aten_inputs(
{grad_out, input, weight, bias, mean, rstd});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(3 * input.numel() + weight.numel() + bias.numel() + mean.numel() +
rstd.numel()) *
int64_t(dataTypeSize(dtype)));
}
//------------------------------------------------------------------------------
static void Baseline_LayerNorm_BWD(
benchmark::State& benchmark_state,
DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
std::vector<int64_t> input_shape{
benchmark_state.range(0), benchmark_state.range(1)};
const size_t kReductionAxis = 1;
std::vector<int64_t> norm_shape;
for (auto idx = kReductionAxis; idx < input_shape.size(); ++idx) {
norm_shape.push_back(input_shape[idx]);
}
// inputs
at::manual_seed(0);
auto maybe_fp16_options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor grad_out = at::randn(input_shape, maybe_fp16_options);
at::Tensor input = at::randn(input_shape, maybe_fp16_options);
at::Tensor weight = at::randn({input_shape[1]}, maybe_fp16_options);
at::Tensor bias = at::randn({input_shape[1]}, maybe_fp16_options);
at::Tensor mean = at::randn({input_shape[0], 1}, fp32_options);
at::Tensor rstd = at::randn({input_shape[0], 1}, fp32_options);
std::array<bool, 3> output_mask = {true, true, true};
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
at::native_layer_norm_backward(
grad_out, input, norm_shape, mean, rstd, weight, bias, output_mask);
auto output = at::layer_norm(input, norm_shape, weight, bias);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
C10_CUDA_CHECK(cudaDeviceSynchronize());
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(3 * input.numel() + weight.numel() + bias.numel() + mean.numel() +
rstd.numel()) *
int64_t(dataTypeSize(dtype)));
}
static void Baseline_LayerNorm_BWD_fp32(benchmark::State& benchmark_state) {
Baseline_LayerNorm_BWD(benchmark_state, DataType::Float);
}
static void Baseline_LayerNorm_BWD_fp16(benchmark::State& benchmark_state) {
Baseline_LayerNorm_BWD(benchmark_state, DataType::Half);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_LayerNorm_BWD_fp32,
setupLayerNorm_BWD,
NvFuserScheduler_LayerNorm_BWD,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{160, 320}, {2, 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_LayerNorm_BWD_fp16,
setupLayerNorm_BWD,
NvFuserScheduler_LayerNorm_BWD,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{160, 320}, {2, 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_LayerNorm_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{160, 320}, {2, 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_BWD_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{160, 320}, {2, 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_LayerNorm_BWD_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,257 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
// TODO: add LSTM function to composite operations
// Function Signature: cy, hy = lstm(x, cx)
static void setupFusion(Fusion* fusion) {
FusionGuard fg(fusion);
TensorView* tvs[16];
for (size_t i = 0; i < 16; i++) {
tvs[i] = makeContigTensor(2, DataType::Float);
fusion->addInput(tvs[i]);
}
const auto cx = makeContigTensor(2, DataType::Float);
fusion->addInput(cx);
const auto in_x = add(add(add(tvs[0], tvs[1]), tvs[2]), tvs[3]);
const auto forget_x = add(add(add(tvs[4], tvs[5]), tvs[6]), tvs[7]);
const auto cell_x = add(add(add(tvs[8], tvs[9]), tvs[10]), tvs[11]);
const auto out_x = add(add(add(tvs[12], tvs[13]), tvs[14]), tvs[15]);
auto lstm_result = lstm(cx, in_x, forget_x, cell_x, out_x);
fusion->addOutput(lstm_result.cell);
fusion->addOutput(lstm_result.hidden);
}
static std::vector<c10::IValue> setupInputs(
int hidden_features,
int batch_size) {
at::manual_seed(0);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
const at::Tensor large_tensor0 =
at::randn({batch_size, hidden_features * 4}, options);
const at::Tensor large_tensor1 =
at::randn({batch_size, hidden_features * 4}, options);
const at::Tensor large_tensor2 =
at::randn({batch_size, hidden_features * 4}, options);
const at::Tensor large_tensor3 =
at::randn({batch_size, hidden_features * 4}, options);
const auto chunked0 = large_tensor0.chunk(4, 1);
const auto chunked1 = large_tensor1.chunk(4, 1);
const auto chunked2 = large_tensor2.chunk(4, 1);
const auto chunked3 = large_tensor3.chunk(4, 1);
std::vector<c10::IValue> inputs;
inputs.insert(inputs.end(), chunked0.begin(), chunked0.end());
inputs.insert(inputs.end(), chunked1.begin(), chunked1.end());
inputs.insert(inputs.end(), chunked2.begin(), chunked2.end());
inputs.insert(inputs.end(), chunked3.begin(), chunked3.end());
const auto at_cx = at::randn({batch_size, hidden_features}, options);
inputs.push_back(at_cx);
return inputs;
}
//------------------------------------------------------------------------------
static void LstmCell_SetupFusion(benchmark::State& benchmark_state) {
for (auto _ : benchmark_state) {
Fusion fusion;
setupFusion(&fusion);
}
}
BENCHMARK(LstmCell_SetupFusion)->Unit(benchmark::kMicrosecond);
//------------------------------------------------------------------------------
static void LstmCell_AutoSchedule(benchmark::State& benchmark_state) {
constexpr int kHiddenFeatures = 512;
constexpr int kBatchSize = 64;
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
benchmark_state.PauseTiming();
Fusion fusion;
setupFusion(&fusion);
std::vector<c10::IValue> inputs = setupInputs(kHiddenFeatures, kBatchSize);
benchmark_state.ResumeTiming();
// Auto-schedule
schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
}
}
BENCHMARK(LstmCell_AutoSchedule)->Unit(benchmark::kMicrosecond);
//------------------------------------------------------------------------------
static void LstmCell_Lower(benchmark::State& benchmark_state) {
constexpr int kHiddenFeatures = 512;
constexpr int kBatchSize = 64;
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs(kHiddenFeatures, kBatchSize);
schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
for (auto _ : benchmark_state) {
GpuLower gpu_lower(&fusion);
}
}
BENCHMARK(LstmCell_Lower)->Unit(benchmark::kMillisecond);
//------------------------------------------------------------------------------
static void LstmCell_Compile(benchmark::State& benchmark_state) {
constexpr int kHiddenFeatures = 512;
constexpr int kBatchSize = 64;
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs(kHiddenFeatures, kBatchSize);
schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
for (auto _ : benchmark_state) {
FusionExecutor executor;
executor.compileFusion(&fusion);
}
}
BENCHMARK(LstmCell_Compile)->Unit(benchmark::kMillisecond);
//------------------------------------------------------------------------------
static void LstmCell_RunFusion(
benchmark::State& benchmark_state,
int hidden_features,
int batch_size) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs(hidden_features, batch_size);
// outputs
std::vector<at::Tensor> outputs;
auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
FusionExecutor executor;
executor.compileFusion(&fusion);
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
}
BENCHMARK_CAPTURE(LstmCell_RunFusion, Small, 512, 64)
->Unit(benchmark::kMicrosecond);
BENCHMARK_CAPTURE(LstmCell_RunFusion, Medium, 1024, 128)
->Unit(benchmark::kMicrosecond);
//------------------------------------------------------------------------------
static void LstmCell_RunFusion_GpuOnly(
benchmark::State& benchmark_state,
int hidden_features,
int batch_size) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs(hidden_features, batch_size);
// outputs
std::vector<at::Tensor> outputs;
auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
FusionExecutor executor;
executor.setMeasureKernelTimeFlag(true);
executor.compileFusion(&fusion);
for (auto _ : benchmark_state) {
clearL2Cache();
outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
benchmark_state.SetIterationTime(executor.kernelTimeMs() / 1000.0);
}
}
BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Small, 512, 64)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK_CAPTURE(LstmCell_RunFusion_GpuOnly, Medium, 1024, 128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
static void LstmCell_RunFusion_CpuOnly(
benchmark::State& benchmark_state,
int hidden_features,
int batch_size) {
Fusion fusion;
// setup fusion
setupFusion(&fusion);
// inputs
std::vector<c10::IValue> inputs = setupInputs(hidden_features, batch_size);
// outputs
std::vector<at::Tensor> outputs;
auto lparams = schedulePointwise(&fusion, c10::ArrayRef<c10::IValue>(inputs));
FusionExecutor executor;
executor.setExecuteKernelFlag(false);
executor.compileFusion(&fusion);
for (auto _ : benchmark_state) {
outputs = executor.runFusion(c10::ArrayRef<c10::IValue>(inputs), lparams);
}
}
BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Small, 512, 64)
->Unit(benchmark::kMicrosecond);
BENCHMARK_CAPTURE(LstmCell_RunFusion_CpuOnly, Medium, 1024, 128)
->Unit(benchmark::kMicrosecond);

View File

@ -1,3 +0,0 @@
#include <benchmark/benchmark.h>
BENCHMARK_MAIN();

View File

@ -1,357 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/matmul.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
bool cudaArchGuardShouldSkip(int required_major, int required_minor) {
int capability_major = at::cuda::getCurrentDeviceProperties()->major;
int capability_minor = at::cuda::getCurrentDeviceProperties()->minor;
if (capability_major < required_major ||
(capability_major == required_major &&
capability_minor < required_minor)) {
return true;
}
return false;
}
bool hasRequiredSmemSize(size_t required_size) {
// Only checking device 0
return at::cuda::getDeviceProperties(0)->sharedMemPerBlockOptin >=
required_size;
}
#define NVFUSER_BENCHMARK_ARCH_SMEM_GUARD( \
REQUIRED_MAJOR, REQUIRED_MINOR, SMEM_SIZE, STATE) \
if (cudaArchGuardShouldSkip(REQUIRED_MAJOR, REQUIRED_MINOR) || \
!hasRequiredSmemSize(SMEM_SIZE)) { \
STATE.SkipWithError("Unsupported arch or not enough smem!"); \
return; \
}
// util to track support matmul operand layout.
using MatmulLayout = MmaOptions::MmaInputLayout;
static constexpr std::array<MatmulLayout, 3> kAllSupportedLayout = {
MatmulLayout::TT,
MatmulLayout::NT,
MatmulLayout::TN};
// Generic interface to get matmul op with the given layout.
TensorView* matmul(TensorView* a, TensorView* b, MatmulLayout layout) {
TORCH_CHECK(
a->nDims() == 2 && b->nDims() == 2, "only pure matmuls for these tests");
TensorView *tv2 = nullptr, *tv0b = nullptr, *tv1b = nullptr;
switch (layout) {
case MatmulLayout::TT:
tv0b = broadcast(a, {false, false, true});
tv1b = broadcast(b, {true, false, false});
tv2 = fusedMultiplySum(tv0b, tv1b, {1});
break;
case MatmulLayout::TN:
tv0b = broadcast(a, {false, true, false});
tv1b = broadcast(b, {true, false, false});
tv2 = fusedMultiplySum(tv0b, tv1b, {2});
break;
case MatmulLayout::NT:
tv0b = broadcast(a, {false, false, true});
tv1b = broadcast(b, {false, true, false});
tv2 = fusedMultiplySum(tv0b, tv1b, {0});
break;
default:
TORCH_CHECK(false, "unsupported data layout.");
}
return tv2;
}
// Utility to generate matmul input tensors based on given layout
at::Tensor atMatmul(at::Tensor a, at::Tensor b, MatmulLayout layout) {
switch (layout) {
case MatmulLayout::TT:
return a.matmul(b);
case MatmulLayout::TN:
return a.matmul(b.t());
case MatmulLayout::NT:
return a.t().matmul(b);
default:
TORCH_CHECK(false, "unsupported data layout.");
}
return at::Tensor();
}
// Utility to generate reference results based on given layout
std::pair<at::Tensor, at::Tensor> fp16MatmulAtInput(
int M,
int N,
int K,
MatmulLayout layout) {
auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
switch (layout) {
case MatmulLayout::TT:
return std::make_pair(
at::randn({M, K}, options), at::randn({K, N}, options));
case MatmulLayout::TN:
return std::make_pair(
at::randn({M, K}, options), at::randn({N, K}, options));
case MatmulLayout::NT:
return std::make_pair(
at::randn({K, M}, options), at::randn({K, N}, options));
default:
TORCH_CHECK(false, "unsupported data layout.");
}
return std::make_pair(at::Tensor(), at::Tensor());
}
// TODO: separate compute and schedule definition once the can schedule
// logic and pattern matching is ready.
void setupMatmul(Fusion* fusion, MatmulLayout layout, MatmulParam params) {
// Only hgemm on the initial setup
auto a = makeContigTensor(2, DataType::Half);
auto b = makeContigTensor(2, DataType::Half);
auto c = matmul(a, b, layout);
fusion->addInput(a);
fusion->addInput(b);
fusion->addOutput(c);
scheduleMatmul(c, a, b, params);
}
static void SingleMatmulBase(
benchmark::State& benchmark_state,
MatmulLayout layout,
MatmulParam params) {
std::vector<int64_t> input_mnk{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2)};
auto fusion_ptr = std::make_unique<Fusion>();
auto fusion = fusion_ptr.get();
FusionGuard fg(fusion);
// Define fusion graph
setupMatmul(fusion, layout, params);
// inputs
at::manual_seed(0);
// Tensor inputs
auto inputs = fp16MatmulAtInput(
input_mnk.at(0), input_mnk.at(1), input_mnk.at(2), layout);
KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(
{inputs.first, inputs.second});
// Always use 32b indexing mode for now.
TORCH_INTERNAL_ASSERT(args.getIndexMode() == KernelIndexMode::INT32);
// Compile kernel
FusionExecutor fe;
fe.compileFusion(fusion, args, LaunchParams());
// Warm up run
auto outputs = fe.runFusion({inputs.first, inputs.second});
fe.setMeasureKernelTimeFlag(true);
// Sync everything up before we start
for (auto _ : benchmark_state) {
clearL2Cache();
auto outputs = fe.runFusion({inputs.first, inputs.second});
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
cudaDeviceSynchronize();
// TODO: FLOPS calculation
}
static void EagerModeMatmul(
benchmark::State& benchmark_state,
MatmulLayout layout) {
std::vector<int64_t> input_mnk{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2)};
at::manual_seed(0);
auto inputs = fp16MatmulAtInput(
input_mnk.at(0), input_mnk.at(1), input_mnk.at(2), layout);
// warm up run
auto outputs = atMatmul(inputs.first, inputs.second, layout);
for (auto _ : benchmark_state) {
clearL2Cache();
CudaKernelTimer timer;
outputs = atMatmul(inputs.first, inputs.second, layout);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
cudaDeviceSynchronize();
}
// Actual benchmarking
// -----------------------------------------------------------------
size_t getSmemSize(GemmTile cta_tile, int stage_number) {
return ((cta_tile.m * cta_tile.k) + (cta_tile.n * cta_tile.k)) *
dataTypeSize(DataType::Half) * stage_number;
}
// TODO: this part eventually will be automated by heuristics
MatmulParam getMatmulParams(
GemmTile cta_tile,
int stage_number,
MatmulLayout layout) {
MatMulTileOptions gemm_tile;
gemm_tile.cta_tile = cta_tile;
// TODO: pipe through split K
gemm_tile.warp_tile = GemmTile(64, 64, cta_tile.k);
gemm_tile.instruction_tile = GemmTile(16, 16, 16);
// Collect mma swizzle info
auto mma_builder =
MmaBuilder(MmaOptions::MacroType::Ampere_16_16_16, gemm_tile)
.layout(layout);
MatmulParam params(mma_builder);
params.tile_sizes = gemm_tile;
params.async_gmem_load_operands = true;
params.double_buffer_options.double_buffer_smem_write = true;
params.double_buffer_options.double_buffer_smem_read = true;
params.double_buffer_options.smem_double_buffer_stage = stage_number;
return params;
}
static void Nvfuser_Matmul_4warp3stage(
benchmark::State& benchmark_state,
MatmulLayout layout) {
auto cta_tile = GemmTile(128, 128, 32);
int number_of_stage = 3;
auto params = getMatmulParams(cta_tile, number_of_stage, layout);
NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
// Run benchmark:
SingleMatmulBase(benchmark_state, layout, params);
}
static void Nvfuser_Matmul_8warp3stage(
benchmark::State& benchmark_state,
MatmulLayout layout) {
auto cta_tile = GemmTile(256, 128, 32);
int number_of_stage = 3;
auto params = getMatmulParams(cta_tile, number_of_stage, layout);
NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
// Run benchmark:
SingleMatmulBase(benchmark_state, layout, params);
}
static void Nvfuser_Matmul_4warp4stage(
benchmark::State& benchmark_state,
MatmulLayout layout) {
auto cta_tile = GemmTile(128, 128, 32);
int number_of_stage = 4;
auto params = getMatmulParams(cta_tile, number_of_stage, layout);
NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
// Run benchmark:
SingleMatmulBase(benchmark_state, layout, params);
}
static void Nvfuser_Matmul_8warp4stage(
benchmark::State& benchmark_state,
MatmulLayout layout) {
auto cta_tile = GemmTile(256, 128, 32);
int number_of_stage = 4;
auto params = getMatmulParams(cta_tile, number_of_stage, layout);
NVFUSER_BENCHMARK_ARCH_SMEM_GUARD(
8, 0, getSmemSize(cta_tile, number_of_stage), benchmark_state);
// Run benchmark:
SingleMatmulBase(benchmark_state, layout, params);
}
// ----------------------------- Benchmark Instantiation-------
// Common utils:
#define NO_TILE_QUANTIZATION_ARGS \
ArgsProduct( \
{{2048}, {3456}, benchmark::CreateDenseRange(512, 4096, /*step=*/512)}) \
->Unit(benchmark::kMicrosecond) \
->UseManualTime();
#define ForAllLayouts(run) \
run(TT, MatmulLayout::TT); \
run(TN, MatmulLayout::TN); \
run(NT, MatmulLayout::NT)
// Instantiations:
#define Nvfuser_4warp3stage_test(layout_label, layout) \
BENCHMARK_CAPTURE( \
Nvfuser_Matmul_4warp3stage, \
no_quant_nvfuser_4warp_##layout_label, \
layout) \
->NO_TILE_QUANTIZATION_ARGS
#define Nvfuser_8warp3stage_test(layout_label, layout) \
BENCHMARK_CAPTURE( \
Nvfuser_Matmul_8warp3stage, \
no_quant_nvfuser_8warp_##layout_label, \
layout) \
->NO_TILE_QUANTIZATION_ARGS
#define Nvfuser_4warp4stage_test(layout_label, layout) \
BENCHMARK_CAPTURE( \
Nvfuser_Matmul_4warp4stage, \
no_quant_nvfuser_4warp_##layout_label, \
layout) \
->NO_TILE_QUANTIZATION_ARGS
#define Nvfuser_8warp4stage_test(layout_label, layout) \
BENCHMARK_CAPTURE( \
Nvfuser_Matmul_8warp4stage, \
no_quant_nvfuser_8warp_##layout_label, \
layout) \
->NO_TILE_QUANTIZATION_ARGS
#define Eagermode_test(layout_label, layout) \
BENCHMARK_CAPTURE( \
EagerModeMatmul, no_quant_eagermode_##layout_label, layout) \
->NO_TILE_QUANTIZATION_ARGS
ForAllLayouts(Nvfuser_4warp3stage_test);
ForAllLayouts(Nvfuser_4warp4stage_test);
ForAllLayouts(Nvfuser_8warp3stage_test);
ForAllLayouts(Nvfuser_8warp4stage_test);
ForAllLayouts(Eagermode_test);

View File

@ -1,384 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <sstream>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
// Return reduction tensor view and output of reduction
static void setupReduction(Fusion* fusion, DataType dtype, int red_axis) {
FusionGuard fg(fusion);
bool is_fp16 = dtype == DataType::Half;
TensorView* tv0 = makeContigTensor(2, dtype);
fusion->addInput(tv0);
TensorView* tv0_cast = tv0;
if (is_fp16) {
tv0_cast = castOp(DataType::Float, tv0);
}
TensorView* tv1 = sum(tv0_cast, {red_axis});
TensorView* tv1_cast = tv1;
if (is_fp16) {
tv1_cast = castOp(DataType::Half, tv1);
}
fusion->addOutput(tv1_cast);
TensorView* output_of_reduction = nullptr;
if (is_fp16) {
output_of_reduction = tv1_cast;
}
}
static void NvFuserScheduler_Reduction(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype,
int reduction_dim) {
auto reduction_size = benchmark_state.range(0);
auto iter_size = benchmark_state.range(1);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor aten_input =
(reduction_dim ? at::randn({iter_size, reduction_size}, options)
: at::randn({reduction_size, iter_size}, options));
fusion_executor_cache->profile(true);
fusion_executor_cache->runFusionWithInputs({aten_input});
auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
auto executor_instance = compile_log.fusion_executor;
auto rparams = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
benchmark_state.SetLabel(rparams + lparams);
fusion_executor_cache->profile(false);
executor_instance->setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
clearL2Cache();
auto cg_outputs = fusion_executor_cache->runFusionWithInputs({aten_input});
benchmark_state.SetIterationTime(
executor_instance->kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(iter_size * reduction_size + iter_size) * int64_t(dataTypeSize(dtype)));
}
static void Baseline_Reduction(
benchmark::State& benchmark_state,
DataType dtype,
int reduction_dim) {
auto reduction_size = benchmark_state.range(0);
auto iter_size = benchmark_state.range(1);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor aten_input =
(reduction_dim ? at::randn({iter_size, reduction_size}, options)
: at::randn({reduction_size, iter_size}, options));
// Sync everything up before we start
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
auto output = aten_input.sum({reduction_dim});
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
C10_CUDA_CHECK(cudaDeviceSynchronize());
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(iter_size * reduction_size + iter_size) * int64_t(dataTypeSize(dtype)));
}
//------------------------------------------------------------------------------
static void Baseline_Reduction_Outer_fp32(benchmark::State& benchmark_state) {
Baseline_Reduction(benchmark_state, DataType::Float, 0);
}
static void Baseline_Reduction_Outer_fp16(benchmark::State& benchmark_state) {
Baseline_Reduction(benchmark_state, DataType::Half, 0);
}
static void Baseline_Reduction_Inner_fp32(benchmark::State& benchmark_state) {
Baseline_Reduction(benchmark_state, DataType::Float, 1);
}
static void Baseline_Reduction_Inner_fp16(benchmark::State& benchmark_state) {
Baseline_Reduction(benchmark_state, DataType::Half, 1);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Reduction_Outer_fp32,
setupReduction,
NvFuserScheduler_Reduction,
DataType::Float,
0);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Reduction_Outer_fp16,
setupReduction,
NvFuserScheduler_Reduction,
DataType::Half,
0);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Reduction_Inner_fp32,
setupReduction,
NvFuserScheduler_Reduction,
DataType::Float,
1);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Reduction_Inner_fp16,
setupReduction,
NvFuserScheduler_Reduction,
DataType::Half,
1);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{1024, 1024 * 512}, {2, 4 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 4 * 1024}, {1024, 1024 * 512}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{1024, 1024 * 1024}, {2, 4 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 4 * 1024}, {1024, 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Reduction_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_Reduction_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 64 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 64 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Reduction_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,170 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupRMSNorm(Fusion* fusion, DataType dtype) {
TORCH_INTERNAL_ASSERT(
dtype == DataType::Float || dtype == DataType::Half ||
dtype == DataType::BFloat16);
FusionGuard fg(fusion);
const float kEps = 1e-6;
Double* eps_ptr = IrBuilder::create<Double>(kEps);
// setup fusion
auto input = makeContigTensor(3, dtype);
auto weight = makeContigTensor(1, dtype);
fusion->addInput(input);
fusion->addInput(weight);
if (dtype == DataType::Half) {
input = castOp(DataType::Float, input);
weight = castOp(DataType::Float, weight);
}
auto rms_norm_results = rms_norm(input, 1, weight, eps_ptr);
auto output = rms_norm_results.output;
if (dtype != DataType::Float) {
output = castOp(dtype, output);
}
fusion->addOutput(output);
}
static void NvFuserScheduler_RMSNorm(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype) {
TORCH_INTERNAL_ASSERT(
dtype == DataType::Float || dtype == DataType::Half ||
dtype == DataType::BFloat16);
std::vector<int64_t> input_shape{8, benchmark_state.range(0), 1024};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor input = at::randn(input_shape, options);
at::Tensor weight = at::randn({input_shape[2]}, options);
std::vector<c10::IValue> aten_inputs({input, weight});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(2 * input.numel() + weight.numel()) * int64_t(dataTypeSize(dtype)));
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_RMSNorm_fp32,
setupRMSNorm,
NvFuserScheduler_RMSNorm,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
->RangeMultiplier(2)
->Ranges({{16, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
->RangeMultiplier(2)
->Ranges({{18, 56}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
->RangeMultiplier(2)
->Ranges({{22, 44}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp32)
->RangeMultiplier(2)
->Ranges({{24, 48}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_RMSNorm_fp16,
setupRMSNorm,
NvFuserScheduler_RMSNorm,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
->RangeMultiplier(2)
->Ranges({{16, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
->RangeMultiplier(2)
->Ranges({{18, 56}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
->RangeMultiplier(2)
->Ranges({{22, 44}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_fp16)
->RangeMultiplier(2)
->Ranges({{24, 48}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
// TODO: Automatically disable/enable if bf16 is supported
// NVFUSER_BENCHMARK_DEFINE(
// NvFuserScheduler_RMSNorm_bf16,
// setupRMSNorm,
// NvFuserScheduler_RMSNorm,
// DataType::BFloat16);
// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
// ->RangeMultiplier(2)
// ->Ranges({{16, 64}})
// ->Unit(benchmark::kMicrosecond)
// ->UseManualTime();
// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
// ->RangeMultiplier(2)
// ->Ranges({{18, 56}})
// ->Unit(benchmark::kMicrosecond)
// ->UseManualTime();
// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
// ->RangeMultiplier(2)
// ->Ranges({{22, 44}})
// ->Unit(benchmark::kMicrosecond)
// ->UseManualTime();
// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_bf16)
// ->RangeMultiplier(2)
// ->Ranges({{24, 48}})
// ->Unit(benchmark::kMicrosecond)
// ->UseManualTime();

View File

@ -1,163 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupRMSNorm_BWD(Fusion* fusion, DataType dtype) {
FusionGuard fg(fusion);
TORCH_INTERNAL_ASSERT(
dtype == DataType::Float || dtype == DataType::Half ||
dtype == DataType::BFloat16);
// setup fusion
auto grad_out = makeContigTensor(3, dtype);
auto input = makeContigTensor(3, dtype);
auto weight = makeContigTensor(1, dtype);
auto rstd = TensorViewBuilder()
.contiguity({false, false, false})
.shape({-1, -1, 1})
.dtype(dtype)
.build();
fusion->addInput(grad_out);
fusion->addInput(input);
fusion->addInput(weight);
fusion->addInput(rstd);
if (dtype == DataType::Half) {
grad_out = castOp(DataType::Float, grad_out);
input = castOp(DataType::Float, input);
weight = castOp(DataType::Float, weight);
rstd = castOp(DataType::Float, rstd);
}
auto rms_norm_results =
rms_norm_backward(grad_out, input, {1}, rstd, weight, {true, true, true});
if (dtype != DataType::Float) {
rms_norm_results.grad_input = castOp(dtype, rms_norm_results.grad_input);
rms_norm_results.grad_weight = castOp(dtype, rms_norm_results.grad_weight);
}
fusion->addOutput(rms_norm_results.grad_input);
fusion->addOutput(rms_norm_results.grad_weight);
}
static void NvFuserScheduler_RMSNorm_BWD(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype) {
TORCH_INTERNAL_ASSERT(
dtype == DataType::Float || dtype == DataType::Half ||
dtype == DataType::BFloat16);
std::vector<int64_t> input_shape{8, benchmark_state.range(0), 1024};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor grad_out = at::randn(input_shape, options);
at::Tensor input = at::randn(input_shape, options);
at::Tensor weight = at::randn({input_shape[2]}, options);
at::Tensor rstd = at::randn({input_shape[0], input_shape[1], 1}, options);
std::vector<c10::IValue> aten_inputs({grad_out, input, weight, rstd});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(3 * input.numel() + weight.numel() + rstd.numel()) *
int64_t(dataTypeSize(dtype)));
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_RMSNorm_BWD_fp32,
setupRMSNorm_BWD,
NvFuserScheduler_RMSNorm_BWD,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32)
->RangeMultiplier(2)
->Ranges({{16, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32)
->RangeMultiplier(2)
->Ranges({{28, 56}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp32)
->RangeMultiplier(2)
->Ranges({{24, 48}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_RMSNorm_BWD_fp16,
setupRMSNorm_BWD,
NvFuserScheduler_RMSNorm_BWD,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16)
->RangeMultiplier(2)
->Ranges({{16, 64}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16)
->RangeMultiplier(2)
->Ranges({{28, 56}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_fp16)
->RangeMultiplier(2)
->Ranges({{24, 48}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
// TODO: Automatically disable/enable if bf16 is supported
// NVFUSER_BENCHMARK_DEFINE(
// NvFuserScheduler_RMSNorm_BWD_bf16,
// setupRMSNorm_BWD,
// NvFuserScheduler_RMSNorm_BWD,
// DataType::BFloat16);
// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16)
// ->RangeMultiplier(2)
// ->Ranges({{16, 64}})
// ->Unit(benchmark::kMicrosecond)
// ->UseManualTime();
// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16)
// ->RangeMultiplier(2)
// ->Ranges({{28, 56}})
// ->Unit(benchmark::kMicrosecond)
// ->UseManualTime();
// NVFUSER_BENCHMARK_RUN(NvFuserScheduler_RMSNorm_BWD_bf16)
// ->RangeMultiplier(2)
// ->Ranges({{24, 48}})
// ->Unit(benchmark::kMicrosecond)
// ->UseManualTime();

View File

@ -1,406 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
static void setupSBR(Fusion* fusion, DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
const size_t kNumberOfDims = 4;
std::vector<int64_t> bcast_shape(kNumberOfDims, 1);
bcast_shape[bcast_shape.size() - 1] = -1;
std::vector<bool> bcast_contig(kNumberOfDims, false);
bcast_contig[bcast_contig.size() - 1] = true;
auto x = makeContigTensor(kNumberOfDims, dtype);
auto scale = TensorViewBuilder()
.contiguity(bcast_contig)
.shape(bcast_shape)
.dtype(dtype)
.build();
auto bias = TensorViewBuilder()
.contiguity(bcast_contig)
.shape(bcast_shape)
.dtype(dtype)
.build();
fusion->addInput(x);
fusion->addInput(scale);
fusion->addInput(bias);
if (dtype == DataType::Half) {
x = castOp(DataType::Float, x);
scale = castOp(DataType::Float, scale);
bias = castOp(DataType::Float, bias);
}
auto scale_bias = add(mul(x, scale), bias);
auto scale_bias_relu = unaryOp(UnaryOpType::Relu, scale_bias);
if (dtype == DataType::Half) {
scale_bias_relu = castOp(DataType::Half, scale_bias_relu);
}
fusion->addOutput(scale_bias_relu);
}
static void setupSBRNorm(Fusion* fusion, DataType dtype) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
const size_t kNumberOfDims = 4;
auto x = makeContigTensor(kNumberOfDims, dtype);
auto weight = makeContigTensor(1, dtype);
auto bias = makeContigTensor(1, dtype);
auto mean = makeContigTensor(1, dtype);
auto var = makeContigTensor(1, dtype);
fusion->addInput(x);
fusion->addInput(weight);
fusion->addInput(bias);
fusion->addInput(mean);
fusion->addInput(var);
std::vector<bool> broadcast_mask(kNumberOfDims, true);
broadcast_mask[broadcast_mask.size() - 1] = false;
if (dtype == DataType::Half) {
x = castOp(DataType::Float, x);
weight = castOp(DataType::Float, weight);
bias = castOp(DataType::Float, bias);
mean = castOp(DataType::Float, mean);
var = castOp(DataType::Float, var);
}
auto rsqrt = unaryOp(UnaryOpType::Rsqrt, var);
auto this_scale = mul(weight, rsqrt);
auto this_bias = mul(sub(bias, mean), this_scale);
auto bcast_scale = broadcast(this_scale, broadcast_mask);
auto bcast_bias = broadcast(this_bias, broadcast_mask);
auto scale_bias = add(mul(x, bcast_scale), bcast_bias);
auto scale_bias_relu = unaryOp(UnaryOpType::Relu, scale_bias);
if (dtype == DataType::Half) {
scale_bias_relu = castOp(DataType::Half, scale_bias_relu);
}
fusion->addOutput(scale_bias_relu);
}
//------------------------------------------------------------------------------
static void NvFuserScheduler_SBR(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype) {
// N, H, W, C format
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(1),
benchmark_state.range(2)};
std::vector<int64_t> bcast_shape{1, 1, 1, -1};
// inputs
at::manual_seed(0);
std::vector<int64_t> static_bcast_shape{1, 1, 1, benchmark_state.range(2)};
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor at_x = at::randn(input_shape, options);
at::Tensor at_scale = at::ones(static_bcast_shape, options);
at::Tensor at_bias = at::zeros(static_bcast_shape, options);
// inputs
std::vector<c10::IValue> aten_inputs = {at_x, at_scale, at_bias};
fusion_executor_cache->profile(true);
fusion_executor_cache->runFusionWithInputs(aten_inputs);
auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
auto executor_instance = compile_log.fusion_executor;
auto params = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
benchmark_state.SetLabel(params + lparams);
benchmark_state.SetLabel(lparams);
fusion_executor_cache->profile(false);
executor_instance->setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
clearL2Cache();
auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
benchmark_state.SetIterationTime(
executor_instance->kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
const size_t size =
input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
const size_t channels = input_shape[3];
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * (channels * 2 + size * 2) *
int64_t(dataTypeSize(dtype)));
}
static void Baseline_SBR(benchmark::State& benchmark_state, DataType dtype) {
// N, H, W, C format
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(1),
benchmark_state.range(2)};
std::vector<int64_t> bcast_shape{benchmark_state.range(2)};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor at_x = at::randn(input_shape, options);
at::Tensor at_y = at::randn(input_shape, options);
at::Tensor at_scale = at::ones(bcast_shape, options);
at::Tensor at_bias = at::zeros(bcast_shape, options);
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
auto scale = at::mul(at_x, at_scale);
auto bias = at::add(scale, at_bias);
auto output = at::relu(bias);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
C10_CUDA_CHECK(cudaDeviceSynchronize());
clearL2Cache();
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
const size_t size =
input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
const size_t channels = input_shape[3];
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * (channels * 2 + size * 2) *
int64_t(dataTypeSize(dtype)));
}
//------------------------------------------------------------------------------
static void NvFuserScheduler_SBR_Norm(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype) {
// N, H, W, C format
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(1),
benchmark_state.range(2)};
std::vector<int64_t> bcast_shape{benchmark_state.range(2)};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor at_x = at::randn(input_shape, options);
at::Tensor at_weight = at::ones(bcast_shape, options);
at::Tensor at_bias = at::zeros(bcast_shape, options);
at::Tensor at_mean = at::zeros(bcast_shape, options);
at::Tensor at_var = at::ones(bcast_shape, options);
// inputs
std::vector<c10::IValue> aten_inputs = {
at_x, at_weight, at_bias, at_mean, at_var};
fusion_executor_cache->profile(true);
fusion_executor_cache->runFusionWithInputs(aten_inputs);
auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
auto executor_instance = compile_log.fusion_executor;
auto params = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
benchmark_state.SetLabel(params + lparams);
fusion_executor_cache->profile(false);
executor_instance->setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
clearL2Cache();
auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
benchmark_state.SetIterationTime(
executor_instance->kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
const size_t size =
input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
const size_t channels = input_shape[3];
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * (channels * 4 + size * 2) *
int64_t(dataTypeSize(dtype)));
}
static void Baseline_SBR_Norm(
benchmark::State& benchmark_state,
DataType dtype) {
// N, H, W, C format
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(1),
benchmark_state.range(2)};
std::vector<int64_t> bcast_shape{1, 1, 1, benchmark_state.range(2)};
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor at_x = at::randn(input_shape, options);
at::Tensor at_weight = at::ones(bcast_shape, options);
at::Tensor at_bias = at::zeros(bcast_shape, options);
at::Tensor at_mean = at::zeros(bcast_shape, options);
at::Tensor at_var = at::ones(bcast_shape, options);
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
CudaKernelTimer timer;
auto this_scale = at::mul(at_weight, at::rsqrt(at_var));
auto this_bias = at::mul(at::sub(at_bias, at_mean), this_scale);
auto scale = at::mul(at_x, this_scale);
auto bias = at::add(scale, this_bias);
auto output = at::relu(bias);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
const size_t size =
input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
const size_t channels = input_shape[3];
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * (channels * 4 + size * 2) *
int64_t(dataTypeSize(dtype)));
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_SBR_fp32,
setupSBR,
NvFuserScheduler_SBR,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_fp32)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_SBR_fp16,
setupSBR,
NvFuserScheduler_SBR,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_fp16)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_SBR_Norm_fp32,
setupSBRNorm,
NvFuserScheduler_SBR_Norm,
DataType::Float);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_Norm_fp32)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_SBR_Norm_fp16,
setupSBRNorm,
NvFuserScheduler_SBR_Norm,
DataType::Half);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_Norm_fp16)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
static void Baseline_SBR_fp32(benchmark::State& benchmark_state) {
Baseline_SBR(benchmark_state, DataType::Float);
}
BENCHMARK(Baseline_SBR_fp32)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
static void Baseline_SBR_fp16(benchmark::State& benchmark_state) {
Baseline_SBR(benchmark_state, DataType::Half);
}
BENCHMARK(Baseline_SBR_fp16)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
static void Baseline_SBR_Norm_fp32(benchmark::State& benchmark_state) {
Baseline_SBR_Norm(benchmark_state, DataType::Float);
}
BENCHMARK(Baseline_SBR_Norm_fp32)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
static void Baseline_SBR_Norm_fp16(benchmark::State& benchmark_state) {
Baseline_SBR_Norm(benchmark_state, DataType::Half);
}
BENCHMARK(Baseline_SBR_Norm_fp16)
// ->RangeMultiplier(2)
->Ranges({{8, 8}, {640, 640}, {64, 128}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,211 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
static auto getLayerBackwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::vector<at::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();
const size_t kM = shape.size();
const size_t kN = norm_shape.size();
const size_t kOuterNumDims = kM - kN;
std::vector<int64_t> outer_shape;
for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
outer_shape.push_back(shape[idx]);
}
for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
outer_shape.push_back(1);
}
auto grad_out = makeSymbolicTensor(shape.size());
auto input = makeSymbolicTensor(shape.size());
auto mean = makeConcreteTensor(outer_shape);
auto rstd = makeConcreteTensor(outer_shape);
auto weight = makeSymbolicTensor(norm_shape.size());
auto bias = makeSymbolicTensor(norm_shape.size());
fusion.addInput(grad_out);
fusion.addInput(input);
fusion.addInput(mean);
fusion.addInput(rstd);
fusion.addInput(weight);
fusion.addInput(bias);
auto grads = layer_norm_backward(
grad_out,
input,
norm_shape,
mean,
rstd,
weight,
bias,
{true, true, true});
fusion.addOutput(grads.grad_input);
fusion.addOutput(grads.grad_weight);
fusion.addOutput(grads.grad_bias);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_grad_out = at::randn(shape, options);
at::Tensor aten_input = at::randn(shape, options);
at::Tensor aten_weight = at::randn(norm_shape, options);
at::Tensor aten_bias = at::randn(norm_shape, options);
auto at_weight = c10::optional<at::Tensor>(aten_weight);
auto at_bias = c10::optional<at::Tensor>(aten_bias);
const float kEps = 1e-5;
auto aten_results =
at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
auto aten_output = std::get<0>(aten_results);
auto aten_mean = std::get<1>(aten_results);
auto aten_rstd = std::get<2>(aten_results);
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {
aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
return fec->getMostRecentKernelRuntime();
}
void LayerNormBackward_ShapeInference_Base(
benchmark::State& benchmark_state,
bool disable_launch_parameter_cache) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::vector<at::IValue> aten_inputs;
std::vector<int64_t> shape{20, 100, 35, 67};
std::vector<int64_t> norm_shape{67};
auto runtime = getLayerBackwardNormRuntime(
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
TORCH_INTERNAL_ASSERT(
runtime->getMaybeHeuristicsFor(args).has_value());
fec->profile(true);
fec->disableKernelLaunch();
fec->runFusionWithInputs(aten_inputs);
if (disable_launch_parameter_cache) {
fec->disableLaunchParamCache();
}
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
fec->runFusionWithInputs(aten_inputs);
}
}
static void LayerNormBackward_ShapeInference(
benchmark::State& benchmark_state) {
LayerNormBackward_ShapeInference_Base(benchmark_state, true);
}
static void LayerNormBackward_NoShapeInferenceCachedBaseline(
benchmark::State& benchmark_state) {
LayerNormBackward_ShapeInference_Base(benchmark_state, false);
}
static auto getLayerForwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& fec,
std::vector<at::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();
const float kEps = 1e-5;
Double* eps_ptr = IrBuilder::create<Double>(kEps);
auto input = makeSymbolicTensor(shape.size());
fusion.addInput(input);
auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
fusion.addOutput(result.output);
fusion.addOutput(result.mean);
fusion.addOutput(result.invstd);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_input = at::randn(shape, options);
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {aten_input};
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
return fec->getMostRecentKernelRuntime();
}
void LayerNormForward_ShapeInferenceBase(
benchmark::State& benchmark_state,
bool disable_launch_param_cache) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
FusionGuard fg(fusion_ptr.get());
// PreAllocate
std::unique_ptr<FusionExecutorCache> fec;
std::vector<at::IValue> aten_inputs;
std::vector<int64_t> shape{20, 100, 35, 67};
std::vector<int64_t> norm_shape{67};
auto runtime = getLayerForwardNormRuntime(
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
KernelArgumentHolder args = KernelArgumentHolder::createKernelArgumentHolder(aten_inputs);
TORCH_INTERNAL_ASSERT(
runtime->getMaybeHeuristicsFor(args).has_value());
fec->profile(true);
fec->disableKernelLaunch();
fec->runFusionWithInputs(aten_inputs);
if (disable_launch_param_cache) {
fec->disableLaunchParamCache();
}
for (auto _ : benchmark_state) {
// Setup (not included in the measurement)
fec->runFusionWithInputs(aten_inputs);
}
}
static void LayerNormForward_NoShapeInferenceCachedBaseline(
benchmark::State& benchmark_state) {
LayerNormForward_ShapeInferenceBase(benchmark_state, false);
}
static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) {
LayerNormForward_ShapeInferenceBase(benchmark_state, true);
}
BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline)
->Unit(benchmark::kMicrosecond);
BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline)
->Unit(benchmark::kMicrosecond);

View File

@ -1,454 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupSoftmax(
Fusion* fusion,
DataType dtype,
const int reduction_axis) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
// setup fusion
auto input = makeContigTensor(2, dtype);
fusion->addInput(input);
if (dtype == DataType::Half) {
input = castOp(DataType::Float, input);
}
auto output = softmax(input, reduction_axis);
if (dtype == DataType::Half) {
output = castOp(DataType::Half, output);
}
fusion->addOutput(output);
}
static void NvFuserScheduler_Softmax(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype,
const int reduction_axis) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto reduction_size = benchmark_state.range(0);
auto iter_size = benchmark_state.range(1);
at::Tensor aten_input =
(reduction_axis ? at::randn({iter_size, reduction_size}, options)
: at::randn({reduction_size, iter_size}, options));
std::vector<c10::IValue> aten_inputs({aten_input});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(2 * aten_input.numel() * int64_t(dataTypeSize(dtype))));
}
// Warp softmax comparison
static void Softmax_WarpReduceReference(benchmark::State& benchmark_state) {
auto dtype = DataType::Float;
std::vector<int64_t> input_shape{
benchmark_state.range(0), benchmark_state.range(1)};
auto fusion_ptr = std::make_unique<Fusion>();
auto fusion = fusion_ptr.get();
FusionGuard fg(fusion);
setupSoftmax(fusion, dtype, 1);
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor aten_input = at::randn(input_shape, options);
std::vector<c10::IValue> aten_inputs({aten_input});
// Schedule through magic scheduler:
SchedulerRuntimeInfo runtime_info(fusion, aten_inputs, true);
TORCH_INTERNAL_ASSERT(SchedulerEntry::canSchedule(
ScheduleHeuristic::Persistent, fusion, runtime_info));
auto scheduler = SchedulerEntry::makeEntry(
ScheduleHeuristic::Persistent, fusion, runtime_info);
scheduler->schedule(fusion);
FusionExecutor fe;
fe.compileFusion(fusion);
auto outputs = fe.runFusion(aten_inputs);
fe.setMeasureKernelTimeFlag(true);
// Sync everything up before we start
for (auto _ : benchmark_state) {
clearL2Cache();
auto outputs = fe.runFusion(aten_inputs);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(2 * aten_input.numel() * int64_t(dataTypeSize(dtype))));
}
static void Softmax_WarpReduce(benchmark::State& benchmark_state) {
auto dtype = DataType::Float;
std::vector<int64_t> input_shape{
benchmark_state.range(0), benchmark_state.range(1)};
auto fusion_ptr = std::make_unique<Fusion>();
auto fusion = fusion_ptr.get();
FusionGuard fg(fusion);
setupSoftmax(fusion, dtype, 1);
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor aten_input = at::randn(input_shape, options);
std::vector<c10::IValue> aten_inputs({aten_input});
// Schedule through magic scheduler:
SchedulerRuntimeInfo runtime_info(fusion, aten_inputs, true);
TORCH_INTERNAL_ASSERT(SchedulerEntry::canSchedule(
ScheduleHeuristic::Persistent, fusion, runtime_info));
auto scheduler = SchedulerEntry::makeEntry(
ScheduleHeuristic::Persistent, fusion, runtime_info);
scheduler->schedule(fusion);
// Modify the schedule to use warp reduction
auto used_vals = fusion->usedMathVals();
for (auto tv : ir_utils::filterByType<TensorView>(used_vals)) {
for (IterDomain* id : tv->domain()->domain()) {
if (id->getParallelType() == ParallelType::TIDx) {
id->padToMultipleOfWarp();
}
}
}
FusionExecutor fe;
fe.compileFusion(fusion);
auto outputs = fe.runFusion(aten_inputs);
fe.setMeasureKernelTimeFlag(true);
// Sync everything up before we start
for (auto _ : benchmark_state) {
clearL2Cache();
auto outputs = fe.runFusion(aten_inputs);
benchmark_state.SetIterationTime(fe.kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(2 * aten_input.numel() * int64_t(dataTypeSize(dtype))));
}
BENCHMARK(Softmax_WarpReduce)
->RangeMultiplier(2)
->Ranges({{8, 8}, {16 * 197, 16 * 197}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Softmax_WarpReduceReference)
->RangeMultiplier(2)
->Ranges({{8, 8}, {16 * 197, 16 * 197}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
static void Baseline_Softmax(
benchmark::State& benchmark_state,
DataType dtype,
const int reduction_axis) {
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto reduction_size = benchmark_state.range(0);
auto iter_size = benchmark_state.range(1);
at::Tensor aten_input =
(reduction_axis ? at::randn({iter_size, reduction_size}, options)
: at::randn({reduction_size, iter_size}, options));
for (auto _ : benchmark_state) {
clearL2Cache();
CudaKernelTimer timer;
auto output = at::_softmax(aten_input, reduction_axis, false);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(2 * aten_input.numel() * int64_t(dataTypeSize(dtype))));
}
static void Baseline_Softmax_Outer_fp32(benchmark::State& benchmark_state) {
Baseline_Softmax(benchmark_state, DataType::Float, 0);
}
static void Baseline_Softmax_Inner_fp32(benchmark::State& benchmark_state) {
Baseline_Softmax(benchmark_state, DataType::Float, 1);
}
static void Baseline_Softmax_Outer_fp16(benchmark::State& benchmark_state) {
Baseline_Softmax(benchmark_state, DataType::Half, 0);
}
static void Baseline_Softmax_Inner_fp16(benchmark::State& benchmark_state) {
Baseline_Softmax(benchmark_state, DataType::Half, 1);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_Outer_fp32,
setupSoftmax,
NvFuserScheduler_Softmax,
DataType::Float,
0);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_Inner_fp32,
setupSoftmax,
NvFuserScheduler_Softmax,
DataType::Float,
1);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_Outer_fp16,
setupSoftmax,
NvFuserScheduler_Softmax,
DataType::Half,
0);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_Inner_fp16,
setupSoftmax,
NvFuserScheduler_Softmax,
DataType::Half,
1);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_Softmax_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,364 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupSoftmaxBWD(
Fusion* fusion,
DataType dtype,
const int reduction_axis) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
// setup fusion
auto grad_output = makeContigTensor(2, dtype);
auto output = makeContigTensor(2, dtype);
auto input = makeContigTensor(2, dtype);
fusion->addInput(grad_output);
fusion->addInput(output);
fusion->addInput(input);
if (dtype == DataType::Half) {
grad_output = castOp(DataType::Float, grad_output);
output = castOp(DataType::Float, output);
input = castOp(DataType::Float, input);
}
auto grad_input = softmax_backward(grad_output, output, reduction_axis);
if (dtype == DataType::Half) {
grad_input = castOp(DataType::Half, grad_input);
}
fusion->addOutput(grad_input);
}
static void NvFuserScheduler_Softmax_BWD(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype,
const int reduction_axis) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto reduction_size = benchmark_state.range(0);
auto iter_size = benchmark_state.range(1);
at::Tensor input =
(reduction_axis ? at::randn({iter_size, reduction_size}, options)
: at::randn({reduction_size, iter_size}, options));
at::Tensor grad_output =
(reduction_axis ? at::randn({iter_size, reduction_size}, options)
: at::randn({reduction_size, iter_size}, options));
at::Tensor output =
(reduction_axis ? at::randn({iter_size, reduction_size}, options)
: at::randn({reduction_size, iter_size}, options));
std::vector<c10::IValue> aten_inputs({grad_output, output, input});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(3 * input.numel() * int64_t(dataTypeSize(dtype))));
}
//------------------------------------------------------------------------------
static void Baseline_Softmax_BWD(
benchmark::State& benchmark_state,
DataType dtype,
const int reduction_axis) {
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
auto reduction_size = benchmark_state.range(0);
auto iter_size = benchmark_state.range(1);
at::Tensor input =
(reduction_axis ? at::randn({iter_size, reduction_size}, options)
: at::randn({reduction_size, iter_size}, options));
at::Tensor grad_output =
(reduction_axis ? at::randn({iter_size, reduction_size}, options)
: at::randn({reduction_size, iter_size}, options));
at::Tensor output =
(reduction_axis ? at::randn({iter_size, reduction_size}, options)
: at::randn({reduction_size, iter_size}, options));
for (auto _ : benchmark_state) {
clearL2Cache();
CudaKernelTimer timer;
auto grad_input = at::_softmax_backward_data(
grad_output, output, reduction_axis, data_type_to_aten(dtype));
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(3 * input.numel() * int64_t(dataTypeSize(dtype))));
}
static void Baseline_Softmax_BWD_Outer_fp32(benchmark::State& benchmark_state) {
Baseline_Softmax_BWD(benchmark_state, DataType::Float, 0);
}
static void Baseline_Softmax_BWD_Inner_fp32(benchmark::State& benchmark_state) {
Baseline_Softmax_BWD(benchmark_state, DataType::Float, 1);
}
static void Baseline_Softmax_BWD_Outer_fp16(benchmark::State& benchmark_state) {
Baseline_Softmax_BWD(benchmark_state, DataType::Half, 0);
}
static void Baseline_Softmax_BWD_Inner_fp16(benchmark::State& benchmark_state) {
Baseline_Softmax_BWD(benchmark_state, DataType::Half, 1);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_BWD_Outer_fp32,
setupSoftmaxBWD,
NvFuserScheduler_Softmax_BWD,
DataType::Float,
0);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_BWD_Inner_fp32,
setupSoftmaxBWD,
NvFuserScheduler_Softmax_BWD,
DataType::Float,
1);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_BWD_Outer_fp16,
setupSoftmaxBWD,
NvFuserScheduler_Softmax_BWD,
DataType::Half,
0);
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_BWD_Inner_fp16,
setupSoftmaxBWD,
NvFuserScheduler_Softmax_BWD,
DataType::Half,
1);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_BWD_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_Softmax_BWD_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Outer_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Outer_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Inner_fp32)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{1, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_BWD_Inner_fp16)
// ->RangeMultiplier(2)
->Ranges({{128, 1024 * 16}, {128, 1024 * 16}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,377 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
//------------------------------------------------------------------------------
static void setupSoftmaxDropout(
Fusion* fusion,
DataType dtype,
const int kReductionAxis) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
FusionGuard fg(fusion);
constexpr int kHiddenSize = 768;
constexpr int kNumAttentionHeads = 12;
constexpr int kAttentionHeadSize = kHiddenSize / kNumAttentionHeads;
constexpr float kDropoutProbability = 0.9;
constexpr float kScale = 1.0f / kDropoutProbability;
// setup fusion
auto attention_scores = makeContigTensor(4, dtype);
auto attention_mask = makeContigTensor(4, dtype);
Double* divisor = IrBuilder::create<Double>();
fusion->addInput(attention_scores);
fusion->addInput(attention_mask);
fusion->addInput(divisor);
if (dtype == DataType::Half) {
attention_scores = castOp(DataType::Float, attention_scores);
attention_mask = castOp(DataType::Float, attention_mask);
}
attention_scores = div(attention_scores, divisor);
attention_scores = add(attention_scores, attention_mask);
auto attention_probs = softmax(attention_scores, kReductionAxis);
auto prob = IrBuilder::create<Double>(kDropoutProbability);
auto scale = IrBuilder::create<Double>(kScale);
auto dropout_results = dropout(attention_probs, prob, scale);
auto output = dropout_results.output;
if (dtype == DataType::Half) {
attention_scores = castOp(DataType::Half, attention_scores);
attention_probs = castOp(DataType::Half, attention_probs);
output = castOp(DataType::Half, output);
}
fusion->addOutput(attention_scores);
fusion->addOutput(attention_probs);
fusion->addOutput(output);
fusion->addOutput(dropout_results.mask);
}
static void NvFuserScheduler_SoftmaxDropout(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype,
const int kReductionAxis) {
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
// reduce across 1, [256, 12, 100, 8]
std::vector<int64_t> input_shape{256, 12, 100, benchmark_state.range(0)};
constexpr int kHiddenSize = 768;
constexpr int kNumAttentionHeads = 12;
constexpr int kAttentionHeadSize = kHiddenSize / kNumAttentionHeads;
constexpr float kDropoutProbability = 0.9;
constexpr float kScale = 1.0f / kDropoutProbability;
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor at_scores = at::randn(input_shape, options);
at::Tensor at_mask = at::randn(input_shape, options);
std::vector<c10::IValue> aten_inputs(
{at_scores, at_mask, sqrt(kAttentionHeadSize)});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
// 5 dtype: attention_scores + attention_mask + attention_scores_out +
// attention_probs_out + output
// 1 bool: dropout_results.mask
// All the same size
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * 5 * at_scores.numel() *
int64_t(dataTypeSize(dtype)) +
// bool mask
int64_t(benchmark_state.iterations()) * at_scores.numel() *
int64_t(dataTypeSize(DataType::Bool)));
}
//------------------------------------------------------------------------------
static void Baseline_Softmax_Dropout(
benchmark::State& benchmark_state,
const int kReductionAxis,
DataType dtype) {
std::vector<int64_t> input_shape{256, 12, 100, benchmark_state.range(0)};
constexpr int kHiddenSize = 768;
constexpr int kNumAttentionHeads = 12;
constexpr float kDropoutProbability = 0.1;
constexpr int kAttentionHeadSize = kHiddenSize / kNumAttentionHeads;
// inputs
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
at::Tensor attention_scores = at::randn(input_shape, options);
at::Tensor at_y = at::randn(input_shape, options);
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
clearL2Cache();
CudaKernelTimer timer;
attention_scores = attention_scores / sqrt(kAttentionHeadSize);
attention_scores = attention_scores + at_y;
auto attention_probs =
at::_softmax(attention_scores, kReductionAxis, false);
attention_probs = at::dropout(attention_probs, kDropoutProbability, true);
// Record
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
// 5 dtype: attention_scores + attention_mask + attention_scores_out +
// attention_probs_out + output
// 1 bool: dropout_results.mask
// All the same size
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * 5 * attention_scores.numel() *
int64_t(dataTypeSize(dtype)) +
// bool mask
int64_t(benchmark_state.iterations()) * attention_scores.numel() *
int64_t(dataTypeSize(DataType::Bool)));
}
//------------------------------------------------------------------------------
static void Baseline_Softmax_Dropout_Inner_fp32(
benchmark::State& benchmark_state) {
Baseline_Softmax_Dropout(benchmark_state, 3, DataType::Float);
}
static void Baseline_Softmax_Dropout_Outer_fp32(
benchmark::State& benchmark_state) {
Baseline_Softmax_Dropout(benchmark_state, 1, DataType::Float);
}
static void Baseline_Softmax_Dropout_Inner_fp16(
benchmark::State& benchmark_state) {
Baseline_Softmax_Dropout(benchmark_state, 3, DataType::Half);
}
static void Baseline_Softmax_Dropout_Outer_fp16(
benchmark::State& benchmark_state) {
Baseline_Softmax_Dropout(benchmark_state, 1, DataType::Half);
}
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_Dropout_Inner_fp32,
setupSoftmaxDropout,
NvFuserScheduler_SoftmaxDropout,
DataType::Float,
3);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Dropout_Inner_fp32)
->Arg(8)
->Arg(16)
->Arg(24)
->Arg(32)
->Arg(40)
->Arg(48)
->Arg(56)
->Arg(64)
->Arg(72)
->Arg(80)
->Arg(88)
->Arg(96)
->Arg(104)
->Arg(112)
->Arg(120)
->Arg(128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_Dropout_Outer_fp32,
setupSoftmaxDropout,
NvFuserScheduler_SoftmaxDropout,
DataType::Float,
1);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Dropout_Outer_fp32)
->Arg(8)
->Arg(16)
->Arg(24)
->Arg(32)
->Arg(40)
->Arg(48)
->Arg(56)
->Arg(64)
->Arg(72)
->Arg(80)
->Arg(88)
->Arg(96)
->Arg(104)
->Arg(112)
->Arg(120)
->Arg(128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_Dropout_Inner_fp16,
setupSoftmaxDropout,
NvFuserScheduler_SoftmaxDropout,
DataType::Half,
3);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Dropout_Inner_fp16)
->Arg(8)
->Arg(16)
->Arg(24)
->Arg(32)
->Arg(40)
->Arg(48)
->Arg(56)
->Arg(64)
->Arg(72)
->Arg(80)
->Arg(88)
->Arg(96)
->Arg(104)
->Arg(112)
->Arg(120)
->Arg(128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_Softmax_Dropout_Outer_fp16,
setupSoftmaxDropout,
NvFuserScheduler_SoftmaxDropout,
DataType::Half,
1);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_Softmax_Dropout_Outer_fp16)
->Arg(8)
->Arg(16)
->Arg(24)
->Arg(32)
->Arg(40)
->Arg(48)
->Arg(56)
->Arg(64)
->Arg(72)
->Arg(80)
->Arg(88)
->Arg(96)
->Arg(104)
->Arg(112)
->Arg(120)
->Arg(128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_Softmax_Dropout_Inner_fp32)
->Arg(8)
->Arg(16)
->Arg(24)
->Arg(32)
->Arg(40)
->Arg(48)
->Arg(56)
->Arg(64)
->Arg(72)
->Arg(80)
->Arg(88)
->Arg(96)
->Arg(104)
->Arg(112)
->Arg(120)
->Arg(128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Dropout_Outer_fp32)
->Arg(8)
->Arg(16)
->Arg(24)
->Arg(32)
->Arg(40)
->Arg(48)
->Arg(56)
->Arg(64)
->Arg(72)
->Arg(80)
->Arg(88)
->Arg(96)
->Arg(104)
->Arg(112)
->Arg(120)
->Arg(128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------
BENCHMARK(Baseline_Softmax_Dropout_Inner_fp16)
->Arg(8)
->Arg(16)
->Arg(24)
->Arg(32)
->Arg(40)
->Arg(48)
->Arg(56)
->Arg(64)
->Arg(72)
->Arg(80)
->Arg(88)
->Arg(96)
->Arg(104)
->Arg(112)
->Arg(120)
->Arg(128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Softmax_Dropout_Outer_fp16)
->Arg(8)
->Arg(16)
->Arg(24)
->Arg(32)
->Arg(40)
->Arg(48)
->Arg(56)
->Arg(64)
->Arg(72)
->Arg(80)
->Arg(88)
->Arg(96)
->Arg(104)
->Arg(112)
->Arg(120)
->Arg(128)
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,738 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <benchmarks/cpp/nvfuser/utils.h>
using namespace torch::jit::fuser::cuda;
static void setup_vit_base_patch16_224_bcast7(Fusion* fusion, void* null) {
FusionGuard fg(fusion);
auto t2 = makeContigTensor(3, DataType::Float);
auto t3 = TensorViewBuilder()
.shape({-1, -1, 1})
.dtype(DataType::Float)
.contiguity({true, true, false})
.build();
auto t4 = TensorViewBuilder()
.shape({-1, -1, 1})
.dtype(DataType::Float)
.contiguity({true, true, false})
.build();
auto t7 = makeContigTensor(3, DataType::Half);
fusion->addInput(t2);
fusion->addInput(t3);
fusion->addInput(t4);
fusion->addInput(t7);
auto t8 = castOp(DataType::Float, t7);
auto t9 = set(t8);
auto t10 = sub(t2, t3);
auto t11 = mul(t10, t4);
auto t25 = mul(t9, t11);
auto t26 = sum(t25, {0, 1});
auto t36 = set(t26);
auto t27 = sum(t9, {0, 1});
auto t37 = set(t27);
auto t39 = castOp(DataType::Half, t11);
fusion->addOutput(t36);
fusion->addOutput(t37);
fusion->addOutput(t39);
}
static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
void* null) {
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2)};
at::manual_seed(0);
auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t2 = at::randn(input_shape, fp32_options);
auto t3 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
auto t4 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
auto t7 = at::randn(input_shape, fp16_options);
std::vector<c10::IValue> aten_inputs({t2, t3, t4, t7});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
// full tensor - float + halfx2 - t2, t7, t39
// Inner most dimension only - floatx2 - t36, t37
// Outer two dimensions only - floatx2 - t3, t4
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
// t2 + t7 t3 + t4 t36 + t37
t2.numel() * (4 + 2) + t3.numel() * 4 * 2 + input_shape[2] * (4 * 2) +
// T39
t2.numel() * 2);
}
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast7,
setup_vit_base_patch16_224_bcast7,
NvFuserScheduler_TIMM_vit_base_patch16_224_bcast7,
nullptr);
// pwise case, broadcasting both sides
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast7)
->Args({64, 197, 768})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
static void setup_vit_base_patch16_224_bcast5(Fusion* fusion, void* null) {
FusionGuard fg(fusion);
auto t2 = makeContigTensor(3, DataType::Float);
auto t5 = makeContigTensor(1, DataType::Float);
auto t3 = makeContigTensor(3, DataType::Half);
auto t0 = makeContigTensor(1, DataType::Float);
auto t1 = makeContigTensor(1, DataType::Float);
fusion->addInput(t2);
fusion->addInput(t5);
fusion->addInput(t3);
fusion->addInput(t0);
fusion->addInput(t1);
std::vector<bool> bcast_pattern0({true, true, false});
std::vector<bool> bcast_pattern1({false, false, true});
auto t4 = castOp(DataType::Float, t3);
auto t6 = set(t5);
auto t7 = broadcast(t6, bcast_pattern0);
auto t8 = add(t4, t7);
auto t9 = rand_like(t8);
auto d34 =
sub(IrBuilder::create<Double>(1.0), IrBuilder::create<Double>(0.0));
auto t10 = lt(t9, d34);
auto t11 = castOp(DataType::Float, t10);
auto t12 = mul(t8, t11);
auto b36 = eq(d34, IrBuilder::create<Double>(0.0));
auto d37 = castOp(DataType::Double, b36);
auto d38 = add(d37, d34);
auto d40 = div(IrBuilder::create<Double>(1.0), d38);
auto t13 = mul(t12, d40);
auto t14 = set(t13);
auto t15 = add(t2, t14);
auto t16 = set(t15);
auto t36 = sum(t16, {2});
auto d151 = castOp(DataType::Double, t2->axis(2)->extent());
auto d152 = mul(IrBuilder::create<Double>(1.0), d151);
auto t19 = div(t36, d152);
auto t22 = broadcast(t19, bcast_pattern1);
auto t23 = sub(t16, t22);
auto t37 = mul(t23, t23);
auto t20 = sum(t37, {2});
auto t24 = broadcast(t20, bcast_pattern1);
auto d95 = castOp(DataType::Double, t2->axis(2)->extent());
auto d105 = reciprocal(d95);
auto t25 = mul(t24, d105);
auto t26 = add(t25, IrBuilder::create<Double>(1e-6));
auto t27 = rsqrt(t26);
auto t28 = mul(t23, t27);
auto t17 = set(t1);
auto t29 = broadcast(t17, bcast_pattern0);
auto t30 = mul(t28, t29);
auto t18 = set(t0);
auto t31 = broadcast(t18, bcast_pattern0);
auto t32 = add(t30, t31);
auto t33 = set(t32);
auto t34 = castOp(DataType::Half, t33);
fusion->addOutput(t16); // full 3d float
fusion->addOutput(t10); // full 3d bool
fusion->addOutput(t22); // bcast last dim float
fusion->addOutput(t27); // bcast last dim float
fusion->addOutput(t18); // passthrough t0 float
fusion->addOutput(t17); // passthrough t1 float
fusion->addOutput(t34); // full 3d half
}
static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
void* null) {
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2)};
at::manual_seed(0);
auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t2 = at::randn(input_shape, fp32_options);
auto t5 = at::randn({input_shape[2]}, fp32_options);
auto t3 = at::randn(input_shape, fp16_options);
auto t0 = at::randn({input_shape[2]}, fp32_options);
auto t1 = at::randn({input_shape[2]}, fp32_options);
std::vector<c10::IValue> aten_inputs({t2, t5, t3, t0, t1});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
// Full tensor - floatx2, halfx2, bool - t2, t16, t3, t34, t16
// Inner most dim only - floatx5 - t5, t0, t1, t7, t17
// Outer two dims only - floatx2 - t22, t27
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
t2.numel() * (2 * 4 + 2 * 2 + 1) + t5.numel() * 5 * 4 +
input_shape[0] * input_shape[1] * 2 * 4);
}
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5_NCHW,
setup_vit_base_patch16_224_bcast5,
NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5,
nullptr);
// Broadcast on both sides
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_vit_base_patch16_224_bcast5_NCHW)
->Args({64, 197, 768})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
static void setup_vit_base_patch16_224_bcast_outer2(
Fusion* fusion,
void* null) {
FusionGuard fg(fusion);
auto t0 = makeContigTensor(3, DataType::Half);
auto t2 = makeContigTensor(1, DataType::Float);
fusion->addInput(t0);
fusion->addInput(t2);
auto t1 = castOp(DataType::Float, t0);
auto t3 = set(t2);
auto t4 = broadcast(t3, {true, true, false});
auto t5 = add(t1, t4);
auto t6 = castOp(DataType::Half, t5);
auto t7 = castOp(DataType::Half, t3);
fusion->addOutput(t6);
fusion->addOutput(t7);
}
static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
void* null) {
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2)};
at::manual_seed(0);
auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(input_shape, fp16_options);
auto t2 = at::randn({input_shape[2]}, fp32_options);
std::vector<c10::IValue> aten_inputs({t0, t2});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
// full tensor - halfx2 - t0, t6
// inner dimension only - halfx2 - t2, t7
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2) +
input_shape[2] * (2 + 4));
}
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer2,
setup_vit_base_patch16_224_bcast_outer2,
NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer2,
nullptr);
NVFUSER_BENCHMARK_RUN(
NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer2)
->Args({64, 197, 2304})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
static void setup_vit_base_patch16_224_norm_inner3(Fusion* fusion, void* null) {
FusionGuard fg(fusion);
auto t0 = makeContigTensor(4, DataType::Half);
fusion->addInput(t0);
auto d13 = IrBuilder::create<Double>();
fusion->addInput(d13);
auto t1 = castOp(DataType::Float, t0);
auto t2 = set(t1);
auto t3 = mul(t2, d13);
auto t4 = set(t3);
auto t5 = max(t4, {3});
auto t6 = broadcast(t5, {false, false, false, true});
auto t7 = sub(t4, t6);
auto t8 = exp(t7);
auto t9 = sum(t8, {3});
auto t10 = broadcast(t9, {false, false, false, true});
auto t11 = reciprocal(t10);
auto t12 = mul(t8, t11);
auto t13 = rand_like(t12);
auto d79 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
auto t14 = lt(t13, d79);
auto t15 = castOp(DataType::Float, t14);
auto b81 = eq(d79, IrBuilder::create<Double>(0));
auto d82 = castOp(DataType::Double, b81);
auto d83 = add(d82, d79);
auto d85 = div(IrBuilder::create<Double>(1), d83);
auto t16 = mul(t12, t15);
auto t17 = mul(t16, d85);
auto t18 = set(t17);
auto t19 = castOp(DataType::Half, t18);
fusion->addOutput(t19);
fusion->addOutput(t14);
fusion->addOutput(t12);
fusion->addOutput(t4);
}
static void NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
void* null) {
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2),
benchmark_state.range(2)};
at::manual_seed(0);
auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto t0 = at::randn(input_shape, fp16_options);
std::vector<c10::IValue> aten_inputs({t0, 0.125});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
// Full tensors - floatx2, half x2, bool - t12, t4, t0, t19, t14
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * t0.numel() * 13);
}
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_norm_inner3,
setup_vit_base_patch16_224_norm_inner3,
NvFuserScheduler_TIMM_vit_base_patch16_224_norm_inner3,
nullptr);
// Norm inner dim
NVFUSER_BENCHMARK_RUN(
NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_norm_inner3)
->Args({64, 12, 197})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
static void setup_vit_base_patch16_224_bcast_outer6(
Fusion* fusion,
void* null) {
FusionGuard fg(fusion);
auto t0 = makeContigTensor(3, DataType::Half);
auto t2 = makeContigTensor(1, DataType::Float);
fusion->addInput(t0);
fusion->addInput(t2);
auto t1 = castOp(DataType::Float, t0);
auto t3 = set(t2);
auto t4 = broadcast(t3, {true, true, false});
auto t5 = add(t1, t4);
auto t6 = set(t5);
auto t7 = mul(t6, IrBuilder::create<Double>(0.707106));
auto t8 = erf(t7);
auto t9 = add(IrBuilder::create<Double>(1), t8);
auto t10 = mul(IrBuilder::create<Double>(0.5), t9);
auto t11 = mul(t6, t10);
auto t12 = rand_like(t11);
auto d66 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
auto t13 = lt(t12, d66);
auto t14 = castOp(DataType::Float, t13);
auto t15 = mul(t11, t14);
auto b68 = eq(d66, IrBuilder::create<Double>(0));
auto d69 = castOp(DataType::Double, b68);
auto d70 = add(d69, d66);
auto d72 = div(IrBuilder::create<Double>(1), d70);
auto t16 = mul(t15, d72);
auto t17 = set(t16);
auto t18 = castOp(DataType::Half, t17);
auto t19 = castOp(DataType::Half, t3);
fusion->addOutput(t18);
fusion->addOutput(t13);
fusion->addOutput(t6);
fusion->addOutput(t19);
}
static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
void* null) {
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2)};
at::manual_seed(0);
auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(input_shape, fp16_options);
auto t2 = at::randn({input_shape[2]}, fp32_options);
std::vector<c10::IValue> aten_inputs({t0, t2});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
// full tensors - float, halfx2, bool - t6, t0, t18, t13
// inner dimension only - float, half - t2, t19
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2 + 1 + 4) +
input_shape[2] * (4 + 2));
}
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer6,
setup_vit_base_patch16_224_bcast_outer6,
NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_outer6,
nullptr);
NVFUSER_BENCHMARK_RUN(
NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_outer6)
// First size is original, the rest are variations to check perf
// reliability.
->Args({64, 197, 3 * 1024})
->Args({64, 197, 2 * 1024})
->Args({64, 197, 1024})
->Args({64, 197, 512})
->Args({3, 1024, 64 * 197})
->Args({2, 1024, 64 * 197})
->Args({1, 1024, 64 * 197})
->Args({2, 256, 64 * 197})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
// Reverse the broadcast dimensions to check for consistency in scheduling.
static void setup_vit_base_patch16_224_bcast_inner6(
Fusion* fusion,
void* null) {
FusionGuard fg(fusion);
auto t0 = makeContigTensor(3, DataType::Half);
auto t2 = makeContigTensor(2, DataType::Float);
fusion->addInput(t0);
fusion->addInput(t2);
auto t1 = castOp(DataType::Float, t0);
auto t3 = set(t2);
auto t4 = broadcast(t3, {false, false, true});
auto t5 = add(t1, t4);
auto t6 = set(t5);
auto t7 = mul(t6, IrBuilder::create<Double>(0.707106));
auto t8 = erf(t7);
auto t9 = add(IrBuilder::create<Double>(1), t8);
auto t10 = mul(IrBuilder::create<Double>(0.5), t9);
auto t11 = mul(t6, t10);
auto t12 = rand_like(t11);
auto d66 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
auto t13 = lt(t12, d66);
auto t14 = castOp(DataType::Float, t13);
auto t15 = mul(t11, t14);
auto b68 = eq(d66, IrBuilder::create<Double>(0));
auto d69 = castOp(DataType::Double, b68);
auto d70 = add(d69, d66);
auto d72 = div(IrBuilder::create<Double>(1), d70);
auto t16 = mul(t15, d72);
auto t17 = set(t16);
auto t18 = castOp(DataType::Half, t17);
auto t19 = castOp(DataType::Half, t3);
fusion->addOutput(t18);
fusion->addOutput(t13);
fusion->addOutput(t6);
fusion->addOutput(t19);
}
static void NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
void* null) {
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2)};
at::manual_seed(0);
auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(input_shape, fp16_options);
auto t2 = at::randn({input_shape[0], input_shape[1]}, fp32_options);
std::vector<c10::IValue> aten_inputs({t0, t2});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
// full tensors - float, halfx2, bool - t6, t0, t18, t13
// outer two dimensions only - float, half - t2, t19
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * t0.numel() * (2 + 2 + 1 + 4) +
input_shape[0] * input_shape[1] * (4 + 2));
}
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_inner6,
setup_vit_base_patch16_224_bcast_inner6,
NvFuserScheduler_TIMM_vit_base_patch16_224_bcast_inner6,
nullptr);
NVFUSER_BENCHMARK_RUN(
NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_bcast_inner6)
->Args({64, 197, 3 * 1024})
->Args({64, 197, 2 * 1024})
->Args({64, 197, 1024})
->Args({64, 197, 512})
->Args({3, 1024, 64 * 197})
->Args({2, 1024, 64 * 197})
->Args({1, 1024, 64 * 197})
->Args({2, 256, 64 * 197})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
static void setup_vit_base_patch16_224_LN_BWD(Fusion* fusion, void* null) {
FusionGuard fg(fusion);
auto t0 = makeContigTensor(3, DataType::Bool);
fusion->addInput(t0);
auto t1 = makeContigTensor(3, DataType::Half);
fusion->addInput(t1);
auto t2 = castOp(DataType::Float, t1);
auto t3 = makeContigTensor(3, DataType::Half);
fusion->addInput(t3);
auto t4 = castOp(DataType::Float, t3);
auto d35 = t3->axis(2)->extent();
auto t5 = TensorViewBuilder()
.shape({-1, -1, 1})
.dtype(DataType::Float)
.contiguity({true, true, false})
.build();
fusion->addInput(t5);
auto t6 = TensorViewBuilder()
.shape({-1, -1, 1})
.dtype(DataType::Float)
.contiguity({true, true, false})
.build();
fusion->addInput(t6);
auto t7 = makeContigTensor(1, DataType::Half);
fusion->addInput(t7);
auto t8 = castOp(DataType::Float, t7);
auto t9 = makeContigTensor(1, DataType::Half);
fusion->addInput(t9);
auto t11 = sub(t4, t5);
auto t12 = mul(t11, t6);
auto t13 = broadcast(t8, {true, true, false});
auto t14 = mul(t2, t13);
auto t15 = mul(d35, t14);
auto t16 = sum(t14, {2});
auto t17 = broadcast(t16, {false, false, true});
auto t18 = mul(t14, t12);
auto t19 = sum(t18, {2});
auto t20 = broadcast(t19, {false, false, true});
auto t40 = castOp(DataType::Half, t12);
auto t41 = castOp(DataType::Float, t40);
auto t42 = castOp(DataType::Half, t20);
auto t43 = castOp(DataType::Float, t42);
auto t21 = mul(t42, t43);
auto t38 = castOp(DataType::Half, t15);
auto t39 = castOp(DataType::Float, t38);
auto t44 = castOp(DataType::Half, t17);
auto t45 = castOp(DataType::Float, t44);
auto t22 = sub(t39, t45);
auto t23 = sub(t22, t21);
auto d87 = reciprocal(d35);
auto t24 = mul(d87, t6);
auto t25 = mul(t24, t23);
auto t26 = mul(t2, t41);
auto t27 = sum(t26, {0, 1});
auto t28 = sum(t2, {0, 1});
auto t29 = castOp(DataType::Float, t0);
auto t30 = mul(t25, t29);
auto d33 = IrBuilder::create<Double>();
fusion->addInput(d33);
auto t31 = mul(t30, d33);
auto t32 = sum(t31, {0, 1});
auto t33 = castOp(DataType::Half, t32);
auto t34 = castOp(DataType::Half, t31);
auto t35 = castOp(DataType::Half, t25);
auto t36 = castOp(DataType::Half, t27);
auto t37 = castOp(DataType::Half, t28);
fusion->addOutput(t33);
fusion->addOutput(t34);
fusion->addOutput(t35);
fusion->addOutput(t36);
fusion->addOutput(t37);
}
static void NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
void* null) {
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2)};
at::manual_seed(0);
// auto bool_options = at::TensorOptions().dtype(at::kBool).device(at::kCUDA,
// 0);
auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto fp32_options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto t0 = at::randn(input_shape, fp16_options).to(at::kBool);
auto t1 = at::randn(input_shape, fp16_options);
auto t3 = at::randn(input_shape, fp16_options);
auto t5 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
auto t6 = at::randn({input_shape[0], input_shape[1], 1}, fp32_options);
auto t7 = at::randn({input_shape[2]}, fp16_options);
auto t9 = at::randn({input_shape[2]}, fp16_options);
std::vector<c10::IValue> aten_inputs({t0, t1, t3, t5, t6, t7, t9, 1.0});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
// Full tensors - bool, halfx4 - t0, t1, t3, t34, t35
// Outer two dimensions - floatx2 - t5, t6
// Inner dimension - halfx5 - t7, t9, t33, t36, t37
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * ((t0.numel() * (4 * 2 + 1))) +
(t5.numel() * 4 * 2) + (t7.numel() * 5 * 2));
}
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_LN_BWD,
setup_vit_base_patch16_224_LN_BWD,
NvFuserScheduler_TIMM_vit_base_patch16_224_LN_BWD,
nullptr);
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_NCHW_vit_base_patch16_224_LN_BWD)
->Args({128, 197, 768})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
static void nhwc_seresnet152d_transpose65(Fusion* fusion, void* null) {
FusionGuard fg(fusion);
auto t2 = makeContigTensor(4, DataType::Half);
auto t5 = makeContigTensor(4, DataType::Half);
auto t7 = makeContigTensor(4, DataType::Half);
auto t9 = makeContigTensor(4, DataType::Half);
auto t4 = makeConcreteTensor({}, DataType::Half);
fusion->addInput(t2);
fusion->addInput(t5);
fusion->addInput(t7);
fusion->addInput(t9);
fusion->addInput(t4);
auto d86 = IrBuilder::create<Double>(0);
auto t3 = castOp(DataType::Float, t2);
auto t6 = castOp(DataType::Float, t5);
auto t8 = castOp(DataType::Float, t7);
auto t10 = castOp(DataType::Float, t9);
auto t11 = add(t8, t10);
auto t12 = set(t11);
auto t13 = set(t6);
auto t14 = lt(t13, d86);
auto t15 = broadcast(t4, {true, true, true, true});
auto t16 = where(t14, t15, t12);
auto t17 = set(t16);
auto t29 = castOp(DataType::Half, t17);
auto t18 = mul(t17, t3);
auto t19 = permute(t18, {0, 2, 3, 1});
auto t30 = castOp(DataType::Half, t19);
fusion->addOutput(t29);
fusion->addOutput(t30);
}
static void NvFuserScheduler_nhwc_seresnet152d_transpose65(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
void* null) {
std::vector<int64_t> input_shape{
benchmark_state.range(0),
benchmark_state.range(2),
benchmark_state.range(2),
benchmark_state.range(1)};
at::manual_seed(0);
auto fp16_options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto t2 = at::randn(input_shape, fp16_options);
auto t5 = at::randn(input_shape, fp16_options);
auto t7 = at::randn(input_shape, fp16_options);
auto t9 = at::randn(input_shape, fp16_options);
// Need zero dim tensor don't know how to do that, so just going to reduce a
// 1D tensor
auto t4 = at::randn({2}, fp16_options).sum();
std::vector<c10::IValue> aten_inputs({t2, t5, t7, t9, t4});
runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs);
// Full tensors - halfx6 - t2, t5, t7, t9, t29, t30
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) * t2.numel() * 6 * 2);
}
NVFUSER_BENCHMARK_DEFINE(
NvFuserScheduler_TIMM_nhwc_seresnet152d_transpose65,
nhwc_seresnet152d_transpose65,
NvFuserScheduler_nhwc_seresnet152d_transpose65,
nullptr);
// Norm inner dim Half version of vit_base_patch16_224_norm_inner3
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_TIMM_nhwc_seresnet152d_transpose65)
->Args({128, 12, 197})
->Unit(benchmark::kMicrosecond)
->UseManualTime();

View File

@ -1,476 +0,0 @@
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <cuda_runtime.h>
#include <benchmarks/cpp/nvfuser/utils.h>
#define TRANSPOSE_CONFIG {true, false, false, false}
using namespace torch::jit::fuser::cuda;
struct TransposeConfig {
bool input1_transpose_axes = false;
bool input2_transpose_axes = false;
bool intermediate_transpose_axes = false;
bool output_transpose_axes = false;
};
std::vector<at::Tensor> generateInputs(
DataType dtype,
int num_dims,
std::pair<int, int> axes,
int perm_size,
int innerdim_size,
bool input1_transpose_axes,
bool input2_transpose_axes,
bool non_vectorize_offset = false,
int iter_size = 32) {
at::manual_seed(0);
auto options =
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
std::vector<int64_t> transpose_shape(num_dims, iter_size);
transpose_shape[axes.second] = innerdim_size;
transpose_shape[axes.first] = perm_size;
std::vector<int64_t> non_transpose_shape(num_dims, iter_size);
non_transpose_shape[axes.first] = innerdim_size;
non_transpose_shape[axes.second] = perm_size;
// TensorType: Concrete, Contig, Symbolic
// Vectorization | Unroll - Add 1 to sizes
// Shift axis by 1 to disable vectorize loads
if (non_vectorize_offset) {
for (auto idx : c10::irange(transpose_shape.size())) {
transpose_shape[idx] += 1;
}
for (auto idx : c10::irange(non_transpose_shape.size())) {
non_transpose_shape[idx] += 1;
}
}
auto optionalTransposeSize =
[&transpose_shape, &non_transpose_shape](bool transpose_tensor) {
return (transpose_tensor) ? transpose_shape : non_transpose_shape;
};
at::Tensor aten_input1 =
at::randn(optionalTransposeSize(input1_transpose_axes), options);
at::Tensor aten_input2 =
at::randn(optionalTransposeSize(input2_transpose_axes), options);
return {aten_input1, aten_input2};
}
//------------------------------------------------------------------------------
static void setupTranspose(
Fusion* fusion,
DataType dtype,
int num_dims,
std::pair<int, int> axes,
TransposeConfig tc) {
FusionGuard fg(fusion);
auto optionalTranspose = [axes](TensorView* tv, bool is_transpose) {
return (is_transpose) ? transpose(tv, axes.first, axes.second) : tv;
};
auto input1 = makeContigTensor(num_dims, dtype);
auto input2 = makeContigTensor(num_dims, dtype);
fusion->addInput(input1);
fusion->addInput(input2);
auto ot_input1 = optionalTranspose(input1, tc.input1_transpose_axes);
auto ot_input2 = optionalTranspose(input2, tc.input2_transpose_axes);
auto intermediate = add(ot_input1, ot_input2);
auto ot_intermediate =
optionalTranspose(intermediate, tc.intermediate_transpose_axes);
auto output = relu(ot_intermediate);
auto ot_output = optionalTranspose(output, tc.output_transpose_axes);
fusion->addOutput(ot_output);
}
static void NvFuserScheduler_Transpose(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
DataType dtype,
int num_dims,
std::pair<int, int> axes,
TransposeConfig tc) {
auto aten_inputs = generateInputs(
dtype,
num_dims,
axes,
benchmark_state.range(0),
benchmark_state.range(1),
tc.input1_transpose_axes,
tc.input2_transpose_axes);
auto at_input1 = aten_inputs[0];
auto at_input2 = aten_inputs[1];
std::vector<c10::IValue> fuser_inputs = {at_input1, at_input2};
runBenchmarkIterations(benchmark_state, fusion_executor_cache, fuser_inputs);
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
((at_input1.numel() * 3) * int64_t(dataTypeSize(dtype))));
}
//------------------------------------------------------------------------------
#define NVFUSER_TRANSPOSE_SQUARE_RUN( \
TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \
NVFUSER_BENCHMARK_DEFINE( \
TITLE, \
setupTranspose, \
NvFuserScheduler_Transpose, \
DTYPE, \
NUM_DIMS, \
{AXIS1, AXIS2}, \
CONFIG); \
\
NVFUSER_BENCHMARK_RUN(TITLE) \
->RangeMultiplier(8) \
->Args({9, 2408}) \
->Args({16, 512}) \
->Args({18, 96}) \
->Args({24, 96}) \
->Args({24, 256}) \
->Args({24, 512}) \
->Args({32, 27}) \
->Args({32, 96}) \
->Args({32, 288}) \
->Args({32, 864}) \
->Args({40, 120}) \
->Args({48, 128}) \
->Args({48, 256}) \
->Args({49, 512}) \
->Args({49, 1024}) \
->Args({49, 2048}) \
->Args({49, 4608}) \
->Args({64, 64}) \
->Args({64, 96}) \
->Args({64, 128}) \
->Args({64, 147}) \
->Args({64, 192}) \
->Args({64, 256}) \
->Args({64, 288}) \
->Args({64, 512}) \
->Args({80, 64}) \
->Args({81, 1728}) \
->Args({83, 1728}) \
->Args({96, 864}) \
->Args({100, 1280}) \
->Args({100, 4032}) \
->Args({120, 40}) \
->Args({128, 128}) \
->Args({128, 512}) \
->Args({128, 1152}) \
->Args({192, 128}) \
->Args({192, 256}) \
->Args({192, 720}) \
->Args({192, 768}) \
->Args({192, 1120}) \
->Args({192, 1728}) \
->Args({196, 256}) \
->Args({196, 512}) \
->Args({196, 1024}) \
->Args({196, 2304}) \
->Args({256, 256}) \
->Args({256, 1024}) \
->Args({256, 2304}) \
->Args({284, 512}) \
->Args({320, 1280}) \
->Args({320, 1728}) \
->Args({324, 2592}) \
->Args({361, 768}) \
->Args({361, 1120}) \
->Args({384, 2}) \
->Args({384, 32}) \
->Args({384, 128}) \
->Args({384, 256}) \
->Args({384, 512}) \
->Args({384, 1280}) \
->Args({384, 2592}) \
->Args({384, 4032}) \
->Args({448, 1280}) \
->Args({480, 16}) \
->Args({480, 256}) \
->Args({512, 2}) \
->Args({512, 16}) \
->Args({512, 128}) \
->Args({512, 256}) \
->Args({512, 1024}) \
->Args({512, 2048}) \
->Args({512, 3072}) \
->Args({512, 4608}) \
->Args({784, 40}) \
->Args({784, 120}) \
->Args({784, 128}) \
->Args({784, 1152}) \
->Args({1001, 2408}) \
->Args({1024, 16}) \
->Args({1024, 256}) \
->Args({1024, 512}) \
->Args({1024, 1024}) \
->Args({1024, 3072}) \
->Args({1369, 192}) \
->Args({1369, 256}) \
->Args({1369, 288}) \
->Args({2048, 512}) \
->Args({2048, 1024}) \
->Args({2250, 27}) \
->Args({3072, 512}) \
->Args({3072, 1024}) \
->Args({3136, 64}) \
->Args({5329, 720}) \
->Args({5625, 64}) \
->Args({12544, 147}) \
->Args({22201, 288}) \
->Unit(benchmark::kMicrosecond)
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp32_Inner_2D_01_Axis,
DataType::Float,
2 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp32_Inner_3D_02_Axis,
DataType::Float,
3 /* num_dims */,
0 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp32_Inner_3D_12_Axis,
DataType::Float,
3 /* num_dims */,
1 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp32_Outer_3D_01_Axis,
DataType::Float,
3 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
//------------------------------------------------------------------------------
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp16_Inner_2D_01_Axis,
DataType::Half,
2 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp16_Inner_3D_02_Axis,
DataType::Half,
3 /* num_dims */,
0 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp16_Inner_3D_12_Axis,
DataType::Half,
3 /* num_dims */,
1 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_SQUARE_RUN(
NF_Transpose_Random_fp16_Outer_3D_01_Axis,
DataType::Half,
3 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
//------------------------------------------------------------------------------
#define NVFUSER_TRANSPOSE_RUN(TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \
NVFUSER_BENCHMARK_DEFINE( \
TITLE, \
setupTranspose, \
NvFuserScheduler_Transpose, \
DTYPE, \
NUM_DIMS, \
{AXIS1, AXIS2}, \
CONFIG); \
\
NVFUSER_BENCHMARK_RUN(TITLE) \
->RangeMultiplier(8) \
->Ranges({{2, 256 * 256}, {160, 320}}) \
->Unit(benchmark::kMicrosecond) \
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp32_Inner_2D_01_Axis,
DataType::Float,
2 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp32_Inner_3D_02_Axis,
DataType::Float,
3 /* num_dims */,
0 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp32_Inner_3D_12_Axis,
DataType::Float,
3 /* num_dims */,
1 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp32_Outer_3D_01_Axis,
DataType::Float,
3 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
//------------------------------------------------------------------------------
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp16_Inner_2D_01_Axis,
DataType::Half,
2 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp16_Inner_3D_02_Axis,
DataType::Half,
3 /* num_dims */,
0 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp16_Inner_3D_12_Axis,
DataType::Half,
3 /* num_dims */,
1 /* axis1 */,
2 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
NVFUSER_TRANSPOSE_RUN(
NF_Transpose_fp16_Outer_3D_01_Axis,
DataType::Half,
3 /* num_dims */,
0 /* axis1 */,
1 /* axis2 */,
TransposeConfig(TRANSPOSE_CONFIG));
//------------------------------------------------------------------------------
static void Baseline_Transpose(
benchmark::State& benchmark_state,
DataType dtype,
int num_dims,
std::pair<int, int> axes,
TransposeConfig tc) {
auto aten_inputs = generateInputs(
dtype,
num_dims,
axes,
benchmark_state.range(0),
benchmark_state.range(1),
tc.input1_transpose_axes,
tc.input2_transpose_axes);
auto at_input1 = aten_inputs[0];
auto at_input2 = aten_inputs[1];
auto optionalTransposeAten = [&axes](at::Tensor x, bool is_transpose) {
return (is_transpose) ? at::transpose(x, axes.first, axes.second) : x;
};
for (auto _ : benchmark_state) {
clearL2Cache();
CudaKernelTimer timer;
auto at_ot_input1 =
optionalTransposeAten(at_input1, tc.input1_transpose_axes);
auto at_ot_input2 =
optionalTransposeAten(at_input2, tc.input2_transpose_axes);
auto at_intermediate = add(at_ot_input1, at_ot_input2);
auto at_ot_intermediate =
optionalTransposeAten(at_intermediate, tc.intermediate_transpose_axes);
auto at_output = relu(at_ot_intermediate);
auto at_ot_output =
optionalTransposeAten(at_output, tc.output_transpose_axes);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
cudaDeviceSynchronize();
benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
(at_input1.numel() * 3 * int64_t(dataTypeSize(dtype))));
}
//------------------------------------------------------------------------------
static void Baseline_Transpose_fp32_Inner_2D_01_Axis(
benchmark::State& benchmark_state) {
Baseline_Transpose(
benchmark_state,
DataType::Float,
2 /* num_dims */,
{0, 1} /* axes */,
TRANSPOSE_CONFIG);
}
static void Baseline_Transpose_fp16_Inner_2D_01_Axis(
benchmark::State& benchmark_state) {
Baseline_Transpose(
benchmark_state,
DataType::Half,
2 /* num_dims */,
{0, 1} /* axes */,
TRANSPOSE_CONFIG);
}
//------------------------------------------------------------------------------
BENCHMARK(Baseline_Transpose_fp32_Inner_2D_01_Axis)
// ->RangeMultiplier(2)
->Ranges({{2, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
BENCHMARK(Baseline_Transpose_fp16_Inner_2D_01_Axis)
// ->RangeMultiplier(2)
->Ranges({{2, 1024 * 1024}, {160, 320}})
->Unit(benchmark::kMicrosecond)
->UseManualTime();
//------------------------------------------------------------------------------

View File

@ -1,228 +0,0 @@
#include <benchmarks/cpp/nvfuser/utils.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <sstream>
using namespace torch::jit::fuser::cuda;
std::string toString(const ReductionParams& rparams) {
std::stringstream ss;
ss << (rparams.fastest_dim ? "Red On Fastest Dim // " : "Red On Slow Dim // ")
<< (rparams.persistent_kernel ? "Persistent Kernel // " : "")
<< (rparams.project_persistent_buffers ? "Project Persistent Buffers // "
: "");
if (rparams.schedule_3D) {
ss << "3D Schedule // "
<< "Outer Reduction: "
<< (rparams.cross_block_outer_reduction ? "cross block / " : "")
<< (rparams.cross_grid_outer_reduction ? "cross grid / " : "")
<< (rparams.split_grid_dim_outer_reduction ? "split grid dim / " : "");
if (rparams.batches_per_block_outer_reduction > 1 ||
rparams.persistent_kernel) {
ss << "persistent batch - " << rparams.batches_per_block_outer_reduction
<< " / ";
}
}
ss << " // Iteration Domain: "
<< (rparams.multiple_reds_per_blk ? "multiple reductions per block / "
: "")
<< (rparams.split_grid_dim_iter_dom ? "split grid dimension / " : "")
<< (rparams.vectorize_iter_dom ? "vectorize / " : "")
<< (rparams.unroll_factor_iter_dom > 1 && !rparams.vectorize_iter_dom
? "unroll / "
: "");
if (rparams.unroll_factor_iter_dom > 1 || rparams.vectorize_iter_dom) {
ss << "factor " << rparams.unroll_factor_iter_dom;
}
ss << " // Inner Reduction Domain: "
<< (rparams.cross_block_inner_reduction ? "cross block reduction / " : "")
<< (rparams.pad_inner_reduction_to_warp ? "pad to warp / " : "")
<< (rparams.cross_grid_inner_reduction ? "cross grid reduction / " : "");
if (rparams.batches_per_block_inner_reduction > 1 ||
rparams.persistent_kernel) {
ss << "persistent batch - " << rparams.batches_per_block_inner_reduction
<< " / ";
}
ss << (rparams.cross_grid_inner_reduction &&
rparams.split_grid_dim_inner_reduction
? "split grid dimension / "
: "")
<< (rparams.vectorize_inner_reduction ? "vectorize / " : "")
<< (rparams.unroll_factor_inner_reduction > 1 &&
!rparams.vectorize_inner_reduction
? "unroll / "
: "");
if (rparams.unroll_factor_inner_reduction > 1 ||
rparams.vectorize_inner_reduction) {
ss << "factor " << rparams.unroll_factor_inner_reduction;
}
return ss.str();
}
std::string toString(const PointwiseParams& params) {
std::stringstream ss;
if (params.break_point) {
ss << "2D Schedule at " << params.break_point << "/";
if (params.split_block) {
ss << " Split block into y-dim/";
}
if (params.split_grid_y_dim) {
ss << " Split y grid dim/";
}
} else {
ss << "1D"
<< "/";
}
if (params.unroll_factor > 1) {
if (params.vectorize) {
ss << "Vectorize, Factor: " << params.unroll_factor;
} else {
ss << "Unroll, Factor: " << params.unroll_factor;
}
}
return ss.str();
}
std::string toString(const TransposeParams& params) {
std::stringstream ss;
ss << "Tile size: (" << params.tile_size1 << "," << params.tile_size2
<< ")/";
ss << "Vectorize size: (" << params.vectorize_factor1 << ","
<< params.vectorize_factor2 << ")";
return ss.str();
}
std::string toString(const std::shared_ptr<HeuristicParams>& params) {
auto rparams = std::dynamic_pointer_cast<ReductionParams>(params);
if (rparams) {
return toString(*rparams);
}
auto pparams = std::dynamic_pointer_cast<PointwiseParams>(params);
if (pparams) {
return toString(*pparams);
}
auto tparams = std::dynamic_pointer_cast<TransposeParams>(params);
if (tparams) {
return toString(*tparams);
}
TORCH_INTERNAL_ASSERT(
false,
"Unknown heuristic parameter type. Did you just added a new heuristic parameter type but forget to update here?");
}
std::string toString(LaunchParams lparams) {
std::stringstream ss;
lparams.toString();
ss << "/Launch_Parameters["
<< "block(" << lparams.bdimz() << "/" << lparams.bdimy() << "/"
<< lparams.bdimx() << ")/grid(" << lparams.gdimz() << "/"
<< lparams.gdimy() << "/" << lparams.gdimx() << ")/" << lparams.smem()
<< "]";
return ss.str();
}
void clearL2Cache() {
torch::NoGradGuard no_grad;
auto l2_cache_size = at::cuda::getCurrentDeviceProperties()->l2CacheSize;
auto options =
torch::TensorOptions().dtype(torch::kFloat32).device(at::kCUDA, 0);
auto l2_elems = l2_cache_size / 4;
torch::Tensor t0 = torch::empty(l2_elems, options);
torch::Tensor t1 = torch::clone(t0);
};
TensorView* makeSymbolicTensor(size_t ndims, DataType dtype) {
return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
}
TensorView* makeContigTensor(size_t ndims, DataType dtype) {
return TensorViewBuilder()
.ndims(ndims)
.dtype(dtype)
.contiguity(std::vector<bool>(ndims, true))
.build();
}
TensorView* makeConcreteTensor(std::vector<int64_t> shape, DataType dtype) {
return TensorViewBuilder().shape(shape).dtype(dtype).build();
}
TensorView* makeContigConcreteTensor(
std::vector<int64_t> shape,
DataType dtype) {
return TensorViewBuilder()
.shape(shape)
.dtype(dtype)
.contiguity(std::vector<bool>(shape.size(), true))
.build();
}
void runBenchmarkIterations(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
std::vector<c10::IValue>& aten_inputs) {
fusion_executor_cache->runFusionWithInputs(aten_inputs);
bool segmented =
fusion_executor_cache->getMostRecentKernelRuntime()->isSegmented() &&
fusion_executor_cache->getMostRecentKernelRuntime()
->fusionSegments()
->groups()
.size() > 1;
if (!segmented) {
fusion_executor_cache->profile(true);
fusion_executor_cache->runFusionWithInputs(aten_inputs);
auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
auto executor_instance = compile_log.fusion_executor;
auto params = toString(compile_log.params);
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
benchmark_state.SetLabel(params + lparams);
executor_instance->setMeasureKernelTimeFlag(true);
// Sync everything up before we start
C10_CUDA_CHECK(cudaDeviceSynchronize());
for (auto _ : benchmark_state) {
clearL2Cache();
auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
benchmark_state.SetIterationTime(
executor_instance->kernelTimeMs() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
} else {
// Segmented
// Sync everything up before we start
{
// Compile/warmup
auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
}
C10_CUDA_CHECK(cudaDeviceSynchronize());
CudaKernelTimer timer;
for (auto _ : benchmark_state) {
clearL2Cache();
timer.restart();
auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
}
// Sync everything up before we're finished, don't want to run ahead on the
// cpu while benchmarking.
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
}
namespace executorCache {
thread_local ExecutorMap executor_map_;
ExecutorMap& getGlobalMap() {
return executor_map_;
}
} // namespace executorCache

View File

@ -1,204 +0,0 @@
#pragma once
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <benchmark/benchmark.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/torch.h>
#include <cuda_runtime.h>
using namespace torch::jit::fuser::cuda;
// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
// but unknown sizes
TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float);
// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
// but unknown sizes. Taken from test_gpu.cpp
TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float);
// Make a non-contiguous tensor of compile-time known sizes
TensorView* makeConcreteTensor(
std::vector<int64_t> shape,
DataType dtype = DataType::Float);
// Make a contiguous tensor of compile-time known sizes
TensorView* makeContigConcreteTensor(
std::vector<int64_t> shape,
DataType dtype = DataType::Float);
std::string toString(const ReductionParams& rparams);
std::string toString(const PointwiseParams& params);
std::string toString(const TransposeParams& params);
std::string toString(const std::shared_ptr<HeuristicParams>& params);
std::string toString(LaunchParams lparams);
// Run benchmark iterations with provided inputs. If not segmented, report
// kernel time from the runtime, as well as heuristic parameters. If segmented
// use timers. Make sure to clear L2 between iterations.
void runBenchmarkIterations(
benchmark::State& benchmark_state,
FusionExecutorCache* fusion_executor_cache,
std::vector<c10::IValue>& aten_inputs);
void clearL2Cache();
class CudaKernelTimer {
public:
CudaKernelTimer() {
// Setup
C10_CUDA_CHECK(cudaEventCreate(&start_event));
C10_CUDA_CHECK(cudaEventCreate(&finish_event));
C10_CUDA_CHECK(cudaEventRecord(start_event));
}
~CudaKernelTimer() {
C10_CUDA_IGNORE_ERROR(cudaEventDestroy(start_event));
C10_CUDA_IGNORE_ERROR(cudaEventDestroy(finish_event));
}
void restart() {
C10_CUDA_CHECK(cudaEventRecord(start_event));
}
float elapsed() {
// Record
C10_CUDA_CHECK(cudaEventRecord(finish_event));
C10_CUDA_CHECK(cudaEventSynchronize(start_event));
C10_CUDA_CHECK(cudaEventSynchronize(finish_event));
C10_CUDA_CHECK(
cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event));
return kernel_time_ms_;
}
private:
// Create
float kernel_time_ms_ = 0;
cudaEvent_t start_event = {};
cudaEvent_t finish_event = {};
};
namespace executorCache {
using ExecutorPtr = std::unique_ptr<FusionExecutorCache>;
using ExecutorMap = std::unordered_map<std::string, ExecutorPtr>;
ExecutorMap& getGlobalMap();
} // namespace executorCache
//! Utility to manage FusionExecutorCache instances for
//! all defined benchmarks
class BenchmarkGraph : public benchmark::Fixture {
public:
using SetupFusionFunction = std::function<void(Fusion*)>;
using SetupFusionMap = std::unordered_map<std::string, SetupFusionFunction>;
virtual std::string graphName() = 0;
virtual SetupFusionFunction setupFusion() = 0;
FusionExecutorCache* getExecutorCache() {
auto& executor_ = getExecutorCacheMap()[graphName()];
TORCH_INTERNAL_ASSERT(executor_);
return executor_.get();
}
void SetUp(const ::benchmark::State& state) {
auto& executor_ = getExecutorCacheMap()[graphName()];
// Makes sure same graph hasn't been compiled before
if (!executor_) {
auto fusion_ptr = std::make_unique<Fusion>();
FusionGuard(fusion_ptr.get());
setupFusion()(fusion_ptr.get());
getExecutorCacheMap()[graphName()] =
std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
}
}
void TearDown(const ::benchmark::State& state) {}
protected:
static executorCache::ExecutorMap& getExecutorCacheMap() {
return executorCache::getGlobalMap();
}
};
#define NVFUSER_TO_STRING_HELPER(n) std::string(#n)
#define NVFUSER_TO_STRING(n) NVFUSER_TO_STRING_HELPER(n)
//! NVFUSER_BENCHMARK_RUN utility usage:
//! This utility helps create and manage FusionExecutorCaches and tries to use
//! the caching
//! mechanism in NVFuser to avoid re-compilation.
//!
//! There are two macros in this utility: NVFUSER_BENCHMARK_DEFINE, and
//! NVFUSER_BENCHMARK_RUN,
//! and user needs to supply two functions SETUP_FUSION and RUN_FUSION, with
//! following signatures:
//!
//! SETUP_FUSION(Fusion* , args...);
//! RUN_FUSION(benchmark::State&, FusionExecutorCache* , args...);
//!
//! where args... are additional arguments, and they need to be the same for
//! SETUP_FUSION and RUN_FUSION.
//!
//! SETUP_FUSION is called once in each definition of benchmark to build the
//! fusionIR graph
//!
//! RUN_FUSION is just like the normal benchmark instance, except that a
//! FusionExecutorCache
//! will be provided for scheduling, running and timing the fusion runs. It is
//! called once in each benchmark instance. For example:
//! NVFUSER_BENCHMARK_RUN(my_benchmark)
//! ->RangeMultiplier(2)
//! ->Ranges({{1, 4})
//! Calls RUN_FUSION 3 times.
//!
//! To register a benchmark, the API is:
//!
//! NVFUSER_BENCHMARK_DEFINE(my_benchmark,SETUP_FUSION,RUN_FUSION,args...);
//!
//! where my_benchmark is any unique name given for this benchmark,
//! SETUP_FUSION, RUN_FUSION as described above,
//! args... is the arg list supplied to both setup_fusion and run_fusion
//!
//! each NVFUSER_BENCHMARK_DEFINE registers a benchmark with a single
//! FusionExecutorCache, i.e. a single fusion graph, and multiple benchmark
//! data points can be registered like:
//!
//! NVFUSER_BENCHMARK_RUN(my_benchmark)
//! ->Ranges({{1,2}});
//!
//! NVFUSER_BENCHMARK_RUN(my_benchmark)
//! ->Ranges({{3,4}});
//!
//! All datapoints will use the same FusionExecutorCache so recompilation is
//! avoided as much as possible.
#define NVFUSER_BENCHMARK_DEFINE( \
BENCHMARK_NAME, SETUP_FUSION, RUN_FUSION, ...) \
class BENCHMARK_NAME##___GRAPH : public BenchmarkGraph { \
public: \
std::string graphName() { \
return NVFUSER_TO_STRING(BENCHMARK_NAME##___GRAPH); \
} \
SetupFusionFunction setupFusion() { \
return [](Fusion* fusion) { SETUP_FUSION(fusion, __VA_ARGS__); }; \
} \
}; \
BENCHMARK_DEFINE_F(BENCHMARK_NAME##___GRAPH, BENCHMARK_NAME) \
(benchmark::State & benchmark_state) { \
RUN_FUSION( \
benchmark_state, \
BENCHMARK_NAME##___GRAPH::getExecutorCache(), \
__VA_ARGS__); \
}
#define NVFUSER_BENCHMARK_RUN(BENCHMARK_NAME) \
BENCHMARK_REGISTER_F(BENCHMARK_NAME##___GRAPH, BENCHMARK_NAME)

File diff suppressed because it is too large Load Diff

View File

@ -1,676 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <ir_interface_nodes.h>
#include <type.h>
#include <type_promotion.h>
class Val;
/*
* The operations defined in this header is intended as user facing functions.
* Generally users should not directly instantiate temporary TensorViews they
* should instead use the functions below which will automatically create IR
* nodes, and return a resulting TensorView of correctly tracked shapes.
*/
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
// Insertion of casting op to dtype, returns new resulting val
TORCH_CUDA_CU_API Val* castOp(DataType dtype, Val* v1);
TORCH_CUDA_CU_API TensorView* castOp(DataType dtype, TensorView* v1);
TORCH_CUDA_CU_API Val* bitCastOp(DataType dtype, Val* v1);
TORCH_CUDA_CU_API TensorView* bitCastOp(DataType dtype, TensorView* v1);
// Perform unary op type and return the output
TORCH_CUDA_CU_API Val* unaryOp(UnaryOpType type, Val* v1);
TORCH_CUDA_CU_API TensorView* unaryOp(UnaryOpType type, TensorView* v1);
TORCH_CUDA_CU_API Val* unaryIsOp(UnaryOpType type, Val* v1);
TORCH_CUDA_CU_API TensorView* unaryIsOp(UnaryOpType type, TensorView* v1);
TORCH_CUDA_CU_API Val* unaryOp(
UnaryOpType type,
Val* v1,
const TypePromotionConfig& config);
TORCH_CUDA_CU_API TensorView* unaryOp(
UnaryOpType type,
TensorView* v1,
const TypePromotionConfig& config);
// Perform binary op type on v1 and v2 and return a type promoted output.
// Mod, CeilDiv, and LT are considered Int only output operations for now.
TORCH_CUDA_CU_API Val* binaryOp(
BinaryOpType type,
Val* v1,
Val* v2,
DataType out_dtype = DataType::Null);
TORCH_CUDA_CU_API TensorView* binaryOp(
BinaryOpType type,
TensorView* v1,
Val* v2,
DataType out_dtype = DataType::Null);
TORCH_CUDA_CU_API TensorView* binaryOp(
BinaryOpType type,
Val* v1,
TensorView* v2,
DataType out_dtype = DataType::Null);
TORCH_CUDA_CU_API TensorView* binaryOp(
BinaryOpType type,
TensorView* v1,
TensorView* v2,
DataType out_dtype = DataType::Null);
TORCH_CUDA_CU_API Val* binaryOp(
BinaryOpType type,
Val* v1,
Val* v2,
const TypePromotionConfig& config);
TORCH_CUDA_CU_API TensorView* binaryOp(
BinaryOpType type,
TensorView* v1,
Val* v2,
const TypePromotionConfig& config);
TORCH_CUDA_CU_API TensorView* binaryOp(
BinaryOpType type,
Val* v1,
TensorView* v2,
const TypePromotionConfig& config);
TORCH_CUDA_CU_API TensorView* binaryOp(
BinaryOpType type,
TensorView* v1,
TensorView* v2,
const TypePromotionConfig& config);
// Perform a reduction operation on v1, initial value for reduction is init,
// reduces across axes, and reduction operation defined by BinaryOp.
TORCH_CUDA_CU_API TensorView* reductionOp(
BinaryOpType reduction_op_type,
const std::vector<int>& axes,
Val* init,
TensorView* v1,
bool keep_dim = false,
DataType dtype = DataType::Null);
//! Auxiliary Struct holding result of
//! a single welford op in ternsorview
class TORCH_CUDA_CU_API WelfordResult {
public:
TensorView* avg;
TensorView* var_sum;
TensorView* n;
explicit WelfordResult(
TensorView* in_avg,
TensorView* in_var_sum,
TensorView* in_n);
};
//! Welford operator on specified axes. This is currently the only scan op with
//! multiple outputs that is supported. May consider generalization if more scan
//! ops are added.
TORCH_CUDA_CU_API WelfordResult Welford(
TensorView* tv,
const std::vector<int>& axes,
TensorView* init_avg = nullptr,
TensorView* init_var = nullptr,
// Initializes to 0 in function definition, doing this so we don't have to
// import IrBuilder just for this one interface.
Int* init_N = nullptr);
// RNG OPERATIONS
TORCH_CUDA_CU_API TensorView* rand(
const std::vector<Val*>& shape,
DataType dtype);
TORCH_CUDA_CU_API Val* rand_like(Val*);
TORCH_CUDA_CU_API TensorView* rand_like(TensorView*);
TORCH_CUDA_CU_API TensorView* uniform(
const std::vector<Val*>& shape,
Val* low,
Val* high,
DataType dtype);
// TENSOR FACTORIES
TORCH_CUDA_CU_API TensorView* full(
const std::vector<Val*>& shape,
Val* fill_value,
DataType dtype);
TORCH_CUDA_CU_API TensorView* full_like(TensorView* tv, Val* fill_value);
TORCH_CUDA_CU_API Val* full_like(Val* tv, Val* fill_value);
TORCH_CUDA_CU_API TensorView* zeros(
const std::vector<Val*>& shape,
DataType dtype);
TORCH_CUDA_CU_API TensorView* zeros_like(TensorView*);
TORCH_CUDA_CU_API Val* zeros_like(Val*);
TORCH_CUDA_CU_API TensorView* ones(
const std::vector<Val*>& shape,
DataType dtype);
TORCH_CUDA_CU_API TensorView* ones_like(TensorView*);
TORCH_CUDA_CU_API Val* ones_like(Val*);
//! WARNING: giving invalid combinations of the start, end and step
//! arguments can result in undefined behavior. Specifically, the
//! signs of `end - start` and step must be the same.
TORCH_CUDA_CU_API TensorView* arange(Val* end, DataType dtype = DataType::Int);
TORCH_CUDA_CU_API TensorView* arange(
Val* start,
Val* end,
DataType dtype = DataType::Int);
TORCH_CUDA_CU_API TensorView* arange(
Val* start,
Val* end,
Val* step,
DataType dtype = DataType::Int);
TORCH_CUDA_CU_API TensorView* eye(Val* size, DataType dtype);
TORCH_CUDA_CU_API TensorView* eye(Val* rows, Val* cols, DataType dtype);
// UNARY OPERATIONS
// abs
TORCH_CUDA_CU_API Val* abs(Val*);
TORCH_CUDA_CU_API TensorView* abs(TensorView*);
// acos
TORCH_CUDA_CU_API Val* acos(Val*);
TORCH_CUDA_CU_API TensorView* acos(TensorView*);
// asin
TORCH_CUDA_CU_API Val* asin(Val*);
TORCH_CUDA_CU_API TensorView* asin(TensorView*);
// atan
TORCH_CUDA_CU_API Val* atan(Val*);
TORCH_CUDA_CU_API TensorView* atan(TensorView*);
// atanh
TORCH_CUDA_CU_API Val* atanh(Val*);
TORCH_CUDA_CU_API TensorView* atanh(TensorView*);
// ceil
TORCH_CUDA_CU_API Val* ceil(Val*);
TORCH_CUDA_CU_API TensorView* ceil(TensorView*);
// cos
TORCH_CUDA_CU_API Val* cos(Val*);
TORCH_CUDA_CU_API TensorView* cos(TensorView*);
// cosh
TORCH_CUDA_CU_API Val* cosh(Val*);
TORCH_CUDA_CU_API TensorView* cosh(TensorView*);
// exp
TORCH_CUDA_CU_API Val* exp(Val*);
TORCH_CUDA_CU_API TensorView* exp(TensorView*);
// expm1
TORCH_CUDA_CU_API Val* expm1(Val*);
TORCH_CUDA_CU_API TensorView* expm1(TensorView*);
// erf
TORCH_CUDA_CU_API Val* erf(Val*);
TORCH_CUDA_CU_API TensorView* erf(TensorView*);
// erfc
TORCH_CUDA_CU_API Val* erfc(Val*);
TORCH_CUDA_CU_API TensorView* erfc(TensorView*);
// floor
TORCH_CUDA_CU_API Val* floor(Val*);
TORCH_CUDA_CU_API TensorView* floor(TensorView*);
// frac
TORCH_CUDA_CU_API Val* frac(Val*);
TORCH_CUDA_CU_API TensorView* frac(TensorView*);
// silu
TORCH_CUDA_CU_API Val* silu(Val*);
TORCH_CUDA_CU_API TensorView* silu(TensorView*);
// lgamma
TORCH_CUDA_CU_API Val* lgamma(Val*);
TORCH_CUDA_CU_API TensorView* lgamma(TensorView*);
// log
TORCH_CUDA_CU_API Val* log(Val*);
TORCH_CUDA_CU_API TensorView* log(TensorView*);
// log10
TORCH_CUDA_CU_API Val* log10(Val*);
TORCH_CUDA_CU_API TensorView* log10(TensorView*);
// log1p
TORCH_CUDA_CU_API Val* log1p(Val*);
TORCH_CUDA_CU_API TensorView* log1p(TensorView*);
// log2
TORCH_CUDA_CU_API Val* log2(Val*);
TORCH_CUDA_CU_API TensorView* log2(TensorView*);
// neg
TORCH_CUDA_CU_API Val* neg(Val*);
TORCH_CUDA_CU_API TensorView* neg(TensorView*);
// real
TORCH_CUDA_CU_API Val* real(Val*);
TORCH_CUDA_CU_API TensorView* real(TensorView*);
// reciprocal
TORCH_CUDA_CU_API Val* reciprocal(Val*);
TORCH_CUDA_CU_API TensorView* reciprocal(TensorView*);
// relu
TORCH_CUDA_CU_API Val* relu(Val*);
TORCH_CUDA_CU_API TensorView* relu(TensorView*);
// rsqrt
TORCH_CUDA_CU_API Val* rsqrt(Val*);
TORCH_CUDA_CU_API TensorView* rsqrt(TensorView*);
// round
TORCH_CUDA_CU_API Val* round(Val*);
TORCH_CUDA_CU_API TensorView* round(TensorView*);
// set
TORCH_CUDA_CU_API Val* set(Val*);
TORCH_CUDA_CU_API TensorView* set(TensorView*);
// sigmoid
TORCH_CUDA_CU_API Val* sigmoid(Val*);
TORCH_CUDA_CU_API TensorView* sigmoid(TensorView*);
// sin
TORCH_CUDA_CU_API Val* sin(Val*);
TORCH_CUDA_CU_API TensorView* sin(TensorView*);
// sinh
TORCH_CUDA_CU_API Val* sinh(Val*);
TORCH_CUDA_CU_API TensorView* sinh(TensorView*);
// sqrt
TORCH_CUDA_CU_API Val* sqrt(Val*);
TORCH_CUDA_CU_API TensorView* sqrt(TensorView*);
// tan
TORCH_CUDA_CU_API Val* tan(Val*);
TORCH_CUDA_CU_API TensorView* tan(TensorView*);
// tanh
TORCH_CUDA_CU_API Val* tanh(Val*);
TORCH_CUDA_CU_API TensorView* tanh(TensorView*);
// trunc
TORCH_CUDA_CU_API Val* trunc(Val*);
TORCH_CUDA_CU_API TensorView* trunc(TensorView*);
// bitwise_not
TORCH_CUDA_CU_API Val* bitwise_not(Val*);
TORCH_CUDA_CU_API TensorView* bitwise_not(TensorView*);
// imag
TORCH_CUDA_CU_API Val* imag(Val*);
TORCH_CUDA_CU_API TensorView* imag(TensorView*);
// isfinite
TORCH_CUDA_CU_API Val* isfinite(Val*);
TORCH_CUDA_CU_API TensorView* isfinite(TensorView*);
// isinf
TORCH_CUDA_CU_API Val* isinf(Val*);
TORCH_CUDA_CU_API TensorView* isinf(TensorView*);
// isnan
TORCH_CUDA_CU_API Val* isnan(Val*);
TORCH_CUDA_CU_API TensorView* isnan(TensorView*);
// isneginf
TORCH_CUDA_CU_API Val* isneginf(Val*);
TORCH_CUDA_CU_API TensorView* isneginf(TensorView*);
// isposinf
TORCH_CUDA_CU_API Val* isposinf(Val*);
TORCH_CUDA_CU_API TensorView* isposinf(TensorView*);
// isreal
TORCH_CUDA_CU_API Val* isreal(Val*);
TORCH_CUDA_CU_API TensorView* isreal(TensorView*);
// print
TORCH_CUDA_CU_API Val* print(Val*);
TORCH_CUDA_CU_API TensorView* print(TensorView*);
// Broadcasts inp based on bool vector. Size of broadcast bool vector should be
// the number of dims desired in the broadcasted tensor. This vector should be
// true if output dim should be a broadcasted dim, and false if it is not a
// broadcasted dim. Number of false entires must match the number of input dims.
TORCH_CUDA_CU_API TensorView* broadcast(
TensorView* inp,
const std::vector<bool>& is_broadcast_dim);
// Expands input based on provided sizes. expand_sizes should be larger than
// the input's root domain (really rfactor) and will broadcast on inner
// dimensions. expand_sizes should be -1 for any dimension that should remain a
// symbolic size. For dimensions that remain broadcast after the expand should
// be set to 1, any dimension being expanded must be marked as a broadcast in
// the input and will be expanded to the provided constant size. Any dimension
// that's symbolic in the input but specified as a non -1 value will be set to
// that constant value.
TORCH_CUDA_CU_API TensorView* expand(
TensorView* inp,
const std::vector<Val*>& expanded_sizes);
// Expands input based on other. For dimensions in inp that are broadcast with a
// matching entry in other that's either a broadcast with expanded extent or a
// non broadcasted iter domain, inp will be expanded to other's size.
TORCH_CUDA_CU_API TensorView* expand_as(TensorView* inp, TensorView* other);
// BINARY OPERATIONS
// add
TORCH_CUDA_CU_API Val* add(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* add(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* add(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* add(TensorView* v1, TensorView* v2);
// atan2
TORCH_CUDA_CU_API Val* atan2(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* atan2(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* atan2(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* atan2(TensorView* v1, TensorView* v2);
// div
TORCH_CUDA_CU_API Val* div(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* div(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* div(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* div(TensorView* v1, TensorView* v2);
// fmod
TORCH_CUDA_CU_API Val* fmod(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* fmod(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* fmod(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* fmod(TensorView* v1, TensorView* v2);
// mul
TORCH_CUDA_CU_API Val* mul(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* mul(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* mul(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* mul(TensorView* v1, TensorView* v2);
// pow
TORCH_CUDA_CU_API Val* pow(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* pow(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* pow(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* pow(TensorView* v1, TensorView* v2);
// remainder
TORCH_CUDA_CU_API Val* remainder(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* remainder(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* remainder(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* remainder(TensorView* v1, TensorView* v2);
// sub
TORCH_CUDA_CU_API Val* sub(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* sub(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* sub(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* sub(TensorView* v1, TensorView* v2);
// Integer binary ops
// mod
TORCH_CUDA_CU_API Val* mod(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* mod(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* mod(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* mod(TensorView* v1, TensorView* v2);
// ceilDiv
TORCH_CUDA_CU_API Val* ceilDiv(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* ceilDiv(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* ceilDiv(TensorView* v1, TensorView* v2);
// Bitwise binary ops
// bitwise_and
TORCH_CUDA_CU_API Val* bitwise_and(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* bitwise_and(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* bitwise_and(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* bitwise_and(TensorView* v1, TensorView* v2);
// bitwise_left_shift
TORCH_CUDA_CU_API Val* bitwise_left_shift(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* bitwise_left_shift(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* bitwise_left_shift(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* bitwise_left_shift(
TensorView* v1,
TensorView* v2);
// bitwise_right_shift
TORCH_CUDA_CU_API Val* bitwise_right_shift(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* bitwise_right_shift(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* bitwise_right_shift(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* bitwise_right_shift(
TensorView* v1,
TensorView* v2);
// bitwise_or
TORCH_CUDA_CU_API Val* bitwise_or(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* bitwise_or(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* bitwise_or(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* bitwise_or(TensorView* v1, TensorView* v2);
// bitwise_xor
TORCH_CUDA_CU_API Val* bitwise_xor(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* bitwise_xor(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* bitwise_xor(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* bitwise_xor(TensorView* v1, TensorView* v2);
// Logical binary ops
// eq
TORCH_CUDA_CU_API Val* eq(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* eq(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* eq(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* eq(TensorView* v1, TensorView* v2);
// ge
TORCH_CUDA_CU_API Val* ge(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* ge(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* ge(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* ge(TensorView* v1, TensorView* v2);
// gt
TORCH_CUDA_CU_API Val* gt(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* gt(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* gt(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* gt(TensorView* v1, TensorView* v2);
// le
TORCH_CUDA_CU_API Val* le(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* le(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* le(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* le(TensorView* v1, TensorView* v2);
// lt
TORCH_CUDA_CU_API Val* lt(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* lt(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* lt(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* lt(TensorView* v1, TensorView* v2);
// ne
TORCH_CUDA_CU_API Val* ne(Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* ne(TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* ne(Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* ne(TensorView* v1, TensorView* v2);
// REDUCTION OPERATIONS
TORCH_CUDA_CU_API TensorView* sum(
TensorView* v1,
const std::vector<int>& reduction_axes,
bool keep_dim = false,
DataType dtype = DataType::Null);
TORCH_CUDA_CU_API TensorView* max(
TensorView* v1,
const std::vector<int>& reduction_axes,
bool keep_dim = false,
DataType dtype = DataType::Null);
TORCH_CUDA_CU_API TensorView* min(
TensorView* v1,
const std::vector<int>& reduction_axes,
bool keep_dim = false,
DataType dtype = DataType::Null);
// COMPOUND OPERATIONS
// add_alpha
TORCH_CUDA_CU_API Val* add_alpha(Val* v1, Val* v2, Val* s);
TORCH_CUDA_CU_API TensorView* add_alpha(TensorView* v1, Val* v2, Val* s);
TORCH_CUDA_CU_API TensorView* add_alpha(Val* v1, TensorView* v2, Val* s);
TORCH_CUDA_CU_API TensorView* add_alpha(TensorView* v1, TensorView* v2, Val* s);
// sub_alpha
TORCH_CUDA_CU_API Val* sub_alpha(Val* v1, Val* v2, Val* s);
TORCH_CUDA_CU_API TensorView* sub_alpha(TensorView* v1, Val* v2, Val* s);
TORCH_CUDA_CU_API TensorView* sub_alpha(Val* v1, TensorView* v2, Val* s);
TORCH_CUDA_CU_API TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* s);
// lerp
TORCH_CUDA_CU_API Val* lerp(Val* start, Val* end, Val* weight);
TORCH_CUDA_CU_API TensorView* lerp(TensorView* start, Val* end, Val* weight);
TORCH_CUDA_CU_API TensorView* lerp(Val* start, TensorView* end, Val* weight);
TORCH_CUDA_CU_API TensorView* lerp(Val* start, Val* end, TensorView* weight);
TORCH_CUDA_CU_API TensorView* lerp(
TensorView* start,
TensorView* end,
Val* weight);
TORCH_CUDA_CU_API TensorView* lerp(
TensorView* start,
Val* end,
TensorView* weight);
TORCH_CUDA_CU_API TensorView* lerp(
Val* start,
TensorView* end,
TensorView* weight);
TORCH_CUDA_CU_API TensorView* lerp(
TensorView* start,
TensorView* end,
TensorView* weight);
// addcmul
TORCH_CUDA_CU_API Val* addcmul(Val* v1, Val* v2, Val* v3, Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(TensorView* v1, Val* v2, Val* v3, Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(Val* v1, TensorView* v2, Val* v3, Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(Val* v1, Val* v2, TensorView* v3, Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(
TensorView* v1,
TensorView* v2,
Val* v3,
Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(
TensorView* v1,
Val* v2,
TensorView* v3,
Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(
Val* v1,
TensorView* v2,
TensorView* v3,
Val* s);
TORCH_CUDA_CU_API TensorView* addcmul(
TensorView* v1,
TensorView* v2,
TensorView* v3,
Val* s);
// TERNARY OPERATIONS
// where
TORCH_CUDA_CU_API Val* where(Val* c, Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* where(TensorView* c, Val* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* where(Val* c, TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* where(Val* c, Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* where(TensorView* c, TensorView* v1, Val* v2);
TORCH_CUDA_CU_API TensorView* where(TensorView* c, Val* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* where(Val* c, TensorView* v1, TensorView* v2);
TORCH_CUDA_CU_API TensorView* where(
TensorView* c,
TensorView* v1,
TensorView* v2);
// threshold
TORCH_CUDA_CU_API Val* threshold(Val* in, Val* thresh, Val* value);
TORCH_CUDA_CU_API TensorView* threshold(
TensorView* in,
Val* thresh,
Val* value);
// clamp
TORCH_CUDA_CU_API Val* clamp(Val* in, Val* min_val, Val* max_val);
TORCH_CUDA_CU_API TensorView* clamp(TensorView* in, Val* min_val, Val* max_val);
//! Internal operator for supporting backward graphs
//!
//! example:
//! v1 = T1 [I0(10),I1(20),I2(30),I3(40)]
//! v2 = sum_to(v1,{30,1}) ------> v2 = T2[I2,R3 (keep_dim)]
//!
//! This operator will return v1* directly if sizes of v1 root domain
//! is already the same as shape.
//!
//! Name of sum_to is different from NV fuser naming,
//! this is to align with the operator name of at::sum_to.
TORCH_CUDA_CU_API TensorView* sum_to(
TensorView* v1,
const std::vector<Int*>& sum_to_size);
TORCH_CUDA_CU_API TensorView* sum_to(
TensorView* v1,
const std::vector<int64_t>& sum_to_size);
//! Shift a tensor to a direction specified by offsets.
//!
//! Example:
//! t0: 2D tensor of size N by M
//! t1 = shift(t0, {1, -1});
//!
//! then:
//! t1[i, j] = t0[i-1, j+1] for 1 <= i < N and 0 <= j < M-1.
//! t1[i, j] = 0, otherwise
//!
//! The pad option controls how out-of-boundary accesses are
//! handled. It specifies how many zeros are logically padded. If no
//! pad option is given, it automatically pads the input tensor so
//! that the output tensor has the same extent for each axis.
//!
//! When a padding value is smaller than the absolute value of a shift
//! offset, the output axis still has the same extent but its start or
//! stop offset is moved inward to signify those outside of the offset
//! are invalid.
//!
//! It is not allowed to use padding values that are larger than shift
//! offsets, which would mean output extentes would be larger than
//! input extents
TORCH_CUDA_CU_API TensorView* shift(
TensorView* inp,
const std::vector<int>& offsets,
const std::vector<int>& pad_width = {});
TORCH_CUDA_CU_API TensorView* shift(
TensorView* inp,
const std::vector<int>& offsets,
bool pad);
//! Gather a window of nearby elements for each element.
//!
//! Each window of size window_shape is stored as a additional
//! innermost domain, meaning that the number of dimensions of the
//! output tensor doubles. The pad_width parameter specifies the
//! padding width of each side of each axis. The strides parameter
//! specifies striding of the operation. Non-unit striding is
//! implemented with strided split, whose outer output domain becomes
//! the root domain for subsequent consumers. The inner output domain
//! becomes a Stride domain, which is ignored by subsequent consumers.
//! Only valid input ranges are fed into strided splits.
//!
//! When trim_out_of_bounds is true, the values at the first and last
//! ends that are outside of the start and stop offsets are
//! effetively trimmed by partial split by 1.
//!
//! Example 1:
//! t0: 2D tensor of [N, M]
//! t1 = gather(t0, {1, 3}, {{0, 0}, {1, 1}});
//!
//! then:
//! t1: [N, M, 1, 3]
//! t1[i, j, k, l] = The value at the window position of [k, l]
//! for t0[i, j]
//!
//! Example 2.1 (without trimming):
//! t0: 2D tensor of [N, M]
//! t1 = gather(t0, {2, 2}, {{0, 0}, {0, 0}});
//!
//! then:
//! t1: [N (stop offset: 1), M (stop offset: 1, 2, 2)]
//!
//! Example 2.1 (with trimming)
//! t0: 2D tensor of [N, M]
//! t1 = gather(t0, {2, 2}, {{0, 0}, {0, 0}}, true);
//!
//! then:
//! t1: [ceilDiv(N - 1, 1), ceilDiv(M - 1, 1), 2, 2]
//!
//! Example 3:
//! t0: 2D tensor of [N, M]
//! t1 = gather(t0, {3, 3}, {{0, 0}, {0, 0}}, {3, 3});
//!
//! then:
//! t1: [ceilDiv(N - 2, 3), ceilDiv(M - 2, 3), 2, 2]
//!
TORCH_CUDA_CU_API TensorView* gather(
TensorView* inp,
const std::vector<int>& window_shape,
const std::vector<std::vector<int>>& pad_width,
const std::vector<int>& strides = {},
bool trim_out_of_bounds = false);
// Append a new IterDomain to the end of a TenorView to allow
// iterating on a vector type. The input tensor must have
// vector dtype.
TORCH_CUDA_CU_API TensorView* viewAsScalar(TensorView* inp);
//! A fused pointwise multiply and sum
//! operator that instantiates the following
//! fused pattern:
//! c = mul(tv_a, tv_b);
//! return sum(c, axes)
//!
//! \param tv_a first multiply operand
//! \param tv_b second multiply operand
//! \param axes axes to sum over
//! \param init sum initial value
//!
//! Note & TODO:
//! currently only support lowering to a mma op
//! through this interface and only support fp16 inputs.
//! will support converting back to multiply and reduce in
//! a follow up.
TORCH_CUDA_CU_API TensorView* fusedMultiplySum(
TensorView* tv_a,
TensorView* tv_b,
const std::vector<int>& axes,
Val* init = nullptr);
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

View File

@ -1,23 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <kernel.h>
#include <string>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace codegen {
//! Generates a CUDA kernel definition for the given kernel
TORCH_CUDA_CU_API std::string generateCudaKernel(
const kir::Kernel* kernel,
const std::string& kernel_name = "CUDAGeneratedKernel");
} // namespace codegen
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,277 +0,0 @@
#include <compute_at.h>
#include <instrumentation.h>
#include <ir_all_nodes.h>
#include <ir_iostream.h>
#include <ir_utils.h>
#include <lower_utils.h>
#include <root_domain_map.h>
#include <transform_iter.h>
#include <c10/util/irange.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
// Simple selector that only propagates across tensor views in the provided
// unordered_set. Will also propagate to all consumers of those tensors, and the
// siblings of those tensors.
class ComputeAtSelector : public MaxInfoSpanningTree::Selector {
std::unordered_set<TensorView*> selected_;
public:
virtual bool allowC2P(TensorView* from, TensorView* to) override {
return selected_.count(to) > 0;
}
virtual bool allowP2C(TensorView* from, TensorView* to) override {
// If the producer is in the selected set, then the consumer must also be
// replayed to obtain a compatible loop structure so that this producer
// can be consumed in this loop.
return selected_.count(from) > 0 || selected_.count(to) > 0;
}
virtual bool allowSibling(TensorView* from, TensorView* to) override {
return true;
}
ComputeAtSelector(std::unordered_set<TensorView*> selected)
: selected_(std::move(selected)) {}
const std::unordered_set<TensorView*>& selected() const {
return selected_;
}
};
namespace {
// Wrapper around set_intersection
template <typename T>
std::set<T> set_intersection(const std::set<T>& set1, const std::set<T>& set2) {
std::set<T> intersection;
std::set_intersection(
set1.begin(),
set1.end(),
set2.begin(),
set2.end(),
std::inserter(intersection, intersection.begin()));
return intersection;
}
std::deque<std::deque<TensorView*>> tvChains(
std::deque<std::deque<Val*>> val_chains) {
std::deque<std::deque<TensorView*>> tv_chains(val_chains.size());
for (const auto i : c10::irange(val_chains.size())) {
auto tv_iterable = ir_utils::filterByType<TensorView>(val_chains[i]);
tv_chains[i] =
std::deque<TensorView*>(tv_iterable.begin(), tv_iterable.end());
}
return tv_chains;
}
std::unordered_set<TensorView*> getAllTVsBetween(
TensorView* producer,
TensorView* consumer) {
TORCH_CHECK(
DependencyCheck::isDependencyOf(producer, consumer),
"Compute At expects ",
producer->name(),
" is a dependency of ",
consumer->name(),
", however it is not.");
auto between_vals =
DependencyCheck::getAllValsBetween({producer}, {consumer});
auto between_tvs = ir_utils::filterByType<TensorView>(between_vals);
std::unordered_set<TensorView*> result(
between_tvs.begin(), between_tvs.end());
result.erase(consumer);
return result;
}
TensorView* getCommonConsumer(TensorView* producer, TensorView* consumer) {
FUSER_PERF_SCOPE("ComputeAt::setCommonConsumer");
auto producer_use_chains_ =
tvChains(DependencyCheck::getAllUseChains(producer));
// Convert the first chain to a set.
std::set<TensorView*> common_consumers(
producer_use_chains_.front().begin(), producer_use_chains_.front().end());
// Run through all use chains of producer, and intersect them to find common
// TVs
for (auto tv_chain : producer_use_chains_) {
common_consumers = set_intersection(
common_consumers,
std::set<TensorView*>(tv_chain.begin(), tv_chain.end()));
}
auto all_chains =
tvChains(DependencyCheck::getAllDependencyChains(producer, consumer));
// Right now we only support compute at if at some point in the graph consumer
// is dependent on producer.
TORCH_CHECK(
!all_chains.empty(),
"Compute At expects ",
producer->name(),
" is a dependency of ",
consumer->name(),
", however it is not.");
// Remove all TVs from producer to consumer as common consumer must be at or
// after consumer
for (const auto& tv_chain : all_chains) {
for (auto tv : tv_chain) {
if (tv != consumer)
common_consumers.erase(tv);
}
}
// If there is a common consumer, grab the first one at or after consumer
TensorView* common_consumer = nullptr;
if (!common_consumers.empty()) {
for (auto tv : producer_use_chains_.front()) {
if (common_consumers.find(tv) != common_consumers.end()) {
common_consumer = tv;
break;
}
}
TORCH_INTERNAL_ASSERT(
common_consumer != nullptr,
"Hit a logical inconsistency in the computeAt pass.");
}
return common_consumer;
}
void pullInSiblings(std::unordered_set<TensorView*>& s) {
for (auto tv : s) {
for (auto sibling_tv : ir_utils::siblingTvsOf(tv)) {
if (sibling_tv == tv) {
continue;
}
s.emplace(sibling_tv);
}
}
}
// I am just trying to get the same set of tensors being transformed matching
// the previous behavior of ComputeAt. The algorithm to compute this set is
// horrible, but I don't care because I will eventually completely remove
// ComputeAt, and this algorihtm is not worse than the pervious ComputeAt. :)
std::unordered_set<TensorView*> getPropagationSubgraph(
TensorView* producer,
TensorView* consumer) {
TORCH_CHECK(
DependencyCheck::isDependencyOf(producer, consumer),
"Compute At expects ",
producer->name(),
" is a dependency of ",
consumer->name(),
", however it is not.");
TensorView* common_consumer = getCommonConsumer(producer, consumer);
if (common_consumer != nullptr) {
auto result = getAllTVsBetween(producer, common_consumer);
pullInSiblings(result);
return result;
}
auto result_vals = DependencyCheck::getAllDependentVals({producer});
result_vals.emplace(producer);
auto result_tvs = ir_utils::filterByType<TensorView>(result_vals);
std::unordered_set<TensorView*> result;
std::copy_if(
result_tvs.begin(),
result_tvs.end(),
std::inserter(result, result.begin()),
[](TensorView* tv) { return !tv->uses().empty(); });
pullInSiblings(result);
return result;
}
} // namespace
void ComputeAt::runAt(
TensorView* producer,
TensorView* consumer,
int64_t consumer_position,
ComputeAtMode mode) {
FUSER_PERF_SCOPE("ComputeAt::runAt");
// Make sure the correct fusion is setup between this and consumer.
TORCH_CHECK(
producer->fusion() == consumer->fusion(),
producer,
" and ",
consumer,
" are not in the same fusion.");
if (mode == ComputeAtMode::MostInlined) {
consumer_position = -1;
}
FusionGuard fg(producer->fusion());
auto selected = getPropagationSubgraph(producer, consumer);
ComputeAtSelector selector(selected);
MaxRootDomainInfoSpanningTree path(consumer, consumer_position, &selector);
if (mode == ComputeAtMode::MostInlined) {
MostInlinedTransformPropagator propagator;
path.traverse(&propagator);
inlineMost(selected);
} else {
TransformPropagator propagator(consumer, consumer_position);
path.traverse(&propagator);
inlineSelectedAt(
selected,
consumer,
consumer_position,
mode == ComputeAtMode::BestEffort);
}
}
void ComputeAt::runWith(
TensorView* producer,
TensorView* consumer,
int64_t producer_position,
ComputeAtMode mode) {
FUSER_PERF_SCOPE("ComputeAt::runWith");
// Make sure the correct fusion is setup between this and consumer.
TORCH_CHECK(
producer->fusion() == consumer->fusion(),
producer,
" and ",
consumer,
" are not in the same fusion.");
if (mode == ComputeAtMode::MostInlined) {
producer_position = -1;
}
FusionGuard fg(producer->fusion());
auto selected = getPropagationSubgraph(producer, consumer);
ComputeAtSelector selector(selected);
MaxRootDomainInfoSpanningTree path(producer, producer_position, &selector);
if (mode == ComputeAtMode::MostInlined) {
MostInlinedTransformPropagator propagator;
path.traverse(&propagator);
inlineMost(selected);
} else {
TransformPropagator propagator(producer, producer_position);
path.traverse(&propagator);
inlineSelectedAt(
selected,
producer,
producer_position,
mode == ComputeAtMode::BestEffort);
}
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,45 +0,0 @@
#pragma once
#include <inlining.h>
#include <root_domain_map.h>
#include <transform_replay.h>
#include <c10/macros/Export.h>
#include <c10/util/Exception.h>
#include <deque>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class TensorDomain;
class TensorView;
struct ComputeAt {
public:
// Runs the compute at pass making producer look like consumer, computing
// producer relative to consumer
static void runAt(
TensorView* producer,
TensorView* consumer,
int64_t consumer_position,
ComputeAtMode mode = ComputeAtMode::Standard);
// Runs the compute with pass making consumer look like producer, computing
// producer relative to consumer
static void runWith(
TensorView* producer,
TensorView* consumer,
int64_t producer_position,
ComputeAtMode mode = ComputeAtMode::Standard);
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

View File

@ -1,264 +0,0 @@
#pragma once
#include <disjoint_set.h>
#include <ir_all_nodes.h>
#include <kernel_ir.h>
#include <lower_trivial_reductions.h>
#include <deque>
#include <unordered_map>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
// There's three modes of these iter domain mappings all uniquely important in
// the lowering process.
//
// For EXACT/PERMISSIVE mode consider:
//
// consumer[i0, b1] = producer[i0]
// consumer->merge(0) (consumer will now be [i0 * b1])
// When producer is replayed as consumer (the direction we use for mapping)
// with BestEffortReplay forward_bcast_mismatch = True the producer to
// consumer map will have both a mapping of consumer(i0) to producer(i0) as
// well as consumer(i0*b1) to producer(i0). This latter mapping is important
// for loop nest mappings as the consumer will generate a loop based on i0*b1
// and the producer may be computeAt inside this loop nest. However, for
// indexing we do not want these two maps as producer may be indexed as i0*i1
// depending on the loop nest structure and how it was built. Therefore we
// really need to carry (at least) two sets of maps around for lowering.
//
// LOOP mode is important if we have something like:
// consumer[i0o, threadIdx.x{i0i}] = producer[i0o, threadIdx.y{i0i}](computeAt
// = 1) which can easily happen when using shared memory. We want to make sure
// that the iteration domain used for loop construction (concreteId) has the
// proper parallelization strategy. In parallel mode we do typical iteration
// domain mapping, however we remove from it any iteration domains outside the
// computeAt of producer when mapping. This guarentees we won't map
// IterDomains that could have different parallelization strategies. We also
// propagate the parallel strategy in parallel mode so all mapped IDs that
// must have the same parallel type, do.
//
// IdMappingMode::LOOP
// Only maps leaf axes to left of compute at
// Forward broadcast axes in replay
// IdMappingMode::PERMISSIVE
// Forward broadcast axes in replay
// Map all iteration domains
// Always contain root mappings (otherwise they could have been forwarded in
// broadcast)
// IdMappingMode::EXACT
// Don't map any broadcast axes to non-broadcast axes
// Do not forward through any broadcast IDs
class TORCH_CUDA_CU_API IterDomainGraph {
public:
IterDomainGraph(Fusion* fusion, bool allow_self_mapping = false);
const DisjointSets<IterDomain*>& permissiveNodes() const {
return permissive_nodes_;
}
const DisjointSets<IterDomain*>& exactNodes() const {
return exact_nodes_;
}
const DisjointSets<IterDomain*>& loopNodes() const {
return loop_nodes_;
}
// Consumers and producers is not symmetric like the other sets
const std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>&
consumers() const {
return consumers_;
}
const std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>&
producers() const {
return producers_;
}
const DisjointSets<IterDomain*>& siblings() const {
return sibling_sets_;
}
const VectorOfUniqueEntries<IterDomain*>& allIds() const {
return all_ids_;
}
const std::unordered_set<IterDomain*>& viewRfactorIds() const {
return view_rfactor_ids_;
}
// Returns if first and second are expressions through which the provided
// id_map have matching inputs (if forward), or outputs (if not forward).
// Returning true means the expressions are "the same", in terms they modify
// matching original extents, by the same amount.
static bool exprsMap(
Expr* first,
Expr* second,
bool forward,
const DisjointSets<IterDomain*>& id_map);
bool hasSelfMapping() const {
return self_mapping_info_.has_value();
}
private:
void build(Fusion* fusion);
void initializeId(IterDomain* id, bool is_view_rfactor_id, bool is_leaf_id);
// Checks if exprsMap then if forward will map outputs else inputs in exact
// and permissive map.
void mapThroughExpr(Expr* first, Expr* second, bool forward);
DisjointSets<IterDomain*> permissive_nodes_;
DisjointSets<IterDomain*> exact_nodes_;
DisjointSets<IterDomain*> loop_nodes_;
// Consumers and producers is not symmetric like the other sets
std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>
consumers_;
std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>
producers_;
DisjointSets<IterDomain*> sibling_sets_;
VectorOfUniqueEntries<IterDomain*> all_ids_;
std::unordered_set<IterDomain*> view_rfactor_ids_;
c10::optional<std::tuple<TensorView*, IterDomain*, IterDomain*, std::string>>
self_mapping_info_ = c10::nullopt;
};
class TrivialReductionInfo;
using DoubleBufferIndices = std::unordered_map<DoubleBufferLoopStage, Int*>;
class TORCH_CUDA_CU_API ComputeAtMap {
public:
ComputeAtMap() = delete;
ComputeAtMap(const ComputeAtMap&) = delete;
ComputeAtMap& operator=(const ComputeAtMap&) = delete;
ComputeAtMap(ComputeAtMap&&) = default;
ComputeAtMap& operator=(ComputeAtMap&&) = default;
ComputeAtMap(Fusion* fusion);
//! Run through disjoint sets in the LOOP map, make sure there's only one
//! non-serial parallel type in each disjoint set, set the parallel type of
//! all IterDomains in the disjoint set to that PType.
void validateAndPropagatePType();
//! Run through disjoint sets in the LOOP map and allocate the index
//! variable for the associated for loop that will be generated
//! for each disjoint sets in the loop map. This pre-allocation makes
//! 2 key assumptions about computeAt map that would very likely be
//! long term invariant:
//! 1. All kir::forloop created in the lowering pass should belong
//! to one of the disjoint sets in loop map.
//! 2. The lowering pass will *never* create a loop nest with 2
//! different nesting levels mapped together, i.e. the case below
//! never occurs:
//! for i in IterDomain1
//! for j in IterDomain2
//! ...
//! With loop_map.areMapped(IterDomain1, IterDomain2) == true.
//! Under this condition, we can pre-allocate all required index
//! variable integers before creating any kir::forloop, and this
//! would help optimizing the generated integer math for indexing.
void allocateIndexVariables();
//! Returns if id0 and id1 are mapped to eachother with provided IdMappingMode
bool areMapped(IterDomain* id0, IterDomain* id1, IdMappingMode mode) const;
//! Returns an iter domain that is the maximum expanded size of all iter
//! domains the one provided maps to. Useful for opening loops to the correct
//! iteration size. Not guarenteed to return the same ID every call, but is
//! guarenteed to return iter domains in the same disjoint set.
IterDomain* getConcreteMappedID(IterDomain* id, IdMappingMode mode) const;
// Prints mapping information, forwards to an internal IterDomainGraph
std::string toString() const;
// Returns if the provided ID is a view like rfactor id
bool isViewRfactor(IterDomain* ref_id) const;
// Returns all rfactor domains in rfactor_concrete_count_reset_domains_ that
// are in the disjoint set of the provided IterDomain. This will be every view
// like rfactor ID the provided ID "depends" on in the map.
std::vector<IterDomain*> getViewRfactorDomainsOfIdGroup(
IterDomain* ref_id,
IdMappingMode mode) const;
const IterDomainGraph& idGraph() const {
return id_graph_;
}
//! Get the ID sets for a provided IdMappingMode
const DisjointSets<IterDomain*>& getIdSets(IdMappingMode mode) const;
// Returns if the ID actually has a disjoint set meaning it has been processed
// in the creation of the compute at map.
bool idExistsInMap(IterDomain* id) const;
//! Returns the pre-allocated index variable integer used in
//! the kir::ForLoop corresponding to the given IterDomain.
//! this interface is only valid if the ID has a loop mapping,
//! ca_map will throw exceptions if given iterdomain doesn't
//! have a loop map entry.
Val* getIndexVariable(
IterDomain* id,
DoubleBufferLoopStage double_buffer_loop_stage =
DoubleBufferLoopStage::NotApplicable) const;
private:
// Build id_graph_
void build(Fusion* fusion);
// Build concrete_id_cache_
// Build a single entry in concrete_cache_id_
IterDomain* computeConcreteId(IterDomain* id, IdMappingMode mode);
void buildConcreteIds();
// Produce the disjoint set containing provided id with mapping mode.
const std::shared_ptr<VectorOfUniqueEntries<IterDomain*>>& disjointSetOf(
IterDomain* id,
IdMappingMode mode) const;
// Should be built once and never modified again.
IterDomainGraph id_graph_;
TrivialReductionInfo trivial_reduction_info_;
// Prevent needing to recompute concrete_id's in compute at map.
// VectorOfUniqueEntries is unique across mapping modes, so don't need to use
// mapping mode directly in this cache. const
// VectorOfUniqueEntries<IterDomain*>& is what's returned by
// ComputeAtMap::disjointSetOf which can be used directly.
std::unordered_map<
std::shared_ptr<VectorOfUniqueEntries<IterDomain*>>,
IterDomain*>
concrete_id_cache_;
//! Allocated Loop index variable through the CA map.
//! only valid for disjoint sets on the loop ca map.
std::unordered_map<const VectorOfUniqueEntries<IterDomain*>*, Val*>
loop_index_variable_map_;
//! Allocated loop indices for double buffer loop.
//! only valid for disjoint sets on the loop ca map
//! that have double buffer-ed iterdomains.
using DoubleBufferIndicesPtr = std::unique_ptr<DoubleBufferIndices>;
std::unordered_map<
const VectorOfUniqueEntries<IterDomain*>*,
DoubleBufferIndicesPtr>
double_buffered_loop_index_variable_map_;
// Shortcut to access the fusion this computeAt map was
// built from.
Fusion* fusion_;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,619 +0,0 @@
#include <ir_utils.h>
#include <iter_visitor.h>
#include <lower2device.h>
#include <contiguity.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
OrderedIdInformation::OrderedIdInformation(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info)
: active_ids_(root_domain), concrete_info_(concrete_info) {
if (ids.empty() || root_domain.empty()) {
return;
}
// Grab root ids and initialize them.
for (const auto root_i : c10::irange(root_domain.size())) {
auto root_id = root_domain[root_i]->as<IterDomain>();
// Initialize id_to_root_ids to map roots to themselves
id_to_root_ids_[root_id] = {root_id};
// Initialize roots as being made up of correctly ordered transforms.
consistently_ordered_ids_.emplace(root_id);
exclusively_consumes_roots_.emplace(root_id);
}
// Iterate from the root domain to the provided ids and fill
// consistently_ordered_ids_, id_to_root_ids_, and exclusively_consumes_roots_
// for all the IDs
auto exprs = StmtSort::getExprsBetween(
ids[0]->fusion(),
{root_domain.begin(), root_domain.end()},
{ids.begin(), ids.end()});
for (auto expr : exprs) {
OptInDispatch::handle(expr);
}
}
bool OrderedIdInformation::checkExclusivelyConsumesRoots(IterDomain* id) {
TORCH_INTERNAL_ASSERT(
std::find(active_ids_.begin(), active_ids_.end(), id) !=
active_ids_.end(),
"Error replaying transforms in contiguous ID checker, expected ",
id->toString(),
" to be in the active ID set.");
auto root_id_it = id_to_root_ids_.find(id);
TORCH_INTERNAL_ASSERT(
root_id_it != id_to_root_ids_.end(),
"Error replaying transforms in contiguous ID checker, couldn't find mapped roots of ",
id->toString());
const auto& root_ids = root_id_it->second;
// Check all the roots of all other ids, to see if any root_ids in id are also
// in them.
for (auto other_active_id : active_ids_) {
if (other_active_id == id || other_active_id == nullptr) {
continue;
}
auto root_id_it = id_to_root_ids_.find(other_active_id);
TORCH_INTERNAL_ASSERT(
root_id_it != id_to_root_ids_.end(),
"Error replaying transforms in contiguous ID checker, couldn't find mapped roots of ",
other_active_id->toString());
const auto& other_root_ids = root_id_it->second;
for (auto other_root_id : other_root_ids) {
if (root_ids.has(other_root_id)) {
return false;
}
}
}
return true;
}
void OrderedIdInformation::handle(Merge* merge) {
// Find inputs in the active_ids_ vector
const auto inner_it =
std::find(active_ids_.begin(), active_ids_.end(), merge->inner());
const auto outer_it =
std::find(active_ids_.begin(), active_ids_.end(), merge->outer());
// If either aren't in active_ids_ it means the inputs were detected to not be
// ordered correctly before hitting this expression.
if (inner_it == active_ids_.end() || outer_it == active_ids_.end()) {
return;
}
auto inner_pos = std::distance(active_ids_.begin(), inner_it);
auto outer_pos = std::distance(active_ids_.begin(), outer_it);
// Find inputs in the ordered transforms map
const auto inner_ordered_it = consistently_ordered_ids_.find(merge->inner());
const auto outer_ordered_it = consistently_ordered_ids_.find(merge->outer());
bool inner_ordered = inner_ordered_it != consistently_ordered_ids_.end();
bool outer_ordered = outer_ordered_it != consistently_ordered_ids_.end();
// Get root ids of the two inputs
const auto inner_root_ids_it = id_to_root_ids_.find(merge->inner());
const auto outer_root_ids_it = id_to_root_ids_.find(merge->outer());
TORCH_INTERNAL_ASSERT(
inner_root_ids_it != id_to_root_ids_.end() &&
outer_root_ids_it != id_to_root_ids_.end(),
"Error replaying transforms in contiguous ID checker.");
const auto& inner_root_ids = inner_root_ids_it->second;
const auto& outer_root_ids = outer_root_ids_it->second;
// TODO: Concretization may prevent contiguous indexing or vectorization.
// It prevents contiguous indexing if the concretization is within the IDs
// that are used for indexing.
// For vectorization it just means we need to make sure the extents of the
// axes to the right of the broadcast root domain in the contigous merge is
// bigger than the vectorization dimension. And that the tensor buffer
// supports the vector word size (always done).
bool outer_is_concretized_bcast = merge->outer()->isBroadcast() &&
concrete_info_->isConcretized(merge->outer());
bool inner_is_concretized_bcast = merge->inner()->isBroadcast() &&
concrete_info_->isConcretized(merge->inner());
// Update maps
// Find the position inner would have to have to be considered ordered
auto pos_after_outer = outer_pos + 1;
for (; pos_after_outer < int64_t(active_ids_.size()); pos_after_outer++) {
if (active_ids_[pos_after_outer] == nullptr) {
// Can't be considered ordered after a nullptr
break;
}
if (active_ids_[pos_after_outer]->isReduction() ||
((active_ids_[pos_after_outer]->isBroadcast() &&
!concrete_info_->isConcretized(active_ids_[pos_after_outer])))) {
// Skip reduction or broadcast axes that aren't concretized in the fusion
continue;
}
break;
}
// The output is ordered as long as the inputs were ordered and outer position
// is directly left of the inner position.
bool out_ordered = inner_ordered && outer_ordered;
out_ordered = out_ordered &&
// If inner_pos is before outer_pos it's not ordered correctly. If for
// some reason it's the same, that would be an error.
inner_pos > outer_pos &&
// Inner could be a broadcast, so doesn't have to be right on
// pos_after_outer as that ID (if it exists) should not be a broadcast.
// However, merging over a broadcast should be fine.
inner_pos <= pos_after_outer && !inner_is_concretized_bcast &&
!outer_is_concretized_bcast;
if (out_ordered) {
consistently_ordered_ids_.emplace(merge->out());
}
// Don't just remove active_ids_, as if we have something like:
// [i0, i1, i2, i3]
// ->merge(0, 2)
// ->merge(1)
// The latter merge looks like it's ordered correctly, if we update the active
// map as:
// [i0, i1, i2, i3] -> [i0*i2, i1, i3]
// Hoever if we instead mark it as:
// [i0, i1, i2, i3] -> [i0*i2, i1, nullptr, i3]
// Or:
// [i0, i1, i2, i3] -> [nullptr, i1, i0*i2, i3]
// It's clear the second merge is not ordered correctly. Doesn't matter which
// direction we put the iter domain in, prefer putting it in outer as we often
// are looking for inner dimensions that are contiguous. We don't want to
// always do this, as it could make ordered merges look non-ordered.
// For exmaple: [i0, i1, i2, i3]
// ->merge(0)
// ->merge(1)
// ->merge(0)
// If it's updated as:
// [i0, i1, i2, i3]
// -> [i0*i1, nullptr, i2, i3]
// -> [i0*i1, nullptr, i2*i3, nullptr]
// Now the final merge looks non-ordered but it is. So only insert a nullptr
// entry if the out is not ordered.
active_ids_[outer_pos] = merge->out();
if (!out_ordered) {
active_ids_[inner_pos] = nullptr;
} else {
active_ids_.erase(active_ids_.begin() + inner_pos);
for (auto i = outer_pos + 1; i < inner_pos; i++) {
// If there's broadcast axes between outer and inner and the merge was
// contiguous, there may be broadcasts between outer and inner that cannot
// be ordered merged anywhere else so remove them.
active_ids_.erase(active_ids_.begin() + outer_pos + 1);
}
}
// Update the root_id entry for the output.
VectorOfUniqueEntries<IterDomain*> root_ids = inner_root_ids;
root_ids.pushBack(outer_root_ids);
id_to_root_ids_[merge->out()] = root_ids;
// Need to check this after updating active_ids_ and id_to_root_ids_
if (checkExclusivelyConsumesRoots(merge->out())) {
exclusively_consumes_roots_.emplace(merge->out());
}
}
void OrderedIdInformation::handle(Split* split) {
// Find the input in the active_ids_ vector
const auto in_it =
std::find(active_ids_.begin(), active_ids_.end(), split->in());
if (in_it == active_ids_.end()) {
return;
}
auto in_pos = std::distance(active_ids_.begin(), in_it);
// Find the input in the ordered transforms map
const auto in_ordered_it = consistently_ordered_ids_.find(split->in());
bool in_ordered = in_ordered_it != consistently_ordered_ids_.end();
// Get root ids of the input
const auto in_root_ids_it = id_to_root_ids_.find(split->in());
TORCH_INTERNAL_ASSERT(
in_root_ids_it != id_to_root_ids_.end(),
"Error replaying transforms in contiguous ID checker.");
VectorOfUniqueEntries<IterDomain*> in_root_ids = in_root_ids_it->second;
// Update map for outputs
// Remove inputs from the active_ids_ and insert the output ID
active_ids_[in_pos] = split->outer();
active_ids_.insert(active_ids_.begin() + in_pos + 1, split->inner());
// The outputs are ordered as long as the input is ordered.
if (in_ordered) {
consistently_ordered_ids_.emplace(split->outer());
consistently_ordered_ids_.emplace(split->inner());
}
// Update the root_id entry for the outputs.
id_to_root_ids_[split->outer()] = in_root_ids;
id_to_root_ids_[split->inner()] = in_root_ids;
}
// Swizzle generally can't be contiguous because of the non-affine nature of it,
// but we can still analyze the operation in the same way as merge/split.
void OrderedIdInformation::handle(Swizzle2D* swizzle) {
// Find inputs in the active_ids_ vector
const auto in_x_it =
std::find(active_ids_.begin(), active_ids_.end(), swizzle->inX());
const auto in_y_it =
std::find(active_ids_.begin(), active_ids_.end(), swizzle->inY());
if (in_x_it == active_ids_.end() || in_y_it == active_ids_.end()) {
return;
}
auto in_x_pos = std::distance(active_ids_.begin(), in_x_it);
auto in_y_pos = std::distance(active_ids_.begin(), in_y_it);
// Find inputs in the ordered transforms map
const auto in_x_ordered_it = consistently_ordered_ids_.find(swizzle->inX());
const auto in_y_ordered_it = consistently_ordered_ids_.find(swizzle->inY());
bool in_x_ordered = in_x_ordered_it != consistently_ordered_ids_.end();
bool in_y_ordered = in_y_ordered_it != consistently_ordered_ids_.end();
// Get root ids of the two inputs
const auto in_x_root_ids_it = id_to_root_ids_.find(swizzle->inX());
const auto in_y_root_ids_it = id_to_root_ids_.find(swizzle->inY());
TORCH_INTERNAL_ASSERT(
in_x_root_ids_it != id_to_root_ids_.end() &&
in_y_root_ids_it != id_to_root_ids_.end(),
"Error replaying transforms in contiguous ID checker.");
const auto& in_x_root_ids = in_x_root_ids_it->second;
const auto& in_y_root_ids = in_y_root_ids_it->second;
// Update map for outputs
// Remove inputs from the active_ids_ and insert the output ID
active_ids_[in_x_pos] = swizzle->outX();
active_ids_[in_y_pos] = swizzle->outY();
// In the case of no real swizzle we can forward properties on each domain
// independently.
if (swizzle->swizzleType() == Swizzle2DType::NoSwizzle) {
if (in_x_ordered) {
consistently_ordered_ids_.emplace(swizzle->outX());
}
if (exclusivelyConsumesRoots(swizzle->inX())) {
exclusively_consumes_roots_.emplace(swizzle->outX());
}
if (in_y_ordered) {
consistently_ordered_ids_.emplace(swizzle->outY());
}
if (exclusivelyConsumesRoots(swizzle->inY())) {
exclusively_consumes_roots_.emplace(swizzle->outY());
}
id_to_root_ids_[swizzle->outX()] = in_x_root_ids;
id_to_root_ids_[swizzle->outY()] = in_y_root_ids;
} else {
VectorOfUniqueEntries<IterDomain*> root_ids = in_x_root_ids;
root_ids.pushBack(in_y_root_ids);
id_to_root_ids_[swizzle->outX()] = root_ids;
id_to_root_ids_[swizzle->outY()] = root_ids;
}
}
NonDivisibleSplitDependencies::NonDivisibleSplitDependencies(
// TODO: Revisit reduction rfactor axes and propagation. Should probably use
// ca_map to propogate non divisibility dependencies across exact map. Still
// need to think through divisible split and non divisible dependencies to
// see if there's conflicts where a split might look non divisible but
// actually is divisible and one's overruling the other.
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
const std::unordered_set<Split*>& divisible_splits) {
if (ids.empty() || root_domain.empty()) {
return;
}
auto transforms = StmtSort::getExprsBetween(
ids[0]->fusion(),
{root_domain.begin(), root_domain.end()},
{ids.begin(), ids.end()});
for (auto transform : transforms) {
auto inp_ids = ir_utils::filterByType<IterDomain>(transform->inputs());
for (auto inp_id : inp_ids) {
if (std::find(root_domain.begin(), root_domain.end(), inp_id) !=
root_domain.end()) {
// This generally shouldn't happen as there shouldn't be
// transformations before the root ids, but in case for some reason
// we eventually do have cases like that, we should reset the
// root_ids if for some reason they've been placed in the non
// divisible split set.
depends_on_non_divisible_split.erase(inp_id);
}
}
bool inputs_non_divisible =
std::any_of(inp_ids.begin(), inp_ids.end(), [this](IterDomain* inp_id) {
return depends_on_non_divisible_split.find(inp_id) !=
depends_on_non_divisible_split.end();
});
auto out_ids = ir_utils::filterByType<IterDomain>(transform->outputs());
if (inputs_non_divisible) {
// If any inputs are known to be dependent on a divisible split
// Mark outputs as dependent on a non_divisible split
depends_on_non_divisible_split.insert(out_ids.begin(), out_ids.end());
continue;
}
if (!transform->isA<Split>()) {
continue;
}
auto split = transform->as<Split>();
// If this transform is a non-divisible split
if (divisible_splits.find(split) == divisible_splits.end()) {
// Mark outputs as dependent on a non_divisible split
auto out_ids = ir_utils::filterByType<IterDomain>(transform->outputs());
depends_on_non_divisible_split.insert(out_ids.begin(), out_ids.end());
}
}
}
ContigIDs::ContigIDs(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
const std::vector<bool>& root_contiguity,
const std::unordered_set<IterDomain*>& final_ids,
const std::unordered_map<IterDomain*, Val*>& index_map,
const std::unordered_set<Split*>& divisible_splits,
std::unordered_map<IterDomain*, IterDomain*> p2c_id_map,
bool ignore_indexability,
bool ignore_consistent_ordering)
: root_domain_(root_domain),
root_contiguity_(root_contiguity),
final_ids_(final_ids),
index_map_(index_map),
divisible_splits_(divisible_splits),
p2c_id_map_(std::move(p2c_id_map)),
ignore_indexability_(ignore_indexability),
ignore_consistent_ordering_(ignore_consistent_ordering),
non_divisible_id_info_(ids, root_domain_, divisible_splits_) {
if (ids.size() > 0) {
// This constructor doesn't provide the following information so it needs to
// be built.
ca_map_ = std::make_shared<ComputeAtMap>(ids[0]->fusion());
halo_info_ = std::make_shared<HaloInfo>(ids[0]->fusion(), ca_map_);
concrete_info_ =
std::make_shared<ConcretizedBroadcastDomains>(ids[0]->fusion());
consistent_transform_info_ = std::make_unique<const OrderedIdInformation>(
ids, root_domain, concrete_info_);
}
build(ids);
}
ContigIDs::ContigIDs(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
const std::vector<bool>& root_contiguity,
const std::unordered_set<IterDomain*>& final_ids,
const std::unordered_map<IterDomain*, Val*>& index_map,
const std::unordered_set<Split*>& divisible_splits,
std::shared_ptr<const ComputeAtMap> ca_map,
std::shared_ptr<const HaloInfo> halo_info,
std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info,
std::unordered_map<IterDomain*, IterDomain*> p2c_id_map,
bool ignore_indexability,
bool ignore_consistent_ordering)
: root_domain_(root_domain),
root_contiguity_(root_contiguity),
final_ids_(final_ids),
index_map_(index_map),
divisible_splits_(divisible_splits),
ca_map_(ca_map),
halo_info_(halo_info),
concrete_info_(concrete_info),
p2c_id_map_(std::move(p2c_id_map)),
ignore_indexability_(ignore_indexability),
ignore_consistent_ordering_(ignore_consistent_ordering),
consistent_transform_info_(std::make_unique<const OrderedIdInformation>(
ids,
root_domain,
concrete_info_)),
non_divisible_id_info_(ids, root_domain, divisible_splits_) {
build(ids);
}
ContigIDs ContigIDs::getNonContigIDs() {
return ContigIDs({}, {}, {}, {}, {}, {});
}
void ContigIDs::build(const std::vector<IterDomain*>& ids) {
if (ids.empty() || root_domain_.empty()) {
return;
}
TORCH_INTERNAL_ASSERT(
root_domain_.size() == root_contiguity_.size(),
"Arguments don't match ",
root_domain_.size(),
" != ",
root_contiguity_.size());
for (const auto root_domain_i : c10::irange(root_domain_.size())) {
auto root_domain_id = root_domain_[root_domain_i]->as<IterDomain>();
root_to_indexed_id_[root_domain_id] = root_domain_id;
// Initialize to false
is_contig_root_[root_domain_id] = false;
// If a root domain has halo, can't use merged domain even if
// both inputs are contiguous. HaloInfo is also initialized for
// rfactor root domains, which should just return "zero"
// RootAxisInfo. This should be safe as no rfactor tensor should
// need halo.
if (root_contiguity_[root_domain_i] &&
!halo_info_->getRootAxisInfo(root_domain_id).hasHalo()) {
contig_ids_.emplace(root_domain_id);
is_contig_root_[root_domain_id] = true;
within_contig_ids_[root_domain_id] = std::unordered_set<IterDomain*>();
}
}
if (!contig_ids_.empty()) {
auto exprs = StmtSort::getExprsBetween(
ids[0]->fusion(),
{root_domain_.begin(), root_domain_.end()},
{ids.begin(), ids.end()});
for (auto expr : exprs) {
handle(expr);
}
}
}
void ContigIDs::handle(Merge* merge) {
// If output is not consistently ordered or doesn't solely consume all root
// domains in its dependencies, then it can't be a contiguously indexable
// iterdomain.
if (!(ignore_consistent_ordering_ ||
consistent_transform_info_->isConsistentlyOrdered(merge->out()))) {
return;
}
if (!consistent_transform_info_->exclusivelyConsumesRoots(merge->out())) {
return;
}
// If output is not "directly indexable" then it's definitely not contiguously
// indexable.
if (!ignore_indexability_ && !isIndexable(merge->out())) {
return;
}
// If inputs are marked as final, stop
if (final_ids_.count(merge->inner()) || final_ids_.count(merge->outer())) {
return;
}
// Check root domains for contiguity
auto root_ids_it =
consistent_transform_info_->idToRootIds().find(merge->out());
TORCH_INTERNAL_ASSERT(
root_ids_it != consistent_transform_info_->idToRootIds().end(),
"\nError in contiguous analysis, merge info doesn't exist for:\n",
merge->toString(),
"\nId: ",
merge->out()->toString());
VectorOfUniqueEntries<IterDomain*> root_ids = root_ids_it->second;
bool is_indexing_pass = !ignore_consistent_ordering_;
IterDomain* last_root = nullptr;
for (auto root_id_i : c10::irange(root_domain_.size())) {
auto root_id = root_domain_[root_id_i];
if (root_ids.has(root_id)) {
// ID found, remove it
root_ids.erase(root_id);
// If we're indexing:
// we could still potentially consider this ID linearly indexable, as we
// could multiple the index by the last root's stride.
//
// If we're computing predicates (ignore_consistent_ordering_==true),
// then we don't have this same constraint, we can just ignore
// contiguity of the roots all together.
if (!root_contiguity_[root_id_i] && is_indexing_pass) {
if (!root_ids.empty()) {
return;
}
}
last_root = root_id;
}
}
// If there's a non_divisible split in the history of merge->out then it can't
// be contiguously indexable.
if (non_divisible_id_info_.dependsOnNonDivisibleSplit(merge->out())) {
return;
}
// Now we know merge->out is a contiguously indexable ID
TORCH_INTERNAL_ASSERT(
last_root != nullptr,
"Issue processing root ids for ",
merge->out()->toString());
// Reset root_ids
root_ids = root_ids_it->second;
for (auto root_id : root_ids) {
root_to_indexed_id_[root_id] = merge->out();
}
auto all_within_vals = DependencyCheck::getAllValsBetween(
{root_domain_.begin(), root_domain_.end()}, {merge->out()});
auto all_within_ids = ir_utils::filterByType<IterDomain>(all_within_vals);
std::unordered_set<IterDomain*> within_id_set(
all_within_ids.begin(), all_within_ids.end());
within_id_set.erase(merge->out());
within_contig_ids_[merge->out()] = within_id_set;
for (auto id : all_within_ids) {
contig_ids_.erase(id);
}
contig_ids_.emplace(merge->out());
}
IterDomain* ContigIDs::getMappedId(IterDomain* id) const {
auto it = p2c_id_map_.find(id);
if (it != p2c_id_map_.end()) {
return it->second;
} else {
return id;
}
}
bool ContigIDs::isIndexable(IterDomain* id) const {
// If ID is mapped to consumer through persmissive map but not exact map it
// will not be mapped through to the exact map through the p2c map. Therefore
// reject because it involves broadcast resolution.
if (!ca_map_->idExistsInMap(getMappedId(id))) {
return false;
}
auto c_id =
ca_map_->getConcreteMappedID(getMappedId(id), IdMappingMode::EXACT);
return index_map_.find(c_id) != index_map_.end();
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,311 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <compute_at_map.h>
#include <disjoint_set.h>
#include <ir_all_nodes.h>
#include <lower_shift.h>
#include <lower_trivial_broadcast.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
// Goes through the transformations associated with a series of ids and root
// ids. Checks the ordering of the iteration domains through these operations to
// pick out which operations are consistently ordered. For example:
// [i0, i1, i2]
// ->split(0, 4)->merge(1)->merge(1)->merge(0)
// are consistently ordered from largest to smallest extents, but
// ->split(0, 4)->merge(1)->merge(0, 2)->merge(0) is not consistently ordered
// with the roots.
//
// This property is important to understand the contiguity of dimensions through
// complex transformations.
class OrderedIdInformation : public OptInDispatch {
public:
OrderedIdInformation() = delete;
OrderedIdInformation(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info);
const std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>&
idToRootIds() const {
return id_to_root_ids_;
}
bool isConsistentlyOrdered(IterDomain* id) const {
return consistently_ordered_ids_.find(id) !=
consistently_ordered_ids_.end();
}
bool exclusivelyConsumesRoots(IterDomain* id) const {
return exclusively_consumes_roots_.find(id) !=
exclusively_consumes_roots_.end();
}
private:
// Returns if the id in active_ids should be in exclusively_consumes_roots_
bool checkExclusivelyConsumesRoots(IterDomain* id);
void handle(Split*) override;
void handle(Merge* merge) override;
void handle(Swizzle2D* swizzle) override;
// Track which root ids were used to generate each iter domain
std::unordered_map<IterDomain*, VectorOfUniqueEntries<IterDomain*>>
id_to_root_ids_;
// Track all IterDomains that have correct ordered transforms for contiguity.
// i.e. if we have:
//
// root = [i0, i1, i2]
// i3 = merge(i0, i2)
// would not be consistently ordered transformed
//
// root = [i0, i1, i2]
// i4, i5 = spit(merge(merge(i0, i1), i2), 4)
// would be consistently ordered transforms
//
// root = [i0, i1, i2, i3]
// i4 = merge(i1, i2) would also be consistently ordered transformed
std::unordered_set<IterDomain*> consistently_ordered_ids_;
// Active series of IterDomains that are updated while we're processing the
// domain. Helps us identify which ids are consistently_ordered_ids_. Used
// for intermediate storage, not to return.
std::vector<IterDomain*> active_ids_;
// IterDomains in this set exclusively consume all the uses of their roots.
// For example:
// [i0, i1] split(0, f)->merge(1)
// [ceilDiv(i0, f), f*i1]
// neither iter domains exclusively consume the roots. With another:
// merge(0) -> [ceilDiv(i0, f)*f*i1]
// The resulting iter domain does exclusively consume the roots.
//
// Also:
// [i0, i1, i2, i3] merge(1)->merge(1)
// ->[i0, i1*i2*i3]
// both resulting iter domains do exclusively consume their roots
std::unordered_set<IterDomain*> exclusively_consumes_roots_;
// Broadcast domains that are concretized cannot be considered contiguously
// indexable.
// TODO: This constraint is more conservative than necessary as it's only if
// the domain is concretized within the local indexing, not in the entire
// fusion.
std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info_;
};
// Based on provided divisible split set, goes through expressions and marks all
// IterDomains that are dependent on a non-divisible split.
class NonDivisibleSplitDependencies : public OptInDispatch {
public:
NonDivisibleSplitDependencies() = delete;
NonDivisibleSplitDependencies(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
const std::unordered_set<Split*>& divisible_splits);
bool dependsOnNonDivisibleSplit(IterDomain* id) const {
return depends_on_non_divisible_split.find(id) !=
depends_on_non_divisible_split.end();
}
private:
std::unordered_set<IterDomain*> depends_on_non_divisible_split;
};
// A merge is contiguous if:
// Inputs of outer are to the left in the root domain of the inputs of RHS.
// All inputs are contiguous in the root domain:
// - All marked as contiguous
// - Only gaps between inputs are broadcast or reductoin dims
// There are no split transformations performed on outer or inner
// All transformations on outer or inner are contiguous merges
// If this criteria holds, then we can index the input root domains of this
// merge with the indexing provided to the output of the merge in the backward
// index pass
class ContigIDs : public OptInDispatch {
public:
//! Check through the history of ids whose inputs map to root_domain with
//! contiguity root_contiguity. Return unordered_set of all merges that are
//! contiguous. Ignore root order is primarily used for predicate generation.
//! In this case we can linearize indexing of any ID that only consists of
//! merge operations.
//!
//! Mapping information from CA Index concrete to reference domains
//! is used to find if merged output domains can be indexed. If there's
//! no mapping to a reference domain, there's no corresponding
//! index, so it isn't marked as conting merge.
//!
//! p2c_id_map can be used when replayed producer domains are
//! analyzed, in which case producer-to-consumer maps should be
//! passed.
//!
//! If ignore_indexability and ignore_halo_constraint are true,
//! ignore the constraint on indexing and halo, respectively. It is
//! the caller that is responsible for its correctness.
//! Not really sure why but clang-tidy only complains about
//! std::unordered_map if passed as a const reference.
ContigIDs(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
const std::vector<bool>& root_contiguity,
const std::unordered_set<IterDomain*>& final_ids,
const std::unordered_map<IterDomain*, Val*>& index_map,
const std::unordered_set<Split*>& divisible_splits,
std::unordered_map<IterDomain*, IterDomain*> p2c_id_map = {},
bool ignore_indexability = false,
bool ignore_consistent_ordering = false);
//! \param ids IterDomains on the leaves of the domain we're looking for
//! contiguous indexing into.
//! \param root_domain the root domain of the domain we're looking for
//! contiguous indexing into.
//! \param root_contiguity the contiguity of the root_domain.
//! \param concrete_to_ref concrete ids of the exact map that the reference
//! index is using for indexing.
//! \param divisible_splits a set of all splits in the fusion that are
//! divisible.
//! \param ca_map compute at map of the fusion.
//! \param halo_info halo information of the fusion.
//! \param concrete_info concretized broadcast information of the fusion.
//! \param p2c_id_map map from producer to consumer ids used for indexing
//! producer tensors.
//! \param ignore_consistent_ordering true for actual indexing into tensors
//! but false for predicate analysis. Ordering of merges don't matter for
//! predicate generation as they don't map to a physical address.
//! \param ignore_indexability can only be true if providing a real
//! concrete_to_ref map. As what it's checking is if the index is actually
//! indexable based on the reference.
ContigIDs(
const std::vector<IterDomain*>& ids,
const std::vector<IterDomain*>& root_domain,
const std::vector<bool>& root_contiguity,
const std::unordered_set<IterDomain*>& final_ids,
const std::unordered_map<IterDomain*, Val*>& index_map,
const std::unordered_set<Split*>& divisible_splits,
std::shared_ptr<const ComputeAtMap> ca_map,
std::shared_ptr<const HaloInfo> halo_info,
std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info,
std::unordered_map<IterDomain*, IterDomain*> p2c_id_map = {},
bool ignore_indexability = false,
bool ignore_consistent_ordering = false);
//! Return an empty ContigIDs with no contiguous ID
static ContigIDs getNonContigIDs();
const std::unordered_set<IterDomain*>& contigIDs() const {
return contig_ids_;
}
const std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>&
withinContigIDs() const {
return within_contig_ids_;
}
const std::unordered_map<IterDomain*, IterDomain*>& rootToIndexedID() const {
return root_to_indexed_id_;
}
VectorOfUniqueEntries<IterDomain*> indexedRootIDs(IterDomain* id) const {
auto root_ids_it = consistent_transform_info_->idToRootIds().find(id);
if (root_ids_it == consistent_transform_info_->idToRootIds().end()) {
return {};
}
return root_ids_it->second;
}
private:
using OptInDispatch::handle;
bool inRoot(const std::vector<IterDomain*>& ids) {
return std::all_of(ids.begin(), ids.end(), [this](IterDomain* id) {
return is_contig_root_.find(id) != is_contig_root_.end();
});
}
bool isContig(IterDomain* id) {
return contig_ids_.find(id) != contig_ids_.end();
}
// Split outputs are not contiguous, don't need to do anything.
void handle(Split*) override {}
void handle(Merge* merge) override;
// TODO:
// Currently not propagating any contiguity information
// as contiguity is generally not preserved after swizzles.
// But in follow ups we could gradually add back a few special
// cases, depending on specific swizzle type and axes.
void handle(Swizzle2D* swizzle) override {}
IterDomain* getCAIndexConcreteId(IterDomain* id) const;
//! True if an ID is indexable.
//! E.g., a merged domain with broadcast may not be indexable when
//! its corresponding reference tensor has non-broadcast domains.
bool isIndexable(IterDomain* id) const;
//! Return an ID mapped with id_map_ or itself
IterDomain* getMappedId(IterDomain* id) const;
private:
void build(const std::vector<IterDomain*>& ids);
//! Root domains to analyze contiguity
const std::vector<IterDomain*>& root_domain_;
//! Contiguity of root_domain_
const std::vector<bool>& root_contiguity_;
//! Domains where indexing/predicates cannot be done with their
//! consumers domains
const std::unordered_set<IterDomain*>& final_ids_;
//! Mapping of concrete domains to indices. Just used to check if
//! there's an index for an IterDomain.
const std::unordered_map<IterDomain*, Val*> index_map_;
// Divisible split information as we can still consider iter domains
// contiguous through divisible splits.
const std::unordered_set<Split*>& divisible_splits_;
std::shared_ptr<const ComputeAtMap> ca_map_;
std::shared_ptr<const HaloInfo> halo_info_;
std::shared_ptr<const ConcretizedBroadcastDomains> concrete_info_;
//! Producer-to-consumer index map in the case of analyzing replayed
//! producer tensors
const std::unordered_map<IterDomain*, IterDomain*> p2c_id_map_;
const bool ignore_indexability_ = false;
const bool ignore_consistent_ordering_ = false;
//! Mapping of root domain to bool indicating contiguity
std::unordered_map<IterDomain*, bool> is_contig_root_;
// Mark if ids are result of contigous merges
std::unordered_set<IterDomain*> contig_ids_;
// Given contiguous domain, return all iter domains within its history.
std::unordered_map<IterDomain*, std::unordered_set<IterDomain*>>
within_contig_ids_;
//! Mapping of root domain to the actual indexed domain, which can
//! be itself or a contig merged domain if found.
std::unordered_map<IterDomain*, IterDomain*> root_to_indexed_id_;
std::unique_ptr<const OrderedIdInformation> consistent_transform_info_;
NonDivisibleSplitDependencies non_divisible_id_info_;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,330 +0,0 @@
#pragma once
#include <c10/util/Exception.h>
#include <algorithm>
#include <initializer_list>
#include <unordered_map>
#include <unordered_set>
#include <vector>
// For printing of the set when using a Statement as the type for the set
#include <ir_base_nodes.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace {
template <typename T>
std::string abstractToString(T* ptr) {
return ptr->toString();
}
template <typename T>
std::string abstractToString(T ref) {
return ref.toString();
}
} // namespace
// Vector like class that will prevent adding duplicate entries by also
// maintaing a set
template <typename T, typename Hash = std::hash<T>>
class VectorOfUniqueEntries {
public:
VectorOfUniqueEntries() = default;
VectorOfUniqueEntries(const std::initializer_list<T>& x)
: vector_(x), set_(x) {}
// Returns if a node was actually added
bool pushBack(T entry) {
if (set_.emplace(entry).second) {
vector_.push_back(entry);
return true;
}
return false;
}
// Returns if any node was added
bool pushBack(const VectorOfUniqueEntries<T, Hash>& other) {
bool any_added = false;
for (auto entry : other) {
any_added = any_added | pushBack(entry);
}
return any_added;
}
// Returns a const vector useful for iterating on
const std::vector<T>& vector() const {
return vector_;
}
// Returns first element in vector
T front() const {
return vector_.front();
}
// Returns last element in vector
T back() const {
return vector_.back();
}
// Remove and returns the last element in vector
T popBack() {
T v = vector_.back();
set_.erase(v);
vector_.pop_back();
return v;
}
// Returns if this container is empty
bool empty() const {
return vector_.empty();
}
// Returns the number of elements in this container
size_t size() const {
return vector_.size();
}
// Returns if entry is in this vector
bool has(T entry) const {
return set_.find(entry) != set_.end();
}
// Erase given entry from the containers if
// there is a match.
void erase(T entry) {
vector_.erase(
std::remove_if(
vector_.begin(),
vector_.end(),
[entry](T val) { return val == entry; }),
vector_.end());
set_.erase(entry);
}
// Insert elements at the end of the container.
template <typename InputIt>
void insert(InputIt begin, InputIt end) {
for (auto it = begin; it != end; it++) {
pushBack(*it);
}
}
// Returns iterator pointing to the beginning of vector container
auto begin() const {
return vector().begin();
}
// Returns iterator pointing to the end of vector container
auto end() const {
return vector().end();
}
// Returns iterator pointing to the beginning of vector container
auto begin() {
return vector().begin();
}
// Returns iterator pointing to the end of vector container
auto end() {
return vector().end();
}
std::string toString() {
std::stringstream ss;
ss << "{ ";
for (auto entry : vector()) {
ss << abstractToString(entry);
if (entry != vector().back()) {
ss << "; ";
}
}
ss << " }";
return ss.str();
}
private:
std::vector<T> vector_;
std::unordered_set<T, Hash> set_;
};
//! Container class DisjointSet models equivalence relationships
//!
//! Each instance of this class keeps equivalence sets
//! DisjointSet::mapEntries(a,b) makes the full set of a and b equivalent
//! DisjointSet::*AreMapped(a,b) checks if a and b belong to the same disjoint
//! set
template <typename T, typename Hash = std::hash<T>>
class DisjointSets {
public:
DisjointSets() = default;
// Warning: returned values should never be modified. This accessor isn't
// strictly safe as VectorOfUniqueEntries is not returned as a const.
const std::
unordered_map<T, std::shared_ptr<VectorOfUniqueEntries<T, Hash>>, Hash>&
disjointSetMap() const {
return disjoint_set_maps_;
}
// Warning: returned values should never be modified. This accessor isn't
// strictly safe as VectorOfUniqueEntries is not returned as a const.
const std::vector<std::shared_ptr<VectorOfUniqueEntries<T, Hash>>>&
disjointSets() const {
return disjoint_sets_;
}
// Return the entire disjoint set of provided entry
const VectorOfUniqueEntries<T, Hash>& getDisjointSetOf(T entry) const {
auto set_it = disjoint_set_maps_.find(entry);
TORCH_INTERNAL_ASSERT(
set_it != disjoint_set_maps_.end(),
"Could not find entry for ",
entry->toString());
return *(set_it->second);
}
// Initializes a new set for provided entry
//
// TODO: Return iterator
void initializeSet(T entry) {
if (disjoint_set_maps_.find(entry) != disjoint_set_maps_.end()) {
return;
}
disjoint_sets_.push_back(
std::make_shared<VectorOfUniqueEntries<T, Hash>>());
disjoint_sets_.back()->pushBack(entry);
disjoint_set_maps_.emplace(std::make_pair(entry, disjoint_sets_.back()));
}
// Adds all of the disjoint set belonging to entry1 to the disjoint set
// belonging to entry0, maps all entries of disjoint set belonging to entry1
// to entry0, removes original disjoint set belonging to entry1.
void mapEntries(T entry0, T entry1) {
auto set_it_0 = disjoint_set_maps_.find(entry0);
auto set_it_1 = disjoint_set_maps_.find(entry1);
// Track if we need to reset iterators, optimize for case where both entries
// exist
bool invalid_iterators = false;
if (set_it_0 == disjoint_set_maps_.end()) {
initializeSet(entry0);
invalid_iterators = true;
}
if (set_it_1 == disjoint_set_maps_.end()) {
initializeSet(entry1);
invalid_iterators = true;
}
// TODO: We can avoid refinding one iterator if initialize set returns an
// iterator, though if we insert entry1 we'd have to refind entry0 as it
// could invalidate all iterators
if (invalid_iterators) {
set_it_0 = disjoint_set_maps_.find(entry0);
set_it_1 = disjoint_set_maps_.find(entry1);
}
auto set0_shared_ptr = set_it_0->second;
auto set1_shared_ptr = set_it_1->second;
// If the sets are already the same, do nothing
if (set0_shared_ptr == set1_shared_ptr) {
return;
}
// Place everything in set1 into set0 and remap all entries in set1 to set0
for (auto entry : set1_shared_ptr->vector()) {
set0_shared_ptr->pushBack(entry);
disjoint_set_maps_[entry] = set0_shared_ptr;
}
// set1 no longer needed as its entries are copied into set0
disjoint_sets_.erase(std::find(
disjoint_sets_.begin(), disjoint_sets_.end(), set1_shared_ptr));
}
// Will assert if provided entry0 is not in any disjoint set, otherwise
// returns if entry0 and entry1 are in the same disjoint set.
bool strictAreMapped(T entry0, T entry1) const {
auto entry_it = disjointSetMap().find(entry0);
TORCH_INTERNAL_ASSERT(
entry_it != disjointSetMap().end(),
"Strict mapping failed on element: ",
abstractToString(entry0),
" either an error occurred, or non strict mapping should have been used.");
return entry_it->second->has(entry1);
}
// If entry0 doesn't have a disjoint set returns false, otherwise returns if
// entry0 and entry1 are in the same disjoint set.
bool permissiveAreMapped(T entry0, T entry1) const {
auto entry_it = disjointSetMap().find(entry0);
if (entry_it == disjointSetMap().end()) {
return false;
}
return entry_it->second->has(entry1);
}
// Returns if a set exists with provided entry
bool mappingExists(T entry) const {
return disjoint_set_maps_.find(entry) != disjoint_set_maps_.end();
}
// Returns a deterministic list of all entries that have been added to any
// disjoint set.
//
// Warning: constructed on every call, consider caching result.
VectorOfUniqueEntries<T, Hash> getAllElements() const {
VectorOfUniqueEntries<T, Hash> all_elements;
for (auto set : disjoint_sets_) {
for (auto entry : set->vector()) {
all_elements.pushBack(entry);
}
}
return all_elements;
}
// Completely clears all disjoint sets
void clear() {
disjoint_set_maps_.clear();
disjoint_sets_.clear();
}
std::string toString() const {
std::stringstream ss;
ss << "disjoint sets{\n";
const std::string sep(" ");
for (auto s_ptr : disjoint_sets_) {
auto& set = *s_ptr;
ss << sep << "{\n";
for (auto entry : set.vector()) {
ss << sep << sep << abstractToString(entry) << "\n";
}
ss << sep << "}\n";
}
ss << "}";
return ss.str();
}
private:
// Disjoint sets
std::unordered_map<T, std::shared_ptr<VectorOfUniqueEntries<T, Hash>>, Hash>
disjoint_set_maps_;
// Keep a list of disjoint_sets that's deterministic to iterate over
std::vector<std::shared_ptr<VectorOfUniqueEntries<T, Hash>>> disjoint_sets_;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

View File

@ -1,378 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <c10/util/Exception.h>
#include <utils.h>
#include <unordered_map>
// dispatch.h prevents the need from adding manual dispatch in every class that
// wants to define how to process a series of nodes. dispatch.h provides 4
// classes that can be inherited providing a means to override functions on a
// per-node basis. There are currently 4 provided dispatch mechanisms:
//
// OptOutDispatch:
//
// provides the functions:
// virtual void handle(ValType* irnode){}
//
// This provides a mechanisms to override this handle for particular node
// types. For example if we only wanted to actually run a function on
// BinaryOps, we could inherit OptOutDispatch and simply override: void
// handle(BinaryOp*) { doSomething; } Then we could run through all our
// Statement* and call OptOutDispatch::handle(statement). When a BinaryOp is
// encountered our override function will be called. For every other node,
// nothing will be done.
//
// OptInDispatch:
//
// This class is similar to OptOutDispatch, however if we encounter a node
// that we haven't specified an override for in the derived class, an error
// will be thrown. This is useful if we create a class that is expected to
// handle any type of node it encounters.
//
// OptOutMutator:
//
// This class is similar to OptOutDispatch except the functions provided are of
// type: virtual Statement* mutate(Statement*) this is useful for when we want
// to have an IR node result from our overloaded functions.
//
// OptInMutator:
//
// This class is similar to OptInDispatch except the functions provided are of
// type: virtual Statement* mutate(Statement*) this is useful for when we want
// to have an IR node result from our overloaded functions.
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class IrContainer;
class Fusion;
// Hierarchal dispatch functions for handle
class Statement;
class Expr;
class Val;
// Vals
class IterDomain;
class TensorDomain;
class TensorView;
class Bool;
class Double;
class Int;
class ComplexDouble;
class NamedScalar;
// Exprs
class FullOp;
class ARangeOp;
class EyeOp;
class UnaryOp;
class BinaryOp;
class TernaryOp;
class RNGOp;
class ReductionOp;
class GroupedReductionOp;
class WelfordOp;
class GroupedWelfordOp;
class LoadStoreOp;
class MmaOp;
class BroadcastOp;
class TransposeOp;
class ExpandOp;
class ShiftOp;
class GatherOp;
class ViewAsScalar;
class ViewOp;
// Exprs
class Split;
class Merge;
class Swizzle2D;
namespace kir {
class Predicate;
class TensorIndex;
class IntPair;
class Allocate;
class BlockSync;
class GridSync;
class CpAsyncWait;
class CpAsyncCommit;
class ForLoop;
class IfThenElse;
class GridReduction;
class GroupedGridReduction;
class GridBroadcast;
class GridWelford;
class GroupedGridWelford;
class AllocateFusedReduction;
class InitMagicZero;
class UpdateMagicZero;
class Swizzle2DInt;
class PairSelect;
} // namespace kir
// By default, all IR nodes are handled in this dispatch, and will call an empty
// function on all nodes.
class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase {
protected:
virtual void unhandled(const Statement*) {}
public:
// Hierarchal dispatch functions for handle
virtual void handle(const Statement*);
virtual void handle(const Expr*);
virtual void handle(const Val*);
// Vals
virtual void handle(const IterDomain* stmt);
virtual void handle(const TensorDomain* stmt);
virtual void handle(const TensorView* stmt);
virtual void handle(const Bool* stmt);
virtual void handle(const Double* stmt);
virtual void handle(const Int* stmt);
virtual void handle(const ComplexDouble* stmt);
virtual void handle(const NamedScalar* stmt);
virtual void handle(const kir::Predicate*);
virtual void handle(const kir::TensorIndex*);
virtual void handle(const kir::IntPair*);
// Exprs
virtual void handle(const FullOp* stmt);
virtual void handle(const ARangeOp* stmt);
virtual void handle(const EyeOp* stmt);
virtual void handle(const UnaryOp* stmt);
virtual void handle(const BinaryOp* stmt);
virtual void handle(const TernaryOp* stmt);
virtual void handle(const RNGOp* stmt);
virtual void handle(const ReductionOp* stmt);
virtual void handle(const GroupedReductionOp* stmt);
virtual void handle(const WelfordOp* stmt);
virtual void handle(const GroupedWelfordOp* stmt);
virtual void handle(const LoadStoreOp* stmt);
virtual void handle(const MmaOp* stmt);
virtual void handle(const BroadcastOp* stmt);
virtual void handle(const Split* stmt);
virtual void handle(const Merge* stmt);
virtual void handle(const Swizzle2D* stmt);
virtual void handle(const TransposeOp* stmt);
virtual void handle(const ExpandOp* stmt);
virtual void handle(const ShiftOp* stmt);
virtual void handle(const GatherOp* stmt);
virtual void handle(const ViewAsScalar* stmt);
virtual void handle(const ViewOp* stmt);
virtual void handle(const kir::Allocate*);
virtual void handle(const kir::BlockSync*);
virtual void handle(const kir::GridSync*);
virtual void handle(const kir::CpAsyncWait*);
virtual void handle(const kir::CpAsyncCommit*);
virtual void handle(const kir::InitMagicZero*);
virtual void handle(const kir::UpdateMagicZero*);
virtual void handle(const kir::ForLoop*);
virtual void handle(const kir::IfThenElse*);
virtual void handle(const kir::GridReduction*);
virtual void handle(const kir::GroupedGridReduction*);
virtual void handle(const kir::GridBroadcast*);
virtual void handle(const kir::GridWelford*);
virtual void handle(const kir::GroupedGridWelford*);
virtual void handle(const kir::AllocateFusedReduction*);
virtual void handle(const kir::Swizzle2DInt*);
virtual void handle(const kir::PairSelect*);
};
class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase {
protected:
virtual void unhandled(Statement*);
public:
// Hierarchal dispatch functions for handle
virtual void handle(Statement*);
virtual void handle(Expr*);
virtual void handle(Val*);
// Vals
virtual void handle(Bool* stmt);
virtual void handle(Double* stmt);
virtual void handle(Int* stmt);
virtual void handle(ComplexDouble* stmt);
virtual void handle(NamedScalar* stmt);
virtual void handle(IterDomain* stmt);
virtual void handle(TensorDomain* stmt);
virtual void handle(TensorView* stmt);
virtual void handle(kir::Predicate*);
virtual void handle(kir::TensorIndex*);
virtual void handle(kir::IntPair*);
// Exprs
virtual void handle(FullOp* stmt);
virtual void handle(ARangeOp* stmt);
virtual void handle(EyeOp* stmt);
virtual void handle(UnaryOp* stmt);
virtual void handle(BinaryOp* stmt);
virtual void handle(TernaryOp* stmt);
virtual void handle(RNGOp* stmt);
virtual void handle(ReductionOp* stmt);
virtual void handle(GroupedReductionOp* stmt);
virtual void handle(WelfordOp* stmt);
virtual void handle(GroupedWelfordOp* stmt);
virtual void handle(LoadStoreOp* stmt);
virtual void handle(MmaOp* stmt);
virtual void handle(BroadcastOp* stmt);
virtual void handle(Split* stmt);
virtual void handle(Merge* stmt);
virtual void handle(Swizzle2D* stmt);
virtual void handle(TransposeOp* stmt);
virtual void handle(ExpandOp* stmt);
virtual void handle(ShiftOp* stmt);
virtual void handle(GatherOp* stmt);
virtual void handle(ViewAsScalar* stmt);
virtual void handle(ViewOp* stmt);
virtual void handle(kir::Allocate* stmt);
virtual void handle(kir::BlockSync* stmt);
virtual void handle(kir::GridSync* stmt);
virtual void handle(kir::CpAsyncWait* stmt);
virtual void handle(kir::CpAsyncCommit* stmt);
virtual void handle(kir::InitMagicZero* stmt);
virtual void handle(kir::UpdateMagicZero* stmt);
virtual void handle(kir::ForLoop* stmt);
virtual void handle(kir::IfThenElse* stmt);
virtual void handle(kir::GridReduction* stmt);
virtual void handle(kir::GroupedGridReduction* stmt);
virtual void handle(kir::GridBroadcast* stmt);
virtual void handle(kir::GridWelford* stmt);
virtual void handle(kir::GroupedGridWelford* stmt);
virtual void handle(kir::AllocateFusedReduction* stmt);
virtual void handle(kir::Swizzle2DInt* stmt);
virtual void handle(kir::PairSelect* stmt);
};
class TORCH_CUDA_CU_API OptInConstDispatch : public OptOutConstDispatch {
public:
using OptOutConstDispatch::handle;
protected:
virtual void unhandled(const Statement* stmt) final;
};
class TORCH_CUDA_CU_API OptInDispatch : public OptOutDispatch {
public:
using OptOutDispatch::handle;
protected:
virtual void unhandled(Statement* stmt) final;
};
// Class to perform mutations on Fusion IR. Exprs can simply be redefined, but
// when mutating values they have to be registered through registerMutation so
// that exprs can detect there's been a muatation and know to modify all
// instances of that Val. This means each Val should be mutated "consistently".
// Otherwise behavior may be difficult to understand as it depends on which
// order mutate is called in. This class expects user to topologically call the
// statments of interest so inputs are called and mutated before exprs depending
// on them.
//
// Warning: TensorViews need to be treated carefully. As we don't generally
// register their mutation when their tensor domains only change. If a TV needs
// to be swapped out, it needs to be registered as a "proper" mutation like
// other vals, on top of TensorDomain being updated in the mutated TensorView.
//
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase {
public:
// Hierarchal dispatch functions for handle
virtual void mutate(Statement* s);
virtual void mutate(Expr* e);
virtual void mutate(Val* v);
void registerMutation(Val* val, Val* mutation);
Val* maybeMutated(Val* val) {
if (mutations.find(val) == mutations.end()) {
return val;
}
return mutations.at(val);
}
std::unordered_map<Val*, Val*> mutations;
//****Functions below defined in mutator.cpp*****
// Vals
virtual void mutate(Bool*);
virtual void mutate(Double*);
virtual void mutate(Int*);
virtual void mutate(ComplexDouble*);
virtual void mutate(NamedScalar*);
virtual void mutate(IterDomain*);
virtual void mutate(TensorDomain*);
virtual void mutate(TensorView*);
virtual void mutate(kir::Predicate*);
virtual void mutate(kir::TensorIndex*);
virtual void mutate(kir::IntPair*);
// Exprs
virtual void mutate(FullOp*);
virtual void mutate(ARangeOp*);
virtual void mutate(EyeOp*);
virtual void mutate(UnaryOp*);
virtual void mutate(BinaryOp*);
virtual void mutate(TernaryOp*);
virtual void mutate(RNGOp*);
virtual void mutate(ReductionOp*);
virtual void mutate(GroupedReductionOp*);
virtual void mutate(WelfordOp*);
virtual void mutate(GroupedWelfordOp*);
virtual void mutate(LoadStoreOp*);
virtual void mutate(MmaOp*);
virtual void mutate(BroadcastOp*);
virtual void mutate(Split*);
virtual void mutate(Merge*);
virtual void mutate(Swizzle2D*);
virtual void mutate(TransposeOp*);
virtual void mutate(ExpandOp*);
virtual void mutate(ShiftOp*);
virtual void mutate(GatherOp*);
virtual void mutate(ViewAsScalar*);
virtual void mutate(ViewOp*);
virtual void mutate(kir::Allocate*);
virtual void mutate(kir::BlockSync*);
virtual void mutate(kir::GridSync*);
virtual void mutate(kir::CpAsyncWait*);
virtual void mutate(kir::CpAsyncCommit*);
virtual void mutate(kir::InitMagicZero*);
virtual void mutate(kir::UpdateMagicZero*);
virtual void mutate(kir::ForLoop*);
virtual void mutate(kir::IfThenElse*);
virtual void mutate(kir::GridReduction*);
virtual void mutate(kir::GroupedGridReduction*);
virtual void mutate(kir::GridBroadcast*);
virtual void mutate(kir::GridWelford*);
virtual void mutate(kir::GroupedGridWelford*);
virtual void mutate(kir::AllocateFusedReduction*);
virtual void mutate(kir::Swizzle2DInt*);
virtual void mutate(kir::PairSelect*);
protected:
void removeExpr(IrContainer*, Expr*);
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1 +0,0 @@
html

View File

@ -1,23 +0,0 @@
#error This is used exclusively for generating the documentation (not a real header)
//! \namespace torch::jit::fuser
//! \brief Main PyTorch JIT Fuser namespace
//! \namespace torch::jit::fuser::cuda
//! \brief CUDA specific components
//! \namespace torch::jit::fuser::cuda::executor_utils
//! \brief Fuser executor related utilities
//! \namespace torch::jit::fuser::kir
//! \brief Kernel IR
//! \namespace torch::jit::fuser::ir_utils
//! \brief IR manipulation utilities
//! \namespace torch::jit::fuser::loop_utils
//! \brief Loop utilities
//! \namespace torch::jit::fuser::scope_utils
//! \brief Scope utilities

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 94 KiB

View File

@ -1,8 +0,0 @@
This is the implementation reference for the CUDA PyTorch JIT Fuser
- [PyTorch GitHub Page](https://github.com/pytorch/pytorch)
- [Fuser Source Tree](https://github.com/pytorch/pytorch/tree/master/torch/csrc/jit/codegen/cuda)
- Main documentation indexes: [Namespaces](namespaces.html) and [Classes](annotated.html)
![Fuser Architecture Overview](images/ir_architecture.png)

View File

@ -1,304 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <c10/util/Exception.h>
#include <cmath>
#include <iostream>
#include <variant>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class TORCH_CUDA_CU_API IntOrDouble {
std::variant<double, int64_t> value_;
public:
IntOrDouble(int64_t i) : value_(i) {}
IntOrDouble(double d) : value_(d) {}
IntOrDouble(int i) : value_((int64_t)i) {}
IntOrDouble(size_t i) : value_((int64_t)i) {}
IntOrDouble() : IntOrDouble(0) {}
IntOrDouble(const IntOrDouble& other) = default;
IntOrDouble& operator=(const IntOrDouble& other) = default;
IntOrDouble(IntOrDouble&& other) noexcept = default;
IntOrDouble& operator=(IntOrDouble&& other) noexcept = default;
bool is_int() const {
return std::holds_alternative<int64_t>(value_);
}
template <typename T>
T as() const {
TORCH_CHECK(
std::holds_alternative<T>(value_),
"The expected dtype and the actual dtype does not match in IntOrDouble");
return std::get<T>(value_);
}
template <typename T>
T cast() const;
#define DEFINE_ARITHMETIC_OP(op) \
IntOrDouble operator op(const IntOrDouble& other) const { \
switch ((int)is_int() << 1 | (int)other.is_int()) { \
case 0b00: \
return IntOrDouble(as<double>() op other.as<double>()); \
case 0b01: \
return IntOrDouble(as<double>() op other.as<int64_t>()); \
case 0b10: \
return IntOrDouble(as<int64_t>() op other.as<double>()); \
case 0b11: \
return IntOrDouble(as<int64_t>() op other.as<int64_t>()); \
} \
TORCH_INTERNAL_ASSERT(false); \
} \
template <typename T> \
IntOrDouble operator op(T other) const { \
if (is_int()) { \
return IntOrDouble(as<int64_t>() op other); \
} \
return IntOrDouble(as<double>() op other); \
}
DEFINE_ARITHMETIC_OP(+)
DEFINE_ARITHMETIC_OP(-)
DEFINE_ARITHMETIC_OP(*)
DEFINE_ARITHMETIC_OP(/)
DEFINE_ARITHMETIC_OP(&&)
#undef DEFINE_ARITHMETIC_OP
#define DEFINE_ASSIGN_OP(assign, op) \
IntOrDouble& operator assign(const IntOrDouble& other) { \
switch ((int)is_int() << 1 | (int)other.is_int()) { \
case 0b00: \
return *this = IntOrDouble(as<double>() op other.as<double>()); \
case 0b01: \
return *this = IntOrDouble(as<double>() op other.as<int64_t>()); \
case 0b10: \
return *this = IntOrDouble(as<int64_t>() op other.as<double>()); \
case 0b11: \
return *this = IntOrDouble(as<int64_t>() op other.as<int64_t>()); \
} \
TORCH_INTERNAL_ASSERT(false); \
} \
template <typename T> \
IntOrDouble& operator assign(T other) { \
if (is_int()) { \
return *this = IntOrDouble(as<int64_t>() op other); \
} \
return *this = IntOrDouble(as<double>() op other); \
}
DEFINE_ASSIGN_OP(+=, +)
DEFINE_ASSIGN_OP(-=, -)
DEFINE_ASSIGN_OP(*=, *)
DEFINE_ASSIGN_OP(/=, /)
#undef DEFINE_ASSIGN_OP
IntOrDouble operator%(const IntOrDouble& other) const {
if (is_int() && other.is_int()) {
return IntOrDouble(as<int64_t>() % other.as<int64_t>());
}
TORCH_INTERNAL_ASSERT(false);
}
IntOrDouble operator%(int64_t other) const {
if (is_int()) {
return IntOrDouble(as<int64_t>() % other);
}
TORCH_INTERNAL_ASSERT(false);
}
IntOrDouble& operator%=(const IntOrDouble& other) {
if (is_int() && other.is_int()) {
return *this = IntOrDouble(as<int64_t>() % other.as<int64_t>());
}
TORCH_INTERNAL_ASSERT(false);
}
IntOrDouble& operator%=(int64_t other) {
if (is_int()) {
return *this = IntOrDouble(as<int64_t>() % other);
}
TORCH_INTERNAL_ASSERT(false);
}
#define DEFINE_COMPARE_OP(op) \
bool operator op(const IntOrDouble& other) const { \
switch ((int)is_int() << 1 | (int)other.is_int()) { \
case 0b00: \
return as<double>() op other.as<double>(); \
case 0b01: \
return as<double>() op other.as<int64_t>(); \
case 0b10: \
return as<int64_t>() op other.as<double>(); \
case 0b11: \
return as<int64_t>() op other.as<int64_t>(); \
} \
TORCH_INTERNAL_ASSERT(false); \
} \
bool operator op(double other) { \
if (is_int()) { \
return as<int64_t>() op other; \
} \
return as<double>() op other; \
} \
bool operator op(int64_t other) { \
if (is_int()) { \
return as<int64_t>() op other; \
} \
return as<double>() op other; \
} \
bool operator op(int other) { \
if (is_int()) { \
return as<int64_t>() op other; \
} \
return as<double>() op other; \
}
DEFINE_COMPARE_OP(>)
DEFINE_COMPARE_OP(>=)
DEFINE_COMPARE_OP(<)
DEFINE_COMPARE_OP(<=)
DEFINE_COMPARE_OP(==)
DEFINE_COMPARE_OP(!=)
#undef DEFINE_COMPARE_OP
IntOrDouble operator-() const {
if (is_int()) {
return IntOrDouble(-as<int64_t>());
}
return IntOrDouble(-as<double>());
}
explicit operator double() const;
explicit operator int64_t() const;
explicit operator size_t() const;
explicit operator int() const;
};
#define DEFINE_ARITHMETIC_OP(op) \
template <typename T> \
inline IntOrDouble operator op(T lhs, IntOrDouble rhs) { \
if (rhs.is_int()) { \
return IntOrDouble(lhs op rhs.as<int64_t>()); \
} \
return IntOrDouble(lhs op rhs.as<double>()); \
}
DEFINE_ARITHMETIC_OP(+)
DEFINE_ARITHMETIC_OP(-)
DEFINE_ARITHMETIC_OP(*)
DEFINE_ARITHMETIC_OP(/)
#undef DEFINE_ARITHMETIC_OP
template <>
inline double IntOrDouble::cast<double>() const {
if (is_int()) {
return (double)as<int64_t>();
}
return as<double>();
}
template <>
inline int64_t IntOrDouble::cast<int64_t>() const {
if (!is_int()) {
return (int64_t)as<double>();
}
return as<int64_t>();
}
inline IntOrDouble::operator double() const {
return as<double>();
}
inline IntOrDouble::operator int64_t() const {
return as<int64_t>();
}
inline IntOrDouble::operator size_t() const {
return as<int64_t>();
}
inline IntOrDouble::operator int() const {
return as<int64_t>();
}
#define DEFINE_EQ_OP(op) \
inline bool operator op(double lhs, const IntOrDouble& rhs) { \
if (rhs.is_int()) { \
return false; \
} \
return lhs op rhs.as<double>(); \
} \
\
inline bool operator op(int64_t lhs, const IntOrDouble& rhs) { \
if (rhs.is_int()) { \
return lhs op rhs.as<int64_t>(); \
} \
return false; \
} \
\
inline bool operator op(int lhs, const IntOrDouble& rhs) { \
return operator op((int64_t)lhs, rhs); \
}
DEFINE_EQ_OP(==)
DEFINE_EQ_OP(!=)
#undef DEFINE_EQ_OP
inline std::ostream& operator<<(std::ostream& os, const IntOrDouble& val) {
if (val.is_int()) {
return os << val.as<int64_t>();
}
return os << val.as<double>();
}
namespace IntOrDouble_functions {
inline IntOrDouble ceildiv(const IntOrDouble& a, const IntOrDouble& b) {
if (a.is_int() && b.is_int()) {
auto aa = a.as<int64_t>();
auto bb = b.as<int64_t>();
if (bb > 0) {
return (aa + bb - 1) / bb;
} else {
return (aa + bb + 1) / bb;
}
}
return std::ceil((a / b).as<double>());
}
inline IntOrDouble max(const IntOrDouble& a, const IntOrDouble& b) {
if (a.is_int() && b.is_int()) {
return std::max(a.as<int64_t>(), b.as<int64_t>());
}
return (a > b ? a : b).cast<double>();
}
inline IntOrDouble min(const IntOrDouble& a, const IntOrDouble& b) {
if (a.is_int() && b.is_int()) {
return std::min(a.as<int64_t>(), b.as<int64_t>());
}
return (a < b ? a : b).cast<double>();
}
inline IntOrDouble abs(const IntOrDouble& a) {
if (a.is_int()) {
return IntOrDouble(std::abs(a.as<int64_t>()));
} else {
return IntOrDouble(std::abs(a.as<double>()));
}
}
} // namespace IntOrDouble_functions
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,602 +0,0 @@
#include <expr_evaluator.h>
#include <instrumentation.h>
#include <ir_utils.h>
#include <kernel_expr_evaluator.h>
#include <lower2device.h>
#include <evaluator_common.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace {
template <typename VALTYPE>
std::vector<VALTYPE*> getImmediateProducers(VALTYPE* val) {
if (val->definition()) {
auto expr = val->definition();
return expr->inputs();
} else {
return {};
}
}
//! IR-Generic utility, collects all the producers required for the
//! given list of IR values and returns them along with the original
//! list in topological order.
template <typename VALTYPE>
std::vector<VALTYPE*> makeSortedEvaluationList(std::vector<VALTYPE*> input) {
// Deduplicate
std::vector<VALTYPE*> to_sort;
std::unordered_set<VALTYPE*> visited;
for (auto val : input) {
if (!visited.count(val)) {
to_sort.push_back(val);
visited.insert(val);
}
}
std::vector<VALTYPE*> sorted;
visited.clear();
// Topological Sort
// Note: didn't explicitly exclude producers that are not in the original
// list. This should be acceptable for the intended use.
while (!to_sort.empty()) {
auto top_val = to_sort.back();
if (visited.count(top_val)) {
to_sort.pop_back();
} else {
bool ready_to_pop = true;
for (auto producer : getImmediateProducers(top_val)) {
if (!visited.count(producer)) {
ready_to_pop = false;
to_sort.push_back(producer);
}
}
if (ready_to_pop) {
visited.insert(top_val);
sorted.push_back(top_val);
to_sort.pop_back();
}
}
}
return sorted;
}
//! Kernel IR utility, collects all the symbolic values
//! used in allocation nodes.
void collectBufferSizes(
std::vector<Val*>& into,
const std::vector<Expr*>& exprs) {
for (auto expr : exprs) {
if (auto allocate = dynamic_cast<kir::Allocate*>(expr)) {
into.push_back(allocate->size());
} else if (auto for_loop = dynamic_cast<kir::ForLoop*>(expr)) {
collectBufferSizes(into, for_loop->body().exprs());
} else if (auto ite = dynamic_cast<kir::IfThenElse*>(expr)) {
collectBufferSizes(into, ite->thenBody().exprs());
collectBufferSizes(into, ite->elseBody().exprs());
}
}
}
//! Kernel IR utility, collects all the kernel symbolic
//! values we will need at runtime, i.e. after the
//! generated cuda kernel has already been compiled.
//! The values are to be used for runtime logic, like
//! `computeLaunchparams`.
std::vector<Val*> collectRuntimeUsedValues(kir::Kernel* kernel) {
std::vector<Val*> ret;
auto all_tvs = ir_utils::allTvs(kernel);
// Collect extent and inputs
for (auto tv : all_tvs) {
for (auto id : tv->domain()->domain()) {
ret.push_back(id->extent());
}
}
for (auto inp : kernel->inputs()) {
if (inp->isA<Int>() || inp->isA<Double>()) {
ret.push_back(inp);
}
}
// Collect allocation sizes:
collectBufferSizes(ret, kernel->topLevelExprs());
return makeSortedEvaluationList(ret);
}
std::vector<Val*> collectRuntimeUsedValues(Fusion* fusion) {
std::vector<Val*> ret;
auto all_tvs = ir_utils::allTvs(fusion);
// Collect extent and inputs
for (auto tv : all_tvs) {
for (auto id : tv->domain()->domain()) {
ret.push_back(id->extent());
}
}
for (auto inp : fusion->inputs()) {
if (inp->isA<Int>() || inp->isA<Double>()) {
ret.push_back(inp);
}
}
return makeSortedEvaluationList(ret);
}
} // namespace
template <typename IRContext>
void PrecomputedValuesBase<IRContext>::initializeValueList(
typename IRContext::EVALUATOR_TYPE& const_evaluator,
const std::vector<Val*>& sorted_value_list) {
// Initialize workspace
num_of_values_ = sorted_value_list.size();
defined_ = std::vector<bool>(num_of_values_, false);
is_constant_ = std::vector<bool>(num_of_values_, false);
values_ = std::vector<IntOrDouble>(num_of_values_, -1);
// Fill in constants and assign evaluator indices
for (const auto i : c10::irange(num_of_values_)) {
// Use an expression evaluator to test if value is const
auto const_val = const_evaluator.evaluate(sorted_value_list[i]);
if (const_val.has_value()) {
is_constant_[i] = true;
values_[i] = const_val.value();
}
sorted_value_list[i]->setEvaluatorIndex(i);
}
}
template <typename IRContext>
c10::optional<IntOrDouble> PrecomputedValuesBase<IRContext>::getMaybeValueFor(
const Val* val) {
auto index = val->evaluatorIndex();
if (index < 0) {
return c10::nullopt;
}
if (!defined_[index] && !is_constant_[index]) {
return c10::nullopt;
}
return values_[index];
}
template <typename IRContext>
void PrecomputedValuesBase<IRContext>::print() const {
std::cout << "Precomputed Values:\n";
for (auto i : c10::irange(symbols_.size())) {
if (defined_[i]) {
std::cout << symbols_[i]->toInlineString() << " = " << values_[i]
<< std::endl;
}
}
}
template <typename IRContext>
void PrecomputedValuesBase<IRContext>::evaluate() {
FUSER_PERF_SCOPE("PrecomputedValues::Evaluate");
value_machine_->run();
validate();
}
template <typename IRContext>
void PrecomputedValuesBase<IRContext>::invalidate() {
// clear binding values
binding_log_.clear();
// invalidate value entries
std::fill(defined_.begin(), defined_.end(), false);
// invalidate flag
has_valid_values_ = false;
}
template <typename IRContext>
void PrecomputedValuesBase<IRContext>::validate() {
FUSER_PERF_SCOPE("PrecomputedValuess::Validate");
for (auto it : binding_log_) {
TORCH_INTERNAL_ASSERT(
values_[it.first] == it.second,
"Precomputed values failed to validate.",
"\nSomething unexpected changed between the compilation and execution.\n",
values_[it.first],
" != ",
it.second);
}
has_valid_values_ = true;
}
template <typename IRContext>
NaiveValueMachine<IRContext>::NaiveValueMachine(
PrecomputedValuesBase<IRContext>& precomputed_values)
: precomputed_values_(precomputed_values) {
num_of_instructions_ = 0;
for (auto val : precomputed_values_.symbols_) {
auto def = val->definition();
if (def) {
if (auto uop = dynamic_cast<UnaryOp*>(def)) {
makeUnaryOp(uop);
} else if (auto bop = dynamic_cast<BinaryOp*>(def)) {
makeBinaryOp(bop);
} else {
TORCH_INTERNAL_ASSERT(false, "Unsupported expr");
}
}
}
}
template <typename IRContext>
void NaiveValueMachine<IRContext>::run() {
for (const auto i : c10::irange(num_of_instructions_)) {
// Skip this instruction if the dest location
// has already been computed or is constant.
if (precomputed_values_.defined_[dest_[i]] ||
precomputed_values_.is_constant_[dest_[i]]) {
continue;
}
runInstruction(i);
}
}
template <typename IRContext>
void NaiveValueMachine<IRContext>::makeUnaryOp(UnaryOp* uop) {
int in = uop->inputs()[0]->evaluatorIndex();
int out = uop->outputs()[0]->evaluatorIndex();
TORCH_INTERNAL_ASSERT(in >= 0, "Integer Machine: unknown input: ", uop);
TORCH_INTERNAL_ASSERT(out >= 0, "Integer Machine: unknown out: ", uop);
int index = makeInstructionEntry();
inst_type_[index] = InstructionType::UNARY_OP;
uop_type_[index] = IRContext::getOpType(uop);
if (uop_type_[index] == UnaryOpType::Cast) {
data_type_[index] = uop->out()->getDataType().value();
}
src0_[index] = in;
dest_[index] = out;
}
template <typename IRContext>
void NaiveValueMachine<IRContext>::makeBinaryOp(BinaryOp* bop) {
int in0 = bop->inputs()[0]->evaluatorIndex();
int in1 = bop->inputs()[1]->evaluatorIndex();
int out = bop->outputs()[0]->evaluatorIndex();
TORCH_INTERNAL_ASSERT(in0 >= 0, "Integer Machine: unknown lhs: ", bop);
TORCH_INTERNAL_ASSERT(in1 >= 0, "Integer Machine: unknown rhs: ", bop);
TORCH_INTERNAL_ASSERT(out >= 0, "Integer Machine: unknown out: ", bop);
int index = makeInstructionEntry();
inst_type_[index] = InstructionType::BINARY_OP;
bop_type_[index] = IRContext::getOpType(bop);
src0_[index] = in0;
src1_[index] = in1;
dest_[index] = out;
}
template <typename IRContext>
int NaiveValueMachine<IRContext>::makeInstructionEntry() {
int index = num_of_instructions_++;
inst_type_.push_back(InstructionType::UNARY_OP);
uop_type_.push_back(UnaryOpType::Abs);
bop_type_.push_back(BinaryOpType::Add);
data_type_.push_back(DataType::Null);
src0_.push_back(-1);
src1_.push_back(-1);
dest_.push_back(-1);
return index;
}
template <typename IRContext>
void NaiveValueMachine<IRContext>::runInstruction(int index) {
switch (inst_type_[index]) {
case InstructionType::UNARY_OP:
runUnaryOp(index);
break;
case InstructionType::BINARY_OP:
runBinaryOp(index);
break;
}
}
template <typename IRContext>
void NaiveValueMachine<IRContext>::runUnaryOp(int index) {
using namespace IntOrDouble_functions;
int src_index = src0_[index];
bool src_defined = precomputed_values_.defined_[src_index];
bool src_is_const = precomputed_values_.is_constant_[src_index];
if (!src_defined && !src_is_const) {
return;
}
int dest_index = dest_[index];
auto& src = precomputed_values_.values_[src_index];
auto& dest = precomputed_values_.values_[dest_index];
switch (uop_type_[index]) {
case UnaryOpType::Neg:
dest = -src;
break;
case UnaryOpType::Set:
dest = src;
break;
case UnaryOpType::Cast:
if (data_type_[index] == DataType::Double) {
dest = src.template cast<double>();
} else if (data_type_[index] == DataType::Int) {
dest = src.template cast<int64_t>();
} else {
TORCH_INTERNAL_ASSERT(false, "dtype not supported in evaluator");
}
break;
case UnaryOpType::Abs:
dest = abs(src);
break;
default:
TORCH_CHECK(!"Unexpected operator type ", uop_type_[index]);
}
precomputed_values_.defined_[dest_index] = true;
}
template <typename IRContext>
void NaiveValueMachine<IRContext>::runBinaryOp(int index) {
using namespace IntOrDouble_functions;
int src0_index = src0_[index];
int src1_index = src1_[index];
bool src0_is_const = precomputed_values_.is_constant_[src0_index];
bool src1_is_const = precomputed_values_.is_constant_[src1_index];
bool src_defined =
(precomputed_values_.defined_[src0_index] || src0_is_const) &&
(precomputed_values_.defined_[src1_index] || src1_is_const);
if (!src_defined) {
return;
}
int dest_index = dest_[index];
auto& lhs = precomputed_values_.values_[src0_index];
auto& rhs = precomputed_values_.values_[src1_index];
auto& dest = precomputed_values_.values_[dest_index];
switch (bop_type_[index]) {
case BinaryOpType::Add:
dest = lhs + rhs;
break;
case BinaryOpType::Sub:
dest = lhs - rhs;
break;
case BinaryOpType::Mul:
dest = lhs * rhs;
break;
case BinaryOpType::Div:
TORCH_CHECK(rhs != 0);
dest = lhs / rhs;
break;
case BinaryOpType::Mod:
TORCH_CHECK(rhs != 0);
dest = lhs % rhs;
break;
case BinaryOpType::CeilDiv:
TORCH_CHECK(rhs != 0);
dest = ceildiv(lhs, rhs);
break;
case BinaryOpType::And:
dest = Int::ScalarType(lhs && rhs);
break;
case BinaryOpType::Max:
dest = lhs > rhs ? lhs : rhs;
break;
case BinaryOpType::Min:
dest = lhs < rhs ? lhs : rhs;
break;
default:
TORCH_CHECK(!"Unexpected operator type");
}
precomputed_values_.defined_[dest_index] = true;
}
KernelPrecomputedValues::KernelPrecomputedValues(kir::Kernel* kernel) {
loadSymbols(collectRuntimeUsedValues(kernel));
kir::ExpressionEvaluator evaluator;
initializeValueList(evaluator, symbols());
initializeNamedScalars();
initializeIntegerMachine();
}
// TODO: put this to base class
void KernelPrecomputedValues::bindTensorMetaData(
TensorView* tv,
const TensorArgAbstract* tensor_arg_abstract) {
const auto root_domain =
TensorDomain::noReductions(tv->domain()->getMaybeRFactorDomain());
TORCH_INTERNAL_ASSERT(
tensor_arg_abstract->getRank() == static_cast<int>(root_domain.size()),
"Something went wrong configuring launch. Inputs do not match.");
for (const auto dim : c10::irange(root_domain.size())) {
auto extent = root_domain[dim]->extent();
auto value = tensor_arg_abstract->getSize(dim);
bindValue(extent->evaluatorIndex(), value);
}
}
namespace {
//! Compares the name of given scalar with thread size strings
//! and returns the corresponding parallel type if a match
//! is found.
c10::optional<ParallelType> getMaybeThreadSizeParallelType(
NamedScalar* named_scalar) {
auto& var_name = named_scalar->name();
for (auto ptype : kParallelTypeThreads) {
if (var_name == stringifyThreadSize(ptype)) {
return ptype;
}
}
return c10::nullopt;
}
} // namespace
void KernelPrecomputedValues::initializeNamedScalars() {
for (auto val : symbols()) {
if (auto named_scalar = dynamic_cast<NamedScalar*>(val)) {
auto maybe_parallel_type = getMaybeThreadSizeParallelType(named_scalar);
if (maybe_parallel_type.has_value()) {
auto& index_list =
thread_dim_value_indices_[maybe_parallel_type.value()];
if (!index_list) {
index_list = std::make_unique<std::vector<int>>();
}
index_list->push_back(val->evaluatorIndex());
}
}
}
}
// TODO: merge this one with above.
void KernelPrecomputedValues::bindKernelInputs(
kir::Kernel* kernel,
const KernelArgumentHolder& args) {
if (hasValidValues()) {
invalidate();
}
const auto& inputs = kernel->inputs();
TORCH_INTERNAL_ASSERT(
args.size() == inputs.size(), "kernel inputs size does not match args");
for (const auto i : c10::irange(inputs.size())) {
auto arg = args[i];
const auto input = inputs[i];
if (auto tensor_input = dynamic_cast<TensorView*>(input)) {
if (const auto& tensor_arg_abstract =
dynamic_cast<const TensorArgAbstract*>(arg)) {
bindTensorMetaData(tensor_input, tensor_arg_abstract);
} else {
// TODO: cpu scalar of int type should be bound as scalar int as well
TORCH_CHECK(
arg->isType(ArgType::CpuScalarTensor),
"binding input to TensorView expects input arg to be of tensor type");
}
} else if (input->isScalar()) {
if (input->dtype() == DataType::Int) {
TORCH_CHECK(
arg->isType(ArgType::Long),
"binding input to integer type expects input arg to be a scalar of Long type");
precomputedValuesBaseType::bindValue(
input->evaluatorIndex(), *static_cast<const int64_t*>(arg->arg()));
} else if (input->dtype() == DataType::Double) {
TORCH_CHECK(
arg->isType(ArgType::Double),
"binding input to double type expects input arg to be a scalar of Double type");
precomputedValuesBaseType::bindValue(
input->evaluatorIndex(), *static_cast<const double*>(arg->arg()));
}
}
}
}
void KernelPrecomputedValues::bindParallelExtents(
const ParallelExtentMap& parallel_extents,
const LaunchParams& launch_constraint) {
// Bind values of extents of parallelized
// iterdomains from launch_constraint when applicable.
// Consistency will be checked at validate().
for (const auto& it : parallel_extents) {
auto raw_val = launch_constraint.getRawVal(it.first);
if (raw_val > 0) {
for (auto extent : it.second) {
bindValue(extent->evaluatorIndex(), raw_val);
}
}
}
}
void KernelPrecomputedValues::bindConcreteParallelTypeValue(
ParallelType pt,
int64_t value) {
auto index_list_it = thread_dim_value_indices_.find(pt);
if (index_list_it != thread_dim_value_indices_.end()) {
for (auto index : *(index_list_it->second)) {
bindValue(index, value);
}
}
}
FusionPrecomputedValues::FusionPrecomputedValues(Fusion* fusion)
: fusion_(fusion) {
loadSymbols(collectRuntimeUsedValues(fusion));
ExpressionEvaluator evaluator(fusion);
initializeValueList(evaluator, symbols());
initializeIntegerMachine();
}
// TODO: put this to base class
void FusionPrecomputedValues::bindTensorMetaData(
TensorView* tv,
const TensorArgAbstract* tensor_arg_abstract) {
const auto root_domain =
TensorDomain::noReductions(tv->getMaybeRFactorDomain());
TORCH_INTERNAL_ASSERT(
tensor_arg_abstract->getRank() == static_cast<int>(root_domain.size()),
"Something went wrong configuring launch. Inputs do not match.");
for (const auto dim : c10::irange(root_domain.size())) {
auto extent = root_domain[dim]->extent();
auto value = tensor_arg_abstract->getSize(dim);
precomputedValuesBaseType::bindValue(extent->evaluatorIndex(), value);
}
}
void FusionPrecomputedValues::bindFusionInputs(
const KernelArgumentHolder& args) {
if (hasValidValues()) {
precomputedValuesBaseType::invalidate();
}
const auto& inputs = fusion_->inputs();
TORCH_INTERNAL_ASSERT(
args.size() == inputs.size(), "kernel inputs size does not match args");
for (const auto i : c10::irange(inputs.size())) {
const auto input = inputs[i];
const ArgAbstract* arg = args[i];
if (auto tensor_input = dynamic_cast<TensorView*>(input)) {
if (const auto& tensor_arg_abstract =
dynamic_cast<const TensorArgAbstract*>(arg)) {
bindTensorMetaData(tensor_input, tensor_arg_abstract);
} else {
TORCH_CHECK(
arg->isType(ArgType::CpuScalarTensor),
"binding input to TensorView expects input arg to be of tensor type");
}
} else if (input->isScalar()) {
if (input->getDataType() == DataType::Int) {
TORCH_CHECK(
arg->isType(ArgType::Long),
"binding input to integer type expects input arg to be a scalar of Long type");
precomputedValuesBaseType::bindValue(
input->evaluatorIndex(), *static_cast<const int64_t*>(arg->arg()));
} else if (input->getDataType() == DataType::Double) {
TORCH_CHECK(
arg->isType(ArgType::Double),
"binding input to double type expects input arg to be a scalar of Double type");
precomputedValuesBaseType::bindValue(
input->evaluatorIndex(), *static_cast<const double*>(arg->arg()));
}
}
}
}
template class PrecomputedValuesBase<FusionIRContext>;
template class PrecomputedValuesBase<KernelIRContext>;
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,343 +0,0 @@
#pragma once
#include <dynamic_type.h>
#include <executor_kernel_arg.h>
#include <executor_launch_params.h>
#include <fusion.h>
#include <ir_all_nodes.h>
#include <lower2device.h>
#include <c10/core/DeviceType.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
//! This is the common space for expression evaluators in
//! fusion IR and kernel IR context. Much of the evaluator
//! optimizations and runtimes could share the same code
//! path and they could be collected here.
class ExpressionEvaluator;
namespace kir {
class ExpressionEvaluator;
} // namespace kir
//! IR Contexts to be passed to generic evaluator optimizations
//! and runtimes. Defines the essential interface for the
//! generic logic to get necessary type and function info
//! from the IR nodes. Generic optimizations will assume
//! the same list of static definitions are provided
//! in each of the contexts, just FusionIR and KernelIR
//! currently.
//! Context for using generic logic on FusionIR
class FusionIRContext {
public:
using TV_TYPE = TensorView;
using EVALUATOR_TYPE = ExpressionEvaluator;
static BinaryOpType getOpType(BinaryOp* bop) {
return bop->getBinaryOpType();
}
static UnaryOpType getOpType(UnaryOp* uop) {
return uop->getUnaryOpType();
}
};
//! Context for using generic logic on KernelIR
class KernelIRContext {
public:
using EVALUATOR_TYPE = kir::ExpressionEvaluator;
static BinaryOpType getOpType(BinaryOp* bop) {
return bop->getBinaryOpType();
}
static UnaryOpType getOpType(UnaryOp* uop) {
return uop->getUnaryOpType();
}
};
template <typename IRContext>
class PrecomputedValuesBase;
//! NaiveValueMachine:
//! This is an un-optimized runtime for evaluating a
//! set of values in one run. The runtime contains
//! a vector of instructions inferred from IR at compile-time
//! and it currently must be associated with an instance of
//! PrecomputedValuesBase that will provide the workspace
//! containing the concrete values for the values.
template <typename IRContext>
class NaiveValueMachine {
//! The generic types of instructions supported for this
//! machine, currently only binary and unary.
enum class InstructionType { UNARY_OP, BINARY_OP };
public:
//! Constructor lowers all the expr IR nodes stored in precomputed_values
//! and stores them in the private state.
NaiveValueMachine(PrecomputedValuesBase<IRContext>& precomputed_values);
//! Runs all the instructions and write results to the associated
//! precomputed_values.
void run();
private:
//! Convert an unary IR expr to an instruction
void makeUnaryOp(UnaryOp* uop);
//! Convert an binary IR expr to an instruction
void makeBinaryOp(BinaryOp* bop);
//! Create an empty instruction with all default values
//! and place it at the end of the instruction buffer.
int makeInstructionEntry();
//! Run a single instruction at the given index of
//! the instruction buffer. Decodes and dispatches
//! to the corresponding instruction handle functions.
void runInstruction(int index);
//! Runs a unary operation at given index of instruction buffer
void runUnaryOp(int index);
//! Runs a binary operation at given index of instruction buffer
void runBinaryOp(int index);
private:
friend PrecomputedValuesBase<IRContext>;
//! Reference to the PrecomputedValues workspace associated with
//! this runtime. All the instructions will read and write the
//! values in this workspace.
PrecomputedValuesBase<IRContext>& precomputed_values_;
//! Instruction buffer. All states are in separate vectors and
//! the entry of each vector at the same index correspond to
//! the same instruction.
//! Total number of instructions
int num_of_instructions_ = 0;
//! Machine instruction type for each instruction i.e.
//! unary or binary
std::vector<InstructionType> inst_type_;
//! Unary operator type if applicable, contains a default
//! value at each index corresponding to a binary op.
std::vector<UnaryOpType> uop_type_;
//! Data type for unary op of type UnaryOpType::Cast, contains a default
//! value at each index corresponding other ops.
std::vector<DataType> data_type_;
//! Unary operator type if applicable, contains a default
//! value at each index corresponding to a unary op.
std::vector<BinaryOpType> bop_type_;
//! Indexes of operands and destination of each instruction.
//! The indexes corresponds to positions in the workspace
//! where concrete values are hosted.
//! Operand 0 of each instruction.
std::vector<int> src0_;
//! Operand 1 of each instruction, a default value at
//! each index corresponding to a unary op.
std::vector<int> src1_;
//! Destination of each instruction.
std::vector<int> dest_;
};
//! PrecomputedValuesBase:
//! A class to support optimized evaluation of values
//! at runtime.
//! At compile time all necessary values are collected
//! from given IR nodes and a runtime and a workspace containing
//! the concrete values is created and pre-allocated.
//! At runtime the value vm is used to evaluate all the
//! values and store them in the workspace ahead of time.
template <typename IRContext>
class PrecomputedValuesBase {
using VALUE_MACHINE = NaiveValueMachine<IRContext>;
public:
explicit PrecomputedValuesBase() = default;
//! Returns if the workspace contains evaluated results.
bool ready() {
return has_valid_values_;
}
//! Runs the internal value machine that will compute
//! the values allocated in the workspace.
void evaluate();
//! Returns value for the given IR node if it's stored
//! in the workspace and has been evaluated.
c10::optional<IntOrDouble> getMaybeValueFor(const Val* val);
//! Debugging helper, prints all the currently known values
void print() const;
protected:
//! Initialize the workspace before first use.
//! Assume the given value list IR nodes have
//! been topologically sorted.
void initializeValueList(
typename IRContext::EVALUATOR_TYPE& evaluator,
const std::vector<Val*>& sorted_value_list);
//! Bind concrete value to the given index
//! if the index is valid.
void bindValue(int index, IntOrDouble value) {
if (index < 0 || is_constant_[index]) {
return;
}
defined_[index] = true;
values_[index] = value;
binding_log_.emplace_back(index, value);
}
//! Invalidate all computed values in the workspace.
void invalidate();
//! Interface for subclasses to access symbols_
void loadSymbols(std::vector<Val*> symbols) {
symbols_ = std::move(symbols);
}
//! Interface for subclasses to access symbols_
std::vector<Val*>& symbols() {
return symbols_;
}
//! Initialize the value runtime that will
//! infer instructions from the workspace.
void initializeIntegerMachine() {
value_machine_ = std::make_unique<VALUE_MACHINE>(*this);
}
bool hasValidValues() {
return has_valid_values_;
}
private:
//! Post evaluation check, throws if any computed value
//! is inconsistent with its bound value
void validate();
//! Returns true if workspace has a computed or constant
//! value for given index.
bool hasValue(int index) {
TORCH_INTERNAL_ASSERT(index > 0);
return defined_[index] || is_constant_[index];
}
private:
friend VALUE_MACHINE;
//! Marks if an evaluation has finished
bool has_valid_values_ = false;
//! The size of workspace
int num_of_values_ = -1;
//! Marks if a value has been bound or
//! computed at each index.
std::vector<bool> defined_;
//! Marks if a value is compile-time constant
//! at each index.
std::vector<bool> is_constant_;
//! Stores the concrete values at each index.
std::vector<IntOrDouble> values_;
//! Stores the IR nodes corresponding to each index.
std::vector<Val*> symbols_;
//! An internal log to keep track of all the bindings
//! used in each evaluation cycle. To be used for
//! consistency check.
std::vector<std::pair<int, IntOrDouble>> binding_log_;
//! Integer runtime for realizing the values computations.
std::unique_ptr<VALUE_MACHINE> value_machine_;
};
//! PrecomputedValues workspace in Fusion IR context,
//! defines the set of values to be collected in each
//! fusion graph and the input value binding given each
//! fusion runtime input.
class FusionPrecomputedValues : public PrecomputedValuesBase<FusionIRContext> {
using precomputedValuesBaseType = PrecomputedValuesBase<FusionIRContext>;
public:
FusionPrecomputedValues(Fusion* fusion);
//! Bind concrete values from fusion runtime inputs
void bindFusionInputs(const KernelArgumentHolder& args);
private:
void bindTensorMetaData(
TensorView* tv,
const TensorArgAbstract* tensor_arg_abstract);
private:
Fusion* fusion_ = nullptr;
};
//! PrecomputedValues workspace in Fusion IR context,
//! defines the set of values to be collected in each
//! kernel IR sequence and the input value binding given each
//! fusion runtime input and launch constraints.
class KernelPrecomputedValues : public PrecomputedValuesBase<KernelIRContext> {
using precomputedValuesBaseType = PrecomputedValuesBase<KernelIRContext>;
public:
using ParallelExtentMap =
std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
KernelPrecomputedValues(kir::Kernel* kernel);
//! Bind concrete values from fusion runtime inputs
void bindKernelInputs(kir::Kernel* kernel, const KernelArgumentHolder& args);
//! Bind concrete values from launch constraints
void bindParallelExtents(
const ParallelExtentMap& parallel_extents,
const LaunchParams& launch_constraint);
//! Bind the NamedScalars corresponding to the
//! concrete parallel dimension sizes after the
//! actual value has been resolved.
void bindConcreteParallelTypeValue(ParallelType pt, int64_t value);
private:
void bindTensorMetaData(
TensorView* tv,
const TensorArgAbstract* tensor_arg_abstract);
//! Iterate through all the named scalars corresponding
//! to thread sizes and pre-group them by their parallel
//! types.
void initializeNamedScalars();
private:
//! Contains all the named scalars correspond
//! to thread size of each parallel type.
std::unordered_map<ParallelType, std::unique_ptr<std::vector<int>>, TypeHash>
thread_dim_value_indices_;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

View File

@ -1,330 +0,0 @@
#pragma once
#include <executor_launch_params.h>
#include <executor_utils.h>
#include <fusion.h>
#include <ir_all_nodes.h>
#include <ir_cloner.h>
#include <ir_printer.h>
#include <kernel_expr_evaluator.h>
#include <lower2device.h>
#include <utils.h>
#include <c10/core/DeviceType.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
TORCH_CUDA_CU_API bool shouldFillAllocationWithNan();
TORCH_CUDA_CU_API void setFillAllocationWithNan(bool value);
// TODO: Should this actually be in launch params?
struct TORCH_CUDA_CU_API CompileOptions {
c10::Device device = c10::Device(c10::DeviceType::CUDA, 0);
KernelIndexMode index_mode = KernelIndexMode::INT64;
};
class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
public:
// Unsafe compilation that's useful for debugging kernels, iterating over
// slight modifications of a generated kernel
void debugCompileFusionFromStr(
Fusion* fusion,
const std::string& code,
const std::string& name,
int id,
CompileOptions options = CompileOptions());
//! infers output sizes via returning non-allocated KernelArgumentHolder.
//! this function is useful for async compilation for segmented fusion
KernelArgumentHolder inferOutputSizes(
const KernelArgumentHolder& args,
const LaunchParams& launch_constraints);
void compileFusion(
Fusion* fusion,
const KernelArgumentHolder& args,
const LaunchParams& launch_constraints = LaunchParams());
// TODO: merge it with the overload above.
//! This API is merely here so we don't have to go back and update all cpp
//! tests.
void compileFusion(
Fusion* fusion,
const at::ArrayRef<IValue>& inputs = {},
const LaunchParams& launch_constraints = LaunchParams()) {
KernelArgumentHolder args =
KernelArgumentHolder::createKernelArgumentHolder(inputs);
compileFusion(fusion, args, launch_constraints);
}
std::vector<at::Tensor> runFusion(
KernelArgumentHolder& args,
const LaunchParams& launch_constraints = LaunchParams(),
const std::vector<at::Tensor>& outputs = {});
std::vector<at::Tensor> runFusion(
const at::ArrayRef<IValue>& inputs,
const std::vector<at::Tensor>& outputs,
const LaunchParams& launch_constraints = LaunchParams(),
const c10::optional<size_t>& opt_code = c10::nullopt) {
KernelArgumentHolder args =
KernelArgumentHolder::createKernelArgumentHolder(inputs);
if (opt_code.has_value()) {
args.setCacheId(*opt_code);
}
return runFusion(args, launch_constraints, outputs);
}
std::vector<at::Tensor> runFusion(
const at::ArrayRef<IValue>& inputs,
const LaunchParams& launch_constraints = LaunchParams(),
const c10::optional<size_t>& opt_code = c10::nullopt) {
return runFusion(inputs, {}, launch_constraints, opt_code);
}
// function to query whether a `FusionExecutor` has a compiled kernel to
// execute
bool compiled() const {
return fusion_id_ != -1 && lowered_;
};
void evictCache(size_t cache_id) {
executor_entry_lookup_.erase(cache_id);
}
// struct used to hold necessary information to launch compiled kernel on a
// given input set.
//
// TODO: strides would also be important when we handle permutations in
// codegen.
//
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
struct ExecutorEntry {
bool init = false;
LaunchParams launch_params;
std::vector<std::pair<int, int>> io_alias_indices;
std::vector<std::vector<int64_t>> output_sizes;
std::vector<std::vector<int64_t>> output_strides;
std::vector<at::ScalarType> output_types;
std::vector<std::vector<int64_t>> buffer_sizes;
std::vector<at::ScalarType> buffer_types;
std::vector<bool> buffer_zero_init;
uint64_t rand_offset;
};
using ExecutorCompileTimeInfoCache =
executor_utils::caching::ExecutorCompileTimeInfoCache;
kir::Kernel* kernel() const {
TORCH_INTERNAL_ASSERT(lowered_);
return lowered_->kernel();
}
//! Internal knob used for debugging/profiling only
void setExecuteKernelFlag(bool execute_kernel) {
execute_kernel_ = execute_kernel;
}
//! Internal knob used for debugging/profiling only
void setMeasureKernelTimeFlag(bool measure_kernel_time) {
measure_kernel_time_ = measure_kernel_time;
}
//! Returns the last kernel execution time, in milliseconds
//!
//! \note The kernel time is only tracked if enabled by calling
//! setMeasureKernelTimeFlag(true)
//!
float kernelTimeMs() const {
return measure_kernel_time_ ? kernel_time_ms_ : 0;
}
//! Returns the number of bytes processed last kernel execution
int64_t bytesProcessed() const {
return bytes_processed_;
}
//! Returns the launch parameters from the last kernel execution
LaunchParams lastLaunchParams() const {
return launch_params_;
}
//! Returns the string of the compiled kernel
std::string kernelString() const {
return kernel_code_;
}
//! Returns the latest compile log
std::string compilerLog() const {
return last_compiler_log_;
}
std::string kernelName() const {
std::stringstream ss;
ss << "kernel" << fusion_id_;
return ss.str();
}
//! Internal tests only. Compiles CUDA code with NVRTC directly from
//! string. This util provides a path to test runtime code, i.e. the resource
//! strings.
void compileRtc(
const std::string& code,
const std::string& name,
bool structured = false,
CompileOptions options = CompileOptions());
//! Internal tests only. Runs the compiled CUDA kernel from compileRtc.
void runRtc(
const LaunchParams& launch_params,
const std::vector<at::Tensor>& args);
//! Internal knob used for debugging/profiling only
void disableLaunchParamCache() {
disable_parameter_cache_ = true;
}
private:
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
struct GlobalBuffers {
std::vector<at::Tensor> buffers;
std::vector<bool> zero_init;
at::Tensor profile_buffer;
};
static std::string kernelNamespace() {
return "CudaCodeGen";
}
// Add preamble and wrap in namespace
std::string getStructuredCode(const std::string& kernel);
LaunchParams computeLaunchParams(
const LaunchParams& launch_constraints,
kir::ExpressionEvaluator& expr_eval,
const int warp_size);
uint64_t computeSharedMemory(
kir::ExpressionEvaluator& expr_eval,
const std::vector<const kir::Allocate*>& buffers,
bool align_padding = false,
uint64_t total = 0);
// return a pair of vector of tensors, where tensors in the first vector are
// not initialized, while the second vector contains zero-initiliazed tensors
GlobalBuffers allocGlobalVals(kir::ExpressionEvaluator& expr_eval);
// alias_index: index of outputs that are aliases to inputs, hence we should
// skip allocating real storage for those, but still maintain its spot to
// maintain the indexing from output aliases to inputs
std::vector<at::Tensor> allocOutputs(
const KernelArgumentHolder& args,
kir::ExpressionEvaluator& expr_eval,
const std::unordered_set<int>& alias_indices = {});
void setUsedTVs();
const std::vector<TensorView*>& getUsedTVs() const {
return used_tvs_;
};
ExecutorCompileTimeInfoCache* compileTimeDataCache() {
return &compile_time_info_cache_;
}
//! returns KernelArgumentHolder representing the output sizes from kernel
//! execution. Note: 1. this API would ignoring aliased outputs and instead
//! pushing scalar int 0 as a place holder; 2. this API doesn't actually
//! allocate output in memory, but rather is used just to infer output sizes.
KernelArgumentHolder evaluateOutputSizes(
const KernelArgumentHolder& args,
kir::ExpressionEvaluator& expr_eval,
const std::unordered_set<int>& alias_indices = {});
private:
CompileOptions options_;
//! Current configured total shared mem size from cudaDeviceProp
size_t configured_device_smem_ = std::numeric_limits<size_t>().max();
//! Available shared memory space for dynamic allocation for the current
//! compiled kernel at the current shared memory/L1 configuration
c10::optional<size_t> maybe_available_dynamic_smem_ = c10::nullopt;
//! Absolute limit of all available shared mem space from cudaDeviceProp
size_t device_smem_limit_ = std::numeric_limits<size_t>().max();
// Assuming sm70 or above:
// limit of statically allocated smem is 48 KB:
// See:
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-8-x
const uint64_t max_static_smem_ = 48 << 10;
int warp_size_ = 0;
executor_utils::NvrtcFunction compiled_kernel_;
// TensorViews actually used in the kernel.
std::vector<TensorView*> used_tvs_;
// Counter to be used for kernel name.
int fusion_id_ = -1;
static int fusion_id_counter_;
std::unique_ptr<GpuLower> lowered_;
// Copy of lowered_->kernel()
Fusion* fusion_ = nullptr;
// Track the block size this kernel was compiled with. If the block size
// increases, recompile to adjust maxregister count.
int64_t block_size_high_water_mark = 1;
// lookup table to take short cut to retrieve recorded information in order to
// launch kernels without re-inference parameters.
std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
// Compile time information caching. This is used for shape inference
// support. The cache stores graph information that are available
// without shape information so that each shape inference call will
// not need to re-compute them.
ExecutorCompileTimeInfoCache compile_time_info_cache_;
// Cached expr eval
std::unique_ptr<KernelPrecomputedValues> evaluator_precomputed_values_ =
nullptr;
// Profiling support: knob to control wheter we actually execute the
// kernel on the GPU or not
bool execute_kernel_ = true;
// Profiling support: knob to enable measuring kernel execution time
bool measure_kernel_time_ = false;
// Profiling support: the last kernel execution time, if measure_kernel_time_
// is true
float kernel_time_ms_ = 0;
// Profiling support: the last kernel Bytes processed
int64_t bytes_processed_ = 0;
// Profiling support: the last launch param used
LaunchParams launch_params_;
// Profiling support: disable caching of launch params and output allocation
// output allocation is also disable when output sizes are dependent on
// runtime scalar inputs, such as for the case of tensor factory. see
// https://github.com/csarofeen/pytorch/issues/2002
bool disable_parameter_cache_ = false;
// Profiling support: kept copy of the cuda kernel
std::string kernel_code_;
// Profiling support: nvrtc log for debugging
std::string last_compiler_log_;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,320 +0,0 @@
#include <c10/util/irange.h>
// Extract size and strides
#include <kernel_cache.h>
#include <executor_kernel_arg.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace {
template <typename T, typename nvfuser_index_t>
std::unique_ptr<TensorArgAbstract> getTensorArg(int nDims) {
switch (nDims) {
case (0):
return std::make_unique<TensorArg<
TensorArgCodegen<T, 0, nvfuser_index_t>,
nvfuser_index_t>>();
case (1):
return std::make_unique<TensorArg<
TensorArgCodegen<T, 1, nvfuser_index_t>,
nvfuser_index_t>>();
case (2):
return std::make_unique<TensorArg<
TensorArgCodegen<T, 2, nvfuser_index_t>,
nvfuser_index_t>>();
case (3):
return std::make_unique<TensorArg<
TensorArgCodegen<T, 3, nvfuser_index_t>,
nvfuser_index_t>>();
case (4):
return std::make_unique<TensorArg<
TensorArgCodegen<T, 4, nvfuser_index_t>,
nvfuser_index_t>>();
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
case (5):
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
return std::make_unique<TensorArg<
TensorArgCodegen<T, 5, nvfuser_index_t>,
nvfuser_index_t>>();
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
case (6):
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
return std::make_unique<TensorArg<
TensorArgCodegen<T, 6, nvfuser_index_t>,
nvfuser_index_t>>();
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
case (7):
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
return std::make_unique<TensorArg<
TensorArgCodegen<T, 7, nvfuser_index_t>,
nvfuser_index_t>>();
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
case (8):
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
return std::make_unique<TensorArg<
TensorArgCodegen<T, 8, nvfuser_index_t>,
nvfuser_index_t>>();
default:
TORCH_INTERNAL_ASSERT(
false,
"Tried to generate a tensor to run a generated kernel with ",
nDims,
" dimensions, however only 0 to 8 dimensional tensor are supported.");
}
return nullptr;
}
template <typename INDEX_MODE>
std::unique_ptr<TensorArgAbstract> getTensorArg(
c10::ScalarType dtype,
int nDims) {
switch (dtype) {
case c10::ScalarType::Double:
return getTensorArg<double, INDEX_MODE>(nDims);
case c10::ScalarType::Float:
return getTensorArg<float, INDEX_MODE>(nDims);
case c10::ScalarType::Half:
return getTensorArg<at::Half, INDEX_MODE>(nDims);
case c10::ScalarType::BFloat16:
return getTensorArg<at::BFloat16, INDEX_MODE>(nDims);
case c10::ScalarType::Bool:
return getTensorArg<bool, INDEX_MODE>(nDims);
case c10::ScalarType::Long:
return getTensorArg<int64_t, INDEX_MODE>(nDims);
case c10::ScalarType::Int:
return getTensorArg<int32_t, INDEX_MODE>(nDims);
case c10::ScalarType::ComplexFloat:
return getTensorArg<c10::complex<float>, INDEX_MODE>(nDims);
case c10::ScalarType::ComplexDouble:
return getTensorArg<c10::complex<double>, INDEX_MODE>(nDims);
default:
TORCH_CHECK(
false,
"Dtype: ",
dtype,
" not currently supported in code generated kernels.");
}
}
std::unique_ptr<TensorArgAbstract> getTensorArg(
c10::ScalarType dtype,
int nDims,
KernelIndexMode index_mode) {
switch (index_mode) {
case KernelIndexMode::INT32:
return getTensorArg<int>(dtype, nDims);
case KernelIndexMode::INT64:
return getTensorArg<int64_t>(dtype, nDims);
default:
break;
}
TORCH_INTERNAL_ASSERT(false, "unknown index mode");
return nullptr;
}
} // namespace
KernelArgumentHolder KernelArgumentHolder::createKernelArgumentHolder(
const c10::ArrayRef<c10::IValue>& inputs) {
if (inputs.empty()) {
// default to int32 on device 0
KernelArgumentHolder args(KernelIndexMode::INT32);
args.setDeviceIndex(0);
return args;
}
auto device_index = getCommonDeviceCUDA(inputs);
auto index_mode = collectIndexMode(inputs);
KernelArgumentHolder args(index_mode);
args.setDeviceIndex(device_index);
args.push(inputs);
return args;
}
// Push a tensor to the arguments
void KernelArgumentHolder::push(const at::Tensor& tensor) {
changed_ = true;
if (is_cpu_scalar(tensor)) {
switch (tensor.scalar_type()) {
case c10::ScalarType::ComplexDouble:
arguments_.push_back(std::make_unique<CpuScalarTensorArg<
CpuScalarTensorCodegen<c10::complex<double>>>>(
tensor.data_ptr<c10::complex<double>>()[0]));
break;
case c10::ScalarType::ComplexFloat:
arguments_.push_back(std::make_unique<CpuScalarTensorArg<
CpuScalarTensorCodegen<c10::complex<float>>>>(
tensor.data_ptr<c10::complex<float>>()[0]));
break;
case c10::ScalarType::Double:
arguments_.push_back(
std::make_unique<
CpuScalarTensorArg<CpuScalarTensorCodegen<double>>>(
tensor.data_ptr<double>()[0]));
break;
case c10::ScalarType::Float:
arguments_.push_back(
std::make_unique<CpuScalarTensorArg<CpuScalarTensorCodegen<float>>>(
tensor.data_ptr<float>()[0]));
break;
case c10::ScalarType::Half:
arguments_.push_back(
std::make_unique<
CpuScalarTensorArg<CpuScalarTensorCodegen<at::Half>>>(
tensor.data_ptr<at::Half>()[0]));
break;
case c10::ScalarType::BFloat16:
arguments_.push_back(
std::make_unique<
CpuScalarTensorArg<CpuScalarTensorCodegen<at::BFloat16>>>(
tensor.data_ptr<at::BFloat16>()[0]));
break;
case c10::ScalarType::Bool:
arguments_.push_back(
std::make_unique<CpuScalarTensorArg<CpuScalarTensorCodegen<bool>>>(
tensor.data_ptr<bool>()[0]));
break;
case c10::ScalarType::Long:
arguments_.push_back(
std::make_unique<
CpuScalarTensorArg<CpuScalarTensorCodegen<int64_t>>>(
tensor.data_ptr<int64_t>()[0]));
break;
case c10::ScalarType::Int:
arguments_.push_back(
std::make_unique<
CpuScalarTensorArg<CpuScalarTensorCodegen<int32_t>>>(
tensor.data_ptr<int32_t>()[0]));
break;
default:
TORCH_CHECK(
false,
"Dtype: ",
tensor.scalar_type(),
" not currently supported in code generated kernels.");
}
} else {
int nDims = tensor.ndimension();
c10::ScalarType dtype = tensor.scalar_type();
std::unique_ptr<TensorArgAbstract> tensor_arg =
getTensorArg(dtype, nDims, index_mode_);
tensor_arg->setTensor(tensor);
tensor_arg->setPointer(tensor.data_ptr());
tensor_arg->setDataType(aten_to_data_type(dtype));
for (const auto i : c10::irange(nDims)) {
tensor_arg->setSize(i, tensor.sizes()[i]);
tensor_arg->setStride(i, tensor.strides()[i]);
}
arguments_.push_back(std::move(tensor_arg));
}
}
// Push a scalar or integer to the arguments
void KernelArgumentHolder::push(const IValue& val) {
changed_ = true;
TORCH_INTERNAL_ASSERT(
val.isScalar(),
"Tried to push an arg to run in a fused kernel, expected a scalar but got, ",
val);
auto scalar_val = val.toScalar();
switch (scalar_val.type()) {
// NOLINTNEXTLINE(bugprone-branch-clone)
case c10::ScalarType::ComplexDouble:
arguments_.push_back(
std::make_unique<ComplexDoubleArg>(scalar_val.toComplexDouble()));
return;
case c10::ScalarType::Double:
arguments_.push_back(std::make_unique<DoubleArg>(scalar_val.toDouble()));
return;
case c10::ScalarType::Long:
arguments_.push_back(std::make_unique<LongArg>(scalar_val.toLong()));
return;
case c10::ScalarType::Bool:
arguments_.push_back(std::make_unique<BoolArg>(scalar_val.toBool()));
return;
default:
TORCH_INTERNAL_ASSERT(
false,
" Tried to create argument to send to a fused kernel, but got an unexpected type.");
}
TORCH_INTERNAL_ASSERT(
false,
" Tried to create argument to send to a fused kernel, but got a non-scalar type.");
}
void KernelArgumentHolder::push(int64_t val) {
arguments_.push_back(std::make_unique<LongArg>(val));
}
void KernelArgumentHolder::push(const at::PhiloxCudaState& val) {
arguments_.push_back(std::make_unique<PhiloxCudaStateArg>(val));
}
// Create buffer, flatten arguments into it, align by 8 Bytes, return pointers
// in the buffer
void** KernelArgumentHolder::getBuffer() {
if (changed_) {
void_ptrs_ = std::vector<void*>(arguments_.size(), nullptr);
for (const auto i : c10::irange(arguments_.size())) {
void_ptrs_[i] = static_cast<void*>(arguments_[i]->arg());
}
changed_ = false;
}
return void_ptrs_.data();
}
void KernelArgumentHolder::push(const c10::ArrayRef<c10::IValue>& args) {
// Naive I/O setup, I'm ignoring all the potential transformation (i.e. I/O
// allocated here from the subgraph could be, and very likely are, different
// from I/O expected by the generated CUDA kernel.
for (const auto& arg : args) {
if (arg.isTensor()) {
push(arg.toTensor());
} else {
push(arg);
}
}
}
void KernelArgumentHolder::push(const std::vector<at::Tensor>& tensors) {
for (const auto& tensor : tensors) {
push(tensor);
}
}
void KernelArgumentHolder::push(const ArgAbstract* arg) {
changed_ = true;
arguments_.emplace_back(arg->copy_unique_ptr());
}
void KernelArgumentHolder::swap(int i, const ArgAbstract* arg) {
changed_ = true;
auto holder = arg->copy_unique_ptr();
arguments_[i].swap(holder);
}
void KernelArgumentHolder::appendPhiloxRNGSeed(uint64_t rand_offset) {
at::PhiloxCudaState philox_engine_inputs;
auto gen = at::cuda::detail::getDefaultCUDAGenerator();
{
// See Note [Acquire lock when using random generators]
std::lock_guard<std::mutex> lock(gen.mutex());
philox_engine_inputs =
at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_cuda_state(
rand_offset);
}
push(philox_engine_inputs);
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,397 +0,0 @@
#pragma once
#include <ATen/core/ivalue.h>
#include <ATen/cuda/CUDAGeneratorImpl.h>
#include <c10/util/Exception.h>
#include <type.h>
#include <torch/csrc/jit/ir/ir.h>
#include <array>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
// This should match the tensor used in the code generation (almost exactly)
template <typename T, int N, typename nvfuser_index_t>
struct TensorArgCodegen {
T& operator[](nvfuser_index_t ind) {
return data[ind];
};
T* data;
std::array<nvfuser_index_t, N> size;
std::array<nvfuser_index_t, N> stride;
constexpr int nDims() const {
return N;
}
void setSize(int i, nvfuser_index_t s) {
size[i] = s;
}
void setStride(int i, nvfuser_index_t s) {
stride[i] = s;
}
nvfuser_index_t getSize(int i) const {
return size[i];
}
nvfuser_index_t getStride(int i) const {
return stride[i];
}
};
// 0-Dim GPU based tensor
template <typename T, typename nvfuser_index_t>
struct TensorArgCodegen<T, 0, nvfuser_index_t> {
T& operator[](nvfuser_index_t ind) {
return data[ind];
};
T* data;
constexpr int nDims() const {
return 0;
}
void setSize(int, nvfuser_index_t) {
TORCH_INTERNAL_ASSERT(false, "Tried to set size of a 0-dim tensor");
}
void setStride(int, nvfuser_index_t) {
TORCH_INTERNAL_ASSERT(false, "Tried to set stride of a 0-dim tensor");
}
nvfuser_index_t getSize(int i) const {
TORCH_INTERNAL_ASSERT(false, "Tried to get size of a 0-dim tensor");
}
nvfuser_index_t getStride(int i) const {
TORCH_INTERNAL_ASSERT(false, "Tried to get stride of a 0-dim tensor");
}
};
// Specialization for 0-dim case that's easy to pass in a CPU based tensor
// without memcpy
template <typename T>
struct CpuScalarTensorCodegen {
T& operator[](int) {
return data;
};
T data;
};
// TODO: macro this and the printer below
enum class ArgType {
PhiloxCudaState,
Long,
Double,
ComplexDouble,
Bool,
Tensor,
CpuScalarTensor
};
inline std::string argTypeToString(ArgType type) {
std::string ret;
switch (type) {
case ArgType::PhiloxCudaState:
ret = "PhiloxCudaState";
break;
case ArgType::Long:
ret = "Long";
break;
case ArgType::Double:
ret = "Double";
break;
case ArgType::ComplexDouble:
ret = "ComplexDouble";
break;
case ArgType::Bool:
ret = "Bool";
break;
case ArgType::Tensor:
ret = "Tensor";
break;
case ArgType::CpuScalarTensor:
ret = "CpuScalarTensor";
break;
}
return ret;
}
struct ArgAbstract {
virtual ~ArgAbstract() = default;
virtual const void* arg() const = 0;
virtual void* arg() = 0;
virtual bool isType(ArgType type) const = 0;
virtual ArgType type() const = 0;
virtual std::unique_ptr<ArgAbstract> copy_unique_ptr() const = 0;
virtual void print() const {
printf("input type: %s\n", argTypeToString(type()).c_str());
};
};
#define DEF_HELPEE_FUNC(TARGET_TYPE, ARG_NAME) \
bool isType(ArgType type) const override { \
return ArgType::TARGET_TYPE == type; \
} \
ArgType type() const override { \
return ArgType::TARGET_TYPE; \
} \
const void* arg() const override { \
return &ARG_NAME; \
} \
void* arg() override { \
return &ARG_NAME; \
} \
std::unique_ptr<ArgAbstract> copy_unique_ptr() const override { \
return std::make_unique<TARGET_TYPE##Arg>(*this); \
}
#define DEF_PRINT_FUNC \
void print() const override { \
std::cout << val_ << std::endl; \
}
struct PhiloxCudaStateArg : public ArgAbstract {
at::PhiloxCudaState val_;
PhiloxCudaStateArg(at::PhiloxCudaState _val) : val_(_val){};
DEF_HELPEE_FUNC(PhiloxCudaState, val_)
};
struct LongArg : public ArgAbstract {
int64_t val_;
explicit LongArg(int64_t _val) : val_(_val) {}
DEF_HELPEE_FUNC(Long, val_)
DEF_PRINT_FUNC
};
struct DoubleArg : public ArgAbstract {
double val_;
explicit DoubleArg(double _val) : val_(_val) {}
DEF_HELPEE_FUNC(Double, val_)
DEF_PRINT_FUNC
};
struct ComplexDoubleArg : public ArgAbstract {
c10::complex<double> val_;
explicit ComplexDoubleArg(c10::complex<double> _val) : val_(_val) {}
DEF_HELPEE_FUNC(ComplexDouble, val_)
DEF_PRINT_FUNC
};
struct BoolArg : public ArgAbstract {
bool val_;
explicit BoolArg(bool _val) : val_(_val) {}
DEF_HELPEE_FUNC(Bool, val_)
DEF_PRINT_FUNC
};
struct TensorArgAbstract : ArgAbstract {
virtual void setSize(int i, int64_t size) = 0;
virtual void setStride(int i, int64_t stride) = 0;
virtual void setPointer(void* ptr) = 0;
virtual void setDataType(DataType data_type) = 0;
virtual void setTensor(at::Tensor tensor) = 0;
virtual int64_t getRank() const = 0;
virtual int64_t getSize(int i) const = 0;
virtual int64_t getStride(int i) const = 0;
virtual void* getPointer() const = 0;
virtual DataType getDataType() const = 0;
virtual int64_t numel() const = 0;
virtual at::Tensor getTensor() const = 0;
// TODO: clean it up and also print out dtype
void print() const override {
auto rank = getRank();
std::cout << "tensor dtype: " << getDataType() << " sizes: (";
for (auto i = 0; i < rank; i++) {
std::cout << getSize(i) << ", ";
}
std::cout << ") stride: (";
for (auto i = 0; i < rank; i++) {
std::cout << getStride(i) << ", ";
}
std::cout << ") pointer: " << getPointer() << std::endl;
}
};
template <typename TENSOR_TYPE, typename nvfuser_index_t>
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
struct TensorArg : public TensorArgAbstract {
TENSOR_TYPE instance_;
// TODO: this is ugly, we should be extracting data type from `instance_`
// instead
DataType data_type_ = DataType::Null;
at::Tensor tensor_;
void setSize(int i, int64_t size) override {
instance_.setSize(i, (nvfuser_index_t)size);
}
void setStride(int i, int64_t stride) override {
instance_.setStride(i, (nvfuser_index_t)stride);
}
void setPointer(void* ptr) override {
instance_.data = static_cast<decltype(TENSOR_TYPE::data)>(ptr);
}
void setDataType(DataType data_type) override {
data_type_ = data_type;
}
void setTensor(at::Tensor tensor) override {
tensor_ = tensor;
}
int64_t getSize(int i) const override {
return instance_.getSize(i);
}
int64_t getStride(int i) const override {
return instance_.getStride(i);
}
int64_t getRank() const override {
return instance_.nDims();
}
void* getPointer() const override {
return instance_.data;
}
DataType getDataType() const override {
return data_type_;
}
at::Tensor getTensor() const override {
return tensor_;
}
int64_t numel() const override {
int64_t ret = 1;
for (auto i : c10::irange(instance_.nDims())) {
ret *= instance_.getSize(i);
}
return ret;
}
DEF_HELPEE_FUNC(Tensor, instance_)
};
template <typename CPU_TENSOR_TYPE>
struct CpuScalarTensorArg : public ArgAbstract {
CPU_TENSOR_TYPE instance_;
CpuScalarTensorArg() = delete;
explicit CpuScalarTensorArg(decltype(CPU_TENSOR_TYPE::data) _data) {
instance_.data = _data;
}
DEF_HELPEE_FUNC(CpuScalarTensor, instance_)
};
// TODO: This class needs some further clean up and refactor
//! KernelArgumentHolder copies meta information from kernel inputs, including
//! tensor sizes/shapes/dtype/memory_ptr and copies scalar inputs. It is used
//! for both compilation as well as kernel execution. The important thing is to
//! strip ownership of tensor from KernelArgumentHolder, so that during async
//! compilation, we are not unnecessarily holding memory that is not needed.
class TORCH_CUDA_CU_API KernelArgumentHolder {
public:
//! create KernelArgumentHolder from c10 inputs. Note that we we not taking
//! the ownership of the memory from the original inputs, but just recording
//! its meta data for kernel execution/compilation.
static KernelArgumentHolder createKernelArgumentHolder(
const c10::ArrayRef<c10::IValue>& inputs);
KernelIndexMode getIndexMode() const {
return index_mode_;
}
explicit KernelArgumentHolder(KernelIndexMode index_mode)
: index_mode_(index_mode) {}
KernelArgumentHolder(const KernelArgumentHolder& self)
: device_index_(self.getDeviceIndex()),
cache_id_(self.getCacheId()),
index_mode_(self.getIndexMode()) {
for (const auto& arg : self.arguments_) {
push(arg.get());
}
}
KernelArgumentHolder& operator=(const KernelArgumentHolder& self) {
device_index_ = self.getDeviceIndex();
index_mode_ = self.getIndexMode();
for (const auto& arg : self.arguments_) {
push(arg.get());
}
return *this;
}
// Push a tensor to the arguments
void push(const at::Tensor& tensor);
// Push a scalar or integer to the arguments
void push(const IValue& val);
void push(const at::PhiloxCudaState& val);
// Create buffer, flatten arguments into it, align by 8 Bytes, return pointers
// in the buffer
void** getBuffer();
void push(const c10::ArrayRef<c10::IValue>& args);
void push(const std::vector<at::Tensor>& tensors);
void push(const ArgAbstract* arg);
void swap(int i, const ArgAbstract* arg);
// push int64
void push(int64_t val);
const ArgAbstract* back() const {
return arguments_.back().get();
}
void appendPhiloxRNGSeed(uint64_t rand_offset);
const ArgAbstract* operator[](int ind) const {
return arguments_.at(ind).get();
};
size_t size() const {
return arguments_.size();
}
bool empty() const {
return arguments_.empty();
}
void setDeviceIndex(int index) {
device_index_ = index;
}
int getDeviceIndex() const {
return device_index_;
}
void setCacheId(size_t id) {
cache_id_ = id;
}
c10::optional<size_t> getCacheId() const {
return cache_id_;
}
void print() const {
for (const auto& arg : arguments_) {
arg->print();
}
}
private:
std::vector<std::unique_ptr<ArgAbstract>> arguments_;
std::vector<void*> void_ptrs_;
bool changed_ = true;
int device_index_ = 0;
c10::optional<size_t> cache_id_ = c10::nullopt;
KernelIndexMode index_mode_ = KernelIndexMode::INT64;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,134 +0,0 @@
#include <executor_launch_params.h>
#include <ATen/cuda/CUDAContext.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
void LaunchParams::assertValid() {
TORCH_INTERNAL_ASSERT(
bdimx() * bdimy() * bdimz() > 0 &&
bdimx() * bdimy() * bdimz() <=
(int64_t)at::cuda::getCurrentDeviceProperties()
->maxThreadsPerMultiProcessor,
"Selected invalid number of threads for cuda: ",
bdimx() * bdimy() * bdimz());
TORCH_INTERNAL_ASSERT(
gdimx() > 0 && gdimx() < (std::int64_t(1) << 32) - 1,
"Invalid number of blocks in x direction: ",
gdimx());
TORCH_INTERNAL_ASSERT(
gdimy() > 0 && gdimy() <= 65535,
"Invalid number of blocks in y direction: ",
gdimy());
TORCH_INTERNAL_ASSERT(
gdimz() > 0 && gdimz() <= 65535,
"Invalid number of blocks in z direction: ",
gdimz());
}
void LaunchParams::bind(int64_t val, ParallelType p_type) {
switch (p_type) {
case ParallelType::TIDx:
checkAndSet(val, bdimx_, "blockDim.x");
break;
case ParallelType::BIDx:
checkAndSet(val, gdimx_, "gridDim.x");
break;
case ParallelType::TIDy:
checkAndSet(val, bdimy_, "blockDim.y");
break;
case ParallelType::BIDy:
checkAndSet(val, gdimy_, "gridDim.y");
break;
case ParallelType::TIDz:
checkAndSet(val, bdimz_, "blockdim.z");
break;
case ParallelType::BIDz:
checkAndSet(val, gdimz_, "gridDim.z");
break;
default:
TORCH_INTERNAL_ASSERT(
false,
"Tried to bind invalid parallel type in launch config: ",
p_type);
}
assertValid();
}
int64_t LaunchParams::getDim(ParallelType p_type) const {
switch (p_type) {
case ParallelType::TIDx:
return bdimx();
case ParallelType::BIDx:
return gdimx();
case ParallelType::TIDy:
return bdimy();
case ParallelType::BIDy:
return gdimy();
case ParallelType::TIDz:
return bdimz();
case ParallelType::BIDz:
return gdimz();
default:
TORCH_INTERNAL_ASSERT(
false,
"Tried to get with invalid parallel type in launch config: ",
p_type);
}
}
bool LaunchParams::hasDim(ParallelType p_type) const {
return getRawVal(p_type) != UNINITIALIZED_VAL;
}
const int64_t& LaunchParams::getRawVal(ParallelType p_type) const {
switch (p_type) {
case ParallelType::TIDx:
return bdimx_;
case ParallelType::BIDx:
return gdimx_;
case ParallelType::TIDy:
return bdimy_;
case ParallelType::BIDy:
return gdimy_;
case ParallelType::TIDz:
return bdimz_;
case ParallelType::BIDz:
return gdimz_;
default:
TORCH_INTERNAL_ASSERT(
false,
"Tried to get with invalid parallel type in launch config: ",
p_type);
}
}
bool LaunchParams::operator==(const LaunchParams& other) const {
return gdimx_ == other.gdimx_ && gdimy_ == other.gdimy_ &&
bdimx_ == other.bdimx_ && bdimy_ == other.bdimy_ && smem_ == other.smem_;
}
void LaunchParams::print() const {
std::cout << toString();
}
std::string LaunchParams::toString() const {
std::stringstream ss;
ss << "Launch Parameters: "
<< "BlockDim.x = " << (bdimx_ == UNINITIALIZED_VAL ? -1 : bdimx_) << ", "
<< "BlockDim.y = " << (bdimy_ == UNINITIALIZED_VAL ? -1 : bdimy_) << ", "
<< "BlockDim.z = " << (bdimz_ == UNINITIALIZED_VAL ? -1 : bdimz_) << ", "
<< "GridDim.x = " << (gdimx_ == UNINITIALIZED_VAL ? -1 : gdimx_) << ", "
<< "GridDim.y = " << (gdimy_ == UNINITIALIZED_VAL ? -1 : gdimy_) << ", "
<< "GridDim.z = " << (gdimz_ == UNINITIALIZED_VAL ? -1 : gdimz_) << ", "
<< "Smem Size = " << smem() << "\n";
return ss.str();
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,136 +0,0 @@
#pragma once
#include <type.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class TORCH_CUDA_CU_API LaunchParams {
public:
static constexpr int64_t UNINITIALIZED_VAL = -1;
LaunchParams(
int64_t gdimx = UNINITIALIZED_VAL,
int64_t gdimy = UNINITIALIZED_VAL,
int64_t gdimz = UNINITIALIZED_VAL,
int64_t bdimx = UNINITIALIZED_VAL,
int64_t bdimy = UNINITIALIZED_VAL,
int64_t bdimz = UNINITIALIZED_VAL)
: gdimx_(gdimx),
gdimy_(gdimy),
gdimz_(gdimz),
bdimx_(bdimx),
bdimy_(bdimy),
bdimz_(bdimz) {
assertValid();
}
void assertValid();
void setSmem(int64_t smem) {
smem_ = smem;
}
int64_t smem() const {
return smem_;
}
int64_t nBlocks() const {
return std::abs(gdimx_ * gdimy_ * gdimz_);
}
int64_t nThreads() const {
return std::abs(bdimx_ * bdimy_ * bdimz_);
}
int64_t bdimx() const {
return static_cast<int64_t>(bdimx_ == UNINITIALIZED_VAL ? 1 : bdimx_);
}
int64_t gdimx() const {
return static_cast<int64_t>(gdimx_ == UNINITIALIZED_VAL ? 1 : gdimx_);
}
int64_t bdimy() const {
return static_cast<int64_t>(bdimy_ == UNINITIALIZED_VAL ? 1 : bdimy_);
}
int64_t gdimy() const {
return static_cast<int64_t>(gdimy_ == UNINITIALIZED_VAL ? 1 : gdimy_);
}
int64_t bdimz() const {
return static_cast<int64_t>(bdimz_ == UNINITIALIZED_VAL ? 1 : bdimz_);
}
int64_t gdimz() const {
return static_cast<int64_t>(gdimz_ == UNINITIALIZED_VAL ? 1 : gdimz_);
}
void checkAndSet(
const int64_t incoming_val,
int64_t& class_val,
std::string val) {
TORCH_INTERNAL_ASSERT(
class_val == UNINITIALIZED_VAL || incoming_val == class_val,
"Tried to set ",
val,
" from ",
class_val,
" to ",
incoming_val,
", but it was already set and new value does not match.",
" Thread dims all have to be bound to the same value.");
TORCH_CHECK(
incoming_val > 0,
"Received a thread binding on ",
val,
" that is ",
incoming_val,
". Cannot create negative threads.");
if (class_val == UNINITIALIZED_VAL) {
class_val = incoming_val;
}
assertValid();
}
// Binds dim assocaited with p_type to val
void bind(int64_t val, ParallelType p_type);
// Adjusted value based on get functions above for each value
int64_t getDim(ParallelType p_type) const;
// Returns raw value which may be UNINITIALIZED_VAL
const int64_t& getRawVal(ParallelType p_type) const;
// Returns false if value associated with p_type == UNINITIALIZED_VAL
bool hasDim(ParallelType p_type) const;
bool operator==(const LaunchParams& other) const;
void print() const;
std::string toString() const;
private:
// Spell them out because I want signed ints to know if they were initialized
// or not.
// TODO: convert to c10::optional
int64_t gdimx_ = UNINITIALIZED_VAL;
int64_t gdimy_ = UNINITIALIZED_VAL;
int64_t gdimz_ = UNINITIALIZED_VAL;
int64_t bdimx_ = UNINITIALIZED_VAL;
int64_t bdimy_ = UNINITIALIZED_VAL;
int64_t bdimz_ = UNINITIALIZED_VAL;
int64_t smem_ = 0;
// TODO: Fill in output sizes
std::vector<std::vector<int64_t>> output_sizes;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

View File

@ -1,314 +0,0 @@
#pragma once
#include <ATen/core/ivalue.h>
#include <c10/core/DeviceType.h>
#include <c10/util/Exception.h>
#include <cuda.h>
#include <torch/csrc/jit/ir/ir.h>
#include <executor_kernel_arg.h>
#include <expr_evaluator.h>
#include <fusion.h>
#include <ir_all_nodes.h>
#include <kernel.h>
#include <kernel_expr_evaluator.h>
#include <lower2device.h>
#include <string>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace executor_utils {
// Include all the functions we might need in generated code
std::string kernelPreamble();
void validateKernelInputs(
Fusion* fusion,
const KernelArgumentHolder& args,
const c10::Device& device);
void validateKernelOutputs(
Fusion* fusion,
const std::vector<at::Tensor>& outputs,
const c10::Device& device);
//! Bind kernel input values to runtime values
kir::ExpressionEvaluator bindKernelInputs(
const KernelArgumentHolder& args,
kir::Kernel* kernel,
bool check_consistency = true);
//! Bind fusion input values to runtime values
TORCH_CUDA_CU_API ExpressionEvaluator
bindFusionInputs(const KernelArgumentHolder& args, Fusion* fusion);
struct NvrtcFunction {
CUmodule module = CUmodule();
CUfunction function = CUfunction();
};
// Returns executable function and the ptxas log from compilation
std::pair<NvrtcFunction, std::string> nvrtcCompile(
const std::string& code,
const std::string& func_name,
int id,
c10::optional<int> opt_block_size = c10::nullopt);
namespace caching {
// TODO: Could consider putting some of
// the logic in the common space and re-use
//! List of all the possible entry types in
//! `FusionExecutor` compile-time data cache.
enum class CompileTimeEntryType {
PARALLEL_BINDING_ITERDOMAINS,
PARALLEL_ITER_EXTENT_MAP,
SIMPLIFIED_PARALLEL_ITER_EXTENT_MAP,
WARP_PADDED_PARALLEL_EXTENTS,
VECTORIZED_TENSOR_VALIDATION,
INPUT_ALIAS_INDICES,
OUTPUT_ALIAS_INDICES
};
//! Entry class definitions for each entry type:
//! each class defines the data type for each entry type
//! Compile-time info to be cached in each FusionExecutor:
//! ParallelBindingIterDomains:
//! Stores all the iterdomains that are parallelized
//! on the scheduled Fusion graph. They will be used
//! in launch param iteration and their extents may
//! come from launch constraints.
class ParallelBindingIterDomains {
public:
using DataType = std::vector<IterDomain*>;
static const CompileTimeEntryType EntryType =
CompileTimeEntryType::PARALLEL_BINDING_ITERDOMAINS;
};
//! Compile-time info to be cached in each FusionExecutor:
//! ParallelIterExtentMap
//! Stores the symbolic extents of all the parallelized
//! iterdomains corresponding to each used parallel type.
class ParallelIterExtentMap {
public:
using DataType =
std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
static const CompileTimeEntryType EntryType =
CompileTimeEntryType::PARALLEL_ITER_EXTENT_MAP;
};
//! Compile-time info to be cached in each FusionExecutor:
//! SimplifiedParallelIterExtentMap
//! This entry type is a simplified version of ParallelIterExtentMap.
//!
//! For launch parameter binding we only need the most concrete iterdomain
//! in each disjoint set stored in CaParallelMap. This entry stores the
//! remaining list of extents for binding after this simplification.
//!
//! We still need ParallelIterExtentMap since we want to bind the concrete
//! values to the extents of all parallelized iterdomains. We would be
//! able to save these bindings if the integer machine has a notion of
//! equality and could be configured compile time. But that'd be a longer
//! term target.
class SimplifiedParallelIterExtentMap {
public:
using DataType =
std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
static const CompileTimeEntryType EntryType =
CompileTimeEntryType::SIMPLIFIED_PARALLEL_ITER_EXTENT_MAP;
};
//! WarpPaddedExtentsInfo:
//! Auxiliary data type for entry class WarpPaddedParallelExtents
struct WarpPaddedExtentsInfo {
std::unordered_set<const Val*> warp_padded_extent_set;
std::unordered_map<const Val*, int64_t> warp_padded_constant;
};
//! Compile-time info to be cached in each FusionExecutor:
//! WarpPaddedParallelExtents
//! Stores the symbolic and constant extents of warp
//! padded parallel iterdomains.
class WarpPaddedParallelExtents {
public:
using DataType = WarpPaddedExtentsInfo;
static const CompileTimeEntryType EntryType =
CompileTimeEntryType::WARP_PADDED_PARALLEL_EXTENTS;
};
//! VectorizedTensorInfo:
//! Auxiliary data type for entry class VectorizedTensorValidation
struct VectorizedTensorInfo {
//! Aligned vectorized fusion inputs
std::vector<int> aligned_vectorized_inp_tensor_pos;
//! Aligned vectorized fusion outputs
std::vector<int> aligned_vectorized_out_tensor_pos;
//! Misaligned vectorized input tensors
std::unordered_set<TensorView*> global_inp_misaligned_tv;
//! Misaligned vectorized output tensors
std::unordered_set<TensorView*> global_out_misaligned_tv;
//! Positions of misaligned input tensors
std::vector<int> inp_misaligned_tensors_pos;
//! Positions of misaligned output tensors
std::vector<int> out_misaligned_tensors_pos;
};
//! Compile-time info to be cached in each FusionExecutor:
//! VectorizedTensorValidation
//! Stores position info and vector word sizes of
//! vectorized input/output tensors, to be used
//! in misaligned vectorization validation.
class VectorizedTensorValidation {
public:
using DataType = VectorizedTensorInfo;
static const CompileTimeEntryType EntryType =
CompileTimeEntryType::VECTORIZED_TENSOR_VALIDATION;
};
//! Compile-time info to be cached in each FusionExecutor:
//! InputAliasIndices
//! Stores position info of aliased input tensors
class InputAliasIndices {
public:
using DataType = std::vector<std::pair<int, int>>;
static const CompileTimeEntryType EntryType =
CompileTimeEntryType::INPUT_ALIAS_INDICES;
};
//! Compile-time info to be cached in each FusionExecutor:
//! OutputAliasIndices
//! Stores position info of aliased output tensors
class OutputAliasIndices {
public:
using DataType = std::unordered_set<int>;
static const CompileTimeEntryType EntryType =
CompileTimeEntryType::OUTPUT_ALIAS_INDICES;
};
//! Base abstract class for unified storage in `ExecutorCompileTimeInfoCache`,
//! each entry in `ExecutorCompileTimeInfoCache` will be a subclass.
class CompileTimeInfoBase : public PolymorphicBase {
public:
CompileTimeInfoBase(CompileTimeEntryType entry_type)
: entry_type_(entry_type) {}
CompileTimeEntryType type() {
return entry_type_;
}
private:
CompileTimeEntryType entry_type_;
};
// Note: Do NOT export this class. MSVC issue with exported class that contains
// std::vector<unique_ptr<xxx>>: https://godbolt.org/z/3E4e8T1P1
//! Compile-time information cache
class ExecutorCompileTimeInfoCache {
using Entry = CompileTimeInfoBase;
using EntryOwningPtr = std::unique_ptr<Entry>;
using EntryPtr = Entry*;
using EntryType = CompileTimeEntryType;
public:
void insert(EntryOwningPtr new_entry);
EntryPtr at(EntryType entry_type) {
return entry_type_map_.at(entry_type);
}
bool has(EntryType entry_type) {
return entry_type_map_.count(entry_type);
}
private:
std::vector<EntryOwningPtr> entries_;
std::unordered_map<EntryType, EntryPtr> entry_type_map_;
};
//! A utility class to facilitate accessing ExecutorCompileTimeInfoCache.
template <typename EntryClass>
class ExecutorCompileTimeEntry {
using EntryDataType = typename EntryClass::DataType;
using EntryDataTypeOwnPtr = std::unique_ptr<EntryDataType>;
using MakerFnType = std::function<EntryDataTypeOwnPtr()>;
public:
//! Creates a data entry with type defined in EntryClass,
//! eg. EntryClass = VectorizableInputsAndOutputs;
//!
//! @param data_cache, a pointer to an instantiated compile-time
//! info cache. The info data will be
//! 1. read from data cache if data cache has the corresponding entry.
//! 2. written into data cache if data cache doesn't have the entry.
//! 3. managed by owned_data_ if data cache is nullptr
//! @param fn:
//! The factory function that needs to return a owning pointer
//! i.e. std::unique_ptr<EntryClass::DataType>. It will only
//! be called either when data cache is missing an entry or when no data
//! cache is given.
ExecutorCompileTimeEntry(
ExecutorCompileTimeInfoCache* data_cache,
MakerFnType fn);
//! Unified interface to get actual data, either from cache
//! or from factory function.
EntryDataType& get() {
return *data_ptr_;
}
private:
//! Internal data owing pointer that will manage the computed
//! data where there is no data cache.
EntryDataTypeOwnPtr owned_data_ = nullptr;
//! Pointer to the valid data entry that could be accessed.
EntryDataType* data_ptr_ = nullptr;
};
} // namespace caching
//! Returns the vector of tensorviews that will be used to bind parallel
//! dimensions.
std::vector<IterDomain*> getParallelBindingsIterDomains(
GpuLower* lower,
const std::vector<TensorView*>& used_tvs);
using ParallelExtentMap =
std::unordered_map<ParallelType, std::vector<const Val*>, TypeHash>;
//! Returns the extents of all parallel binding iterdomains corresponding
//! to each parallel type.
std::unique_ptr<ParallelExtentMap> getParallelIterExtents(
std::vector<IterDomain*>& parallel_binding_ids);
//! Returns the simplified set of extents necessary for launch parameter
//! binding.
std::unique_ptr<ParallelExtentMap> getSimplifiedParallelIterExtents(
GpuLower* lower,
std::vector<IterDomain*>& parallel_binding_ids);
//! Returns the symbolic or constant extetns of warp padded parallel
//! iterdomains in the given vector.
std::unique_ptr<caching::WarpPaddedExtentsInfo> getWarpPaddedExtentsInfo(
kir::Kernel* lower,
std::vector<IterDomain*>& parallel_binding_ids);
void validateVectorizedTensors(
kir::Kernel* kernel,
const KernelArgumentHolder& args,
const std::vector<at::Tensor>& outputs,
caching::ExecutorCompileTimeInfoCache* data_cache,
kir::ExpressionEvaluator& expr_eval);
} // namespace executor_utils
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,202 +0,0 @@
#include <evaluator_common.h>
#include <expr_evaluator.h>
#include <fusion.h>
#include <instrumentation.h>
#include <ir_all_nodes.h>
#include <ir_iostream.h>
#include <iostream>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace {
bool equals(Val* value, const IntOrDouble& concrete_value) {
switch (value->getDataType().value()) {
case DataType::Int: {
if (!concrete_value.is_int()) {
return false;
}
auto val = value->getInt();
return val.has_value() && val.value() == concrete_value.as<int64_t>();
}
case DataType::Double: {
if (concrete_value.is_int()) {
return false;
}
auto val = value->getDouble();
return val.has_value() && val.value() == concrete_value.as<double>();
}
default:
TORCH_INTERNAL_ASSERT(false);
}
}
template <typename T>
c10::optional<IntOrDouble> toOptionalIntOrDouble(c10::optional<T> i) {
if (!i) {
return c10::nullopt;
}
return IntOrDouble(i.value());
}
} // namespace
void ExpressionEvaluator::bind(Val* value, const IntOrDouble& concrete_value) {
if (equals(value, concrete_value)) {
return;
}
TORCH_CHECK(!value->isConstScalar(), "Tried to bind to a constant value");
TORCH_CHECK(
value->definition() == nullptr,
"Tried to bind to a value that is computed in the fusion IR");
if (value->isA<NamedScalar>()) {
known_named_scalars_[value->as<NamedScalar>()->name()] = concrete_value;
} else {
known_values_[value] = concrete_value;
}
}
void ExpressionEvaluator::bind(
const std::string& name,
const IntOrDouble& concrete_value) {
known_named_scalars_[name] = concrete_value;
}
c10::optional<IntOrDouble> ExpressionEvaluator::evaluate(Val* value) {
if (evaluator_precomputed_values_ != nullptr) {
return toOptionalIntOrDouble(
evaluator_precomputed_values_->getMaybeValueFor(value));
} else {
auto maybe_concrete_value = getValue(value);
if (!maybe_concrete_value.has_value()) {
if (value->definition() != nullptr) {
OptOutDispatch::handle(value->definition());
maybe_concrete_value = getValue(value);
}
}
return maybe_concrete_value;
}
return c10::nullopt;
}
void ExpressionEvaluator::print() const {
std::cout << "\nEvaluation context\n";
std::cout << "--------------------\n";
for (const auto& kv : known_values_) {
TORCH_INTERNAL_ASSERT(!kv.first->isConstScalar());
std::cout << kv.first << " = " << kv.second << " ; "
<< *kv.first->getValType() << "\n";
}
std::cout << "--------------------\n\n";
}
c10::optional<IntOrDouble> ExpressionEvaluator::getValue(Val* value) {
TORCH_INTERNAL_ASSERT(
value->isAnInt() || value->isADouble(),
"Expression Evaluation does not support values other than integers/doubles at this time.");
if (value->getValType().value() == ValType::Scalar) {
if (value->isAnInt() && value->as<Int>()->value().has_value()) {
return toOptionalIntOrDouble(value->as<Int>()->value());
}
if (value->isADouble() && value->as<Double>()->value().has_value()) {
return toOptionalIntOrDouble(value->as<Double>()->value());
}
}
if (value->isA<NamedScalar>()) {
const auto it = known_named_scalars_.find(value->as<NamedScalar>()->name());
return it != known_named_scalars_.end()
? c10::optional<IntOrDouble>(it->second)
: c10::nullopt;
} else {
const auto it = known_values_.find(value);
return it != known_values_.end() ? c10::optional<IntOrDouble>(it->second)
: c10::nullopt;
}
}
void ExpressionEvaluator::handle(UnaryOp* uop) {
using namespace IntOrDouble_functions;
const auto in = evaluate(uop->in());
if (in.has_value()) {
switch (uop->getUnaryOpType()) {
case UnaryOpType::Neg:
known_values_[uop->out()] = -*in;
break;
case UnaryOpType::Set:
known_values_[uop->out()] = *in;
break;
case UnaryOpType::Cast:
if (uop->out()->getDataType() == DataType::Int) {
known_values_[uop->out()] = in->cast<int64_t>();
} else if (uop->out()->getDataType() == DataType::Double) {
known_values_[uop->out()] = in->cast<double>();
} else {
TORCH_INTERNAL_ASSERT(false, "dtype not supported in evaluator");
}
break;
case UnaryOpType::Abs:
known_values_[uop->out()] = abs(*in);
break;
default:
TORCH_CHECK(
!"Unexpected operator type ",
uop->getUnaryOpType(),
" in ",
uop->toString());
}
}
}
void ExpressionEvaluator::handle(BinaryOp* bop) {
using namespace IntOrDouble_functions;
const auto lhs = evaluate(bop->lhs());
const auto rhs = evaluate(bop->rhs());
if (lhs.has_value() && rhs.has_value()) {
switch (bop->getBinaryOpType()) {
case BinaryOpType::Add:
known_values_[bop->out()] = *lhs + *rhs;
break;
case BinaryOpType::Sub:
known_values_[bop->out()] = *lhs - *rhs;
break;
case BinaryOpType::Mul:
known_values_[bop->out()] = *lhs * *rhs;
break;
case BinaryOpType::Div:
TORCH_CHECK(*rhs != 0);
known_values_[bop->out()] = *lhs / *rhs;
break;
case BinaryOpType::Mod:
TORCH_CHECK(*rhs != 0);
known_values_[bop->out()] = *lhs % *rhs;
break;
case BinaryOpType::CeilDiv:
TORCH_CHECK(*rhs != 0);
known_values_[bop->out()] = ceildiv(*lhs, *rhs);
break;
case BinaryOpType::And:
known_values_[bop->out()] = *lhs && *rhs;
break;
case BinaryOpType::Max:
known_values_[bop->out()] = max(*lhs, *rhs);
break;
case BinaryOpType::Min:
known_values_[bop->out()] = min(*lhs, *rhs);
break;
default:
TORCH_CHECK(!"Unexpected operator type");
}
}
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,68 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <dynamic_type.h>
#include <ir_interface_nodes.h>
#include <iter_visitor.h>
#include <c10/util/Optional.h>
#include <string>
#include <unordered_map>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class FusionPrecomputedValues;
//! Calculate Fusion IR expressions
class TORCH_CUDA_CU_API ExpressionEvaluator : private OptOutDispatch {
public:
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
explicit ExpressionEvaluator(Fusion* fusion) : fusion_(fusion) {}
//! Returns the associated fusion object
Fusion* fusion() const {
return fusion_;
}
//! Bind a concrete value to an IR variable
void bind(Val* value, const IntOrDouble& concrete_value);
//! Bind a concrete value to a named scalar
void bind(const std::string& name, const IntOrDouble& concrete_value);
//! Try to evaluate a Fusion IR value
c10::optional<IntOrDouble> evaluate(Val* value);
//! Debugging helper, prints all the currently known values
void print() const;
void bindPrecomputedValues(FusionPrecomputedValues* precomputed_values) {
evaluator_precomputed_values_ = precomputed_values;
}
auto precomputedValues() {
return evaluator_precomputed_values_;
}
private:
c10::optional<IntOrDouble> getValue(Val* value);
void handle(UnaryOp*) final;
void handle(BinaryOp*) final;
// TODO: handle swizzle
private:
std::unordered_map<const Val*, IntOrDouble> known_values_;
std::unordered_map<std::string, IntOrDouble> known_named_scalars_;
Fusion* fusion_ = nullptr;
FusionPrecomputedValues* evaluator_precomputed_values_ = nullptr;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,723 +0,0 @@
#include <arith.h>
#include <codegen.h>
#include <disjoint_set.h>
#include <fusion.h>
#include <fusion_segmenter.h>
#include <instrumentation.h>
#include <ir_all_nodes.h>
#include <ir_cloner.h>
#include <ir_printer.h>
#include <ir_utils.h>
#include <iter_visitor.h>
#include <kernel.h>
#include <lower2device.h>
#include <lower_bank_conflict.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
static thread_local Fusion* ACTIVE_FUSION = nullptr; // NOLINT
FusionGuard::FusionGuard(Fusion* fusion) {
prev_fusion = ACTIVE_FUSION;
ACTIVE_FUSION = fusion;
}
FusionGuard::~FusionGuard() {
ACTIVE_FUSION = prev_fusion;
}
Fusion* FusionGuard::getCurFusion() {
return ACTIVE_FUSION;
}
void FusionGuard::setCurFusion(Fusion* fusion) {
ACTIVE_FUSION = fusion;
}
void swap(Fusion& a, Fusion& b) noexcept {
FUSER_PERF_SCOPE("Fusion swap");
using std::swap;
swap(static_cast<IrContainer&>(a), static_cast<IrContainer&>(b));
swap(a.inputs_, b.inputs_);
swap(a.outputs_, b.outputs_);
swap(a.io_alias_, b.io_alias_);
swap(a.permuted_input_map_, b.permuted_input_map_);
swap(a.permuted_output_map_, b.permuted_output_map_);
}
std::unique_ptr<SegmentedFusion> Fusion::segment(
const KernelArgumentHolder& args) {
FUSER_PERF_SCOPE("Segment Fusion");
return SegmentCandidateFinder::segment(this, args);
}
IrCloner Fusion::copy(const Fusion* from, Fusion* to) {
to->clear();
auto ir_cloner = IrContainer::copy(from, to);
for (auto val : from->vals_) {
ir_cloner.clone(val)->setDefinition(ir_cloner.clone(val->definition_));
ir_cloner.clone(val)->setUses(ir_cloner.clone(val->uses_));
}
to->inputs_ = ir_cloner.clone(from->inputs_);
to->outputs_ = ir_cloner.clone(from->outputs_);
for (auto inp : to->inputs_) {
inp->setIsFusionInput(true);
}
for (auto out : to->outputs_) {
out->setIsFusionOutput(true);
}
// TODO: put this into ir_cloner instead
for (const auto& entry : from->io_alias_) {
Val* copied_output = ir_cloner.clone(entry.first);
Val* copied_input = ir_cloner.clone(entry.second);
to->io_alias_[copied_output] = copied_input;
}
to->permuted_input_map_ = from->permuted_input_map_;
to->permuted_output_map_ = from->permuted_output_map_;
to->all_tv_uses_valid_ = from->all_tv_uses_valid_;
// This should never be true on copy, but copying for completeness.
to->is_during_update_uses_ = from->is_during_update_uses_;
return ir_cloner;
}
// Clang tidy complains when using default constructor for IrContainer instead
// of copy constructor. Fusion::copy has a call to IrContainer::copy, so it's
// redundant to use the IrContainer copy constructor, but it is harmless since
// Fusion::copy starts by calling clear().
Fusion::Fusion(const Fusion& other) : IrContainer(other) {
FUSER_PERF_SCOPE("Fusion copy");
Fusion::copy(&other, this);
}
Fusion::Fusion(Fusion&& other) noexcept {
FUSER_PERF_SCOPE("Fusion move");
swap(*this, other);
}
Fusion& Fusion::operator=(const Fusion& other) {
FUSER_PERF_SCOPE("Fusion copy assign");
Fusion copy(other);
clear();
swap(*this, copy);
return *this;
}
Fusion& Fusion::operator=(Fusion&& other) noexcept {
FUSER_PERF_SCOPE("Fusion move assign");
clear();
swap(*this, other);
return *this;
}
Fusion::~Fusion() {
clear();
}
void Fusion::clear() noexcept {
FUSER_PERF_SCOPE("Fusion clear");
IrContainer::clear();
inputs_.clear();
outputs_.clear();
io_alias_.clear();
permuted_input_map_.clear();
permuted_output_map_.clear();
all_tv_uses_valid_ = false;
is_during_update_uses_ = false;
}
void Fusion::removeExpr(Expr* expr) {
assertInContainer(expr, "Cannot remove expr ");
// If we hit this error too frequently, we could lighten the restrictions so
// that removing something that doesn't exist simply does nothing. For now,
// we're going with the strictest model which errors.
for (auto out : expr->outputs()) {
out->setDefinition(nullptr);
}
for (auto inp : expr->inputs()) {
auto uses_copy = inp->uses();
auto it = std::find(uses_copy.begin(), uses_copy.end(), expr);
if (it != uses_copy.end()) {
uses_copy.erase(it);
inp->setUses(uses_copy);
}
}
IrContainer::removeExpr(expr);
}
void Fusion::removeVal(Val* val) {
assertInContainer(val, "Cannot remove val ");
TORCH_CHECK(
!val->isFusionInput(),
"Cannot remove val as it is an input of the fusion.");
TORCH_CHECK(
!val->isFusionOutput(),
"Cannot remove val as it is an output of the fusion.");
Expr* orig = val->definition();
if (orig != nullptr)
removeExpr(val->definition());
for (Expr* use : unordered_uses(val)) {
removeExpr(use);
}
IrContainer::removeVal(val);
}
void Fusion::addInput(Val* input) {
assertInContainer(input, "Cannot register input ");
TORCH_INTERNAL_ASSERT(
input->getDataType() != DataType::Index,
"Data type Index is a local compile time data type only, it cannot be used as an input in case it was generated from another kernel.");
if (input->getValType().value() == ValType::TensorView) {
auto tv = input->as<TensorView>();
tv->setMemoryType(MemoryType::Global);
} else if (input->getValType().value() == ValType::Scalar) {
TORCH_CHECK(
!input->isConst(),
"Immediate scalar value cannot be added as an input. It is not necessary to pass it as an input.");
}
inputs_.push_back(input);
input->setIsFusionInput(true);
all_tv_uses_valid_ = false;
}
void Fusion::addOutput(Val* output) {
// We currently don't support explicitly outputing aliased inputs. This is
// because they are already marked as output for in-place update. It's tricky
// to allow marking them explicitly as real output, since that requires us to
// register/identify output not only by `Val*` pointer, but also by indices;
// it also requires us to magically arrange `outputs_` entries in proper order
// ^^^ this doesn't look intuitive on `outputs_` in fusion.
// I think we can solve this by marking addOutput on io_alias_ keys after
// fusion is fully defined. Tracking this in #1488
// Apparently we can't do this neither at the time. I think segmentation
// unfortunately would call addOutput after we marked io_alias_ map.
// TORCH_CHECK(io_alias_.count(output) == 0,
// "can't register aliased output as real output");
assertInContainer(output, "Cannot register output ");
if (output->getValType().value() == ValType::TensorView) {
auto tv = output->as<TensorView>();
tv->setMemoryType(MemoryType::Global);
}
outputs_.push_back(output);
output->setIsFusionOutput(true);
all_tv_uses_valid_ = false;
}
void Fusion::removeInput(Val* input) {
auto find_input = std::find(inputs_.begin(), inputs_.end(), input);
if (find_input != inputs_.end()) {
inputs_.erase(find_input);
}
input->setIsFusionInput(false);
all_tv_uses_valid_ = false;
}
void Fusion::removeOutput(Val* output) {
auto find_output = std::find(outputs_.begin(), outputs_.end(), output);
if (find_output != outputs_.end()) {
outputs_.erase(find_output);
}
output->setIsFusionOutput(false);
all_tv_uses_valid_ = false;
}
void Fusion::replaceOutput(Val* output, Val* replacement) {
auto find_output = std::find(outputs_.begin(), outputs_.end(), output);
TORCH_CHECK(find_output != outputs_.end(), "Unable to find output in Fusion");
if (find_output != outputs_.end()) {
std::replace_if(
outputs_.begin(),
outputs_.end(),
[&output](Val* v) { return v == output; },
replacement);
if (replacement->getValType().value() == ValType::TensorView) {
replacement->setIsFusionOutput(true);
replacement->as<TensorView>()->setMemoryType(MemoryType::Global);
}
if (output->getValType().value() == ValType::TensorView) {
output->setIsFusionOutput(false);
output->as<TensorView>()->setMemoryType(MemoryType::Local);
}
resetTvUses();
}
// Temporary WAR for issue #1112
// (https://github.com/csarofeen/pytorch/issues/1112)
if (io_alias_.count(output) != 0) {
auto input = io_alias_[output];
io_alias_.erase(output);
io_alias_[replacement] = input;
}
}
std::vector<Expr*> Fusion::exprs() {
return StmtSort::getExprs(this);
}
std::vector<Val*> Fusion::inputsOf(Val* val) {
return InputsOf::output(this, val);
}
void Fusion::validateInputs() {
std::unordered_set<Val*> all_inputs;
for (Val* out : outputs()) {
for (Val* input : inputsOf(out)) {
all_inputs.insert(input);
}
}
std::unordered_set<Val*> input_dims;
auto inp_tvs = ir_utils::filterByType<TensorView>(inputs());
for (auto tv : inp_tvs) {
for (auto id : tv->getMaybeRFactorDomain()) {
input_dims.emplace(id->extent());
}
}
for (Val* input : all_inputs) {
if (!input->isConstScalar()) {
TORCH_CHECK(
input->isFusionInput() ||
// TODO: Switch:
inContainer(input),
// to: input_dims.find(input) != input_dims.end(),
// https://github.com/csarofeen/pytorch/issues/1365
"Could not figure out how ",
input->toString(),
" is generated, however it was not specified as an input.");
}
}
}
void Fusion::print() {
FUSER_PERF_SCOPE("Fusion::print");
FusionGuard fg(this);
std::cout << "\n%kernel {\n";
IrMathPrinter op_exprs(std::cout);
op_exprs.handle(this);
std::cout << "\nTransformPrinter : \n";
IrTransformPrinter t_exprs(std::cout);
t_exprs.handle(this);
std::cout << "}\n\n";
}
void Fusion::printKernel(DataType index_type) {
FUSER_PERF_SCOPE("Fusion::printKernel");
TORCH_INTERNAL_ASSERT(
!this->isA<kir::Kernel>(),
"Cannot \"print kernel\" of a kernel container. ",
"This would require lowering during lowering.");
std::cout << codegen::generateCudaKernel(GpuLower(this, index_type).kernel());
}
std::unordered_map<std::string, std::pair<int, int>> Fusion::bankConflictInfo(
DataType index_type) {
GpuLower lower(this, index_type);
auto kernel = lower.kernel();
auto info = getBankConflictInfo(kernel);
// The container of exprs goes out of scope, so we return a map of string here
std::unordered_map<std::string, std::pair<int, int>> result;
result.reserve(info.size());
for (auto i : info) {
result[i.first->toString()] = i.second;
}
return result;
}
void Fusion::printMath(bool from_outputs_only) {
FUSER_PERF_SCOPE("Fusion::printMath");
FusionGuard fg(this);
auto exprs_for_print = exprs();
std::cout << "Inputs:" << std::endl;
for (auto inp : inputs()) {
std::cout << " " << inp << ", " << inp->getDataType().value() << std::endl;
}
std::cout << "Outputs:" << std::endl;
for (auto out : outputs()) {
std::cout << " " << out << ", " << out->getDataType().value() << std::endl;
}
// If we want everything in the fusion, grab all values without uses to
// traverse from.
if (!from_outputs_only) {
std::vector<Val*> leaf_vals;
for (auto val : deterministic_vals()) {
if (val->uses().empty()) {
leaf_vals.push_back(val);
}
}
exprs_for_print = StmtSort::getExprs(this, leaf_vals);
}
std::cout << "\n%kernel_math {\n";
for (auto expr : exprs_for_print) {
std::cout << expr;
}
std::cout << "}\n\n";
}
std::vector<Val*> Fusion::inputsAndCreated() {
auto result = inputs_;
for (auto expr : exprs()) {
auto tv_inputs = ir_utils::filterByType<TensorView>(expr->inputs());
if (tv_inputs.empty()) {
for (auto v : expr->outputs()) {
result.emplace_back(v);
}
}
}
return result;
}
void Fusion::printTransforms() {
FUSER_PERF_SCOPE("Fusion::printTransforms");
FusionGuard fg(this);
IrTransformPrinter t_exprs(std::cout);
t_exprs.handle(this);
}
void Fusion::registerVal(Val* val) {
if (inContainer(val)) {
return;
}
if (val->fusion()) {
TORCH_CHECK(
val->fusion() == this, val, " was not found in the active fusion.");
}
IrContainer::registerVal(val);
}
void Fusion::registerExpr(Expr* expr) {
if (inContainer(expr)) {
return;
}
if (expr->fusion()) {
TORCH_CHECK(
expr->fusion() == this, expr, " was not found in the active fusion.");
}
IrContainer::registerExpr(expr);
bool has_tv = false;
for (Val* input : expr->inputs()) {
has_tv = has_tv || input->isA<TensorView>();
assertInContainer(input, "Input to expr is invalid, ");
auto uses_copy = input->uses();
if (std::find(uses_copy.begin(), uses_copy.end(), expr) ==
uses_copy.end()) {
uses_copy.push_back(expr);
input->setUses(uses_copy);
}
}
// Kernel is the only container type that is non-ssa. This is mainly (maybe
// only) because of initialization expressions which would overwrite tensor
// view definitions.
bool is_ssa = !this->isA<kir::Kernel>();
for (Val* output : expr->outputs()) {
has_tv = has_tv || output->isA<TensorView>();
assertInContainer(output, "Output to expr is invalid, ");
if (output->definition() != nullptr && is_ssa) {
removeExpr(output->definition());
}
if (is_ssa || (!is_ssa && output->definition() == nullptr)) {
output->setDefinition(expr);
}
}
if (has_tv) {
resetTvUses();
}
}
void Fusion::resetTvUses() {
FUSER_PERF_SCOPE("Fusion::resetTvUses");
is_during_update_uses_ = true;
// getExprs only uses definition, so even if we've modified uses already to
// remove dead exprs, this could reinsert them. getExprs is also boundeds by
// inputs as registered inputs will return nullptr as their definition.
const auto all_tvs = ir_utils::filterByType<TensorView>(vals_);
const auto used_exprs = StmtSort::getExprs(this);
for (auto tv : all_tvs) {
tv->setUses({});
}
// Same as in register expr
for (auto expr : used_exprs) {
for (Val* input : expr->inputs()) {
auto uses_copy = input->uses();
if (std::find(uses_copy.begin(), uses_copy.end(), expr) ==
uses_copy.end()) {
uses_copy.push_back(expr);
input->setUses(uses_copy);
}
}
}
all_tv_uses_valid_ = true;
is_during_update_uses_ = false;
}
std::vector<Val*> Fusion::usedMathVals() {
// Note that using fusion->inputs() as the argument for the first
// parameter of getAllValsBetween does not grab all used vals as
// there can be vals that are created inside a fusion without using
// anything from inputs. See, for example, tv0 in the
// FusionOuterSplit test.
const auto inputs = InputsOf::outputs(this, outputs());
auto used_math_vals = DependencyCheck::getAllValsBetween(
{inputs.begin(), inputs.end()}, outputs());
// When an expre has multiple outputs and only some of them are
// used, the rest aren't included in used_math_vals as they are not
// used. However, we want them to be included as they must show up
// in the fusion.
std::vector<Val*> vals_to_add;
std::unordered_set<Val*> added_vals;
for (auto val : used_math_vals) {
auto def = val->definition();
if (def == nullptr || def->outputs().size() < 2) {
continue;
}
for (auto out : def->outputs()) {
if (std::find(used_math_vals.begin(), used_math_vals.end(), out) ==
used_math_vals.end()) {
if (!added_vals.count(out)) {
vals_to_add.push_back(out);
added_vals.insert(out);
}
}
}
}
used_math_vals.insert(
used_math_vals.end(), vals_to_add.begin(), vals_to_add.end());
return used_math_vals;
}
std::vector<Val*> Fusion::terminatingMathVals() {
VectorOfUniqueEntries<Val*> result;
auto used_vals = usedMathVals();
for (auto v : used_vals) {
// Locate the vals that are not expr outputs but have valid definitions.
if (unordered_uses(v).empty() && v->definition() != nullptr) {
result.pushBack(v);
}
}
return result.vector();
}
std::unordered_set<Expr*> Fusion::unordered_uses(const Val* val) const {
return std::unordered_set<Expr*>(val->uses().begin(), val->uses().end());
}
Expr* Fusion::definition(const Val* val) const {
assertInContainer(val, "Cannot detect the definition of val, ");
return val->definition();
}
// Indicate to kernel to set itself up to generate random numbers
bool Fusion::isStochastic() {
for (auto expr : exprs()) {
if (expr->getExprType() == ExprType::RNGOp) {
return true;
}
}
return false;
}
std::vector<Val*> Fusion::getTerminatingOutputs() const {
FUSER_PERF_SCOPE("getTerminatingOutputs");
auto is_reachable_to_output = [](Val* val) {
// traverse to consumers of val and see if there is an output
std::deque<Val*> consumers;
for (auto use : val->uses()) {
for (auto consumer : use->outputs()) {
consumers.push_back(consumer);
}
}
while (!consumers.empty()) {
auto consumer = consumers.back();
consumers.pop_back();
if (consumer->isFusionOutput()) {
return true;
}
// consumer is not an output; proceed to its consumers
for (auto use : consumer->uses()) {
for (auto consumer_of_consumer : use->outputs()) {
consumers.push_back(consumer_of_consumer);
}
}
}
return false;
};
std::vector<Val*> terminating_outputs;
for (auto out : outputs()) {
// If there is another output reachable from this output, it's not
// terminating.
if (is_reachable_to_output(out)) {
continue;
}
terminating_outputs.push_back(out);
}
return terminating_outputs;
}
bool Fusion::isAliasCompatible(Val* left, Val* right) {
// Nullptr check
if (left == nullptr || right == nullptr) {
return false;
}
// DataType check
if (!left->getDataType().has_value() || !right->getDataType().has_value() ||
left->getDataType().value() != right->getDataType().value()) {
return false;
}
// ValType check
if (!left->getValType().has_value() || !right->getValType().has_value() ||
left->getValType().value() != right->getValType().value()) {
return false;
}
// Check same number of dimensions if both values are TensorViews
if (ir_utils::isTV(left) && ir_utils::isTV(right)) {
return left->as<TensorView>()->nDims() == right->as<TensorView>()->nDims();
}
return false;
}
void Fusion::aliasOutputToInput(Val* output, Val* input) {
// Because we could cast output when input is cast.
TORCH_INTERNAL_ASSERT(
!output->isFusionOutput(),
"Do NOT add aliased output to fusion output outside of `aliasOutputToInput");
if (!input->isFusionInput()) {
auto input_expr = input->definition();
// TORCH_INTERNAL_ASSERT(input_def.etype() == ExprType::UnaryOp, "expected
// unary op for aliased input");
TORCH_INTERNAL_ASSERT(
input_expr->isA<UnaryOp>(), "expected unary op for aliased input");
auto input_uop = input_expr->as<UnaryOp>();
TORCH_INTERNAL_ASSERT(
input_uop->getUnaryOpType() == UnaryOpType::Cast,
"expected aliased input to be output of cast op");
input = input_uop->in();
}
TORCH_INTERNAL_ASSERT(
input->getDataType().has_value() && output->getDataType().has_value(),
"requires DataType to be available for aliased output to input");
if (input->getDataType().value() != output->getDataType().value()) {
output = castOp(input->getDataType().value(), output);
}
// TODO: output should be marked at the end of fusion definition #1488
addOutput(output);
TORCH_INTERNAL_ASSERT(
isAliasCompatible(input, output),
"The input and output values are not alias-compatible.");
io_alias_[output] = input;
}
Val* Fusion::getOutputAlias(Val* output) {
auto search = io_alias_.find(output);
if (search != io_alias_.end()) {
return search->second;
}
return nullptr;
}
std::unordered_set<int> Fusion::getOutputAliasIndices() const {
if (io_alias_.empty()) {
return {};
}
std::unordered_set<int> alias_indices;
for (const auto i : c10::irange(outputs_.size())) {
if (io_alias_.count(outputs_[i]) != 0) {
alias_indices.insert(i);
}
}
return alias_indices;
}
std::vector<std::pair<int, int>> Fusion::getInputAliasIndices() const {
if (io_alias_.empty()) {
return {};
}
std::vector<std::pair<int, int>> alias_indices;
for (const auto i : c10::irange(outputs_.size())) {
if (io_alias_.count(outputs_[i]) != 0) {
bool found = false;
for (const auto j : c10::irange(inputs_.size())) {
if (io_alias_.at(outputs_[i]) == inputs_[j]) {
alias_indices.emplace_back(i, j);
found = true;
break;
}
}
TORCH_INTERNAL_ASSERT(
found,
"io_alias_ mapping failure, alias output is not present in inputs");
}
}
// can't assert here, we could have segmented fusion where not all alias
// outputs are present
return alias_indices;
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,288 +0,0 @@
#pragma once
#include <ATen/core/ivalue.h>
#include <c10/macros/Export.h>
#include <c10/util/Exception.h>
#include <ir_base_nodes.h>
#include <ir_container.h>
#include <iter_visitor.h>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
//! Usage: FusionGuard and Fusion are required user interfaces for any operation
//! underlying the code generator. In order to create values, expressions, and
//! generate code a Fusion instance must be active. It is the responsibility of
//! the user to create a Fusion instance and register it with the fusion guard.
//! The simplest example of this is:
//!
//! Fusion fusion;
//! FusionGuard fg(&fusion);
//!
//! Once a fusion is active all values and operations will be registered with
//! it.
//!
//! FusionGuard and Fusion are critical to the lifetime model of the IR system.
//! FusionGuard is a convenient way to set what base container instance holds
//! the defined IR. Statements that are defined are registered through the
//! FusionGuard with a particular Fusion. FusionGuard provides convenient
//! methods to access the active fusion so it doesn't need to be passed around
//! constantly. Any IR node derived classes from Statement must register with
//! Fusion to avoid memory leaks.
//!
//! Fusion is generally thought of as a translated fusion group from the JIT. It
//! is likely a single kernel, although, we don't have to stick to this in the
//! future and could in theory generate multiple kernels with an executor to run
//! them.
//!
//! Fusion also allows users to set input/output values that will allow us to
//! figure out how to hook up runtime data to and from the JIT as well as
//! provide us mechanisms for dependency analysis and DCE including safety
//! checks.
class Fusion;
class TensorView;
class WelfordResult;
class SegmentCandidateFinder;
class SegmentedFusion;
class KernelArgumentHolder;
//! Fusion Guard is our "context manager". It holds the actrive fusion and
//! allows it to be accessed anywhere through FusionGuard::getCurFusion()
class TORCH_CUDA_CU_API FusionGuard {
public:
Fusion* prev_fusion;
//! Set the active fusion so it can be manipulated.
explicit FusionGuard(Fusion* fusion);
~FusionGuard();
static Fusion* getCurFusion();
static void setCurFusion(Fusion* fusion);
};
//! Fusion is mutable but unique. Nodes cannot be copied in any way from one
//! Fusion to another. If anything like that is desired, it would require
//! duplicating all associated values and exprs. Fusion is considered to be SSA,
//! though this could also change in the future if there is a good reason to do
//! so.
//!
//! The Fusion owns the whole IR graph (Vals and Exprs)
//!
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
class TORCH_CUDA_CU_API Fusion : public IrContainer {
typedef std::unordered_map<int, std::vector<int64_t>> PermutationMap;
public:
Fusion() = default;
Fusion(const Fusion& other);
Fusion(Fusion&& other) noexcept;
Fusion& operator=(const Fusion& other);
Fusion& operator=(Fusion&& other) noexcept;
~Fusion() override;
friend void swap(Fusion& a, Fusion& b) noexcept;
void clear() noexcept;
//! Break dependency chains associated with Expr, remove references to expr
//! delete expr
void removeExpr(Expr* expr) override;
//! Completely remove val from the fusion, break all dependencies associated
//! with it
void removeVal(Val* val) override;
//! Register input as an input of the fusion
void addInput(Val* input);
//! Register output as an output of the fusion
void addOutput(Val* output);
//! Deregister input as an input of the fusion
void removeInput(Val* input);
//! Deregister output as an output of the fusion
void removeOutput(Val* output);
//! Replace output with another value
void replaceOutput(Val* output, Val* replacement);
//! Assert that all leaves found from outputs are registered as an input
void validateInputs();
//! Print this fusion to the console
void print();
//! Print Arith exprs
//! \param from_outputs_only Only print exprs reachable from outputs
void printMath(bool from_outputs_only = true);
//! Print transformations used in fusion (can be very verbose)
void printTransforms();
//! Lower the fusion and print a kernel
void printKernel(DataType index_type = DataType::Int);
//! Lower the fusion and evaluate bank conflict info
std::unordered_map<std::string, std::pair<int, int>> bankConflictInfo(
DataType index_type = DataType::Int);
//! Return a list of topologically sorted expressions. This only includes
//! exprs required to genereate registered outputs.
std::vector<Expr*> exprs();
//! Return a vector of fusion inputs that feed this Val
std::vector<Val*> inputsOf(Val* val);
//! Return all Vals in math expressions that cannot be eliminated.
//!
//! It is generally equivalent to vals that are used to generate
//! outputs, however, when a multi-output expression exists, and only
//! some of the outputs are used, the remaining unused outputs are
//! also included as they must show up in the final code.
std::vector<Val*> usedMathVals();
//! Returns all vals that are produced by used math expressions and
//! also do not have further consumers.
//!
//! In the case of an active multi-output expressions, the returned vector
//! will include the expression outputs that did not lead to an fusion
//! output.
std::vector<Val*> terminatingMathVals();
//! Return all Exprs that use val
std::unordered_set<Expr*> unordered_uses(const Val* val) const;
//! Return the Expr that produces val
Expr* definition(const Val* val) const;
//! Indicate to kernel to set itself up to generate random numbers
bool isStochastic();
//! Run fusion segmentation algorithm to create a segmented fusion
std::unique_ptr<SegmentedFusion> segment(const KernelArgumentHolder& args);
const auto& inputs() const {
return inputs_;
}
std::vector<Val*> inputsAndCreated();
const auto& outputs() const {
return outputs_;
}
std::vector<Val*> getTerminatingOutputs() const;
// Aliasing output to input value, this is a WAR to allow inplace update on
// input tensor.
// Note: this is not always safe and should be used with extra caution.
// Currently the only place it's used is in the running stats update for batch
// normalization.
// TODO: alias should be made aware to segmentation, so we'll always include
// the input tensor to the section where output is produced.
void aliasOutputToInput(Val* output, Val* input);
Val* getOutputAlias(Val* output);
std::unordered_set<int> getOutputAliasIndices() const;
std::vector<std::pair<int, int>> getInputAliasIndices() const;
// mark input at index to be permuted by permutation
void setPermutationOnInput(int index, std::vector<int64_t> permutation) {
permuted_input_map_.insert({index, permutation});
}
// mark output at index to be restored by permutation
void setPermutationOnOutput(int index, std::vector<int64_t> permutation) {
permuted_output_map_.insert({index, permutation});
}
// return a map of indices to permutation, which indicates all input tensors
// that needs to be permuted
const PermutationMap& getPermutationInputMap() const {
return permuted_input_map_;
}
// return a map of indices to permutation, which indicates all output tensors
// that needs to be permuted
const PermutationMap& getPermutationOutputMap() const {
return permuted_output_map_;
}
bool isTVUseInfoValid() {
return all_tv_uses_valid_;
}
bool isUpdatingTVUseInfo() {
return is_during_update_uses_;
}
const auto& ioAlias() const {
return io_alias_;
}
protected:
friend SegmentCandidateFinder;
friend SegmentedFusion;
friend class TranslateApplicableWelford;
friend Val;
static IrCloner copy(const Fusion* from, Fusion* to);
//! Register the Val with this fusion
virtual void registerVal(Val* val) override;
//! Register expr with this fusion.
//! When we register an expression, we want to update the dependency tracking
//! of Vals. If this container is a not a Kernel, it will remove previous
//! definitions of outputs and register this Expr as the definition. Otherwise
//! will update definition if not previously set, but will not remove old
//! definitions.
virtual void registerExpr(Expr* expr) override;
//! Clear Expr's from TV uses that are not required to produce outputs from
//! inputs. Only other place this is used (other than Fusion) is in
//! Val::uses()
void resetTvUses();
private:
// Determine if the two values are compatible for aliasing
// Same DataType, ValType, and number of dimensions
bool isAliasCompatible(Val* left, Val* right);
private:
// Fusion inputs and outputs
std::vector<Val*> inputs_;
std::vector<Val*> outputs_;
// io alias pointing from output to input
std::unordered_map<Val*, Val*> io_alias_;
// See Note [ Permutation support in nvfuser ]
// map from indices of input tensor to permutation
PermutationMap permuted_input_map_;
// map from indices of output tensor to permutation
PermutationMap permuted_output_map_;
// Records if the current use data in the IR nodes are valid
// the states are either all valid or all invalid
bool all_tv_uses_valid_ = false;
bool is_during_update_uses_ = false;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

View File

@ -1,628 +0,0 @@
#pragma once
#include <fusion.h>
#include <ir_base_nodes.h>
#include <kernel_cache.h>
#include <scheduler/all_schedulers.h>
#include <scheduler/registry.h>
#include <utils.h>
#include <deque>
#include <list>
#include <unordered_set>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class SegmentedGroup;
class SegmentCandidateFinder;
// A directed edge on DAG,
// Wrapper for values, edges between segmented groups which are made up
// of Exprs. Multiple edges can exist between segmented groups.
struct SegmentedEdge {
SegmentedEdge(SegmentedGroup* from, SegmentedGroup* to, Val* val)
: from(from), to(to), val(val) {}
SegmentedGroup* from;
SegmentedGroup* to;
Val* val;
void print() const;
};
std::ostream& operator<<(std::ostream& os, const SegmentedEdge* edge);
//! Groups together expressions which create a segmented group
//! Can be used to produce fusions
class TORCH_CUDA_CU_API SegmentedGroup {
public:
SegmentedGroup(SegmentedFusion* segmented_fusion)
: segmented_fusion_(segmented_fusion) {}
SegmentedGroup(Expr* expr, SegmentedFusion* segmented_fusion)
: segmented_fusion_(segmented_fusion) {
exprs_.push_back(expr);
}
//! Checks if this group takes original fusion's input
bool isInputGroup() {
return !input_vals.empty();
};
//! Checks if this group is used any where in the segmented fusion
bool isConnected() const {
return !producer_edges.empty() || !consumer_edges.empty() ||
!output_vals.empty();
}
//! returns the id assigned by segment pass
int groupId() const {
return group_id_;
}
//! Returns inputs that this group shares with the original fusion
const auto& inputs() const {
return input_vals;
}
//! Returns outputs that this group shares with the original fusion
const auto& outputs() const {
return output_vals;
}
//! Returns the schedule heuristic associated with this group
ScheduleHeuristic heuristic() const {
return heuristic_;
}
//! Returns the exprs that make up this group
const auto& exprs() const {
return exprs_;
}
//! Debug print function
void print() const;
//! Returns the segmented fusion that this group is in
SegmentedFusion* segmentedFusion() const {
return segmented_fusion_;
}
//! Utility to re-collect the operators included in this
//! segmented group after updating the group boundary.
void resetExprList();
//! Try to get a scheduler entry for this group with
//! the given runtime info.
//! Returns a new scheduler with the same heuristics
//! for this group if possible.
//! Note that the schedule params can be different.
//! Returns a nullopt if this group cannot be scheduled
//! with the same heuristics.
c10::optional<std::unique_ptr<SchedulerEntry>> getMaybeSchedulerEntry(
SchedulerRuntimeInfo& runtime_info);
public:
//! "Ancestor nodes", towards inputs of segmentedDAG
std::vector<SegmentedEdge*> producer_edges;
//! "Descendent nodes", towards outputs of segmentedDAG
std::vector<SegmentedEdge*> consumer_edges;
//! Composite Fusion inputs in this group
std::vector<Val*> input_vals;
//! Composite Fusion outputs in this group
std::vector<Val*> output_vals;
private:
friend class SegmentCandidateFinder;
friend class SegmentedFusion;
friend class FusionKernelRuntime;
friend class TranslateApplicableWelford;
//! unique identifier of group in the segmented fusion
int group_id_ = -1;
//! The scheduler to use for compiling this group
ScheduleHeuristic heuristic_ = ScheduleHeuristic::None;
//! Exprs that make up the group
std::vector<Expr*> exprs_;
//! Maximum path distance from an input segmented group required for
//! Theorem 4.2
int level_ = -1;
//! traversal marker, has this node already been processed
bool visited_ = false;
//! Did we select another group to merge with
SegmentedGroup* merge_with_ = nullptr;
//! if we selected another group to merge, which edge is to be contracted
SegmentedEdge* merge_through_ = nullptr;
//! Has this node been merged?
bool merged_ = false;
private:
//! Utility to convert edge vector to value vector
std::vector<Val*> edgesToVals(const std::vector<SegmentedEdge*>& se_v);
//! Reset method to call at begining of each
//! merge node iteration
void clearTraversalInfo();
//! To be called at the very end of segment fusion
//! no more segment merging should be done beyond
void finalize();
//! Return all segmented groups connected with *this
std::vector<SegmentedGroup*> getNeighbors();
//! Utility struct to represent a group connection
//! both the group to connect with and the edge
//! to connect through
struct NeighborGroup {
NeighborGroup(SegmentedGroup* g, SegmentedEdge* e) : group(g), edge(e) {}
SegmentedGroup* group;
SegmentedEdge* edge;
};
//! TODO: May want to sort this based on size of connections between this and
//! neighbors as well as if the connection is an output of the fusion (has to
//! be saved to gmem anyways)
std::vector<NeighborGroup> getNeighborGroups();
//! Look at all neighbors of this and return who this could merge with based
//! on level values of this, neighbors, and merged neighbors of neighbors
std::vector<NeighborGroup> getMergeCandidates();
//! Assign schedule heuristic to this group
void setHeuristic(ScheduleHeuristic sh) {
heuristic_ = sh;
}
//! Assign Id for this group
void setID(int id) {
TORCH_INTERNAL_ASSERT(group_id_ == -1);
group_id_ = id;
}
//! SegmentedFusion this group belongs to
SegmentedFusion* segmented_fusion_;
};
std::ostream& operator<<(std::ostream& os, const SegmentedGroup* group);
//! Auxiliary class for storing heuristics. The managed data is either
//! a single scheduler entry for complete fusion,
//! or a vector of schedulers, one for each segment, for segmented fusion.
class TORCH_CUDA_CU_API FusionHeuristics {
using SchedulerEntryOwningPtr = std::unique_ptr<SchedulerEntry>;
public:
//! Constructor for segmented fusion case. Created with empty list and
//! uses emplaceBack for inserting heuristics in order
explicit FusionHeuristics() = default;
//! Constructor for complete fusion case, generates the scheduler entry
//! for the fusion owning the given expression
explicit FusionHeuristics(
ScheduleHeuristic schedule_heuristic,
SchedulerRuntimeInfo& runtime_info,
HeuristicSummary* data_cache = nullptr) {
heuristics_.emplace_back(SchedulerEntry::makeEntry(
schedule_heuristic, runtime_info.fusion(), runtime_info, data_cache));
is_segmented_ = false;
}
FusionHeuristics(const FusionHeuristics&) = delete;
FusionHeuristics& operator=(const FusionHeuristics&) = delete;
//! Place a scheduler entry on the list. Applies to segmented fusion only.
void emplaceBack(SchedulerEntryOwningPtr&& pt) {
TORCH_INTERNAL_ASSERT(is_segmented_);
heuristics_.emplace_back(std::move(pt));
}
//! Returns list of schedulers for a segmneted fusion.
const std::vector<SchedulerEntryOwningPtr>& heuristicsList() const {
return heuristics_;
}
//! Returns the single scheduler for a complete fusion.
SchedulerEntry* singleKernelHeuristics() {
TORCH_INTERNAL_ASSERT(!is_segmented_);
return heuristics_.begin()->get();
}
private:
std::vector<SchedulerEntryOwningPtr> heuristics_;
bool is_segmented_ = true;
};
//! Exported Interface for representing segmented fusion graph
//! this class owns the segmented groups
class TORCH_CUDA_CU_API SegmentedFusion {
public:
explicit SegmentedFusion(std::unique_ptr<Fusion> fusion);
//! Factory function for the un-segmented case, directly
//! constructs a "SegmentedFusion", with the given Fusion
//! as the only group.
static std::unique_ptr<SegmentedFusion> fromCompleteFusion(
std::unique_ptr<Fusion> fusion,
ScheduleHeuristic heuristic);
//! Is the fusion segmented?
bool isSegmented() const {
return !groups_.empty();
}
std::vector<SegmentedGroup*>& groups() {
return groups_;
}
std::vector<SegmentedEdge*>& edges() {
return edges_;
}
const std::vector<SegmentedGroup*>& cgroups() const {
return groups_;
}
const std::vector<SegmentedEdge*>& cedges() const {
return edges_;
}
//! Returns the original un-segmented fusion
Fusion* completeFusion() const {
return complete_fusion_.get();
}
const auto& inputs() const {
return complete_fusion_->inputs();
}
const auto& outputs() const {
return complete_fusion_->outputs();
}
Val* findAlias(Val* val) const {
auto alias_it = complete_fusion_->ioAlias().find(val);
if (alias_it != complete_fusion_->ioAlias().end()) {
return alias_it->second;
}
return nullptr;
}
//! Make a clone of the group and convert to fusion
std::unique_ptr<Fusion> makeFusion(SegmentedGroup* sg);
//! Make heuristics for all groups in this segmented fusion
std::unique_ptr<FusionHeuristics> makeInitialHeuristics(
const KernelArgumentHolder& inputs);
//! Inline Debug print for segmented fusion
std::string toString(int verbosity) const;
//! Debug drawing for graphviz
void draw();
//! Debug print for segmented fusions
void print() const;
//! API for adding groups
SegmentedGroup* newGroup();
//! API shortcut for adding a singleton group
SegmentedGroup* newGroup(Expr* expr);
//! API for adding edges
SegmentedEdge* newEdge(SegmentedGroup* from, SegmentedGroup* to, Val* val);
HeuristicSummary* getCachedHeuristicDataFor(SegmentedGroup* group);
private:
//! Unique name for segmented fusion
int segmented_fusion_name_;
//! States representing segmentation
std::vector<SegmentedEdge*> edges_;
std::vector<SegmentedGroup*> groups_;
//! Owning object to explicitly manage groups and edges
class Impl {
public:
explicit Impl(SegmentedFusion* sf) : owning_fusion_(sf) {}
SegmentedGroup* makeGroup();
SegmentedGroup* makeGroup(Expr*);
SegmentedEdge* makeEdge(SegmentedGroup* from, SegmentedGroup* to, Val* val);
void cleanUnused();
private:
using GroupPtr = std::unique_ptr<SegmentedGroup>;
using EdgePtr = std::unique_ptr<SegmentedEdge>;
std::vector<GroupPtr> groups_;
std::vector<EdgePtr> edges_;
SegmentedFusion* owning_fusion_;
};
Impl impl_;
//! A Copy of original full fusion
std::unique_ptr<Fusion> complete_fusion_;
//! A set of intermediate tensors that need to be cast to fp16
std::unordered_set<TensorView*> force_fp16_tv_set_;
DataType force_half_precision_type_;
//! Static traversal information to be used for fast heuristics lookup
std::unordered_map<SegmentedGroup*, std::unique_ptr<HeuristicSummary>>
heuristic_summary_cache_;
// TODO: this class needs cleanup
protected:
friend class SegmentCandidateFinder;
//! Make a heuristics entry for a group and parameters
std::unique_ptr<SchedulerEntry> makeInitialSchedulerEntry(
SegmentedGroup* sg,
SchedulerRuntimeInfo& runtime_info);
//! Cleanup function to be call at the end of fusion
//! segment pass
void finalize();
//! Collect all the intermediate tensors between segmented
//! groups that will cast to fp16
void annotateFP16IntermediateTensors();
//! Keep heuristic checking intermediate data
void setCachedHeuristicDataFor(
SegmentedGroup* group,
std::unique_ptr<HeuristicSummary> data);
//! Utility to give unique name for each segmented fusion
static size_t segmentedFusionName() {
static size_t counter = 0;
return counter++;
}
};
//! This is a base class for segmenter analysis
//! provides the minimal implementation on header so that
//! a unique_ptr can use this base class
//! actual implementations of analyses are in the .cpp files
//! TODO: In the next refactor PR, should put segment candidate
//! finder in .cpp file completely since API doesn't require these
//! details
class SegmenterAnalysis : public PolymorphicBase {};
class GroupDependencyAnalysis;
// Manual node merging passes
class CombineReductions;
//! Options to configure/debug candidate finder
struct TORCH_CUDA_CU_API SegmentCandidateFinderOptions {
bool run_translate_welford = true;
bool run_combine_reductions = true;
bool run_herrmann_merge = true;
bool run_final_merge = true;
};
//! SegmentCandidateFinder
//! Responsible for going through DAG and proposing things we could try to
//! fuse together, calls "canGenerateCode" on these proposed segments to see
//! if they are valid and we can generate code for them.
//! FusionSegment
//! A group of exprs that are segmented together
//! FusionSegmentConnections
//! Holds vals and what they connect. In other words it's a val that is an
//! output of a FusionSegment "from" and an input of FusionSegment "to".
//! There's nothing preventing from a val being between segments twice.
//! TODO: make sure there's nothing wrong with segmentation on nodes that
//! have the same value input twice. i.e. (B = A*A)
//! Selecting segments to propose is based on the theorem 4.2 in the paper which
//! makes sure when segment the segmented graph will be a DAG (assumes Fusion is
//! already a DAG). The segmentation code relies on assumptions of DAG-ness
//! during segmentation, meaning proposed merging of groups must maintain the
//! DAG property of the graph.
//!
//! Julien Herrmann, Yusuf Özkaya, Bora Uçar, Kamer Kaya, Umit Catalyurek.
//! Multilevel Algorithms for Acyclic Partitioning of Directed Acyclic Graphs.
//! SIAM Journal on Scientific Computing, Society for Industrial and Applied
//! Mathematics, 2019, 41 (4), pp.A2117-A2145. ff10.1137/18M1176865ff.
//! ffhal02306566f
class TORCH_CUDA_CU_API SegmentCandidateFinder {
public:
// Perform segmentation on a copy of the given fusion
static std::unique_ptr<SegmentedFusion> segment(
const Fusion* fusion,
const KernelArgumentHolder& inputs,
SegmentCandidateFinderOptions options = SegmentCandidateFinderOptions()) {
auto fusion_copy = std::make_unique<Fusion>(*fusion);
if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) {
std::cout << "Segment the fusion (Original Fusion Un-modified): "
<< std::endl;
fusion_copy->printMath();
}
SegmentCandidateFinder scf(std::move(fusion_copy), inputs, options);
return std::move(scf.segmented_fusion_);
}
// Perform segmentation on and take ownership of the given fusion
static std::unique_ptr<SegmentedFusion> segment(
std::unique_ptr<Fusion> fusion,
const KernelArgumentHolder& inputs,
SegmentCandidateFinderOptions options = SegmentCandidateFinderOptions()) {
SegmentCandidateFinder scf(std::move(fusion), inputs, options);
if (isDebugDumpEnabled(DebugDumpOption::FusionSegments)) {
std::cout << "Segment the fusion (Original Fusion Un-modified): "
<< std::endl;
scf.completeFusion()->printMath();
}
return std::move(scf.segmented_fusion_);
}
static bool TranslateWelfordInFusion(
Fusion* fusion,
const KernelArgumentHolder& runtime_inputs);
private:
// Perform segmentation on and take ownership of the given fusion
SegmentCandidateFinder(
std::unique_ptr<Fusion> fusion,
const KernelArgumentHolder& inputs,
SegmentCandidateFinderOptions options);
void resetTraversal();
void resetLevels();
SegmentedGroup* mergeNodes();
bool codeGenSupportedMerge(SegmentedGroup* group1, SegmentedGroup* group2);
void findSegments();
std::unordered_set<SegmentedEdge*> disconnectGroup(SegmentedGroup* group);
std::vector<SegmentedGroup*>& groups() {
TORCH_INTERNAL_ASSERT(
segmented_fusion_ != nullptr, "Segment finder not owinging any fusion");
return segmented_fusion_->groups();
}
std::vector<SegmentedEdge*>& edges() {
TORCH_INTERNAL_ASSERT(
segmented_fusion_ != nullptr, "Segment finder not owinging any fusion");
return segmented_fusion_->edges();
}
Fusion* completeFusion() {
TORCH_INTERNAL_ASSERT(
segmented_fusion_ != nullptr, "Segment finder not owinging any fusion");
return segmented_fusion_->completeFusion();
}
SchedulerRuntimeInfo& runtimeInfo() {
return runtime_info_;
}
ExpressionEvaluator& expressionEvaluator() {
return runtime_info_.expressionEvaluator();
}
//! Additional merging iteration, clean up the rest of
//! the merging opportunities
//! Herrmann et al. is a fast and safe algorithm for finding merge candidates
//! but can become too conservative in our use cases because we place
//! additional qualifiers on valid merges other than having to generate DAGs,
//! i.e. canSchedule. So we need a bruteforce final merging iteration as a
//! clean up pass. Cost isn't expected to be high since the graph at this
//! stage is already quite merged. Example cf. test_gpu.cpp:
//! FusionDAGMerging_CUDA
//!
//! This merging algorithm is based on Theorem 4.1 of Herrmann et al.,
//! to check if a producer-consumer pair can be merged into one group,
//! it's enough to check if any other consumer of the producer also
//! produces the consumer.
void finalMerge();
//! Duplicate and add all exprs producing the used
//! scalar values in group
void resolveScalarsInGroup(SegmentedGroup* group);
//! Duplicate and add all exprs from "inputs" in the group, to complete
//! inputs. These expressions are simply unary ops of inputs that we want to
//! recompute for each segment, instead of computing and producing a segmented
//! val. For example if we have:
//! tv1 = tv0 * 2;
//! tv3 = tv1 + tv2;
//! tv4 = tv1 + tv4
//! If we segmented on tv1, we would be producing an output for tv1 for 2
//! groups that have tv3 or tv4, instead we could easily recompute tv1 from
//! tv0.
void resolveInputsInGroup(SegmentedGroup* group);
//! Remove all scalar edges in group
//! (TODO: need structure better so we don't have to do this)
void removeScalarEdges();
//! Utility function to merge a vector of groups in one step,
//! need to check for DAG condition before using this method
SegmentedGroup* mergeAllGivenGroups(
const std::vector<SegmentedGroup*>& groups);
//! Utility to remove a group and corresponding edges
//! TODO: remove inline versions of this as much as possible
void eraseGroups(std::unordered_set<SegmentedGroup*>& groups_to_erase);
void finalize();
//! Return the resulting heuristic corresponding to the merged
//! group built by merging the two groups connected by edge
ScheduleHeuristic deriveHeuristic(SegmentedGroup* edge);
GroupDependencyAnalysis* getGroupDependency();
protected:
//! These are the merge node heuristic passes, should
//! eventually should have a dedicated interface
//! instead of keeping adding friends
friend class CombineReductions;
//! options to configure and debug the segment process
SegmentCandidateFinderOptions options_;
std::deque<SegmentedGroup*> to_visit_;
std::vector<SegmentedGroup*> next_to_visit_;
std::unordered_set<SegmentedGroup*> clean_up_groups_;
std::unordered_set<SegmentedEdge*> clean_up_edges_;
std::vector<SegmentedGroup*> to_merge_;
std::unique_ptr<SegmentedFusion> segmented_fusion_;
std::unique_ptr<SegmenterAnalysis> group_dependency_;
SchedulerRuntimeInfo runtime_info_;
//! Note:
//! Segmenter should eventually rely only on runtime_info_ for
//! safe caching. runtime_inputs_ is only used in translateWelford
//! to initialize expression evaluators on copies of the original
//! fusion, which doesn't use any un-cached info and is safe.
//!
//! Directly using runtime_inputs_ in other cases is in general
//! risky.
//!
//! To get rid of runtime_inputs_ we need mechanisms
//! to copy expression evaluator values from fusion
//! to a copy, or even better to a copy of a
//! sub-graph of original fusion.
//! TODO:
//! implement the expression evaluator transfer and
//! remove runtime_inputs_ in a follow up.
const KernelArgumentHolder& runtime_inputs_;
};
// TODO: Make as member functions on classes instead of global scope
TORCH_CUDA_CU_API std::string toString(const SegmentedGroup* group);
TORCH_CUDA_CU_API std::string toString(const SegmentedEdge* edge);
TORCH_CUDA_CU_API std::string toString(const SegmentedFusion* segmented_fusion);
TORCH_CUDA_CU_API std::string toString(
const SegmentCandidateFinderOptions& segment_options);
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

View File

@ -1,204 +0,0 @@
#include <ir_builder.h>
#include <ir_utils.h>
#include <root_domain_map.h>
#include <transform_iter.h>
#include <grouped_reduction.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace {
// Return if ref and other are transformed in the same way.
bool hasMatchingTransformations(TensorView* ref, TensorView* other) {
std::unordered_map<IterDomain*, IterDomain*> ref_2_other;
for (const auto i : c10::irange(ref->getRootDomain().size())) {
ref_2_other.emplace(
ref->getRootDomain().at(i), other->getRootDomain().at(i));
}
auto replay =
BestEffortReplay(
other->domain()->domain(), ref->domain()->domain(), ref_2_other)
.getReplay();
for (const auto i : c10::irange(ref->nDims())) {
auto ref_id = ref->axis(i);
auto other_id = other->axis(i);
auto it = replay.find(ref_id);
if (it == replay.end() || it->second != other_id) {
return false;
}
}
return true;
}
// Validate grouping of reductions and return a new max producer position
void validateReductionGrouping(
const std::vector<Val*>& inputs,
const std::vector<Val*>& outputs) {
TORCH_INTERNAL_ASSERT(inputs.size() == outputs.size());
TORCH_INTERNAL_ASSERT(!inputs.empty());
auto fusion = dynamic_cast<Fusion*>(outputs[0]->container());
TORCH_INTERNAL_ASSERT(
fusion != nullptr, "Grouping of reductions must be done within a Fusion");
ExactRootDomainMap exact_map(fusion);
// Pick the first output TV as a reference and compare it with the
// rest. Do not allow grouping if any mismatch is detected.
auto ref_tv = outputs[0]->as<TensorView>();
const auto ref_domain = ref_tv->getRootDomain();
const auto num_root_dims = ref_domain.size();
const auto num_dims = ref_tv->nDims();
const auto ref_ca_pos = ref_tv->getComputeAtPosition();
for (const auto i : c10::irange(inputs.size())) {
auto output_tv = outputs.at(i)->as<TensorView>();
const auto& output_domain = output_tv->getRootDomain();
if (ref_tv == output_tv) {
continue;
}
TORCH_INTERNAL_ASSERT(
output_domain.size() == num_root_dims,
"Invalid grouped reduction due to mismatched number of root dimensions. "
"Expected: ",
num_root_dims,
". Detected: ",
output_domain.size(),
". Invalid output tensor: ",
output_tv->toString());
TORCH_INTERNAL_ASSERT(
output_tv->nDims() == num_dims,
"Invalid grouped reduction due to mismatched number of dimensions. "
"Expected: ",
num_dims,
". Detected: ",
output_tv->nDims(),
". Invalid output tensor: ",
output_tv->toString());
for (const auto i : c10::irange(num_root_dims)) {
auto ref_id = ref_domain.at(i);
auto output_id = output_domain.at(i);
// If an IterDomain is broadcast, require the other
// corresponding IterDomains are also broadcast. This may not be
// necessary but not completely certain.
TORCH_INTERNAL_ASSERT(
ref_id->isBroadcast() == output_id->isBroadcast(),
"Invalid grouped reduction due to mismatched broadcast root domains. ",
"Reference domain: ",
ref_id->toString(),
". Mismatched domain: ",
output_id->toString(),
". Invalid tensor: ",
output_tv->toString());
if (ref_id->isBroadcast()) {
continue;
}
TORCH_INTERNAL_ASSERT(
ref_id->isReduction() == output_id->isReduction(),
"Invalid grouped reduction due to mismatched reduction root domains. ",
"Reference domain: ",
ref_id->toString(),
". Mismatched domain: ",
output_id->toString(),
". Invalid tensor: ",
output_tv->toString());
TORCH_INTERNAL_ASSERT(
exact_map.areMapped(ref_id, output_id) || ref_id->sameAs(output_id),
"Invalid grouped reduction due to mismatched root domains. ",
"Reference domain: ",
ref_id->toString(),
". Mismatched domain: ",
output_id->toString(),
". Invalid tensor: ",
output_tv->toString());
}
TORCH_INTERNAL_ASSERT(
hasMatchingTransformations(ref_tv, output_tv),
"Invalid grouped reduction due to mismatched transformations. ",
"Reference tensor: ",
ref_tv->toString(),
". Mismatched tensor: ",
output_tv->toString());
// Must have the same computeAt position
TORCH_INTERNAL_ASSERT(
output_tv->getComputeAtPosition() == ref_ca_pos,
"Invalid grouped reduction due to mismatched computeAt position. ",
"Reference tensor: ",
ref_tv->toString(),
". Mismatched tensor: ",
output_tv->toString());
}
// Must not have any data dependency from outputs to inputs
const auto all_dep_vals = DependencyCheck::getAllValsBetween(
{outputs.begin(), outputs.end()}, inputs);
if (!all_dep_vals.empty()) {
std::stringstream ss;
ss << "Invalid dependency:";
for (auto val : all_dep_vals) {
ss << " " << val->toString();
}
TORCH_INTERNAL_ASSERT(all_dep_vals.empty(), ss.str());
}
}
} // namespace
void groupReductions(const std::vector<TensorView*>& reduction_outputs) {
TORCH_CHECK(!reduction_outputs.empty(), "No tensor is given");
auto container = reduction_outputs[0]->container();
const auto num_reductions = reduction_outputs.size();
std::vector<BinaryOpType> op_types(num_reductions);
std::vector<Val*> init_vals(num_reductions);
std::vector<Val*> outputs(num_reductions);
std::vector<Val*> inputs(num_reductions);
for (const auto i : c10::irange(num_reductions)) {
auto reduction_out = reduction_outputs.at(i);
TORCH_CHECK(
reduction_out->definition() != nullptr,
"Invalid tensor to group: ",
reduction_out->toString(),
". Definition not found");
auto rop = dynamic_cast<ReductionOp*>(reduction_out->definition());
TORCH_CHECK(
rop != nullptr,
"Invalid tensor to group: ",
reduction_out->toString(),
". Not an output of a ReductionOp: ",
reduction_out->definition()->toString());
// Fused reduction is only enabled during the lowering, so at this
// point it should be false.
TORCH_INTERNAL_ASSERT(
!rop->isAllreduce(), "Invalid ReductionOp: ", rop->toString());
op_types.at(i) = rop->getReductionOpType();
init_vals.at(i) = rop->init();
outputs.at(i) = rop->out();
inputs.at(i) = rop->in();
}
validateReductionGrouping(inputs, outputs);
IrBuilder::create<GroupedReductionOp>(
container, op_types, init_vals, outputs, inputs);
for (auto output : ir_utils::filterByType<TensorView>(outputs)) {
output->updateMaxProducerPosition();
}
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,41 +0,0 @@
#pragma once
#include <ir_all_nodes.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
//! Horizontally fuse multiple reductions.
//!
//! Given a list of tensors produced by ReductionOp, create a new
//! GroupedReductionOp expression that takes the input tensors of the
//! original reductions and produces the given tensors, replacing
//! their defining expressions.
//!
//! GroupedReductionOp works just like ReductionOp with a potential
//! benefit of aggregating synchronizations across individual
//! reductions. See the reduction::gridReduce2 runtime function for a
//! two-input version of grid reduction.
//!
//! The grouped reductions must follow several constraints, which
//! include:
//! - There must not exist any data dependency between individual
//! reductions.
//! - All reduction output tensors must have the same number of
//! dimensions, the same transformations and the same axes to
//! reduce.
//!
//! Note that Welford is not allowed yet, though it should be
//! technically straightforward to support horizontal fusions of
//! welford ops. Unclear how common it would be in practice, though.
//!
//! \param reduction_outputs Tensors produced by ReductionOp
TORCH_CUDA_CU_API void groupReductions(
const std::vector<TensorView*>& reduction_outputs);
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

View File

@ -1,447 +0,0 @@
#pragma once
#include <iter_visitor.h>
#include <root_domain_map.h>
#include <unordered_map>
#include <unordered_set>
#include <vector>
/*
* Index compute takes in a list of indices typically generated from the
* surrounding for loop nest. The number of indicies are intended to match the
* number of dimensions of the incomming TensorView which may have less or more
* dimensions than its root due to split/merge operations.
* Split/merge operations are then replayed backwards produce resulting
* indices (based on input indices) that match the root dimension.
*
* For example with GLOBAL tensor:
* TV[I, K]
* TV[Io, Ii{4}, K] = TV.split(I, factor=4)
* ALLOC: NONE
* INDEX: indexCompute {i, j, k} -> {i * 4 + j, k}
* FLATTENED_INDEX: {i * 4 + j, k} -> {(i * 4 + j) * K + k}
* PREDICATE: {i * 4 + j, k} -> i * 4 + j < I
*
*
* For example with SHARED tensor:
*
* global_TV[I, K]
* global_TV[Io, Ii{4}, K] = global_TV.split(I, factor=4)
* smem_TV.compute_at(global_TV, 1)
* global_TV.parallelize(1, threadIDx.x)
*
* ALLOC: alloc(smem_TV, 4 x K)
* INDEX: indexCompute(smem_TV, {threadIdx.x, k}) -> {threadIdx.x, k}
* FLATTENED_INDEX: {threadIdx.x * 4 + j, k} -> {(threadIdx.x * 4 + j) * K + k}
* PREDICATE: {threadIdx.x * 4 + j, k} -> threadIdx.x * 4 + j < I // Same as if
* global
*
*
* For example with LOCAL tensor:
* global_TV[I, K, L]
* global_TV[Io, Ii{4}, K, L] = global_TV.split(I, factor=4)
* reg_TV.compute_at(global_TV, 2)
* global_TV.parallelize(1, threadIDx.x)
* global_TV{i, j, k, l} -> { i * 4 + j, k, l }
* global_TV{ i * 4 + j, k, l } -> { (i * 4 + j) * K * L + k * L + l}
*
* ALLOC: alloc(reg_TV, K x L)
* INDEX: {k, l} -> {k, l}
* FLATTENED_INDEX: {k, l} -> {k * L + l}
* PREDICATE: i * 4 + j < I && k < K && l < L -> // Same as if global
*
* These indices can then be flattened later based on strides.
*/
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class ContigIDs;
class LoopIndexing;
struct IndexFromIdGraph;
class IndexCompute : public BackwardVisitor {
protected:
using BackwardVisitor::handle;
void handle(Split*) override;
void handle(Merge*) override;
void handle(Expr*) override;
void handle(Swizzle2D*) override;
// return extent_map_[id] if exists, else return id->extent()
Val* getExtent(IterDomain* id) const;
//! True if a domain is not used to index
bool isZero(IterDomain* id) const;
//! True if any dependent of a domain is not used to index
bool hasZeroMerged(IterDomain* id) const;
//! Returns the concrete ID from the compute at EXACT mode map if
//! concrete_id_pass == true, otherwise returns id passed in.
//! Helps unify the expr handling logic in reference domain and concrete id
//! based traversal.
IterDomain* maybeGetExactMapConcreteID(IterDomain* id);
//! (Concrete indexing pass only)
//! Collect permissive index binding from the given expression.
//! See also permissive_map_ and LoopIndexing::getBackwardOutOfLineExprList.
void collectIndexIntoPermissiveMap(const LoopIndexing& loop_indexing);
//! (Concrete indexing pass only)
//! Iterate through id_expr's input and pull index vals from permissive
//! map, when both of the following are true:
//! 1. the output id is missing in index_map_.
//! 2. the output id is found in permissive map.
void updateIndexMapFromPermissiveMap(const Expr* id_expr);
// Tensor domain we're mapping back to root
const TensorDomain* td_; // NOLINT
// Map we update as we propagate backward, containing all IDs in the
// propagation. Initial indices are mapped with this map at tv->domain()
// and are back propagated to tv->getRootDomain(). This index_map_ keeps the
// indices at intermediate IterDomain's in that back propagation.
std::unordered_map<IterDomain*, Val*> index_map_; // NOLINT
// Map from IterDomain to their broadcasted extent. If a TV has I0*I1 but its
// producer has B0*I1 this map will contain a mapping from the ID{B0*I1} to
// the extent I0*I1. Also contains updated extents if we merge in a 0 index.
// See zero_merged_in_.
std::unordered_map<IterDomain*, Val*> extent_map_; // NOLINT
// Keeps track of domains that do not contribute to indexing
std::unordered_set<IterDomain*> zero_domains_; // NOLINT
// This set keeps track of IterDomain's that have had a zero index merged into
// them. This happens if we do something like tv->axis(0)->split(4) then
// tv->computeAt(1, ...) if this tensor is in smem or lmem the backward
// indexing would be (0, i) then when we do the backward computation that zero
// and i would attempt to be merged together. We handle indices like these
// specially.
std::unordered_set<IterDomain*> zero_merged_in_;
// IDs that are a result of contiguous merges
std::unordered_set<IterDomain*> contig_ids_;
// Map from root to indexed domains
std::unordered_map<IterDomain*, IterDomain*> root_to_indexed_id_;
// Mentions if we should propagate an index down a particular IterDomain path
// if there's an option
std::unordered_set<IterDomain*> preferred_paths_;
// Map from IterDomains to halo-extended extents
std::unordered_map<IterDomain*, Val*> halo_extent_map_;
// Temporary flag which tells IndexCompute to use concrete id's from the exact
// map rather than the actual IDs used in the ID expressions.
bool concrete_id_pass_ = false;
// Mode of swizzle that are activated in this index compute
// instance. Will treat swizzles of different mode as no-op.
// Currently data mode swizzles are handled same as before in IndexSwizzle
// pass, while loop mode swizzles are handled early on in concrete indexing
// pass. See also [Note on swizzle mode]
SwizzleMode swizzle_mode_ = SwizzleMode::NoSwizzle;
// (Concrete id pass only)
// Contains the indexing math that could be resolved with only the
// iterdomains on the right of the consumer_tv's ca axis, i.e. the
// ones that corresponding to the loops that consumer_tv would not
// share with any of its consumers.
// These indexing vals should be kept separate from index_map_ and
// should only be used when the indexing traversal follows the
// order defined in LoopIndexingAnalysis::traverseFromDomainVals.
std::unordered_map<IterDomain*, Val*> permissive_index_map_;
public:
const std::unordered_map<IterDomain*, Val*>& indexMap() const {
return index_map_;
}
const std::unordered_map<IterDomain*, Val*>& extentMap() const {
return extent_map_;
}
const std::unordered_set<IterDomain*>& zeroDomains() const {
return zero_domains_;
}
const std::unordered_set<IterDomain*>& zeroMergedIn() const {
return zero_merged_in_;
}
const std::unordered_map<IterDomain*, IterDomain*>& rootToContigID() const {
return root_to_indexed_id_;
}
// Propagate back from _td using initial_index_map
IndexCompute(
const TensorDomain* _td,
std::unordered_map<IterDomain*, Val*> initial_index_map,
std::unordered_map<IterDomain*, Val*> _extent_map,
std::unordered_set<IterDomain*> zero_domains,
std::unordered_set<IterDomain*> _zero_merged_in,
std::unordered_set<IterDomain*> preferred_paths = {},
std::unordered_map<IterDomain*, Val*> halo_extent_map = {});
IndexCompute(
const TensorDomain* _td,
std::unordered_map<IterDomain*, Val*> initial_index_map,
std::unordered_map<IterDomain*, Val*> _extent_map,
std::unordered_set<IterDomain*> zero_domains,
std::unordered_set<IterDomain*> _zero_merged_in,
const ContigIDs& contig_finder,
std::unordered_set<IterDomain*> preferred_paths = {},
std::unordered_map<IterDomain*, Val*> halo_extent_map = {});
// Entry point used for using concrete id based traversal. This traversal is
// assumed to start at leaf IDs provided by initial_index_map.
IndexCompute(
std::unordered_map<IterDomain*, Val*> initial_index_map,
std::unordered_set<IterDomain*> zero_domains,
std::unordered_set<IterDomain*> preferred_paths,
std::unordered_map<IterDomain*, Val*> concrete_halo_extent_map);
// Updates index_map, extent_map, and zero_merged_in based on id_map and
// returns a new IndexCompute ready to be used.
IndexCompute updateIndexCompute(
const TensorDomain* new_td,
const std::unordered_map<IterDomain*, IterDomain*>& id_map,
const ContigIDs& contig_finder) const;
// Interface to run index traversal through loop indexing analysis result to
// be used with the entry point for concrete id based traversal.
void run(const LoopIndexing& loop_indexing);
virtual void run();
};
//! Apply swizzle and update root indices accordingly
class IndexSwizzle : public IndexCompute {
public:
IndexSwizzle(
const TensorView* tv,
std::unordered_map<IterDomain*, Val*> initial_index_map,
std::unordered_map<IterDomain*, Val*> extent_map,
std::unordered_set<IterDomain*> zero_domains,
std::unordered_set<IterDomain*> zero_merged_in);
IndexSwizzle(
const TensorView* tv,
const TensorDomain* domain,
std::unordered_map<IterDomain*, Val*> initial_index_map,
std::unordered_map<IterDomain*, Val*> extent_map,
std::unordered_set<IterDomain*> zero_domains,
std::unordered_set<IterDomain*> zero_merged_in);
void run() override;
protected:
using IndexCompute::handle;
void handle(Expr* e) override;
void handle(Swizzle2D* swizzle_2d) override;
private:
const TensorView* tv_ = nullptr;
SwizzleType swizzle_type_ = SwizzleType::NoSwizzle;
std::vector<IterDomain*> ids_to_swizzle_;
std::unordered_set<IterDomain*> swizzled_ids_;
};
//! Predicate information of a root or contiguous merged domain
class RootPredicateInfo {
friend class Index;
public:
const auto& startPredicate() const {
return start_predicate_;
}
auto& startPredicate() {
return start_predicate_;
}
const auto& startOffset() const {
return start_offset_;
}
const auto& stopPredicate() const {
return stop_predicate_;
}
const auto& stopOffset() const {
return stop_offset_;
}
const auto& rootIds() const {
return root_ids_;
}
//! Return a false RootPredicateInfo, i.e., both start and stop
//! predicates are false.
static RootPredicateInfo getFalseInfo();
private:
// prdicate for lower end
Bool* start_predicate_ = nullptr;
// prdicate for upper end
Bool* stop_predicate_ = nullptr;
// Offset of the start predicate
Val* start_offset_ = nullptr;
// Offset of the stop predicate
Val* stop_offset_ = nullptr;
// Track which roots have been handled by the generated predicates
std::unordered_set<IterDomain*> root_ids_;
};
// Simple interface for IndexCompute
// If getComputeAtAxis and more generally TensorView const model is fixed, we
// can make the below tensorviews const.
class Index {
private:
// Producer indexing if it's in shared or local memory
static std::vector<Val*> getNonGlobalProducerStridedIndices(
TensorView* producer,
const TensorView* consumer,
const std::vector<kir::ForLoop*>& loops);
// Consumer indexing if it's in shared or local memory
static std::vector<Val*> getNonGlobalConsumerStridedIndices(
const TensorView* consumer,
const std::vector<kir::ForLoop*>& loops);
// Producer if it's in global memory
static std::vector<Val*> getGlobalProducerStridedIndices(
TensorView* producer,
const TensorView* consumer,
const std::vector<kir::ForLoop*>& loops);
// Consumer indexing if it's in global memory
static std::vector<Val*> getGlobalConsumerStridedIndices(
const TensorView* consumer,
const std::vector<kir::ForLoop*>& loops);
// get the strides of a tensor used for the index lowering
static std::vector<Val*> getStrides(const TensorView* tv);
// get the root indices of a tensor used for the index lowering
static std::vector<Val*> getRootIndices(
const TensorView* tv,
const std::vector<kir::ForLoop*>& loops,
const IndexFromIdGraph& index_from_id_graph);
public:
// Indexing functions
// Consumer = Producer
// i.e. T0 = T1... -> T0 is the consumer, T1 is the producer
// Producer indexing dispatch
static kir::TensorIndex* getProducerIndex(
TensorView* producer,
const TensorView* consumer,
const std::vector<kir::ForLoop*>& loops);
// Consumer index dispatch
static kir::TensorIndex* getConsumerIndex(
const TensorView* consumer,
const std::vector<kir::ForLoop*>& loops);
//! Returns a vector of strided indices mapped onto the (rfactor)
//! root domain of a producer tensor. The size of the returned
//! vector is guaranteed to be equal to the number of axes of the
//! indexing root domain.
static std::vector<Val*> getProducerStridedIndices(
TensorView* producer,
const TensorView* consumer,
const std::vector<kir::ForLoop*>& loops);
//! Returns a vector of strided indices mapped onto the (rfactor)
//! root domain of a consumer tensor. The size of the returned
//! vector is guaranteed to be equal to the number of axes of the
//! indexing root domain.
static std::vector<Val*> getConsumerStridedIndices(
const TensorView* consumer,
const std::vector<kir::ForLoop*>& loops);
//! Returns the logical index linearized from a multi-dimension address into a
//! linear memory address a consumer tensor. The returned index is intended to
//! be used for the computation of some tensor factories, such as: arange and
//! rand (for Philox pseudo random sequences)
static std::vector<Val*> getLinearLogicalIndex(
TensorView* consumer_tv,
const std::vector<kir::ForLoop*>& loops);
//! Returns a vector of logical indices mapped onto the (rfactor)
//! root domain of a consumer tensor. The returned index is intended
//! to be used for the computation of some tensor factories, such as:
//! eye
static std::vector<Val*> getPerDimLogicalIndex(
TensorView* consumer_tv,
const std::vector<kir::ForLoop*>& loops);
//! Take a consumer tensorview and loop nest and generates predicates
//! associated with the concrete roots of the loop nest. Returns a list of
//! predicates, and a list of concrete roots they're associated with. It
//! is assumed that no predicate is required if index[i] is an index
//! directly from a for loop. This will not catch all cases if we actually
//! have static size information for example:
//!
//! TV[I].split(4)
//! would produce the code:
//! for(i : I/4)
//! for(j : 4)
//! if( i * 4 + j < TV.size(0))
//! TV[i * 4 + j]...
//!
//! However if we had TV.size[0] = 16 at "compile time" then we wouldn't
//! need the predicate. This will be caught by canOmitPredicate in the
//! predicate lowering
//!
//! unswitch_or_vec_loop is the for loop to start the unswitch like
//! predicate, this is not a bool value as if we have an unswitch loop
//! with a vectorized loop inside, we only want to base the "unswitch"
//! like predicate on the vectorized loop.
static std::vector<RootPredicateInfo> getReferenceRootPredicates(
TensorView* consumer_tv,
const std::vector<kir::ForLoop*>& loops,
kir::ForLoop* unswitch_or_vec_loop,
bool padding_predicate);
};
// Used for local and shared index mapping. Returns a map from loops
// to loop indices as well as a set of loops that do not contribute to
// indexing.
// TODO: could be cleaned up further.
std::pair<
std::unordered_map<kir::ForLoop*, Val*>,
std::unordered_set<kir::ForLoop*>>
indexMapFromTV(
const TensorView* tv,
const std::vector<kir::ForLoop*>& loops,
kir::ForLoop* alloc_loop,
bool as_consumer,
kir::ForLoop* double_buffer_loop = nullptr);
//! Set "pragma unroll" required for loops that indexing of Local
//! tensors depends on.
//!
//! \param tv Indexed tensor
//! \param alloc_loop Allocation loop of tv
//! \param loops The current loop structure
//! \param id_map Producer-to-consumer map in case of indexing as producer
void ensureStaticIndexing(
const TensorView* tv,
kir::ForLoop* alloc_loop,
const std::vector<kir::ForLoop*>& loops,
const std::unordered_map<IterDomain*, IterDomain*>& id_map = {});
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,306 +0,0 @@
#include <inlining.h>
#include <ir_utils.h>
#include <root_domain_map.h>
#include <transform_iter.h>
#include <utility>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
MaxPosCalculator::MaxPosCalculator(
const std::unordered_set<IterDomain*>& uninlinable_ids)
: uninlinable_ids_(uninlinable_ids) {
buildUnmappableDims();
}
void MaxPosCalculator::buildUnmappableDims() {
ComputeAtRootDomainMap root_map;
root_map.build();
auto all_tvs = ir_utils::allTvs(FusionGuard::getCurFusion());
for (auto tv : all_tvs) {
auto consumers = ir_utils::consumerTvsOf(tv);
for (auto consumer : consumers) {
// Grab dimensions in producer and consumer that are mappable to eachother
// based on the computeAtRootDomainMap. This will tell us which dimensions
// can be inlined based on avoiding trying to inline non-trivial
// reduction structures.
auto mappable_roots =
root_map.getMappableDims(tv->domain(), consumer->domain());
for (auto tv_root_id : tv->getMaybeRFactorDomain()) {
if (mappable_roots.find(tv_root_id) == mappable_roots.end() &&
!tv_root_id->isTrivialReduction()) {
unmappable_dims_.emplace(tv_root_id);
}
}
}
}
}
bool MaxPosCalculator::isAllowedID(
IterDomain* id,
TensorView* tv,
bool best_effort,
bool allow_reduction,
bool allow_vectorize,
bool allow_unmappable) const {
bool allowed = true;
if (!allow_reduction) {
allowed = allowed && !id->isReduction();
}
if (uninlinable_ids_.count(id)) {
return false;
}
if (!allow_vectorize) {
// Avoid inlining if marked as Vectorize or Group. In the case of
// BestEffort and MostInlined modes, avoid Unroll as well.
bool is_vectorize = isParallelTypeVectorize(id->getParallelType()) ||
id->getParallelType() == ParallelType::Group ||
(best_effort && id->getParallelType() == ParallelType::Unroll);
allowed = allowed && !is_vectorize;
}
if (!allow_unmappable) {
auto root_dom = tv->getMaybeRFactorDomain();
std::unordered_set<Val*> root_dom_set(root_dom.begin(), root_dom.end());
auto all_vals = DependencyCheck::getAllValsBetween(root_dom_set, {id});
bool is_unmappable = false;
for (auto val : all_vals) {
auto id = val->as<IterDomain>();
if (root_dom_set.count(val) > 0 && unmappable_dims_.count(id) > 0) {
is_unmappable = true;
break;
}
}
allowed = allowed && !is_unmappable;
}
return allowed;
}
size_t MaxPosCalculator::getMaxPosSelf(
TensorView* tv,
bool best_effort,
bool allow_reduction,
bool allow_vectorize,
bool allow_unmappable) const {
auto dom = tv->domain()->domain();
auto iter = std::find_if(dom.begin(), dom.end(), [=](IterDomain* id) {
return !isAllowedID(
id,
tv,
best_effort,
allow_reduction,
allow_vectorize,
allow_unmappable);
});
return std::distance(dom.begin(), iter);
}
// Return the max position in producer that can be inlined to consumer
// Cannot inline:
// Vectorized dimensions in consumer
// Unrolled dimensions in consumer
size_t MaxPosCalculator::getMaxProducerPosFromConsumer(
TensorView* producer,
TensorView* consumer,
bool best_effort) const {
auto pairwise_root_map = PairwiseRootDomainMap(producer, consumer);
auto replay_CasP =
BestEffortReplay::replayCasP(consumer, producer, -1, pairwise_root_map);
auto p2c_replay_map = replay_CasP.getReplay();
for (size_t producer_pos = 0; producer_pos < producer->nDims();
producer_pos++) {
// If the producer position is mismatching with the consumer, then we can
// not inline into this position, otherwise the max producer position of
// the consumer will become invalid and expression sort will fail.
if (TransformReplay::getMatchedLeafPosWithoutReplayCasP(
consumer, producer, producer_pos + 1) < 0) {
return producer_pos;
}
auto map_it = p2c_replay_map.find(producer->axis(producer_pos));
if (map_it != p2c_replay_map.end()) {
auto c_id = map_it->second;
if (!isAllowedID(c_id, consumer, best_effort, true, false, true)) {
return producer_pos;
}
}
}
return producer->nDims();
}
size_t MaxPosCalculator::getMaxPosAll(
TensorView* tv,
bool best_effort,
bool check_siblings) {
auto max_pos = getMaxPosSelf(tv, best_effort, false, false, false);
for (auto consumer_tv : ir_utils::consumerTvsOf(tv)) {
max_pos = std::min<size_t>(
max_pos, getMaxProducerPosFromConsumer(tv, consumer_tv, best_effort));
}
if (check_siblings) {
for (auto sibling_tv : ir_utils::siblingTvsOf(tv)) {
max_pos = std::min<size_t>(
max_pos, getMaxPosAll(sibling_tv, best_effort, false));
}
}
return max_pos;
}
void inlineMost(const std::unordered_set<IterDomain*>& uninlinable_ids) {
inlineMost(ir_utils::allTvs(FusionGuard::getCurFusion()), uninlinable_ids);
}
void inlineMost(
const std::vector<TensorView*>& tvs,
const std::unordered_set<IterDomain*>& uninlinable_ids) {
if (tvs.empty()) {
return;
}
MaxPosCalculator calc(uninlinable_ids);
for (auto tv : tvs) {
tv->inlineAt(-1, true, &calc);
}
}
void inlineMost(
const std::unordered_set<TensorView*>& tvs,
const std::unordered_set<IterDomain*>& uninlinable_ids) {
if (tvs.empty()) {
return;
}
MaxPosCalculator calc(uninlinable_ids);
for (auto tv : tvs) {
tv->inlineAt(-1, true, &calc);
}
}
namespace {
// Find the positions of `selected` tensors that is mapped to the given position
// in the reference tensor.
class FindMappedPositions : public MaxInfoSpanningTree::Propagator {
std::unordered_map<TensorView*, size_t>& output_;
public:
FindMappedPositions(
std::unordered_map<TensorView*, size_t>& output,
TensorView* reference,
int64_t reference_pos);
~FindMappedPositions() override = default;
virtual void propagateC2P(TensorView* from, TensorView* to) override;
virtual void propagateP2C(TensorView* from, TensorView* to) override;
virtual void propagateSibling(TensorView* from, TensorView* to) override;
};
FindMappedPositions::FindMappedPositions(
std::unordered_map<TensorView*, size_t>& output,
TensorView* reference,
int64_t reference_pos)
: output_(output) {
if (reference_pos < 0) {
reference_pos += int64_t(reference->nDims()) + 1;
}
TORCH_CHECK(
reference_pos >= 0 && reference_pos <= int64_t(reference->nDims()),
"Invalid axis received ",
reference_pos,
" but should be > -",
reference->nDims(),
" and <= ",
reference->nDims(),
".");
output_[reference] = reference_pos;
}
void FindMappedPositions::propagateC2P(TensorView* from, TensorView* to) {
int from_pos = output_.at(from);
auto to_pos =
TransformReplay::getMatchedLeafPosWithoutReplayPasC(to, from, from_pos);
// If there is no matching position found, we compute the highest matched
// position as the closest approximation
while (to_pos < 0) {
from_pos--;
to_pos =
TransformReplay::getMatchedLeafPosWithoutReplayPasC(to, from, from_pos);
}
output_[to] = to_pos;
}
void FindMappedPositions::propagateP2C(TensorView* from, TensorView* to) {
int from_pos = output_.at(from);
auto to_pos =
TransformReplay::getMatchedLeafPosWithoutReplayCasP(to, from, from_pos);
// If there is no matching position found, we compute the highest matched
// position as the closest approximation
while (to_pos < 0) {
from_pos--;
to_pos =
TransformReplay::getMatchedLeafPosWithoutReplayCasP(to, from, from_pos);
}
output_[to] = to_pos;
}
void FindMappedPositions::propagateSibling(TensorView* from, TensorView* to) {
auto from_pos = output_.at(from);
TORCH_CHECK(
TransformReplay::fullSelfMatching(to, from),
"Transformations in siblings ",
from,
" and ",
to,
" does not match with each other.");
output_[to] = from_pos;
}
std::unordered_map<TensorView*, size_t> getPositionsMappedTo(
TensorView* reference_tv,
int64_t reference_pos) {
std::unordered_map<TensorView*, size_t> mapped_positions;
MaxRootDomainInfoSpanningTree tree(reference_tv, reference_pos);
FindMappedPositions propagator(mapped_positions, reference_tv, reference_pos);
tree.traverse(&propagator);
return mapped_positions;
}
} // namespace
void inlineAllAt(
TensorView* reference_tv,
int64_t reference_pos,
bool best_effort,
const std::unordered_set<IterDomain*>& uninlinable_ids) {
auto mapped_positions = getPositionsMappedTo(reference_tv, reference_pos);
MaxPosCalculator calc(uninlinable_ids);
for (auto pair : mapped_positions) {
pair.first->inlineAt(pair.second, best_effort, &calc);
}
}
void inlineSelectedAt(
const std::unordered_set<TensorView*>& selected,
TensorView* reference_tv,
int64_t reference_pos,
bool best_effort,
const std::unordered_set<IterDomain*>& uninlinable_ids) {
auto mapped_positions = getPositionsMappedTo(reference_tv, reference_pos);
MaxPosCalculator calc(uninlinable_ids);
for (auto pair : mapped_positions) {
if (selected.count(pair.first) > 0) {
pair.first->inlineAt(pair.second, best_effort, &calc);
}
}
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,100 +0,0 @@
#pragma once
#include <ir_interface_nodes.h>
#include <maxinfo_propagator.h>
#include <transform_replay.h>
#include <memory>
#include <unordered_set>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class MaxPosCalculator {
// Root domains in producer that's unmappable to any of its consumers
std::unordered_set<IterDomain*> unmappable_dims_;
// User set IterDomains to not inline, used in schedulers to avoid inlining
// trivial reductions
std::unordered_set<IterDomain*> uninlinable_ids_;
// Iterate through all TVs and collect the dimensions of each TV that don't
// map to all its consumer TVs.
void buildUnmappableDims();
// Utility function to return if an id of tv is a valid iter domain to inline
// within. This is used in getMaxPos{PasC,CasP}. Different variations of the
// bool values are used if checking max position of PasC, CasP, or checking
// for a max "self" position.
bool isAllowedID(
IterDomain* id,
TensorView* tv,
bool best_effort,
bool allow_reduction,
bool allow_vectorize,
bool allow_unmappable) const;
public:
// Returns the position at which tv can be inlined within.
size_t getMaxPosSelf(
TensorView* tv,
bool best_effort,
bool allow_reduction,
bool allow_vectorize,
bool allow_unmappable) const;
// Returns the maximum position producer can be inlined based on consumer
// given the set ComputeAtMode
size_t getMaxProducerPosFromConsumer(
TensorView* producer,
TensorView* consumer,
bool best_effort) const;
// Checks producers, consumers, and siblings to see what the maximum position
// in tv is that can be shared across both directions.
size_t getMaxPosAll(
TensorView* tv,
bool best_effort = false,
bool check_siblings = true);
MaxPosCalculator(const std::unordered_set<IterDomain*>& uninlinable_ids = {});
};
// Inline to the right most allowed position for all tensors in the current
// fusion.
TORCH_CUDA_CU_API void inlineMost(
const std::unordered_set<IterDomain*>& uninlinable_ids = {});
// Inline to the right most allowed position for the selected tensors in the
// current fusion.
TORCH_CUDA_CU_API void inlineMost(
const std::vector<TensorView*>& tvs,
const std::unordered_set<IterDomain*>& uninlinable_ids = {});
// Inline to the right most allowed position for the selected tensors in the
// current fusion.
TORCH_CUDA_CU_API void inlineMost(
const std::unordered_set<TensorView*>& tvs,
const std::unordered_set<IterDomain*>& uninlinable_ids = {});
// Inline to the position corresponding to the reference position in the
// reference tensor for all tensors in the current fusion.
TORCH_CUDA_CU_API void inlineAllAt(
TensorView* reference_tv,
int64_t reference_pos,
bool best_effort = false,
const std::unordered_set<IterDomain*>& uninlinable_ids = {});
// Inline to the position corresponding to the reference position in the
// reference tensor for selected tensors in the current fusion.
TORCH_CUDA_CU_API void inlineSelectedAt(
const std::unordered_set<TensorView*>& selected,
TensorView* reference_tv,
int64_t reference_pos,
bool best_effort = false,
const std::unordered_set<IterDomain*>& uninlinable_ids = {});
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,76 +0,0 @@
#include <instrumentation.h>
#include <c10/macros/Export.h>
#ifdef _WIN32
#include <c10/util/win32-headers.h>
#else
#include <pthread.h>
#include <unistd.h>
#endif
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace inst {
Trace::Trace() {
const char* trace_filename = getenv("PYTORCH_NVFUSER_TRACE");
if (trace_filename != nullptr) {
log_file_ = fopen(trace_filename, "w");
TORCH_CHECK(log_file_ != nullptr, "Can't open trace file");
// Disable the file stream buffering, since it may result
// in torn writes in multi-threaded tracing
setbuf(log_file_, nullptr);
// Print the trace prologue
// (including a dummy TRACE_START event)
fprintf(log_file_, "{\n\"traceEvents\": [\n");
start_timestamp_ = Clock::now();
logEvent('I', "TRACE_START");
}
if (isOptionDisabled(DisableOption::Nvtx)) {
record_nvtx_range_ = false;
}
}
Trace::~Trace() {
if (log_file_ != nullptr) {
// Print trace epilogue
logEvent('I', "TRACE_END", ' ');
fprintf(log_file_, "],\n\"displayTimeUnit\": \"ms\"\n}\n");
fclose(log_file_);
}
}
void Trace::logEvent(char ph, const char* name, char sep) {
const std::chrono::duration<double> d = Clock::now() - start_timestamp_;
const double elapsed = d.count() * 1e6;
#ifdef _WIN32
const unsigned int pid = GetCurrentProcessId();
const unsigned int tid = GetCurrentThreadId();
#else
const unsigned int pid = getpid();
const unsigned int tid = std::hash<pthread_t>{}(pthread_self());
#endif // _WIN32
fprintf(
log_file_,
"{ \"name\": \"%s\", \"ph\": \"%c\", \"pid\": %u, \"tid\": %u, \"ts\": %.0f }%c\n",
name,
ph,
pid,
tid,
elapsed,
sep);
}
} // namespace inst
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,105 +0,0 @@
#pragma once
#include <utils.h>
#include <nvToolsExt.h>
// NOLINTNEXTLINE(modernize-deprecated-headers)
#include <stdio.h>
#include <chrono>
#include <cstdio>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace inst {
//! An optional record of selected timestamped operations, events and counters
//!
//! This class is not intended to be used directly. Instead, the operations
//! to be traced are marked (for example using the FUSER_PERF_SCOPE macro)
//!
//! In order to enable tracing, the `PYTORCH_NVFUSER_TRACE` environment
//! variable is set to point to a trace file (ex `test.trace`). The file name
//! may be a relative or an absolute path.
//!
//! The trace uses the Chrome Tracing (Catapult) format, which is a well
//! documented JSON based format supported by multiple tools:
//! https://chromium.googlesource.com/catapult/+/HEAD/tracing/README.md
//!
//! An easy way to view traces is to type `about://tracing` in Chrome or
//! Chromium.
//!
class TORCH_CUDA_CU_API Trace : public NonCopyable {
public:
using Clock = std::chrono::steady_clock;
public:
static Trace* instance() {
static Trace trace;
return &trace;
}
void beginEvent(const char* name) {
if (log_file_ != nullptr) {
logEvent('B', name);
}
if (record_nvtx_range_) {
nvtxRangePushA(name);
}
}
void endEvent(const char* name) {
if (record_nvtx_range_) {
nvtxRangePop();
}
if (log_file_ != nullptr) {
logEvent('E', name);
}
}
private:
Trace();
~Trace();
void logEvent(char ph, const char* name, char sep = ',');
private:
FILE* log_file_ = nullptr;
Clock::time_point start_timestamp_;
bool record_nvtx_range_ = true;
};
//! \internal Automatic scope for a perf marker
//! (normally used through the FUSER_PERF_SCOPE macro)
class TORCH_CUDA_CU_API TraceScope : public NonCopyable {
public:
explicit TraceScope(const char* event_name) : event_name_(event_name) {
Trace::instance()->beginEvent(event_name_);
}
~TraceScope() {
Trace::instance()->endEvent(event_name_);
}
private:
const char* event_name_ = nullptr;
};
#define FUSER_MACRO_CONCAT2(a, b) a##b
#define FUSER_MACRO_CONCAT(a, b) FUSER_MACRO_CONCAT2(a, b)
#define FUSER_ANONYMOUS(prefix) FUSER_MACRO_CONCAT(prefix, __COUNTER__)
//! Defines a scope we want to measure and record in a perf trace
//!
//! \param name The name of the scope, normally a simple string literal
//!
#define FUSER_PERF_SCOPE(name) \
torch::jit::fuser::cuda::inst::TraceScope FUSER_ANONYMOUS(_perf_scope_)(name)
} // namespace inst
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,8 +0,0 @@
#pragma once
#include <ir_base_nodes.h>
#include <ir_interface_nodes.h>
#include <ir_internal_nodes.h>
// TODO: remove this once the Kernel IR split is complete
#include <kernel_ir.h>

View File

@ -1,378 +0,0 @@
#include <dispatch.h>
#include <expr_evaluator.h>
#include <fusion.h>
#include <ir_all_nodes.h>
#include <ir_builder.h>
#include <ir_cloner.h>
#include <ir_printer.h>
#include <kernel.h>
#include <kernel_ir.h>
#include <kernel_ir_dispatch.h>
#include <mutator.h>
#include <torch/csrc/jit/ir/ir.h>
#include <c10/util/Exception.h>
#include <c10/util/irange.h>
#include <iostream>
#include <stdexcept>
#include <string>
#include <unordered_map>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
Statement::Statement(IrBuilderPasskey passkey) {
ir_container_ = passkey.ir_container_;
}
Statement::Statement(const Statement* src, IrCloner* ir_cloner) {
ir_container_ = ir_cloner->container();
}
void Statement::setName(IrContainerPasskey, StmtNameType name) {
name_ = name;
}
void Statement::setName(IrBuilderPasskey, StmtNameType name) {
name_ = name;
}
Val* Statement::asVal() {
TORCH_INTERNAL_ASSERT(isVal(), "Cannot cast to Val as this is not a Val.");
return this->as<Val>();
}
Expr* Statement::asExpr() {
TORCH_INTERNAL_ASSERT(isExpr(), "Cannot cast to Expr as this is not a Expr.");
return this->as<Expr>();
}
std::string Statement::toString() const {
std::stringstream ss;
IrPrinter ir_printer(ss);
ir_printer.handle(this);
return ss.str();
}
std::string Statement::toInlineString() const {
std::stringstream ss;
IrPrinter ir_printer(ss);
ir_printer.print_inline(this);
return ss.str();
}
Fusion* Statement::fusion() const {
TORCH_INTERNAL_ASSERT(
ir_container_->isA<Fusion>(), "Statement does not belong to a fusion.");
return ir_container_->as<Fusion>();
}
kir::Kernel* Statement::kernel() const {
TORCH_INTERNAL_ASSERT(
ir_container_->isA<kir::Kernel>(),
"Statement does not belong to a kernel.");
return ir_container_->as<kir::Kernel>();
}
// When we create a Val we immediately register them with the active fusion.
Val::Val(IrBuilderPasskey passkey, ValType _vtype, DataType _dtype)
: Statement(passkey), vtype_(_vtype), dtype_(_dtype) {}
// NOTE: we don't clone the definition_ and uses_ here
// since they may introduce cloning cycles. Instead, we copy
// the original pointers and we'll fix them up later part of the
// Fusion copy. Neither definition_ nor uses_ are copied through
// this constructor now leaving them to be resolved by later stages
//
Val::Val(const Val* src, IrCloner* ir_cloner)
: Statement(src, ir_cloner), vtype_(src->vtype_), dtype_(src->dtype_) {}
const std::vector<Expr*>& Val::uses() const {
if (vtype_ == ValType::TensorView) {
if (!fusion()->isTVUseInfoValid() && !fusion()->isUpdatingTVUseInfo()) {
fusion()->resetTvUses();
}
}
return uses_;
}
// Converts the data type of TensorView or Scalar representing index
// values. The data type of the original input should be
// DataType::Index, but DataType::Int is also allowed as it is used
// for index expressions.
void Val::resolveIndexDtype() {
TORCH_INTERNAL_ASSERT(
vtype_ == ValType::TensorView || vtype_ == ValType::Scalar,
"Resolving index type is currently only supported on tensor view or scalar values. "
"Value type: ",
vtype_);
TORCH_INTERNAL_ASSERT(
dtype_ == DataType::Index || dtype_ == DataType::Int,
"Can only resolve index type if a Val has an Index or Int DataType. ",
"Data type: ",
dtype_);
TORCH_INTERNAL_ASSERT(
container()->isA<kir::Kernel>(),
"Index type can only be resolved at compile time.");
dtype_ = container()->as<kir::Kernel>()->indexType();
}
namespace {
// Traverse definition of all values involved in constructing the provided val.
// Check if all values involved are constant values, meaning the provided
// val is also a constant value.
class ConstCheck : private OptOutConstDispatch {
private:
bool is_const_ = true;
// Returns true if all Val's in the hisotry of provided Val is an Int. Since
// our expression evaluator doesn't support any type besides int, it's
// important to check it is one.
bool is_int_ = true;
void handle(const Bool* b) final {
is_const_ = is_const_ && b->isConst();
}
void handle(const Double* d) final {
is_const_ = is_const_ && d->isConst();
}
void handle(const Int* i) final {
is_const_ = is_const_ && i->isConst();
}
void handle(const NamedScalar* ns) final {
is_const_ = is_const_ && false;
}
void handle(const Expr* expr) final {
for (auto inp : expr->inputs()) {
handle(inp);
}
}
void handle(const Val* val) final {
if (!val->isAnInt()) {
is_int_ = false;
}
if (val->definition() != nullptr) {
handle(val->definition());
} else {
OptOutConstDispatch::handle(val);
}
}
public:
static bool isConst(const Val* val) {
ConstCheck cc;
cc.handle(val);
return cc.is_const_;
}
static bool isConstInt(const Val* val) {
ConstCheck cc;
cc.handle(val);
return cc.is_const_ && cc.is_int_;
}
};
} // namespace
bool Val::isConstScalar() const {
if (!isScalar()) {
return false;
}
return ConstCheck::isConst(this);
}
bool Val::isConstInt() const {
return ConstCheck::isConst(this) && isAnInt();
}
int64_t Val::evaluateInt() {
TORCH_INTERNAL_ASSERT(
ConstCheck::isConst(this),
"Cannot get Int of not const values through IR nodes, must use runtime ExpressionEvaluator.");
if (this->as<Int>()->value().has_value()) {
return this->as<Int>()->value().value();
}
ExpressionEvaluator ee(fusion());
auto evaluated_val = ee.evaluate(this);
TORCH_INTERNAL_ASSERT(
evaluated_val.has_value(),
"Detected a const integer but failed to infer its value.");
return evaluated_val->as<int64_t>();
}
double Val::evaluateDouble() {
TORCH_INTERNAL_ASSERT(
ConstCheck::isConst(this),
"Cannot get Double of not const doubles through IR nodes, must use runtime ExpressionEvaluator.");
if (this->as<Double>()->value().has_value()) {
return this->as<Double>()->value().value();
}
ExpressionEvaluator ee(fusion());
auto evaluated_val = ee.evaluate(this);
TORCH_INTERNAL_ASSERT(
evaluated_val.has_value(),
"Detected a const integer but failed to infer its value.");
return evaluated_val->as<double>();
}
c10::optional<int64_t> Val::getInt() const {
if (isConstScalar() && isAnInt()) {
if (this->getValType() == ValType::Scalar) {
if (this->isA<Int>()) {
return this->as<Int>()->value();
}
}
}
return c10::nullopt;
}
c10::optional<double> Val::getDouble() const {
if (isConstScalar() && isAnInt()) {
if (this->getValType() == ValType::Scalar) {
if (this->isA<Double>()) {
return this->as<Double>()->value();
}
}
}
return c10::nullopt;
}
bool Val::isZeroInt() const {
auto int_val = getInt();
return int_val.has_value() && int_val.value() == 0;
}
bool Val::isOneInt() const {
auto int_val = getInt();
return int_val.has_value() && int_val.value() == 1;
}
bool Val::isDefinitionType(ExprType expression_type) const {
if (definition() != nullptr) {
auto def_expr_type = definition()->getExprType();
if (def_expr_type.has_value() && def_expr_type.value() == expression_type) {
return true;
}
}
return false;
}
c10::optional<DataType> Val::getDataType() const {
TORCH_INTERNAL_ASSERT(
dtype_ != DataType::Null, "Value does not have a data type.");
return dtype_;
}
bool Val::isProducerOf(const Val* other) const {
TORCH_INTERNAL_ASSERT(other != nullptr);
TORCH_INTERNAL_ASSERT(container() == other->container());
if (definition() == nullptr) {
return false;
}
return std::any_of(
definition()->inputs().begin(),
definition()->inputs().end(),
[other](const Val* input) { return input == other; });
}
bool Val::isConsumerOf(const Val* other) const {
return other->isProducerOf(this);
}
// We don't register with the active fusion in Expr as this needs to be done
// after inputs and outputs are registered with the Expr
Expr::Expr(IrBuilderPasskey passkey, ExprType etype)
: Statement(passkey), etype_{etype} {}
Expr::Expr(const Expr* src, IrCloner* ir_cloner)
: Statement(src, ir_cloner),
etype_(src->etype_),
inputs_(ir_cloner->clone(src->inputs_)),
outputs_(ir_cloner->clone(src->outputs_)) {}
bool Expr::sameAs(const Statement* other) const {
if (this == other) {
return true;
}
if (!other->isA<Expr>()) {
return false;
}
const Expr* other_expr = other->as<Expr>();
if (getExprType() != other_expr->getExprType()) {
return false;
}
if (inputs().size() != other_expr->inputs().size() ||
outputs().size() != other_expr->outputs().size()) {
return false;
}
for (const auto i : c10::irange(inputs().size())) {
if (!input(i)->sameAs(other_expr->input(i))) {
return false;
}
}
return true;
}
kir::Predicate* Expr::predicate() const {
TORCH_INTERNAL_ASSERT(
container()->isA<kir::Kernel>(), "Function invalid for fusion.");
return predicate_;
}
void Expr::setPredicate(kir::Predicate* predicate) {
TORCH_INTERNAL_ASSERT(
container()->isA<kir::Kernel>(), "Function invalid for fusion.");
predicate_ = predicate;
}
Expr* Expr::withPredicate(kir::Predicate* predicate) {
auto result = shallowCopy();
result->setPredicate(predicate);
return result;
}
kir::Predicate* Expr::writePredicate() const {
TORCH_INTERNAL_ASSERT(
container()->isA<kir::Kernel>(), "Function invalid for fusion.");
return write_predicate_;
}
void Expr::setWritePredicate(kir::Predicate* write_predicate) {
TORCH_INTERNAL_ASSERT(
container()->isA<kir::Kernel>(), "Function invalid for fusion.");
write_predicate_ = write_predicate;
}
Expr* Expr::withWritePredicate(kir::Predicate* predicate) {
auto result = shallowCopy();
result->setWritePredicate(predicate);
return result;
}
void Expr::copyPredicatesFrom(const Expr* expr) {
if (container()->isA<kir::Kernel>()) {
predicate_ = expr->predicate_;
write_predicate_ = expr->write_predicate_;
}
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,524 +0,0 @@
#pragma once
#include <c10/core/ScalarType.h>
#include <c10/macros/Export.h>
#include <c10/util/Exception.h>
#include <c10/util/Optional.h>
#include <type.h>
#include <utils.h>
#include <cstdint>
#include <iostream>
#include <limits>
#include <memory>
#include <stdexcept>
#include <unordered_map>
#include <vector>
// TODO: Add more types (int32, int64)
// TODO: sameAs should have better logic to check against any type and return
// gracefully
/*
* This file defines the base IR structure. Any IR node in this system will
* inherit from one of the following classes: Statement, Expr, Val,
* IrInputOutput IR is any information that the code generation stack may need
* for analysis. By analysis we're refering to anything done in response to a
* user facing call of this stack. This could be careful tracking of user calls,
* and any transformation including optimizing transformations, user declared
* transformations, and lowering the IR.
*/
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
using ValueId = int32_t;
using StmtNameType = unsigned int;
constexpr StmtNameType kInvalidStmName =
std::numeric_limits<unsigned int>::max();
class Fusion;
class FusionGuard;
class Expr;
class Val;
class UnaryOp;
class BinaryOp;
class RNGOp;
class IterDomain;
class IrCloner;
class IrContainer;
class IrBuilderPasskey;
class IrContainerPasskey;
namespace kir {
class Kernel;
class Predicate;
} // namespace kir
// Passkey for container to register names with statements
class ExprPasskey {
friend class Expr;
private:
explicit ExprPasskey() {}
};
TORCH_CUDA_CU_API void swap(Fusion& a, Fusion& b) noexcept;
//! Statement is the highest level node representation. Everything that is
//! considered "IR" will be derived from this class at some point. Both Values
//! and Expr's are a Statement. If there will ever be any more fundamental
//! types, they will also derive from Statement.
//!
//! We use Statements to pass around nodes of unknown compile type. Therefore it
//! is also important for the design to have a dispatch system for a Statment.
//! Basically beinng able to succienctly traverse down the inhereitance stack of
//! a Statment at runtime. This is currently implemented in dispatch.h
class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase {
friend void swap(Fusion&, Fusion&) noexcept;
friend void swap(IrContainer& a, IrContainer& b) noexcept;
public:
Statement() = delete;
// Cloning constructor
Statement(const Statement* src, IrCloner* ir_cloner);
// Dispatch functions, definitions in dispatch.cpp
template <typename T>
static void dispatch(T handler, Statement*);
template <typename T>
static void constDispatch(T handler, const Statement* const);
template <typename T>
static void mutatorDispatch(T mutator, Statement*);
// Accessor functions to types. Vals always have a DataType, Exprs never do
virtual c10::optional<ValType> getValType() const {
return c10::nullopt;
}
virtual c10::optional<DataType> getDataType() const {
return c10::nullopt;
}
virtual c10::optional<ExprType> getExprType() const {
return c10::nullopt;
}
// Short cut to figure out if it is a value/expression
bool isVal() const {
return getValType() != c10::nullopt;
}
bool isExpr() const {
return getExprType() != c10::nullopt;
}
// Make sure this is a Val and return it as a Val*
Val* asVal();
// Make sure this is an Expr and return it as an Expr*
Expr* asExpr();
// Return the fusion this statement belongs to
Fusion* fusion() const;
// Return the kernel this statement belongs to
kir::Kernel* kernel() const;
// Return the container this statement belongs to
IrContainer* container() const {
return ir_container_;
}
// Return the int that represents its name
StmtNameType name() const {
return name_;
}
// Set the statements' name. Typically the container will set the name,
// however if we're dealing with cloning, IrBuilder will set the name, this
// maybe should be from IrCloner, however I didn't want to add another
// passkey.
void setName(IrContainerPasskey, StmtNameType name);
void setName(IrBuilderPasskey, StmtNameType name);
virtual bool sameType(const Statement* const other) {
if (isVal() && other->isVal())
return getValType().value() == other->getValType().value();
if (isExpr() && other->isExpr())
return getExprType().value() == other->getExprType().value();
return false;
}
// Return if this statement is the same as another statement
// TODO: should this run through dispatch on this and other?
virtual bool sameAs(const Statement* other) const {
return this == other;
}
std::string toString() const;
std::string toInlineString() const;
protected:
Statement(IrBuilderPasskey);
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
StmtNameType name_ = kInvalidStmName;
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
IrContainer* ir_container_ = nullptr;
};
//! A Val represents a "value." These are objects, like tensors, scalars, and
//! memory locations, that are inputs and outputs of computations (represented
//! by Exprs, below)
//!
//! Vals are constant and unique and should always be passed
//! around as a pointer. Val can generally be thought of as representing any
//! type of data. Some examples: a constant size like convolution filter width a
//! runtime constant like batch normalizations momentum a "symbolic" tensor like
//! one passed down from the JIT a memory buffer used in device code
//!
//! Adding a Val:
//! Right now adding a Val is quite involved. Val's can be defined in ir.h or in
//! their own header file. The following is what is currently needed to add a
//! new Val:
//!
//! 1) Definition inheriting from Val
//! - Members must be private or protected
//! - Accessor functions for members
//! - Must call Val constructor, Val constructor registers with fusion
//! - Implementation of bool sameAs(...)
//! - Must implement a "cloning" constructor, ex.
//! Int::Int(const Int* src, IrCloner* ir_cloner)
//! 2) dispatch.h/.cpp must be updated to include dispatch of the new Val
//! 3) Default mutator function should be added to mutator.cpp
//! 4a) Printing functions should be added to ir_iostream.h/.cpp
//! 4b) Graphviz generation must be added to ir_graphviz.h/.cpp
//! 5) An enum value must be added to ValType in type.h
//! 6) A string entry must be added in val_type_string_map
//!
class TORCH_CUDA_CU_API Val : public Statement {
public:
explicit Val(
IrBuilderPasskey,
ValType _vtype,
DataType _dtype = DataType::Null);
Val(const Val* src, IrCloner* ir_cloner);
// Dispatch functions, definitions in dispatch.cpp
template <typename T>
static void dispatch(T handler, Val*);
template <typename T>
static void constDispatch(T handler, const Val* const);
template <typename T>
static void mutatorDispatch(T mutator, Val*);
c10::optional<ValType> getValType() const override {
return vtype_;
}
ValType vtype() const {
return vtype_;
}
DataType dtype() const {
return dtype_;
}
// Throws if no DataType is found. Vals must have a DataType
c10::optional<DataType> getDataType() const override;
bool isScalar() const {
return vtype_ == ValType::Scalar || vtype_ == ValType::NamedScalar;
}
// Returns if all dependencies are constant scalars
bool isConstScalar() const;
// Returns if all dependencies are constant integers
bool isConstInt() const;
bool isAnInt() const {
return isScalar() && dtype_ == DataType::Int;
}
bool isADouble() const {
return isScalar() && dtype_ == DataType::Double;
}
// If this Val is an integer with a direct constant value associated with it,
// will return the value of that constant integer. If this integer has
// defining expressions it will return a c10::nullopt. Those values should be
// infered using evaluateInt.
c10::optional<int64_t> getInt() const;
// If this Val is a double with a direct constant value associated with it,
// will return the value of that constant double. If this double has
// defining expressions it will return a c10::nullopt. Those values should be
// infered using evaluateDouble.
c10::optional<double> getDouble() const;
// If this Val is a constant integer, and its history is comprised only of
// constant values, will return the value of that constant integer. Cannot
// make constant as expression evaluator takes non-constant Vals.
int64_t evaluateInt();
// If this Val is a constant double, and its history is comprised only of
// constant values, will return the value of that constant double. Cannot
// make constant as expression evaluator takes non-constant Vals.
double evaluateDouble();
// Returns if no dependencies and is a constant scalar.
virtual bool isConst() const {
return false;
}
bool isZeroInt() const;
bool isOneInt() const;
// Returns the Expr that this value is an output of, returns nullptr if none
// was found
Expr* definition() const {
if (is_fusion_input_) {
return nullptr;
}
return definition_;
}
// Determine if value definition matches given expression type
bool isDefinitionType(ExprType expression_type) const;
const std::vector<Expr*>& uses() const;
bool isFusionInput() const {
return is_fusion_input_;
}
bool isFusionOutput() const {
return is_fusion_output_;
}
//! Returns true when other is a producer of this
bool isProducerOf(const Val* other) const;
//! Returns true when other is a consumer of this
bool isConsumerOf(const Val* other) const;
bool sameType(const Statement* other) override {
return Statement::sameType(other) &&
getDataType() == other->as<Val>()->getDataType();
}
// TODO: Make this more sophisticated. A value being the same as another value
// should be evaluated based on the DAG that created it, and that DAGs leaf
// nodes
bool sameAs(const Statement* other) const override {
return this == other;
}
void setEvaluatorIndex(int to) {
TORCH_INTERNAL_ASSERT(evaluator_index_ == -1);
evaluator_index_ = to;
}
int evaluatorIndex() const {
return evaluator_index_;
}
// Following is managed by Fusion (or kirIrBuilder) and can change.
// TODO: Protect with a passkey.
void setDefinition(Expr* expr) {
definition_ = expr;
}
void resolveIndexDtype();
protected:
friend Fusion;
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
const ValType vtype_;
// TODO: Add fusion passkey for this
void setIsFusionInput(bool is_fusion_input) {
is_fusion_input_ = is_fusion_input;
}
// TODO: Add fusion passkey for this
void setIsFusionOutput(bool is_fusion_output) {
is_fusion_output_ = is_fusion_output;
}
// TODO: Add fusion or container passkey for this
void setUses(const std::vector<Expr*>& uses) {
uses_ = uses;
}
private:
// There's only one instance where dtype can change, and that's through
// resolving the index data type from nvfuser to either Int or Int32 for
// welford operations.
DataType dtype_;
// Following is managed by Fusion and can change.
bool is_fusion_input_ = false;
bool is_fusion_output_ = false;
Expr* definition_ = nullptr;
std::vector<Expr*> uses_;
// Expr evaluator idx;
int evaluator_index_ = -1;
};
//! A Expr represents a "computation." These are functions that takes inputs
//! and produce outputs, inputs and outputs all being Vals. There are
//! specializations of BinaryOp which takes 2 inputs and produces 1 output, and
//! UnaryOp which takes 1 input and produces 1 output. Exprs are unique and
//! immutable. Conceptually, Exprs could always be manipulated using unique
//! pointers, and we could add this later. However, for now Exprs can be
//! replaced in a fusion, but they cannot be modified in place.
//!
//! The IR is static single assignment (SSA). Values can only be defined as an
//! output of an Expr once. If they are re-defined the original definition is
//! deleted from the program, as opposed to an ordered redefinition of the
//! value in the program.
//!
//! Note: Registering an Expr with a Fusion is actually 2 parts, one part is
//! done in the Expr constructor, so that should be called on anything that
//! inherits Expr. The issue with having registration in Expr's constructor, is
//! that the constructor of an Expr will set ouputs and inputs. This
//! information is important for registration with Fuser, so it can track the
//! dependency chain.
//!
//! Adding an Expr:
//! Right now adding an Expr is quite involved. Expr's can be defined in ir.h
//! or in their own header file. The following is what is currently needed for
//! Expr definitions:
//!
//! 1) Definition inheriting from Expr.
//! - Members must be private or protected
//! - Accessor functions for members
//! - Constructors need to register with the Fusion after inputs/outputs
//! are defined
//! - Implementation of bool sameAs(...)
//! 2) dispatch.h/.cpp must be updated to include dispatch of the new Val
//! 3) Default mutator function should be added to mutator.h/.cpp
//! 4) Printing functions should be added to ir_iostream.h/.cpp
//! 5) Lower case convenience functions should be added to arith.h/.cpp (If
//! user facing)
//! 6) An enum value must be added to ExprType in type.h
//! 7) A string entry must be added in expr_type_string_map
//! 8) Entry added to ir_graphviz .cpp/.h
//!
class TORCH_CUDA_CU_API Expr : public Statement {
public:
explicit Expr(IrBuilderPasskey, ExprType type);
Expr(const Expr* src, IrCloner* ir_cloner);
// Creates a new instance of the expression with all its field copied.
// Note that unlike IrCloner, this function only do a shallow copy
virtual Expr* shallowCopy() const = 0;
c10::optional<ExprType> getExprType() const override {
return etype_;
}
ExprType etype() const {
return etype_;
}
bool sameAs(const Statement* other) const override;
// Input/output accessors
const auto& inputs() const {
return inputs_;
}
const auto& outputs() const {
return outputs_;
}
auto input(size_t index) const {
return inputs_[index];
}
auto output(size_t index) const {
return outputs_[index];
}
// Dispatch functions, definitions in dispatch.cpp
template <typename T>
static void dispatch(T handler, Expr*);
template <typename T>
static void constDispatch(T handler, const Expr* const);
template <typename T>
static void mutatorDispatch(T mutator, Expr*);
// TODO: Protect based on being in kernel container
kir::Predicate* predicate() const;
// Creates a shallow copy the expression with the given predicate attached.
// TODO: Protect based on being in kernel container
Expr* withPredicate(kir::Predicate* predicate);
// TODO: Protect based on being in kernel container
kir::Predicate* writePredicate() const;
// Creates a shallow copy the expression with the given write-predicate
// attached.
// TODO: Protect based on being in kernel container
Expr* withWritePredicate(kir::Predicate* write_predicate);
protected:
// TODO: Protect based on being in kernel container
void setPredicate(kir::Predicate* predicate);
// TODO: Protect based on being in kernel container
void setWritePredicate(kir::Predicate* write_predicate);
void copyPredicatesFrom(const Expr* expr);
// TODO: Add Fusion passkey
void addInput(Val* input) {
TORCH_INTERNAL_ASSERT(input != nullptr);
inputs_.push_back(input);
}
// TODO: Add Fusion passkey
void addOutput(Val* output) {
TORCH_INTERNAL_ASSERT(output != nullptr);
outputs_.push_back(output);
}
ExprPasskey exprPasskey() {
return ExprPasskey();
}
private:
ExprType etype_ = ExprType::Invalid;
std::vector<Val*> inputs_;
std::vector<Val*> outputs_;
kir::Predicate* predicate_ = nullptr;
// Only used for reduction-related expressions
kir::Predicate* write_predicate_ = nullptr;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,471 +0,0 @@
#include <fusion.h>
#include <ir_builder.h>
#include <ir_cloner.h>
#include <kernel.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
//! Clone an IR node, forwarding the arguments to the IrCloner constructor.
template <class T>
T* IrBuilder::clone(const T* src, IrCloner* ir_cloner) {
TORCH_INTERNAL_ASSERT(
ir_cloner != nullptr,
"Cannot use create when a cloner object is set. Use clone.");
TORCH_INTERNAL_ASSERT(
ir_cloner->container() != nullptr,
"Cloner doesn't have a valid container to store cloned object.");
T* dest = new T(src, ir_cloner);
const Statement* src_stmt = dynamic_cast<const Statement*>(src);
Statement* dest_stmt = dynamic_cast<Statement*>(dest);
auto dest_container = ir_cloner->container();
auto src_container = src_stmt->container();
dest_container->registerStmt(IrBuilderPasskey(dest_container), dest_stmt);
if (src_container != dest_container) {
dest_stmt->setName(IrBuilderPasskey(dest_container), src_stmt->name());
}
ir_cloner->registerClone(src_stmt, dest_stmt);
return dest;
}
#define IR_BUILDER_INSTANTIATE(T) \
template T* IrBuilder::clone(const T* src, IrCloner* ir_cloner);
// Vals
IR_BUILDER_INSTANTIATE(IterDomain)
IR_BUILDER_INSTANTIATE(TensorDomain)
IR_BUILDER_INSTANTIATE(TensorView)
IR_BUILDER_INSTANTIATE(Bool)
IR_BUILDER_INSTANTIATE(Double)
IR_BUILDER_INSTANTIATE(Int)
IR_BUILDER_INSTANTIATE(ComplexDouble)
IR_BUILDER_INSTANTIATE(NamedScalar)
// Exprs
IR_BUILDER_INSTANTIATE(Split)
IR_BUILDER_INSTANTIATE(Merge)
IR_BUILDER_INSTANTIATE(Swizzle2D)
IR_BUILDER_INSTANTIATE(TransposeOp)
IR_BUILDER_INSTANTIATE(ExpandOp)
IR_BUILDER_INSTANTIATE(ShiftOp)
IR_BUILDER_INSTANTIATE(GatherOp)
IR_BUILDER_INSTANTIATE(ViewAsScalar)
IR_BUILDER_INSTANTIATE(ViewOp)
IR_BUILDER_INSTANTIATE(FullOp)
IR_BUILDER_INSTANTIATE(ARangeOp)
IR_BUILDER_INSTANTIATE(EyeOp)
IR_BUILDER_INSTANTIATE(UnaryOp)
IR_BUILDER_INSTANTIATE(BinaryOp)
IR_BUILDER_INSTANTIATE(TernaryOp)
IR_BUILDER_INSTANTIATE(RNGOp)
IR_BUILDER_INSTANTIATE(ReductionOp)
IR_BUILDER_INSTANTIATE(GroupedReductionOp)
IR_BUILDER_INSTANTIATE(WelfordOp)
IR_BUILDER_INSTANTIATE(LoadStoreOp)
IR_BUILDER_INSTANTIATE(MmaOp)
IR_BUILDER_INSTANTIATE(BroadcastOp)
Val* IrBuilder::newResult(DataType dtype) {
switch (dtype) {
case DataType::Bool:
return IrBuilder::create<Bool>(c10::nullopt);
case DataType::Double:
return IrBuilder::create<Double>(c10::nullopt);
case DataType::Int:
return IrBuilder::create<Int>(c10::nullopt);
default:
TORCH_CHECK(false, "Unexpected data type");
}
}
Val* IrBuilder::newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
TORCH_CHECK(
lhs != nullptr && rhs != nullptr,
"Either lhs or rhs is a nullptr in newArithmeticExpr.");
TORCH_CHECK(
lhs->dtype() == rhs->dtype(),
"Incompatible operand types: ",
lhs->dtype(),
" and ",
rhs->dtype());
auto result = newResult(lhs->dtype());
IrBuilder::create<BinaryOp>(op_type, result, lhs, rhs);
// NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
return result;
}
Val* IrBuilder::newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) {
TORCH_CHECK(
lhs != nullptr && rhs != nullptr,
"Either lhs or rhs is a nullptr in newLogicExpr.");
auto result = IrBuilder::create<Bool>(c10::nullopt);
IrBuilder::create<BinaryOp>(op_type, result, lhs, rhs);
// NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks)
return result;
}
Val* IrBuilder::whereExpr(Val* pred, Val* lhs, Val* rhs) {
TORCH_CHECK(
pred != nullptr && lhs != nullptr && rhs != nullptr,
"Either pred, lhs, or rhs is a nullptr in whereExpr.");
TORCH_CHECK(lhs->dtype() == rhs->dtype(), "Incompatible operand types");
auto result = newResult(lhs->dtype());
IrBuilder::create<TernaryOp>(TernaryOpType::Where, result, pred, lhs, rhs);
return result;
}
Val* IrBuilder::negExpr(Val* val) {
TORCH_CHECK(val != nullptr, "val is a nullptr in negExpr.");
auto result = newResult(val->dtype());
IrBuilder::create<UnaryOp>(UnaryOpType::Neg, result, val);
return result;
}
Val* IrBuilder::notExpr(Val* val) {
TORCH_CHECK(val != nullptr, "val is a nullptr in notExpr.");
auto result = newResult(val->dtype());
IrBuilder::create<UnaryOp>(UnaryOpType::Not, result, val);
return result;
}
Val* IrBuilder::setExpr(Val* val) {
TORCH_CHECK(val != nullptr, "val is a nullptr in setExpr.");
auto result = newResult(val->dtype());
IrBuilder::create<UnaryOp>(UnaryOpType::Set, result, val);
return result;
}
Val* IrBuilder::setExprNamedScalar(const std::string& name, Val* val) {
TORCH_CHECK(val != nullptr, "val is a nullptr in setExprNamedScalar.");
auto result = IrBuilder::create<NamedScalar>(name, val->dtype());
IrBuilder::create<UnaryOp>(UnaryOpType::Set, result, val);
return result;
}
Val* IrBuilder::addressExprNamedScalar(const std::string& name, Val* val) {
TORCH_CHECK(val != nullptr, "val is a nullptr in addressExprNamedScalar.");
auto result = IrBuilder::create<NamedScalar>(name, DataType::Int);
IrBuilder::create<UnaryOp>(UnaryOpType::Address, result, val);
return result;
}
Val* IrBuilder::andExpr(Val* lhs, Val* rhs) {
return newLogicExpr(BinaryOpType::And, lhs, rhs);
}
Val* IrBuilder::eqExpr(Val* lhs, Val* rhs) {
return newLogicExpr(BinaryOpType::Eq, lhs, rhs);
}
Val* IrBuilder::gtExpr(Val* lhs, Val* rhs) {
return newLogicExpr(BinaryOpType::GT, lhs, rhs);
}
Val* IrBuilder::ltExpr(Val* lhs, Val* rhs) {
return newLogicExpr(BinaryOpType::LT, lhs, rhs);
}
Val* IrBuilder::leExpr(Val* lhs, Val* rhs) {
return newLogicExpr(BinaryOpType::LE, lhs, rhs);
}
Val* IrBuilder::geExpr(Val* lhs, Val* rhs) {
return newLogicExpr(BinaryOpType::GE, lhs, rhs);
}
Val* IrBuilder::addExpr(Val* lhs, Val* rhs) {
return newArithmeticExpr(BinaryOpType::Add, lhs, rhs);
}
Val* IrBuilder::subExpr(Val* lhs, Val* rhs) {
return newArithmeticExpr(BinaryOpType::Sub, lhs, rhs);
}
Val* IrBuilder::mulExpr(Val* lhs, Val* rhs) {
return newArithmeticExpr(BinaryOpType::Mul, lhs, rhs);
}
Val* IrBuilder::divExpr(Val* lhs, Val* rhs) {
return newArithmeticExpr(BinaryOpType::Div, lhs, rhs);
}
Val* IrBuilder::ceilDivExpr(Val* lhs, Val* rhs) {
return newArithmeticExpr(BinaryOpType::CeilDiv, lhs, rhs);
}
Val* IrBuilder::modExpr(Val* lhs, Val* rhs) {
return newArithmeticExpr(BinaryOpType::Mod, lhs, rhs);
}
Val* IrBuilder::maxExpr(Val* lhs, Val* rhs) {
return newArithmeticExpr(BinaryOpType::Max, lhs, rhs);
}
Val* IrBuilder::minExpr(Val* lhs, Val* rhs) {
return newArithmeticExpr(BinaryOpType::Min, lhs, rhs);
}
Val* IrBuilder::swizzle2DIntExpr(
Val* in_x,
Val* in_y,
Val* extent_x,
Val* extent_y,
Swizzle2DType swizzle_type) {
auto result = create<kir::IntPair>();
create<kir::Swizzle2DInt>(
result, in_x, in_y, extent_x, extent_y, swizzle_type);
return result;
}
Val* IrBuilder::pairSelectExpr(Val* in, kir::PairSelect::Selection sel) {
auto int_pair = dynamic_cast<kir::IntPair*>(in);
TORCH_INTERNAL_ASSERT(int_pair != nullptr);
auto result = create<Int>();
create<kir::PairSelect>(result, int_pair, sel);
return result;
}
Val* SimplifyingIrBuilder::negExpr(Val* val) {
if (auto int_val = dynamic_cast<Int*>(val)) {
if (int_val->isConst()) {
return IrBuilder::create<Int>(-int_val->value().value());
}
}
return IrBuilder::negExpr(val);
}
Val* SimplifyingIrBuilder::notExpr(Val* val) {
if (auto bool_val = dynamic_cast<Bool*>(val)) {
if (bool_val->isConst()) {
if (bool_val->value().value()) {
return FusionGuard::getCurFusion()->falseVal();
} else {
return FusionGuard::getCurFusion()->trueVal();
}
}
}
return IrBuilder::notExpr(val);
}
Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int::ScalarType rhs) {
if (rhs == 0) {
return lhs;
} else if (lhs == nullptr) {
return IrBuilder::IrBuilder::create<Int>(rhs);
} else if (lhs->isConst()) {
return IrBuilder::IrBuilder::create<Int>(lhs->value().value() + rhs);
} else if (rhs > 0) {
return IrBuilder::addExpr(lhs, IrBuilder::IrBuilder::create<Int>(rhs));
} else {
return IrBuilder::subExpr(lhs, IrBuilder::IrBuilder::create<Int>(-rhs));
}
}
Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int* rhs) {
if (rhs == nullptr) {
return lhs;
} else if (lhs == nullptr) {
return rhs;
} else if (lhs->isConst()) {
return addExpr(rhs, lhs->value().value());
} else if (rhs->isConst()) {
return addExpr(lhs, rhs->value().value());
} else {
return IrBuilder::addExpr(lhs, rhs);
}
}
Val* SimplifyingIrBuilder::addExpr(Val* lhs, Val* rhs) {
TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr);
if (lhs == nullptr || lhs->isZeroInt()) {
return rhs;
} else if (rhs == nullptr || rhs->isZeroInt()) {
return lhs;
}
auto lhs_int = dynamic_cast<Int*>(lhs);
auto rhs_int = dynamic_cast<Int*>(rhs);
if (lhs_int != nullptr && rhs_int != nullptr) {
return addExpr(lhs_int, rhs_int);
} else {
return IrBuilder::addExpr(lhs, rhs);
}
}
Val* SimplifyingIrBuilder::addExpr(Val* lhs, Int::ScalarType rhs) {
auto lhs_int = dynamic_cast<Int*>(lhs);
if (lhs_int != nullptr) {
return addExpr(lhs_int, rhs);
} else {
return addExpr(lhs, IrBuilder::create<Int>(rhs));
}
}
Val* SimplifyingIrBuilder::subExpr(Val* lhs, Val* rhs) {
return addExpr(lhs, negExpr(rhs));
}
Val* SimplifyingIrBuilder::mulExpr(Int* lhs, Int::ScalarType rhs) {
if (rhs == 0) {
return lhs->container()->zeroVal();
} else if (rhs == 1) {
return lhs;
} else if (lhs == nullptr) {
return IrBuilder::create<Int>(rhs);
} else if (lhs->isConst()) {
return IrBuilder::create<Int>(lhs->value().value() * rhs);
} else {
return IrBuilder::mulExpr(lhs, IrBuilder::create<Int>(rhs));
}
}
Val* SimplifyingIrBuilder::mulExpr(Val* lhs, Int::ScalarType rhs) {
auto lhs_int = dynamic_cast<Int*>(lhs);
if (lhs_int != nullptr) {
return mulExpr(lhs_int, rhs);
} else {
return IrBuilder::mulExpr(lhs, IrBuilder::create<Int>(rhs));
}
}
Val* SimplifyingIrBuilder::mulExpr(Int* lhs, Int* rhs) {
if (rhs == nullptr) {
return lhs;
} else if (lhs == nullptr) {
return rhs;
} else if (lhs->isConst()) {
return mulExpr(rhs, lhs->value().value());
} else if (rhs->isConst()) {
return mulExpr(lhs, rhs->value().value());
} else {
return IrBuilder::mulExpr(lhs, rhs);
}
}
Val* SimplifyingIrBuilder::mulExpr(Val* lhs, Val* rhs) {
TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr);
if (lhs == nullptr || lhs->isOneInt()) {
return rhs;
} else if (rhs == nullptr || rhs->isOneInt()) {
return lhs;
} else if (lhs->isZeroInt() || rhs->isZeroInt()) {
return lhs->container()->zeroVal();
}
auto lhs_int = dynamic_cast<Int*>(lhs);
auto rhs_int = dynamic_cast<Int*>(rhs);
if (lhs_int != nullptr && rhs_int != nullptr) {
return mulExpr(lhs_int, rhs_int);
} else {
return IrBuilder::mulExpr(lhs, rhs);
}
}
Val* SimplifyingIrBuilder::andExpr(Val* lhs, Val* rhs) {
TORCH_INTERNAL_ASSERT(!(lhs == nullptr && rhs == nullptr));
if (lhs == nullptr) {
return rhs;
} else if (rhs == nullptr) {
return lhs;
}
bool lhs_definitely_true = false;
bool lhs_definitely_false = false;
auto lhs_bool = dynamic_cast<Bool*>(lhs);
if (lhs_bool && lhs_bool->isConst()) {
lhs_definitely_true = lhs_bool->value().value();
lhs_definitely_false = !lhs_bool->value().value();
}
auto rhs_bool = dynamic_cast<Bool*>(rhs);
bool rhs_definitely_true = false;
bool rhs_definitely_false = false;
if (rhs_bool && rhs_bool->isConst()) {
rhs_definitely_true = rhs_bool->value().value();
rhs_definitely_false = !rhs_bool->value().value();
}
if (lhs_definitely_true && rhs_definitely_true) {
return FusionGuard::getCurFusion()->trueVal();
} else if (lhs_definitely_false || rhs_definitely_false) {
return FusionGuard::getCurFusion()->falseVal();
} else if (lhs_definitely_true) {
return rhs;
} else if (rhs_definitely_true) {
return lhs;
}
return IrBuilder::andExpr(lhs, rhs);
}
namespace {
template <typename IrBuilderFunc, typename IntFunc>
Val* minOrMaxExpr(
Int* lhs,
Int* rhs,
IrBuilderFunc ir_builder_func,
IntFunc int_func) {
if (rhs == nullptr) {
return lhs;
} else if (lhs == nullptr) {
return rhs;
} else if (lhs->isConst() && rhs->isConst()) {
return IrBuilder::create<Int>(
int_func(lhs->value().value(), rhs->value().value()));
} else {
return ir_builder_func(lhs, rhs);
}
}
template <typename IrBuilderFunc, typename IntFunc>
Val* minOrMaxExpr(
Val* lhs,
Val* rhs,
IrBuilderFunc ir_builder_func,
IntFunc int_func) {
TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr);
if (lhs == nullptr) {
return rhs;
} else if (rhs == nullptr || lhs == rhs) {
return lhs;
}
auto lhs_int = dynamic_cast<Int*>(lhs);
auto rhs_int = dynamic_cast<Int*>(rhs);
if (lhs_int != nullptr && rhs_int != nullptr) {
return minOrMaxExpr(lhs_int, rhs_int, ir_builder_func, int_func);
} else {
return ir_builder_func(lhs, rhs);
}
}
} // namespace
Val* SimplifyingIrBuilder::maxExpr(Val* lhs, Val* rhs) {
return minOrMaxExpr(
lhs,
rhs,
[](Val* lhs, Val* rhs) { return IrBuilder::maxExpr(lhs, rhs); },
[](int64_t lhs, int64_t rhs) { return std::max(lhs, rhs); });
}
Val* SimplifyingIrBuilder::minExpr(Val* lhs, Val* rhs) {
return minOrMaxExpr(
lhs,
rhs,
[](Val* lhs, Val* rhs) { return IrBuilder::minExpr(lhs, rhs); },
[](int64_t lhs, int64_t rhs) { return std::min(lhs, rhs); });
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,140 +0,0 @@
#pragma once
#include <fusion.h>
#include <ir_all_nodes.h>
#include <ir_container.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace kir {
class Kernel;
}
class IrCloner;
// Passkey for builder to register properties with statements, and to call
// functions in IrContainer
class TORCH_CUDA_CU_API IrBuilderPasskey {
friend class IrBuilder;
public:
// TODO: Collapse ir_container and Kernel once Kernel inherits from
// IrContainer
IrContainer* const ir_container_ = nullptr;
private:
explicit IrBuilderPasskey(IrContainer* ir_container);
};
//! IR builder interface
class TORCH_CUDA_CU_API IrBuilder {
public:
//! Allocate a new IR node, forwarding the arguments to the appropriate
//! constructor and registering with the container
template <class T, class... Args>
static T* create(Args&&... args) {
auto container = FusionGuard::getCurFusion();
// return create<T>(container, std::forward<Args>(args)...);
TORCH_INTERNAL_ASSERT(
container != nullptr, "Need an active container to build IR.");
T* node = new T(IrBuilderPasskey(container), std::forward<Args>(args)...);
container->registerStmt(IrBuilderPasskey(container), node);
return node;
}
//! Allocate a new IR node, forwarding the arguments to the appropriate
//! constructor and registering with the container
template <class T, class... Args>
static T* create(IrContainer* container, Args&&... args) {
TORCH_INTERNAL_ASSERT(
container != nullptr, "Need an active container to build IR.");
T* node = new T(IrBuilderPasskey(container), std::forward<Args>(args)...);
container->registerStmt(IrBuilderPasskey(container), node);
return node;
}
//! Clone an IR node, forwarding the arguments to the IrCloner constructor.
//! Register clones with IrCloner's target container.
template <class T>
static T* clone(const T* src, IrCloner* ir_cloner);
// Unary operations
static Val* negExpr(Val* val);
static Val* notExpr(Val* val);
static Val* setExpr(Val* val);
static Val* setExprNamedScalar(const std::string& name, Val* val);
static Val* addressExprNamedScalar(const std::string& name, Val* val);
// Binary operations
static Val* andExpr(Val* lhs, Val* rhs);
static Val* eqExpr(Val* lhs, Val* rhs);
static Val* gtExpr(Val* lhs, Val* rhs);
static Val* ltExpr(Val* lhs, Val* rhs);
static Val* leExpr(Val* lhs, Val* rhs);
static Val* geExpr(Val* lhs, Val* rhs);
static Val* addExpr(Val* lhs, Val* rhs);
static Val* subExpr(Val* lhs, Val* rhs);
static Val* mulExpr(Val* lhs, Val* rhs);
static Val* divExpr(Val* lhs, Val* rhs);
static Val* ceilDivExpr(Val* lhs, Val* rhs);
static Val* modExpr(Val* lhs, Val* rhs);
static Val* maxExpr(Val* lhs, Val* rhs);
static Val* minExpr(Val* lhs, Val* rhs);
// Ternary operations
static Val* whereExpr(Val* pred, Val* lhs, Val* rhs);
// Swizzle operations
static Val* swizzle2DIntExpr(
Val* x,
Val* y,
Val* extent_x,
Val* extent_y,
Swizzle2DType swizzle_type);
static Val* pairSelectExpr(Val* in, kir::PairSelect::Selection sel);
private:
static Val* newResult(DataType dtype);
static Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
static Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs);
};
//! A wrapper builder with static expression simplification
//!
//! Example:
//! - addExpr(new Int(1), new Int(2)) -> Int(3)
//! - addExpr(new Int(0), new NamedScalar("foo")) -> NamedScalar("foo")
//!
//! Designed to be used to simplify predicate and index expressions in
//! generated code. Also, the shift validation may fail without
//! this simplification.
class TORCH_CUDA_CU_API SimplifyingIrBuilder : public IrBuilder {
public:
static Val* negExpr(Val* val);
static Val* notExpr(Val* val);
static Val* addExpr(Int* lhs, Int::ScalarType rhs);
static Val* addExpr(Val* lhs, Int::ScalarType rhs);
static Val* addExpr(Int* lhs, Int* rhs);
static Val* addExpr(Val* lhs, Val* rhs);
static Val* subExpr(Val* lhs, Val* rhs);
static Val* mulExpr(Int* lhs, Int::ScalarType rhs);
static Val* mulExpr(Val* lhs, Int::ScalarType rhs);
static Val* mulExpr(Int* lhs, Int* rhs);
static Val* mulExpr(Val* lhs, Val* rhs);
static Val* andExpr(Val* lhs, Val* rhs);
static Val* maxExpr(Val* lhs, Val* rhs);
static Val* minExpr(Val* lhs, Val* rhs);
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,242 +0,0 @@
#include <ir_cloner.h>
#include <fusion.h>
#include <ir_all_nodes.h>
#include <ir_builder.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
IrCloner::IrCloner(IrContainer* container) : ir_container_(container) {}
Statement* IrCloner::clone(const Statement* statement) {
if (statement == nullptr) {
return nullptr;
}
// Have we already cloned this node?
const auto it = clones_map_.find(statement);
if (it != clones_map_.end()) {
return it->second;
} else {
// Clone the new node, saving/restoring this->clone_
// since the cloning can be reentrant
auto saved_clone = clone_;
handle(statement);
auto new_node = clone_;
clone_ = saved_clone;
// The base cloning constructor (Statement) should have
// registered the new node. Failure to do so indicates
// that something went horribly wrong.
TORCH_INTERNAL_ASSERT(new_node != nullptr);
TORCH_INTERNAL_ASSERT(clones_map_[statement] == new_node);
return new_node;
}
}
void IrCloner::registerClone(const Statement* src, Statement* clone) {
TORCH_CHECK(src != nullptr);
TORCH_CHECK(clone != nullptr);
TORCH_CHECK(clones_map_.insert({src, clone}).second);
}
void IrCloner::handle(const Statement* s) {
OptInConstDispatch::handle(s);
}
void IrCloner::handle(const Val* v) {
OptInConstDispatch::handle(v);
}
void IrCloner::handle(const Expr* e) {
OptInConstDispatch::handle(e);
}
void IrCloner::handle(const TensorDomain* td) {
clone_ = IrBuilder::clone(td, this);
}
void IrCloner::handle(const IterDomain* id) {
clone_ = IrBuilder::clone(id, this);
}
void IrCloner::handle(const Bool* b) {
clone_ = IrBuilder::clone(b, this);
}
void IrCloner::handle(const Double* d) {
clone_ = IrBuilder::clone(d, this);
}
void IrCloner::handle(const Int* i) {
clone_ = IrBuilder::clone(i, this);
}
void IrCloner::handle(const ComplexDouble* c) {
clone_ = IrBuilder::clone(c, this);
}
void IrCloner::handle(const NamedScalar* named_scalar) {
clone_ = IrBuilder::clone(named_scalar, this);
}
void IrCloner::handle(const TensorView* tv) {
clone_ = IrBuilder::clone(tv, this);
}
void IrCloner::handle(const FullOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const ARangeOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const EyeOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const UnaryOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const BinaryOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const TernaryOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const RNGOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const BroadcastOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const ReductionOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const GroupedReductionOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const WelfordOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const LoadStoreOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const MmaOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const TransposeOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const ExpandOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const ShiftOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const GatherOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const ViewAsScalar* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const ViewOp* op) {
clone_ = IrBuilder::clone(op, this);
}
void IrCloner::handle(const Split* split) {
clone_ = IrBuilder::clone(split, this);
}
void IrCloner::handle(const Merge* merge) {
clone_ = IrBuilder::clone(merge, this);
}
void IrCloner::handle(const Swizzle2D* swizzle) {
clone_ = IrBuilder::clone(swizzle, this);
}
TensorView* RecomputeTv::recompute(TensorView* tv) {
FusionGuard fg(tv->fusion());
// Disallow recomputation of inputs or outputs. User would have to be aware of
// these changes and informed they happened somehow.
TORCH_INTERNAL_ASSERT(
!tv->isFusionInput(),
"Cannot recompute buffers that are inputs of the fusion.");
// Grab all the expressions used to generate the TensorView
auto exprs = StmtSort::getExprs(tv->fusion(), {tv}, false);
// Run the replicator
RecomputeTv replicator(tv->fusion(), exprs);
// Make const version of pointer for lookup
const auto const_tv = tv;
// Find the recomputed tensor from the cloner
auto clone_it = replicator.clones_map_.find(const_tv);
TORCH_INTERNAL_ASSERT(clone_it != replicator.clones_map_.end());
auto cloned_val = clone_it->second;
TORCH_INTERNAL_ASSERT(
cloned_val->isA<TensorView>(),
"Cloned value is somehow not a tensor view.");
// Return the cloned value
return cloned_val->as<TensorView>();
}
RecomputeTv::RecomputeTv(Fusion* fusion, std::vector<Expr*> exprs)
: IrCloner(fusion), fusion_(fusion) {
// Add inputs to the clones map to prevent cloning them.
for (const auto inp : fusion->inputs()) {
clones_map_[inp] = inp;
}
// Adds all scalar values to clones map to prevent cloning them
for (const auto val : fusion->vals()) {
if (val->getValType().value() == ValType::Scalar ||
val->getValType().value() == ValType::NamedScalar) {
clones_map_[val] = val;
}
}
// Clone the expressions
for (auto expr : exprs) {
IrCloner::handle(expr);
}
}
void RecomputeTv::handle(const TensorDomain* td) {
// Make sure to recompute the history of the iteration domains, explicitly go
// through the expressions and send them to IrCloner.
auto exprs =
StmtSort::getExprs(fusion_, {td->domain().begin(), td->domain().end()});
for (auto expr : exprs) {
IrCloner::handle(expr);
}
IrCloner::handle(td);
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,132 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <dispatch.h>
#include <ir_builder.h>
#include <unordered_map>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class IrContainer;
//! Clones nodes from an exiting Fusion
//!
//! \warning IrCloner machinery is a specialized helper for implementing
//! Fusion copy operations and the and limited scope of RecomputeTv below.
//! It is not intended for any other uses.
//!
class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
friend class Statement;
friend class IrBuilder;
public:
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
explicit IrCloner(IrContainer* container);
Statement* clone(const Statement* statement);
template <class T>
T* clone(const T* node) {
return node ? clone(node->template as<Statement>())->template as<T>()
: nullptr;
}
template <class T>
std::vector<T*> clone(const std::vector<T*>& container) {
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
std::vector<T*> copy;
copy.reserve(container.size());
for (auto p : container) {
copy.push_back(clone(p));
}
return copy;
}
IrContainer* container() const {
return ir_container_;
}
protected:
void registerClone(const Statement* src, Statement* clone);
void handle(const Statement*) override;
void handle(const Val*) override;
void handle(const Expr*) override;
void handle(const TensorDomain*) override;
void handle(const TensorView*) override;
void handle(const IterDomain*) override;
void handle(const Bool*) override;
void handle(const Double*) override;
void handle(const Int*) override;
void handle(const ComplexDouble*) override;
void handle(const NamedScalar*) override;
void handle(const FullOp*) override;
void handle(const ARangeOp*) override;
void handle(const EyeOp*) override;
void handle(const UnaryOp*) override;
void handle(const BinaryOp*) override;
void handle(const TernaryOp*) override;
void handle(const RNGOp*) override;
void handle(const BroadcastOp*) override;
void handle(const ReductionOp*) override;
void handle(const GroupedReductionOp*) override;
void handle(const WelfordOp*) override;
void handle(const LoadStoreOp*) override;
void handle(const MmaOp*) override;
void handle(const TransposeOp*) override;
void handle(const ExpandOp*) override;
void handle(const ShiftOp*) override;
void handle(const GatherOp*) override;
void handle(const ViewAsScalar*) override;
void handle(const ViewOp*) override;
void handle(const Split*) override;
void handle(const Merge*) override;
void handle(const Swizzle2D*) override;
protected:
// We keep track of the original -> clone map so we don't
// duplicate clones of the same object if referenced multiple times
std::unordered_map<const Statement*, Statement*> clones_map_;
private:
// The destination Fusion container
IrContainer* ir_container_ = nullptr;
// The dispatch interface doesn't allow returning values from
// individual `handle()` methods, so they are storing the
// result here
Statement* clone_ = nullptr;
// Builder to make all the new nodes
IrBuilder builder_;
};
// Replicates all expressions used to generate the provided TensorView. Does not
// replicate inputs. Does not replicate scalar values. In other words the value
// provided will be recomputed from the inputs of the fusion.
class RecomputeTv : private IrCloner {
public:
// Replicates expressions and values in provided expressions.
static TensorView* recompute(TensorView* tv);
private:
RecomputeTv(Fusion* fusion, std::vector<Expr*> exprs);
void handle(const TensorDomain*) final;
Fusion* fusion_;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,279 +0,0 @@
#include <instrumentation.h>
#include <ir_builder.h>
#include <ir_cloner.h>
#include <ir_container.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
void swap(IrContainer& a, IrContainer& b) noexcept {
FUSER_PERF_SCOPE("Fusion swap");
using std::swap;
// Swap the content
swap(a.vals_up_, b.vals_up_);
swap(a.vals_, b.vals_);
swap(a.exprs_up_, b.exprs_up_);
swap(a.exprs_, b.exprs_);
swap(a.raw_ptrs_, b.raw_ptrs_);
swap(a.val_type_name_map_, b.val_type_name_map_);
swap(a.expr_name_counter_, b.expr_name_counter_);
// Fixup the Statement::fusion_ links for a
for (auto val : a.vals_) {
val->ir_container_ = &a;
}
for (auto expr : a.exprs_) {
expr->ir_container_ = &a;
}
// Fixup the Statement::fusion_ links for b
for (auto val : b.vals_) {
val->ir_container_ = &a;
}
for (auto expr : b.exprs_) {
expr->ir_container_ = &a;
}
}
IrCloner IrContainer::copy(const IrContainer* from, IrContainer* to) {
to->clear();
IrCloner ir_cloner(to);
for (auto val : from->vals_) {
to->vals_.insert(ir_cloner.clone(val));
}
for (auto expr : from->exprs_) {
to->exprs_.insert(ir_cloner.clone(expr));
}
to->val_type_name_map_ = from->val_type_name_map_;
to->expr_name_counter_ = from->expr_name_counter_;
return ir_cloner;
}
IrContainer::IrContainer() = default;
IrContainer::IrContainer(const IrContainer& other) {
FUSER_PERF_SCOPE("IrContainer copy");
IrContainer::copy(&other, this);
}
IrContainer::IrContainer(IrContainer&& other) noexcept {
FUSER_PERF_SCOPE("IrContainer move");
swap(*this, other);
}
IrContainer& IrContainer::operator=(const IrContainer& other) {
FUSER_PERF_SCOPE("IrContainer copy assign");
IrContainer copy(other);
clear();
swap(*this, copy);
return *this;
}
IrContainer& IrContainer::operator=(IrContainer&& other) noexcept {
FUSER_PERF_SCOPE("IrContainer move assign");
clear();
swap(*this, other);
return *this;
}
IrContainer::~IrContainer() {
clear();
}
//! Register the Statement with this container
void IrContainer::registerStmt(IrBuilderPasskey, Statement* stmt) {
if (stmt->isVal()) {
registerVal(stmt->asVal());
} else {
registerExpr(stmt->asExpr());
}
}
//! Register the Val with this container
void IrContainer::registerVal(IrBuilderPasskey, Val* val) {
registerVal(val);
}
//! Register expr with this container.
void IrContainer::registerExpr(IrBuilderPasskey, Expr* expr) {
registerExpr(expr);
}
void IrContainer::registerExpr(ExprPasskey, Expr* expr) {
registerExpr(expr);
}
void IrContainer::removeExpr(Expr* expr) {
TORCH_INTERNAL_ASSERT(
exprs_.find(expr) != exprs_.end(),
"Wanted to remove an expression but it doesn't exist in this container.");
auto expr_in_deque = std::find_if(
exprs_up_.begin(),
exprs_up_.end(),
[expr](std::unique_ptr<Expr>& expr_up) { return expr_up.get() == expr; });
TORCH_INTERNAL_ASSERT(
expr_in_deque != exprs_up_.end(),
"Wanted to remove an expression but its unique ptr is missing.");
exprs_.erase(expr);
exprs_up_.erase(expr_in_deque);
raw_ptrs_.erase((void*)expr);
}
//! Completely remove val from the fusion, break all dependencies associated
//! with it
void IrContainer::removeVal(Val* val) {
// Don't remove shortcuts
if (val == true_val_.get() || val == false_val_.get() ||
val == one_val_.get() || val == zero_val_.get() ||
val == magic_zero_val_.get()) {
return;
}
TORCH_INTERNAL_ASSERT(
vals_.find(val) != vals_.end(),
"Wanted to remove a value but it doesn't exist in this container.");
auto val_in_deque = std::find_if(
vals_up_.begin(), vals_up_.end(), [val](std::unique_ptr<Val>& val_up) {
return val_up.get() == val;
});
TORCH_INTERNAL_ASSERT(
val_in_deque != vals_up_.end(),
"Wanted to remove a value but its unique ptr is missing.");
vals_.erase(val);
vals_up_.erase(val_in_deque);
raw_ptrs_.erase((void*)val);
}
//! Register the Val with this container
void IrContainer::registerVal(Val* val) {
if (inContainer(val)) {
return;
}
vals_up_.emplace_back(std::unique_ptr<Val>(val));
vals_.emplace(vals_up_.back().get());
val->setName(IrContainerPasskey(), getValName(vals_up_.back()->vtype()));
raw_ptrs_.emplace((void*)vals_up_.back().get());
}
//! Register expr with this container.
void IrContainer::registerExpr(Expr* expr) {
if (inContainer(expr)) {
return;
}
exprs_up_.emplace_back(std::unique_ptr<Expr>(expr));
exprs_.emplace(exprs_up_.back().get());
expr->setName(IrContainerPasskey(), getExprName());
raw_ptrs_.emplace((void*)exprs_up_.back().get());
}
void IrContainer::clear() noexcept {
FUSER_PERF_SCOPE("IrContainer clear");
vals_.clear();
vals_up_.clear();
exprs_.clear();
exprs_up_.clear();
raw_ptrs_.clear();
val_type_name_map_.clear();
expr_name_counter_ = 0;
}
bool IrContainer::inContainer(const Statement* stmt) const {
const void* const_void = (const void*)(stmt);
void* nonconst_void = const_cast<void*>(const_void); // NOLINT
if (raw_ptrs_.find(nonconst_void) == raw_ptrs_.end()) {
return false;
}
TORCH_INTERNAL_ASSERT(
stmt->container() == this,
"Container claims to own stmt, but stmt disagrees.");
Statement* nonconst_stmt = const_cast<Statement*>(stmt); // NOLINT
if (stmt->isExpr()) {
TORCH_INTERNAL_ASSERT(
exprs_.find(nonconst_stmt->as<Expr>()) != exprs_.end(),
"Somehow container claims to and not to own an Expr.");
}
if (stmt->isVal()) {
TORCH_INTERNAL_ASSERT(
vals_.find(nonconst_stmt->as<Val>()) != vals_.end(),
"Somehow container claims to and not to own an Val.");
}
return true;
}
// Shortcuts for frequently used vals
Int* IrContainer::zeroVal() {
if (!zero_val_) {
auto zero_val = IrBuilder::create<Int>(this, 0);
TORCH_INTERNAL_ASSERT(vals_up_.back().get() == zero_val);
zero_val_ = std::unique_ptr<Int>(vals_up_.back().release()->as<Int>());
vals_up_.pop_back();
}
return zero_val_.get();
}
Int* IrContainer::oneVal() {
if (!one_val_) {
auto one_val = IrBuilder::create<Int>(this, 1);
TORCH_INTERNAL_ASSERT(vals_up_.back().get() == one_val);
one_val_ = std::unique_ptr<Int>(vals_up_.back().release()->as<Int>());
vals_up_.pop_back();
}
return one_val_.get();
}
Bool* IrContainer::falseVal() {
if (!false_val_) {
auto false_val = IrBuilder::create<Bool>(this, false);
TORCH_INTERNAL_ASSERT(vals_up_.back().get() == false_val);
false_val_ = std::unique_ptr<Bool>(vals_up_.back().release()->as<Bool>());
vals_up_.pop_back();
}
return false_val_.get();
}
Bool* IrContainer::trueVal() {
if (!true_val_) {
auto true_val = IrBuilder::create<Bool>(this, true);
TORCH_INTERNAL_ASSERT(vals_up_.back().get() == true_val);
true_val_ = std::unique_ptr<Bool>(vals_up_.back().release()->as<Bool>());
vals_up_.pop_back();
}
return true_val_.get();
}
NamedScalar* IrContainer::magicZeroVal() {
if (!magic_zero_val_) {
auto magic_zero =
IrBuilder::create<NamedScalar>(kMagicZeroName, DataType::Int);
TORCH_INTERNAL_ASSERT(vals_up_.back().get() == magic_zero);
magic_zero_val_ = std::unique_ptr<NamedScalar>(
vals_up_.back().release()->as<NamedScalar>());
vals_up_.pop_back();
}
return magic_zero_val_.get();
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,174 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <ir_base_nodes.h>
#include <utils.h>
#include <deque>
#include <unordered_map>
#include <unordered_set>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class IrBuilderPasskey;
class ExprPasskey;
class OptOutMutator;
class Int;
class Bool;
class NamedScalar;
// Passkey for container to register names with statements
class IrContainerPasskey {
friend class IrContainer;
private:
explicit IrContainerPasskey() {}
};
class TORCH_CUDA_CU_API IrContainer : public PolymorphicBase {
public:
IrContainer();
IrContainer(const IrContainer& other);
IrContainer(IrContainer&& other) noexcept;
IrContainer& operator=(const IrContainer& other);
IrContainer& operator=(IrContainer&& other) noexcept;
virtual ~IrContainer();
bool inContainer(const Statement* stmt) const;
void assertInContainer(const Statement* stmt, const std::string& msg) const {
TORCH_CHECK(
inContainer(stmt), msg, " it was not found in the active container.");
}
//! Return in insertion order
const std::deque<Val*> deterministic_vals() const noexcept {
std::deque<Val*> vals_deque;
std::transform(
vals_up_.begin(),
vals_up_.end(),
std::back_inserter(vals_deque),
[](const std::unique_ptr<Val>& val_up) { return val_up.get(); });
return vals_deque;
}
//! Register the Statement with this container
virtual void registerStmt(IrBuilderPasskey, Statement* stmt);
//! Register the Val with this container
virtual void registerVal(IrBuilderPasskey, Val* val);
//! Register expr with this container.
virtual void registerExpr(IrBuilderPasskey, Expr* expr);
//! Allow expr's to register themselves with a container, this is only used
//! for broadcastOp so it can register itself in its constructor so root maps
//! can be built.
virtual void registerExpr(ExprPasskey, Expr* expr);
//! Return the set of Exprs registered with this fusion. Warning: This will
//! return exprs outside inputs/outputs, so can be unsafe for use with
//! segmented fusions.
const std::unordered_set<Expr*>& unordered_exprs() const noexcept {
return exprs_;
}
//! Return the set of Vals registered with this fusion
const std::unordered_set<Val*>& vals() const noexcept {
return vals_;
}
// Shortcuts for frequently used vals
Int* zeroVal();
Int* oneVal();
Bool* falseVal();
Bool* trueVal();
NamedScalar* magicZeroVal();
protected:
static IrCloner copy(const IrContainer* from, IrContainer* to);
friend void swap(IrContainer& a, IrContainer& b) noexcept;
// Let mutator remove Exprs.
friend OptOutMutator;
virtual void removeExpr(Expr* expr);
//! Completely remove val from the fusion, break all dependencies associated
//! with it
virtual void removeVal(Val* val);
//! Register the Val with this container
virtual void registerVal(Val* val);
//! Register expr with this container.
virtual void registerExpr(Expr* expr);
StmtNameType getValName(ValType vtype) {
if (val_type_name_map_.find(vtype) == val_type_name_map_.end()) {
val_type_name_map_[vtype] = 0;
}
return val_type_name_map_[vtype]++;
}
StmtNameType getExprName() {
return expr_name_counter_++;
}
void clear() noexcept;
// Deque of unique pointer is the memory owning data structure
std::deque<std::unique_ptr<Val>> vals_up_;
// A convenient set to return when we just need an unordered set to do
// something like check if a Val is in this container
std::unordered_set<Val*> vals_;
// Deque of unique pointer is the memory owning data structure
std::deque<std::unique_ptr<Expr>> exprs_up_;
// A convenient set to return when we just need an unordered set to do
// something like check if an Expr is in this container
std::unordered_set<Expr*> exprs_;
// Used to implement a generic "inContainer" that can be passed an invalid
// pointer. Specifically a pointer to a Statement owned by another container
// that has been freed. We can't check normally with the unordered_sets we
// already have because it would require a const_cast from a constant
// expr/val, or a dynamic cast from a Statement.
std::unordered_set<void*> raw_ptrs_;
// Values names counters
std::unordered_map<ValType, StmtNameType, TypeHash> val_type_name_map_;
// Expression names counter
StmtNameType expr_name_counter_ = 0;
// Manually store some persistent, frequently used nodes. It's very
// challenging to do this anything but manually as detecting when a container
// may or may not have one of these vals is tricky. Specifically because if
// the container doesn't own it, it's hard to understand from the outside if
// the node may have been removed then re-registered. It could also be tricky
// to know when we're using a different container as in FusionCopy_test
// demonstrates deleting then creating containers can result in the same
// pointer for the container.
std::unique_ptr<Bool> true_val_;
std::unique_ptr<Bool> false_val_;
std::unique_ptr<Int> one_val_;
std::unique_ptr<Int> zero_val_;
std::unique_ptr<NamedScalar> magic_zero_val_;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,519 +0,0 @@
#include <ir_graphviz.h>
#include <fusion.h>
#include <ir_all_nodes.h>
#include <ir_builder.h>
#include <type.h>
#include <fstream>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace {
// Private helper, generating node labels for IrGraphGenerator
class IrNodeLabel : private OptInConstDispatch {
using DetailLevel = IrGraphGenerator::DetailLevel;
public:
static std::string gen(
const Statement* node,
DetailLevel detail_level = DetailLevel::Basic) {
IrNodeLabel generator(detail_level);
generator.OptInConstDispatch::handle(node);
return generator.label_.str();
}
private:
explicit IrNodeLabel(DetailLevel detail_level)
: detail_level_(detail_level) {}
~IrNodeLabel() override = default;
void handle(const Bool* b) override {
if (b->isSymbolic()) {
label_ << "b" << b->name();
} else {
if (detail_level_ >= DetailLevel::Explicit) {
label_ << "b" << b->name() << "=";
}
label_ << *b->value();
}
}
void handle(const Double* d) override {
if (d->isSymbolic()) {
label_ << "d" << d->name();
} else {
if (detail_level_ >= DetailLevel::Explicit) {
label_ << "d" << d->name() << "=";
}
label_ << *d->value();
}
}
void handle(const Int* i) override {
if (i->isSymbolic()) {
label_ << "i" << i->name();
} else {
if (detail_level_ >= DetailLevel::Explicit) {
label_ << "i" << i->name() << "=";
}
label_ << *i->value();
}
}
void handle(const NamedScalar* ns) override {
label_ << ns->name();
}
void handle(const IterDomain* id) override {
label_ << id->getIterType();
label_ << id->getParallelType();
label_ << "(";
if (!id->start()->isZeroInt()) {
label_ << IrNodeLabel::gen(id->start()) << " : ";
}
label_ << IrNodeLabel::gen(id->extent());
label_ << ")";
}
void handle(const Split* split) override {
label_ << "Split(inner=" << (split->innerSplit() ? "true" : "false")
<< ", factor=" << IrNodeLabel::gen(split->factor()) << ")";
}
void handle(const Merge* merge) override {
label_ << "Merge";
}
private:
std::stringstream label_;
const DetailLevel detail_level_;
};
// Small color palette from the X11 theme
static const char* getColorFromIndex(size_t index) {
const size_t number_of_colors = 10;
index = index % number_of_colors;
switch (index) {
case 0: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
return "azure";
case 1: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
return "pink";
case 2: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
return "green";
case 3: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
return "grey";
case 4: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
return "yellow";
case 5: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
return "lavender";
case 6: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
return "cyan";
case 7: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
return "white";
case 8: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
return "magenta";
case 9: // NOLINT(cppcoreguidelines-avoid-magic-numbers)
return "red";
default:
break;
}
return "";
}
} // anonymous namespace
void IrGraphGenerator::print(
const Fusion* fusion,
const char* filename,
DetailLevel detail_level,
ExprColorMap* expr_color_map) {
std::ofstream dot_file(filename);
TORCH_CHECK(dot_file.good(), "Failed to open the IR graph file");
dot_file << toGraphviz(fusion, detail_level, expr_color_map);
}
std::string IrGraphGenerator::toGraphviz(
const Fusion* fusion,
DetailLevel detail_level,
ExprColorMap* expr_color_map) {
IrGraphGenerator ir_graph(fusion, detail_level, expr_color_map);
return ir_graph.generate();
}
IrGraphGenerator::IrGraphGenerator(
const Fusion* fusion,
DetailLevel detail_level,
ExprColorMap* expr_color_map)
: detail_level_(detail_level),
fusion_(fusion),
expr_color_map_(expr_color_map) {
// setup inputs & outputs
// (indexes used to quickly check if a value is fusion input or output)
for (const auto* input : fusion->inputs()) {
TORCH_CHECK(inputs_.count(input) == 0);
inputs_.insert(input);
}
for (const auto* output : fusion->outputs()) {
TORCH_CHECK(outputs_.count(output) == 0);
outputs_.insert(output);
}
}
std::string IrGraphGenerator::getid(const Statement* stm) {
const auto it = id_map_.find(stm);
if (it == id_map_.end()) {
// First reference, generate a new id
std::stringstream new_id;
new_id << "stm_" << next_id_++;
id_map_.insert({stm, new_id.str()});
return new_id.str();
} else {
return it->second;
}
}
void IrGraphGenerator::addArc(
const Statement* src,
const Statement* dst,
const std::string& style) {
// We automatically visit (handle) the arc's source and destination
handle(src);
handle(dst);
// generate and queue the arc definition
std::stringstream arc_def;
arc_def << getid(src) << " -> " << getid(dst) << " " << style;
arcs_.push_back(arc_def.str());
}
void IrGraphGenerator::printExpr(const Expr* expr, const std::string& label) {
graph_def_ << " " << getid(expr) << " "
<< "[label=\"" << label << "\", shape=oval, color=blue, "
<< "style=filled, fillcolor=";
if (expr_color_map_ != nullptr && expr_color_map_->count(expr)) {
graph_def_ << getColorFromIndex(expr_color_map_->at(expr));
} else {
graph_def_ << "azure";
}
graph_def_ << "];\n";
}
void IrGraphGenerator::printValue(const Val* val, const std::string& label) {
graph_def_ << " " << getid(val) << " [label=\"" << label
<< "\", shape=rect, color=green, fontsize=10];\n";
}
std::string IrGraphGenerator::generate() {
// IrGraphGenerator instances are not reusable
TORCH_CHECK(graph_def_.str().empty());
TORCH_CHECK(visited_.empty());
// record detail level
graph_def_ << "// detail level: ";
switch (detail_level_) {
case DetailLevel::ComputeOnly:
graph_def_ << "compute only\n";
break;
case DetailLevel::Basic:
graph_def_ << "minimal\n";
break;
case DetailLevel::Explicit:
graph_def_ << "explicit\n";
break;
case DetailLevel::Verbose:
graph_def_ << "verbose\n";
break;
default:
TORCH_CHECK(!"Unexpected detail level");
}
graph_def_ << "digraph fusion_ir {\n"
<< " node [shape=circle, color=gray];\n"
<< " edge [color=black];\n";
// Compute graph
generateComputeGraph();
// Schedule graph
if (detail_level_ > DetailLevel::ComputeOnly) {
generateScheduleGraph();
}
// All expressions & values
// (These are otherwise unreacheable (dead) nodes)
if (detail_level_ >= DetailLevel::Verbose) {
for (const auto* expr : fusion_->unordered_exprs()) {
handle(expr);
}
for (const auto* val : fusion_->vals()) {
handle(val);
}
}
// Finally, print all arc definitions
for (const auto& arc : arcs_) {
graph_def_ << " " << arc << ";\n";
}
graph_def_ << "}\n";
// Make sure that all referenced nodes have been visited
for (const auto& kv : id_map_) {
TORCH_CHECK(visited(kv.first));
}
return graph_def_.str();
}
void IrGraphGenerator::generateComputeGraph() {
graph_def_ << " subgraph cluster_compute {\n"
<< " label=\"compute\";\n"
<< " style=dashed;\n";
// Inputs
for (const auto* input : fusion_->inputs()) {
handle(input);
}
// Outputs
for (const auto* output : fusion_->outputs()) {
handle(output);
}
graph_def_ << " }\n";
}
void IrGraphGenerator::generateScheduleGraph() {
graph_def_ << " subgraph cluster_schedule {\n"
<< " label=\"schedule\";\n"
<< " style=dashed;\n";
// Connect TensorView with their TensorDomain
// (this will trigger the traversal of the schedule graph)
for (auto tv : tensor_views_) {
addArc(tv->domain(), tv, "[style=dashed, arrowhead=none]");
if (detail_level_ >= DetailLevel::Explicit) {
// Maybe not the best way to handle the root domain, but should be okay
addArc(
tv,
IrBuilder::create<TensorDomain>(tv->getRootDomain()),
"[style=dashed, color=green, arrowhead=none]");
if (tv->domain()->hasRFactor())
addArc(
tv,
IrBuilder::create<TensorDomain>(tv->domain()->getRFactorDomain()),
"[style=dashed, color=green, arrowhead=none]");
}
}
graph_def_ << " }\n";
}
void IrGraphGenerator::handle(const Statement* s) {
OptInConstDispatch::handle(s);
}
void IrGraphGenerator::handle(const Val* v) {
if (!visited(v)) {
visited_.insert(v);
if (const auto* def = v->definition()) {
handle(def);
}
OptInConstDispatch::handle(v);
}
}
void IrGraphGenerator::handle(const Expr* e) {
if (!visited(e)) {
visited_.insert(e);
OptInConstDispatch::handle(e);
}
}
void IrGraphGenerator::handle(const TensorDomain* td) {
graph_def_ << " " << getid(td) << " [label=\"TensorDomain\", "
<< "shape=note, color=gray, "
<< "style=filled, fillcolor=gray90, fontsize=10];\n";
for (auto iter_domain : td->domain()) {
addArc(iter_domain, td, "[color=gray]");
}
}
void IrGraphGenerator::handle(const IterDomain* id) {
graph_def_ << " " << getid(id) << " [label=\"" << IrNodeLabel::gen(id)
<< "\", shape=cds, color=gray, fontsize=10];\n";
if (!id->start()->isZeroInt()) {
addArc(id->start(), id, "[color=gray]");
}
addArc(id->extent(), id, "[color=gray]");
}
void IrGraphGenerator::handle(const Bool* b) {
printValue(b, IrNodeLabel::gen(b, detail_level_));
}
void IrGraphGenerator::handle(const Double* d) {
printValue(d, IrNodeLabel::gen(d, detail_level_));
}
void IrGraphGenerator::handle(const Int* i) {
printValue(i, IrNodeLabel::gen(i, detail_level_));
}
void IrGraphGenerator::handle(const ComplexDouble* i) {
printValue(i, IrNodeLabel::gen(i, detail_level_));
}
void IrGraphGenerator::handle(const NamedScalar* i) {
printValue(i, IrNodeLabel::gen(i, detail_level_));
}
void IrGraphGenerator::handle(const TensorView* tv) {
std::stringstream label;
label << "{T" << tv->name() << "|";
label << "{";
bool first_axis = true;
for (auto iter_domain : tv->domain()->domain()) {
if (first_axis) {
first_axis = false;
} else {
label << "|";
}
label << IrNodeLabel::gen(iter_domain);
}
label << "}}";
const bool is_input = inputs_.find(tv) != inputs_.end();
const bool is_output = outputs_.find(tv) != outputs_.end();
const char* style = is_input ? "style=filled, fillcolor=palegreen"
: is_output ? "style=filled, fillcolor=lightblue"
: "style=filled, fillcolor=beige";
graph_def_ << " " << getid(tv) << " [label=\"" << label.str()
<< "\", shape=Mrecord, color=brown, " << style << "];\n";
tensor_views_.push_back(tv);
}
void IrGraphGenerator::handle(const FullOp* fop) {
// node
printExpr(fop, "full");
// inputs & outputs
addArc(fop->getFillValue(), fop);
addArc(fop, fop->output(0));
}
void IrGraphGenerator::handle(const ARangeOp* aop) {
// node
printExpr(aop, "arange");
// inputs & outputs
addArc(aop->start(), aop);
addArc(aop->end(), aop);
addArc(aop->step(), aop);
addArc(aop, aop->output(0));
}
void IrGraphGenerator::handle(const EyeOp* eop) {
// node
printExpr(eop, "eye");
// inputs & outputs
addArc(eop, eop->output(0));
}
void IrGraphGenerator::handle(const UnaryOp* uop) {
// node
std::stringstream label;
label << uop->getUnaryOpType();
printExpr(uop, label.str());
// inputs & outputs
addArc(uop->in(), uop);
addArc(uop, uop->out());
}
void IrGraphGenerator::handle(const BinaryOp* bop) {
// node
std::stringstream label;
label << bop->getBinaryOpType();
printExpr(bop, label.str());
// inputs & outputs
addArc(bop->lhs(), bop);
addArc(bop->rhs(), bop, "[color=blue]");
addArc(bop, bop->out());
}
void IrGraphGenerator::handle(const TernaryOp* op) {
// node
std::stringstream label;
label << op->getTernaryOpType();
printExpr(op, label.str());
// inputs & outputs
addArc(op->in1(), op);
addArc(op->in2(), op, "[color=blue]");
addArc(op->in3(), op, "[color=brown]");
addArc(op, op->out());
}
void IrGraphGenerator::handle(const RNGOp* op) {
// node
std::stringstream label;
label << op->getRNGOpType();
printExpr(op, label.str());
// inputs & outputs
addArc(op, op->output(0));
}
void IrGraphGenerator::handle(const BroadcastOp* op) {
printExpr(op, "Broadcast");
addArc(op->in(), op);
addArc(op, op->out());
}
void IrGraphGenerator::handle(const ReductionOp* op) {
// node
std::stringstream label;
label << "Reduction(" << op->getReductionOpType() << ")";
printExpr(op, label.str());
// inputs & outputs
addArc(op->in(), op);
addArc(op->init(), op, "[color=blue]");
addArc(op, op->out());
}
void IrGraphGenerator::handle(const Split* split) {
printExpr(split, IrNodeLabel::gen(split));
addArc(split->in(), split);
addArc(split, split->outer());
addArc(split, split->inner());
}
void IrGraphGenerator::handle(const Merge* merge) {
printExpr(merge, IrNodeLabel::gen(merge));
addArc(merge->outer(), merge);
addArc(merge->inner(), merge);
addArc(merge, merge->out());
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,130 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <dispatch.h>
#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
// Generates a DOT (https://www.graphviz.org) graph
// representation of a fuser IR
//
// Usage:
// 1) Add calls to IrGraphGenerator::print(), for example:
// `IrGraphGenerator::print(&fusion, "ir.dot")`
//
// 2) Call IrGraphGenerator::print() from a debugger. Using gdb for example:
// `call IrGraphGenerator::print(&fusion, "ir.dot",
// IrGraphGenerator::DetailLevel::Explicit)`
//
// Notes:
// - When called from the debugger, the detail_level must be
// explicitly passed in (most debuggers don't support default arguments)
//
// - The output dot file path can't include shell specific notations,
// for example you can't use "~/temp/ir.dot" ("/home/user/temp/ir.dot"
// must be used instead)
//
class TORCH_CUDA_CU_API IrGraphGenerator : private OptInConstDispatch {
public:
enum class DetailLevel {
ComputeOnly, // Only dataflow (compute) nodes
Basic, // Compute + schedule, with minimal details (default)
Explicit, // Additional details (ex. symbolic names for scalar constants)
Verbose, // Includes all values and dead definitions
};
using ExprColorMap = std::unordered_map<const Expr*, size_t>;
public:
static void print(
const Fusion* fusion,
const char* filename,
DetailLevel detail_level = DetailLevel::Basic,
ExprColorMap* expr_color_map = nullptr);
static std::string toGraphviz(
const Fusion* fusion,
DetailLevel detail_level,
ExprColorMap* expr_color_map = nullptr);
private:
IrGraphGenerator(
const Fusion* fusion,
DetailLevel detail_level,
ExprColorMap* expr_color_map = nullptr);
~IrGraphGenerator() override = default;
std::string generate();
void generateComputeGraph();
void generateScheduleGraph();
void handle(const Statement*) override;
void handle(const Val*) override;
void handle(const Expr*) override;
void handle(const TensorDomain*) override;
void handle(const TensorView*) override;
void handle(const IterDomain*) override;
void handle(const Bool*) override;
void handle(const Double*) override;
void handle(const Int*) override;
void handle(const ComplexDouble*) override;
void handle(const NamedScalar*) override;
void handle(const FullOp*) override;
void handle(const ARangeOp*) override;
void handle(const EyeOp*) override;
void handle(const UnaryOp*) override;
void handle(const BinaryOp*) override;
void handle(const TernaryOp*) override;
void handle(const RNGOp*) override;
void handle(const BroadcastOp*) override;
void handle(const ReductionOp*) override;
void handle(const Split*) override;
void handle(const Merge*) override;
// lookup the graph id, creating one if not found
std::string getid(const Statement* stm);
bool visited(const Statement* s) const {
return visited_.find(s) != visited_.end();
}
void addArc(
const Statement* src,
const Statement* dst,
const std::string& style = "");
void printExpr(const Expr* expr, const std::string& label);
void printValue(const Val* val, const std::string& label);
private:
const DetailLevel detail_level_;
const Fusion* const fusion_;
std::stringstream graph_def_;
std::unordered_map<const Statement*, std::string> id_map_;
std::unordered_set<const Statement*> visited_;
std::unordered_set<const Val*> inputs_;
std::unordered_set<const Val*> outputs_;
std::vector<const TensorView*> tensor_views_;
std::vector<std::string> arcs_;
int next_id_ = 1;
ExprColorMap* expr_color_map_ = nullptr;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,600 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <fusion.h>
#include <ir_base_nodes.h>
#include <ir_internal_nodes.h>
#include <mma_type.h>
#include <torch/csrc/jit/ir/ir.h>
//! Nodes in here are intended to be "user facing" users in this sense being
//! those that want to be able to generate CUDA code.
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class WelfordResult;
class ViewTransform;
class IrCloner;
class IrBuilderPasskey;
//! A Bool value
//!
//! This value can be a symbolic value (defined after the kernel
//! is compiled) or a constant value (inlined into the kernel definition).
//!
class TORCH_CUDA_CU_API Bool : public Val {
public:
Bool(IrBuilderPasskey passkey);
explicit Bool(IrBuilderPasskey passkey, bool value);
explicit Bool(IrBuilderPasskey passkey, c10::optional<bool> value);
Bool(const Bool* src, IrCloner* ir_cloner);
bool isSymbolic() const {
return !(maybe_value_.has_value());
}
bool isConst() const final {
return maybe_value_.has_value();
}
c10::optional<bool> value() const {
return maybe_value_;
}
bool sameAs(const Statement* other) const override;
private:
const c10::optional<bool> maybe_value_;
};
//! A Float64 value. This value can be a symbolic value (defined after the
//! kernel is compiled) or a constant value (inlined into the kernel
//! definition).
class TORCH_CUDA_CU_API Double : public Val {
public:
using ScalarType = double;
Double(IrBuilderPasskey passkey);
explicit Double(IrBuilderPasskey passkey, ScalarType value);
explicit Double(IrBuilderPasskey passkey, c10::optional<ScalarType> value);
Double(const Double* src, IrCloner* ir_cloner);
bool isSymbolic() const {
return !(maybe_value_.has_value());
}
bool isConst() const final {
return maybe_value_.has_value();
}
c10::optional<ScalarType> value() const {
return maybe_value_;
}
bool sameAs(const Statement* other) const override;
private:
const c10::optional<ScalarType> maybe_value_;
};
//! An Int64 value. If used for indexing it's set as size_t. Otherwise it's an
//! inlined literal in the kernel.
class TORCH_CUDA_CU_API Int : public Val {
public:
using ScalarType = int64_t;
Int(IrBuilderPasskey passkey);
explicit Int(IrBuilderPasskey passkey, ScalarType value);
explicit Int(IrBuilderPasskey passkey, c10::optional<ScalarType> value);
Int(const Int* src, IrCloner* ir_cloner);
bool isSymbolic() const {
return !(maybe_value_.has_value());
}
bool isConst() const final {
return maybe_value_.has_value();
}
c10::optional<ScalarType> value() const {
return maybe_value_;
}
bool sameAs(const Statement* other) const override;
private:
const c10::optional<ScalarType> maybe_value_;
};
//! An c10::complex<double> value. This value can be a symbolic value (defined
//! after the kernel is compiled) or a constant value (inlined into the kernel
//! definition).
class TORCH_CUDA_CU_API ComplexDouble : public Val {
public:
using ScalarType = c10::complex<double>;
ComplexDouble(IrBuilderPasskey passkey);
explicit ComplexDouble(IrBuilderPasskey passkey, ScalarType value);
explicit ComplexDouble(
IrBuilderPasskey passkey,
c10::optional<ScalarType> value);
ComplexDouble(const ComplexDouble* src, IrCloner* ir_cloner);
bool isSymbolic() const {
return !(maybe_value_.has_value());
}
bool isConst() const final {
return maybe_value_.has_value();
}
c10::optional<ScalarType> value() const {
return maybe_value_;
}
bool sameAs(const Statement* other) const override;
private:
const c10::optional<ScalarType> maybe_value_;
};
//! Mode during propagation of computeAt, standard will throw an error if
//! computeAt position provided can't be satisfied, best effort will lower the
//! computeAt position as needed during traversal, most inlined will increase
//! the compute at position to maximum possible through traversal.
enum class ComputeAtMode { Standard, BestEffort, MostInlined };
class TransformPropagator;
struct MostInlinedTransformPropagator;
class TransformIter;
class TransformReplay;
class OptOutMutator;
class TensorDomain;
class MaxPosCalculator;
namespace ir_utils {
class TVDomainGuard;
}
//! TensorView is our primitive Tensor Type used in code generation. It can be
//! thought of as representing physical memory, however, its dimensionality is
//! modifed as split/merge/computeAt functions are called. The history of
//! these transformations are kept and used for generating actual code
//! referncing physical memory. Generally when users are thinking of code
//! generation in reference to a Tensor, this is the class they should be
//! interacting with.
//!
//! The reason we need both TensorView and TensorDomain is that we need to have
//! a record of both what is being computed and how it is being computed. For
//! example we may have the operation:
//!
//! TV3[I, J, K] = TV2[I, J, K] + TV1[I, J, K]
//!
//! The mathematical operations here are on the tensor views TV1, TV2, and
//! TV3. This operation is a pointwise operation. To compute this pointwise
//! operation we iterate over the 3D TensorDomain [I, J, K], where K is the
//! fastest changing dimension.
//!
//! \todo Need to work on the const model for TensorView, making all functions
//! that should be const, const. Gave this a try but expanded really quickly.
//! getComputeAtAxis not being const because it can return a TV that some expect
//! to be non-const is the biggest headache.
//!
class TORCH_CUDA_CU_API TensorView : public Val {
public:
TensorView(
IrBuilderPasskey passkey,
TensorDomain* domain,
DataType dtype,
MemoryType mtype = MemoryType::Local);
explicit TensorView(
IrBuilderPasskey passkey,
const std::shared_ptr<c10::TensorType>& tensor_type);
explicit TensorView(
IrBuilderPasskey passkey,
const std::shared_ptr<Value>& jit_value);
TensorView(const TensorView* src, IrCloner* ir_cloner);
TensorDomain* domain() const {
return domain_;
}
//! This is for a TensorView with an rFactor domain that is an input to a
//! fusion segment. We convert the rfactor domain into a new root domain.
//! Any dynamic-sized rfactor iterDomains are given a new symbolic extent.
//! Concrete integer extents are kept. Output TensorViews of any subsequent
//! expressions that use this TensorView are also updated.
void convertRfactorToRootDomain();
void setContiguity(const std::vector<bool>& contig) {
domain()->setContiguity(contig);
}
void setContiguity(bool contig) {
setContiguity(std::vector<bool>(domain()->contiguity().size(), contig));
}
bool hasReduction() const;
bool hasBlockReduction() const;
bool hasGridReduction() const;
bool hasBroadcast() const;
bool hasRFactor() const;
//! This is the previous hasReduction logic,
//! kept here exclusively for lower loop pass will
//! deprecate when Fusion IR pass can convert
//! trivial reductions
bool hasAnyReduction() const;
//! Returns true if this tensor is zero dimensional,
//! i.e. a wrapped scalar or an empty placeholder.
bool isZeroDim() const {
return nDims() == 0;
}
//! Returns true if this tensor does not contain
//! any value.
bool isEmptyTensor() const;
c10::optional<unsigned int> getReductionAxis() const;
const std::vector<IterDomain*>& getRootDomain() const;
const std::vector<IterDomain*>& getRFactorDomain() const;
// If rfactor domain exists in domain() return it, otherwise return root
// domain.
const std::vector<IterDomain*>& getMaybeRFactorDomain() const;
IterDomain* axis(int pos) const;
// Does it share outer axes with other tensors?
bool hasComputeAt() const {
return compute_at_pos_ > 0;
}
bool hasMaxProducerPosition() const {
return max_producer_pos_ > 0;
}
size_t nDims() const;
// sets cpu_scalar_ value, which is special handling for CPU based zero-dim
// tensors (i.e. CPU Tensors that only have one value). This is only used if
// on an input value, otherwise ignored. This is important as special handling
// because these "scalars" should be type promoted as a tensor, but we want to
// avoid explicit copying of the data, so we want to pass the data value as a
// standard kernel argument value.
void setCpuScalar(bool is_cpu_scalar);
// returns cpu_scalar_ value, which is special handling for CPU based zero-dim
// tensors (i.e. CPU Tensors that only have one value). This is only used if
// on an input value, otherwise ignored. This is important as special handling
// because these "scalars" should be type promoted as a tensor, but we want to
// avoid explicit copying of the data, so we want to pass the data value as a
// standard kernel argument value.
bool isCpuScalar() const {
return cpu_scalar_;
}
// Returns the position that this tensor is produced at relative to its axes.
unsigned int getComputeAtPosition() const {
return compute_at_pos_;
}
// Returns the maximum position of producers are being computed at relative to
// this tensor. This position dictates the clear expectations of producers.
unsigned int getMaxProducerPosition() const {
return max_producer_pos_;
}
//! This is used when we disconnect a tensorview from a reduction
//! operation and connect it to a non-reduction operator. We need
//! to remove the reduction ids on the tv in this case.
//! Currently only used in translate welford, and this function may
//! be refactored or extended if any more use cases appear.
void clearReductionIterDomains();
//! Compute this TensorView relative to a consumer position, -1 will
//! compute tensors inline with each other, 0 doesn't share
//! any loop nests between the tensors. It's an error when the given
//! position is not legally viable. Alternatively, when the mode
//! parameter is ComputeAtMode::BestEffort, the position is lowered
//! one by one until a valid position is found. When
//! ComputeAtMode::MostInlined is given, the position parameter is
//! ignored, and the deepest possible position is searched.
TensorView* computeAt(
TensorView* consumer,
int position,
ComputeAtMode mode = ComputeAtMode::Standard);
//! Compute this tensor to consumer, at local position, -1 will compute
//! tensors inline with eachother, 0 doesn't share any loop nests between the
//! tensors. The mode parameter can be used in the same manner as computeAt.
TensorView* computeWith(
TensorView* consumer,
int position,
ComputeAtMode mode = ComputeAtMode::Standard);
// Split "axis" into 2 axes
//! inner_split dictates if the factor section of the split should be inside
//! the
//! remainer or outside.
//! e.g. split(0, 4, inner_split = true) will result in:
//! tv[id{extent}] -> tv[id{ceilDiv(extent, factor)}, id{factor}]
//! e.g. split(0, 4, inner_split = false) will result in:
//! tv[id{extent}] -> tv[id{factor}, id{ceilDiv(extent, factor)}]
//!
//! When trim_out_of_bounds is true, only the inner domain defined by the
//! start and stop positions is split.
TensorView* split(
int axis,
unsigned int factor,
bool inner_split = true,
bool trim_out_of_bounds = false);
// Split "axis" into 2 axes where the inner axes is size of "factor"
// and outer axis is size axis.size() / factor. Factor can be a symbolic
// value instead of constant. This requires setting the symbolic value as an
// input, or using a parallel dim from NamedScalar::getParallelDim
TensorView* split(
int axis,
Val* factor,
bool inner_split = true,
bool trim_out_of_bounds = false);
// Merge axis_o and axis_i into 1 IterDomain
TensorView* merge(int axis_o, int axis_i);
// Merge axis and axis+1 into 1 IterDomain
TensorView* merge(int axis) {
return merge(axis, axis + 1);
}
// Reorder axes according to old2new[old_pos] = new_pos
TensorView* reorder(const std::unordered_map<int, int>& old2new);
//! Swizzle indices to improve memory access efficiency.
//!
//! Swizzle::Transpose is a pattern commonly used to avoid bank
//! conflicts in shared memory. It takes two axes and shifts the
//! second axis by the first axis as ((axis1 + axis2) % extent). The
//! memory type must be Shared.
//!
//! \input type Swizzle pattern such as transpose.
//! \input axes Axes to swizzle
TensorView* swizzle(SwizzleType type, const std::vector<int>& axes);
//! Swizzle the rectangular tile defined by the iterdomains corresponding
//! to the 2 given indices.
TensorView* swizzle(
Swizzle2DType swizzle_type,
int x,
int y,
SwizzleMode swizzle_mode = SwizzleMode::Data);
// WARNING: rFactor does not return this TensorView, ir returns a new
// tensorview consumed by this!
//
// Take reduction axes out of this domain, and create a new
// domain. New domain will be used to create this domain.
//
// For example:
// TV1[I0, R1, R2, I3] = TV0[I0, I1, I2, I3]
//
// After:
// TV1->rfactor({1}), TV1 is transformed to -> TV1[I0, R2, I3]
//
// The TensorView returned is: TV2[I0, R1, I2, I3]
//
// The reduction will now beset as:
// TV2[I0, R1, I2, I3] = TV0[I0, I1, I2, I3]
// TV1[I0, R2, I3] = TV2[I0, R1, I2, I3]
//
TensorView* rFactor(const std::vector<int>& axes);
//! Multi-output version of rFactor, semantically similar with
//! the reduction version except that the rfactor is done
//! for all outputs in a consistent way
std::vector<TensorView*> rFactor(
const std::vector<int>& axes,
const std::vector<TensorView*>& tvs);
//! Create a TensorView before the original tensor. A common use case is to
//! write results into shared memory or registers before moving to global
//! memory. Analogous to TVM Cache_Write
//!
//! @param cache_op: memory operator to use for the inserted op between
//! the the data tensor and the cache tensor
TensorView* cacheBefore(
c10::optional<LoadStoreOpType> cache_op = c10::nullopt);
//! Create a TensorView after the original tensor. A common use case is to
//! read tensor into shared memory or registers. Analogous to TVM Cache_Read
//!
//! @param cache_op: memory operator to use for the inserted op between
//! the the data tensor and the cache tensor
TensorView* cacheAfter(
c10::optional<LoadStoreOpType> cache_op = c10::nullopt);
// For a fusion output with other uses, we want to avoid writing to global
// memory and then reading the output again. We write to global memory
// separately after an operation. We replace this fusion output with the
// direct write TensorView.
TensorView* cacheFork();
MemoryType getMemoryType() const {
return memory_type_;
}
void setMemoryType(MemoryType mt);
SwizzleType swizzleType() const {
return swizzle_type_;
}
const std::vector<IterDomain*>& axesToSwizzle() const {
return axes_to_swizzle_;
}
// Apply double buffering transformation
void doubleBuffer();
// Apply circular buffering transformation
void circularBuffer(unsigned int number_of_stage);
// Returns true if this tensor is double buffered.
bool isDoubleBuffered() const {
return is_double_buffered_;
}
// Returns true if this tensor is circular buffered.
bool isCircularBuffered() const {
return is_circular_buffered_;
}
// Returns the depth of circular buffering if applicable.
unsigned int circularBufferDepth() const {
TORCH_INTERNAL_ASSERT(
is_circular_buffered_, toString(), "not circular buffered");
return circular_buffer_stage_;
}
//! Transforms the innermost iterdomains according to the given mma swizzle,
//! this should be used on the tvs that are either inputs/outputs of an
//! MmaOp, or any tv's that are involved in prolog/epilog fusions and need to
//! have a matching thread swizzle with the mma operand/result.
//! More detail on usage see [WarpMmaSwizzler] in scheduler/mma_utils.h .
void applyMmaSwizzle(MmaOptions options);
//! Returns if this tensor view has swizzle operator on its tensor domain.
//! This is the temporary flag for indicating that the new swizzle
//! implementation is used and will be removed in follow ups.
bool hasSwizzleOp() const {
return has_swizzle_op_;
}
friend TORCH_CUDA_CU_API TransformPropagator;
friend TORCH_CUDA_CU_API MostInlinedTransformPropagator;
friend TORCH_CUDA_CU_API TransformReplay;
friend TORCH_CUDA_CU_API OptOutMutator;
friend class InlineBatchingGuard;
friend class ir_utils::TVDomainGuard;
// Inline the computation of this tensor into its consumer at the given
// position. If this tensor is already inlined in a higher position, then this
// call is a no-op. If the right most dimensions before `pos` are
// broadcasting, then will not inline into these broadcastings. If
// best_effort, then will inline into the highest allowed position that is <=
// `pos`.
void inlineAt(
int64_t pos,
bool best_effort = false,
MaxPosCalculator* calc = nullptr);
// Update the max producer position of the current tensor. This is required
// when we modify producer-consumer relationship of a scheduled tensor, for
// example, grouping multiple reductions.
void updateMaxProducerPosition();
protected:
void setDomain(TensorDomain* td) {
domain_ = td;
}
private:
int normalizeAxisPos(int pos) const {
if (pos < 0) {
pos += nDims();
}
return pos;
}
//! A helper function to maintain the consistency of schedules of
//! multiple outputs wheen doing rfactor on multi-output reduction ops.
TensorView* multiOutputRfactorHelper(
TensorView* tv,
const std::vector<int>& axes);
private:
TensorDomain* domain_ = nullptr;
unsigned int compute_at_pos_ = 0;
unsigned int max_producer_pos_ = 0;
MemoryType memory_type_ = MemoryType::Local;
SwizzleType swizzle_type_ = SwizzleType::NoSwizzle;
std::vector<IterDomain*> axes_to_swizzle_;
bool is_double_buffered_ = false;
//! Indicates if the tensor is circular buffered.
bool is_circular_buffered_ = false;
//! Indicates the circular buffering stage depth if applicable.
unsigned int circular_buffer_stage_ = 0;
// special handling for CPU based zero-dim tensors (i.e. CPU Tensors that only
// have one value). This is only used if on an input value, otherwise ignored.
// This is important as special handling because these "scalars" should be
// type promoted as a tensor, but we want to avoid explicit copying of the
// data, so we want to pass the data value as a standard kernel argument
// value.
bool cpu_scalar_ = false;
//! Indicates if this tensor view has swizzle operator on its tensor domain.
//! This is the temporary flag for indicating that the new swizzle
//! implementation is used and will be removed in follow ups.
bool has_swizzle_op_ = false;
};
//! A simple TensorView builder
//!
//! Example usage:
//!
//! auto tv = TensorViewBuilder()
//! .ndims(ndims)
//! .dtype(dtype)
//! .contiguity(contiguity)
//! .build();
//!
class TORCH_CUDA_CU_API TensorViewBuilder {
public:
//! Set the number of dimensions of the tensor (default 0, meaning scalar)
TensorViewBuilder& ndims(size_t ndims);
//! Set the data type of the tensor (default DataType::Float)
TensorViewBuilder& dtype(DataType dtype);
//! Set the contiguity information (default non-contiguous)
TensorViewBuilder& contiguity(std::vector<bool> contiguity);
//! Set the shape (default 0 dimensional, ie. scalar)
TensorViewBuilder& shape(std::vector<Val*> shape);
TensorViewBuilder& shape(const std::vector<int64_t>& shape);
//! Creates a new TensorView with the specified options
TensorView* build() const;
private:
size_t ndims_ = 0;
DataType dtype_ = DataType::Float;
std::vector<bool> contiguity_;
std::vector<Val*> shape_;
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,162 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <dispatch.h>
#include <c10/util/irange.h>
#include <iostream>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class Fusion;
namespace kir {
class Kernel;
class Scope;
} // namespace kir
//! Define pretty printing functions for IR nodes
//!
//! This class is intended for debug printing, so it attempts
//! to handle invalid states as well.
//!
class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch {
static constexpr char const* kTab = " ";
public:
explicit IrPrinter(std::ostream& os) : os_(os) {}
// Indent the generated code
std::ostream& indent() {
for (const auto i : c10::irange(indent_size_)) {
(void)i; // Suppress unused variable warning
os_ << " ";
}
return os_;
}
void resetIndent() {
indent_size_ = 0;
}
bool printInline() const {
return print_inline_;
}
using OptInConstDispatch::handle;
virtual void handle(Fusion* f);
// handle calls some non const fusion ops,
// eventhough fusion should remain unchanged.
// Need to look into this.
virtual void handle(const Fusion* f) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
handle(const_cast<Fusion*>(f));
}
virtual void handle(Fusion& f) {
handle(&f);
}
virtual void handle(const kir::Kernel* kernel);
virtual void handle(kir::Kernel& kernel);
void handleScope(const kir::Scope& scope);
void handle(const Statement* s) final;
void handle(const Val* v) final;
void handle(const Expr* e) final;
void handle(const IterDomain*) final;
void handle(const TensorDomain*) final;
void handle(const TensorView*) final;
void handle(const Bool*) final;
void handle(const Double*) final;
void handle(const Int*) final;
void handle(const ComplexDouble*) final;
void handle(const NamedScalar*) final;
void handle(const FullOp*) final;
void handle(const ARangeOp*) final;
void handle(const EyeOp*) final;
void handle(const UnaryOp*) final;
void handle(const BinaryOp*) final;
void handle(const TernaryOp*) final;
void handle(const RNGOp*) final;
void handle(const ReductionOp*) final;
void handle(const GroupedReductionOp*) final;
void handle(const WelfordOp*) final;
void handle(const GroupedWelfordOp*) final;
void handle(const LoadStoreOp*) final;
void handle(const MmaOp*) final;
void handle(const BroadcastOp*) final;
void handle(const TransposeOp*) final;
void handle(const ExpandOp*) final;
void handle(const ShiftOp*) final;
void handle(const GatherOp*) final;
void handle(const ViewAsScalar*) final;
void handle(const ViewOp*) final;
void handle(const kir::Predicate*) final;
void handle(const kir::TensorIndex*) final;
void handle(const kir::IntPair*) final;
void handle(const kir::GridBroadcast*) final;
void handle(const kir::GridReduction*) final;
void handle(const kir::GroupedGridReduction*) final;
void handle(const kir::GridWelford*) final;
void handle(const kir::GroupedGridWelford*) final;
void handle(const kir::ForLoop*) final;
void handle(const kir::IfThenElse*) final;
void handle(const kir::Allocate*) final;
void handle(const kir::BlockSync*) final;
void handle(const kir::GridSync*) final;
void handle(const kir::CpAsyncWait*) final;
void handle(const kir::CpAsyncCommit*) final;
void handle(const kir::InitMagicZero*) final;
void handle(const kir::UpdateMagicZero*) final;
void handle(const kir::AllocateFusedReduction*) final;
void handle(const kir::Swizzle2DInt*) final;
void handle(const kir::PairSelect*) final;
// IR math printer overrides these to prevent them from printing, keep
// override
void handle(const Split*) override;
void handle(const Merge*) override;
void handle(const Swizzle2D*) override;
void print_inline(const Statement* stmt) {
bool prev = print_inline_;
print_inline_ = true;
handle(stmt);
print_inline_ = prev;
}
protected:
std::ostream& os() {
return os_;
}
private:
std::ostream& os_;
bool print_inline_ = false;
int indent_size_ = 0;
};
TORCH_CUDA_CU_API std::ostream& operator<<(
std::ostream& os,
const Statement* stmt);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream& os, Fusion* f);
TORCH_CUDA_CU_API std::ostream& operator<<(std::ostream& os, Fusion& f);
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

View File

@ -1,59 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <ir_iostream.h>
#include <iter_visitor.h>
#include <iostream>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
//! Prints computation Fusion IR nodes
//!
//! IrMathPrinter and IrTransformPrinter allow the splitting up of fusion print
//! functions. IrMathPrinter as its name implies focuses solely on what tensor
//! computations are taking place. Resulting TensorView math will reflect the
//! series of split/merge/computeAts that have taken place, however these
//! nodes will not be displayed in what is printed. IrTransformPrinter does not
//! print any mathematical functions and only lists the series of
//! split/merge calls that were made. Both of these printing methods are
//! quite verbose on purpose as to show accurately what is represented in the IR
//! of a fusion.
//
//! \sa IrTransformPrinter
//!
class TORCH_CUDA_CU_API IrMathPrinter : public IrPrinter {
public:
IrMathPrinter(std::ostream& os) : IrPrinter(os) {}
void handle(const Split* const) override {}
void handle(const Merge* const) override {}
void handle(const Swizzle2D* const) override {}
void handle(Fusion* f) override {
IrPrinter::handle(f);
}
};
//! Prints transformation (schedule) Fusion IR nodes
//!
//! \sa IrMathPrinter
//!
class TORCH_CUDA_CU_API IrTransformPrinter : public IrPrinter {
public:
IrTransformPrinter(std::ostream& os) : IrPrinter(os) {}
void handle(Fusion* f) override;
private:
void printTransforms(TensorView* tv);
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

File diff suppressed because it is too large Load Diff

View File

@ -1,341 +0,0 @@
#pragma once
#include <ir_all_nodes.h>
#include <type.h>
#include <iterator>
#include <unordered_map>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace ir_utils {
// Replace values in fusion using ValReplacementMutator
void replaceValue(
Fusion*,
const std::unordered_map<Val*, Val*>& replacement_map);
template <typename FilterType, typename Iterator>
class FilterIterator {
public:
using iterator_category = std::forward_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = FilterType*;
using pointer = value_type*;
using reference = value_type&;
FilterIterator(Iterator begin, Iterator end) : current_(begin), end_(end) {
advance();
}
FilterType* operator*() const {
return (*current_)->template as<FilterType>();
}
FilterType* operator->() const {
return (*this);
}
FilterIterator& operator++() {
++current_;
advance();
return *this;
}
FilterIterator operator++(int) {
const auto before_increment = *this;
++current_;
advance();
return before_increment;
}
bool operator==(const FilterIterator& other) const {
TORCH_INTERNAL_ASSERT(
end_ == other.end_,
"Comparing two FilteredViews that originate from different containers");
return current_ == other.current_;
}
bool operator!=(const FilterIterator& other) const {
return !(*this == other);
}
private:
void advance() {
current_ = std::find_if(current_, end_, [](const auto& val) {
return dynamic_cast<const FilterType*>(val) != nullptr;
});
}
private:
Iterator current_;
Iterator end_;
};
// An iterable view to a given container of Val pointers. Only returns
// Vals of a given Val type.
// NOTE: Add a non-const iterator if needed.
template <typename FilterType, typename InputIt>
class FilteredView {
public:
using value_type = FilterType*;
using const_iterator = FilterIterator<FilterType, InputIt>;
FilteredView(InputIt first, InputIt last) : input_it_(first), last_(last) {}
const_iterator cbegin() const {
return const_iterator(input_it_, last_);
}
const_iterator begin() const {
return cbegin();
}
const_iterator cend() const {
return const_iterator(last_, last_);
}
const_iterator end() const {
return cend();
}
bool empty() const {
return begin() == end();
}
std::vector<value_type> vector() const {
return std::vector<value_type>(begin(), end());
}
private:
const InputIt input_it_;
const InputIt last_;
};
template <typename FilterType, typename InputIt>
auto filterByType(InputIt first, InputIt last) {
return FilteredView<FilterType, InputIt>(first, last);
}
template <typename FilterType, typename ContainerType>
auto filterByType(const ContainerType&& inputs) = delete;
template <typename FilterType, typename ContainerType>
auto filterByType(const ContainerType& inputs) {
return filterByType<FilterType>(inputs.cbegin(), inputs.cend());
}
//! Returns a list of new-to-old mappings.
//!
//! This funcion canonicalizes the dimensions and validates that multiple old
//! dimension are mapped to the same new dimension.
std::vector<int64_t> normalizeNew2Old(
const std::vector<int64_t>& new2old_in,
size_t ndims);
//! Returns a list of new-to-old mappings.
//!
//! The input map does not need to be complete. Missing axes are
//! assumed not to be affected.
//!
//! This is used to preprocess broadcast and transpose arguments.
//!
//! Example: (N := ndims)
//! {{0, 1}} -> [1, 0, ...., N-1]
//! Transposes the first two axes with no other change.
//!
//! {{0, -1}} -> [N-1, ...., 0]
//! Swaps the first and last axes.
std::vector<int> normalizeOld2New(
const std::unordered_map<int, int>& old2new_in,
size_t ndims);
// Replace all uses of reference with substitute in expr. Return the Expr.
// Warning: Invalidates provided Expr.
// Warning: Removes connection of reference through provided Expr.
// Warning: Creates new Expr connecting substitue.
// Reference is found through direct pointer comparison.
Expr* replaceValInExpr(Expr* expr, Val* reference, Val* substitute);
//! Replace Vals in an index Val as specified by replacement_map while
//! cloning the given index Val. The index val is assumed to represent
//! a tensor index consisting of Ints and arithmetic expressions.
//!
//! This is similar to replaceValInExpr but is different as Vals are
//! cloned such that no other exprs using the same leaf Vals are not
//! modified. TODO: Consider cleaning up the multiple replacement
//! routines.
Val* replaceValInIndexVal(
Val* index,
const std::unordered_map<Val*, Val*>& replacement_map);
// Makes rfactor generic with reduction ops and Welford
TORCH_CUDA_CU_API TensorView* rfactorHelper(
TensorView* red_tv,
const std::vector<int>& axes);
// Return immediate producers of val, this function can be used on any Val and
// will return producers through Exprs.
//
// Warning: returned val's are not guaranteed to be between fusion inputs and
// outputs. This function simply uses val->definition() or val->uses() which is
// limited to not go through fusion inputs/outputs, but if on a path that isn't
// strictly between fusion inputs/outputs, it could effectively return dead
// code.
TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(Val* val);
// Return immediate consumers of val, this function can be used on any Val and
// will return consumers through Exprs.
//
// Warning: returned val's are not guaranteed to be between fusion inputs and
// outputs. This function simply uses val->definition() or val->uses() which is
// limited to not go through fusion inputs/outputs, but if on a path that isn't
// strictly between fusion inputs/outputs, it could effectively return dead
// code.
TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(Val* val);
// Return immediate siblings of val, this function can be used on any Val and
// will return siblings through Exprs.
//
// Warning: returned val's are not guaranteed to be between fusion inputs and
// outputs. This function simply uses val->definition() or val->uses() which is
// limited to not go through fusion inputs/outputs, but if on a path that isn't
// strictly between fusion inputs/outputs, it could effectively return dead
// code.
TORCH_CUDA_CU_API std::vector<Val*> siblingValsOf(Val* val);
// Return immediate producers of vals, this function can be used on any vals and
// will return producers through Exprs.
//
// Warning: returned val's are not guaranteed to be between fusion inputs and
// outputs. This function simply uses val->definition() or val->uses() which is
// limited to not go through fusion inputs/outputs, but if on a path that isn't
// strictly between fusion inputs/outputs, it could effectively return dead
// code.
TORCH_CUDA_CU_API std::vector<Val*> producerValsOf(
const std::vector<Val*>& vals);
// Return immediate consumers of vals, this function can be used on any vals and
// will return consumers through Exprs.
//
// Warning: returned val's are not guaranteed to be between fusion inputs and
// outputs. This function simply uses val->definition() or val->uses() which is
// limited to not go through fusion inputs/outputs, but if on a path that isn't
// strictly between fusion inputs/outputs, it could effectively return dead
// code.
TORCH_CUDA_CU_API std::vector<Val*> consumerValsOf(
const std::vector<Val*>& vals);
// Return immediate producers of tv, this function will return all immediate
// producers of tv through Exprs.
//
// Warning: returned tv's are not guaranteed to be between fusion inputs and
// outputs. This function simply uses tv->definition() or tv->uses() which is
// limited to not go through fusion inputs/outputs, but if on a path that isn't
// strictly between fusion inputs/outputs, it could effectively return dead
// code.
TORCH_CUDA_CU_API std::vector<TensorView*> producerTvsOf(TensorView* tv);
// Return immediate consumers of tv, this function will return all immediate
// consumers of tv through Exprs.
//
// Warning: returned tv's are not guaranteed to be between fusion inputs and
// outputs. This function simply uses tv->definition() or tv->uses() which is
// limited to not go through fusion inputs/outputs, but if on a path that isn't
// strictly between fusion inputs/outputs, it could effectively return dead
// code.
TORCH_CUDA_CU_API std::vector<TensorView*> consumerTvsOf(TensorView* tv);
// Return immediate siblings of tv, this function will return all immediate
// siblings of tv through Exprs.
//
// Warning: returned tv's are not guaranteed to be between fusion inputs and
// outputs. This function simply uses tv->definition() or tv->uses() which is
// limited to not go through fusion inputs/outputs, but if on a path that isn't
// strictly between fusion inputs/outputs, it could effectively return dead
// code.
TORCH_CUDA_CU_API std::vector<TensorView*> siblingTvsOf(TensorView* tv);
// Return immediate producers of tvs, this function will return all immediate
// producers of tvs through Exprs.
//
// Warning: returned tv's are not guaranteed to be between fusion inputs and
// outputs. This function simply uses tv->definition() or tv->uses() which is
// limited to not go through fusion inputs/outputs, but if on a path that isn't
// strictly between fusion inputs/outputs, it could effectively return dead
// code.
TORCH_CUDA_CU_API std::vector<TensorView*> producerTvsOf(
const std::vector<TensorView*>& tvs);
// Return immediate consumers of tvs, this function will return all immediate
// consumers of tvs through Exprs.
//
// Warning: returned tv's are not guaranteed to be between fusion inputs and
// outputs. This function simply uses tv->definition() or tv->uses() which is
// limited to not go through fusion inputs/outputs, but if on a path that isn't
// strictly between fusion inputs/outputs, it could effectively return dead
// code.
TORCH_CUDA_CU_API std::vector<TensorView*> consumerTvsOf(
const std::vector<TensorView*>& tvs);
// Returns producers of tv that are inputs of fusion
TORCH_CUDA_CU_API std::vector<TensorView*> inputTvsOf(TensorView* tv);
// Returns consumers of tv that are outputs of fusion
TORCH_CUDA_CU_API std::vector<TensorView*> outputTvsOf(TensorView* tv);
// Returns producers of tvs that are inputs of fusion
TORCH_CUDA_CU_API std::vector<TensorView*> inputTvsOf(
std::vector<TensorView*> tvs);
// Returns consumers of tvs that are outputs of fusion
TORCH_CUDA_CU_API std::vector<TensorView*> outputTvsOf(
std::vector<TensorView*> tvs);
// returns all tensor views in fusion that are used between outputs and inputs.
TORCH_CUDA_CU_API std::vector<TensorView*> allTvs(Fusion* fusion);
// returns all tensor views in fusion that are used between outputs and inputs
// except the specified set.
TORCH_CUDA_CU_API std::vector<TensorView*> allTvsExcept(
Fusion* fusion,
const std::unordered_set<TensorView*>& except);
TORCH_CUDA_CU_API std::vector<Expr*> getReductionOps(
Fusion* fusion,
bool ignore_trivial = true);
// Returns the initialization value of tv or nullptr if not initialized.
TORCH_CUDA_CU_API Val* getReductionInitValOf(TensorView* tv);
// Returns if Expr is a reduction op
TORCH_CUDA_CU_API bool isReductionOp(const Expr*);
// Returns if Expr is a reduction op with TensorView or TensorIndex
TORCH_CUDA_CU_API bool isReductionTvOp(const Expr*);
// Returns all non-trivial view operations. We shouldn't have trivial view
// operations but this function is to simply make sure if we ever do we don't
// pull them in.
TORCH_CUDA_CU_API std::vector<ViewOp*> getViewOps(Fusion*);
template <typename T>
std::string toString(const T& nodes) {
std::stringstream ss;
for (const Statement* stmt : nodes) {
if (ss.tellp() != 0) {
ss << ", ";
}
ss << stmt->toString();
}
return ss.str();
}
} // namespace ir_utils
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,869 +0,0 @@
#include <iter_visitor.h>
#include <fusion.h>
#include <ir_all_nodes.h>
#include <ir_iostream.h>
#include <ir_utils.h>
#include <type.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
/* ITER VISITOR */
namespace {
// Remove any stmt in stmts that is in visited
void remove_visited(
std::vector<Statement*>& stmts,
const std::unordered_set<Statement*>& visited) {
std::deque<std::vector<Statement*>::iterator> to_erase;
for (auto it = stmts.begin(); it != stmts.end(); it++) {
if (visited.find(*it) != visited.end()) {
to_erase.push_back(it);
}
}
while (!to_erase.empty()) {
stmts.erase(to_erase.back());
to_erase.pop_back();
}
}
class MemberStatements : public OptOutDispatch {
public:
// Return all members of the stmt if it's a Val. For expressions it returns
// nothing.
static std::vector<Statement*> next(Statement* stmt) {
MemberStatements find_next(stmt);
return find_next.next_stmts_;
}
private:
MemberStatements() = default;
MemberStatements(Statement* stmt) {
handle(stmt);
}
using OptOutDispatch::handle;
void handle(Val* val) final {
FusionGuard::getCurFusion()->assertInContainer(
val,
"IterVisitor.cpp::MemberStatements::handle(Val*) Cannot traverse val, ");
OptOutDispatch::handle(val);
}
void handle(IterDomain* stmt) final {
next_stmts_.push_back(stmt->start());
next_stmts_.push_back(stmt->extent());
next_stmts_.push_back(stmt->stopOffset());
}
void handle(TensorDomain* stmt) final {
next_stmts_.insert(
next_stmts_.end(), stmt->domain().begin(), stmt->domain().end());
}
void handle(TensorView* tv) final {
next_stmts_.push_back(tv->domain());
}
std::vector<Statement*> next_stmts_;
};
} // namespace
std::vector<Statement*> IterVisitor::next(Statement* stmt) {
if (stmt->isVal()) {
return next(stmt->as<Val>());
} else {
return next(stmt->as<Expr>());
}
}
std::vector<Statement*> IterVisitor::next(Val* v) {
FusionGuard::getCurFusion()->assertInContainer(v, "Cannot traverse val, ");
if (v->definition() != nullptr) {
return {v->definition()};
}
return {};
}
std::vector<Statement*> IterVisitor::next(Expr* expr) {
FusionGuard::getCurFusion()->assertInContainer(
expr, "Cannot traverse expr, ");
std::vector<Statement*> next_stmts{
expr->inputs().begin(), expr->inputs().end()};
return next_stmts;
}
// This handle functions is called on every Statement* in topological order,
// starting from outputs to inputs.
void IterVisitor::handle(Statement* s) {
OptOutDispatch::handle(s);
}
// This handle functions is called on every Expr* in topological order,
// starting from outputs to inputs.
void IterVisitor::handle(Expr* e) {
OptOutDispatch::handle(e);
}
// This handle functions is called on every Val* in topological order,
// starting from outputs to inputs.
void IterVisitor::handle(Val* v) {
OptOutDispatch::handle(v);
}
// Implementation details:
// We start with an entry in stmt_stack that is the outputs we want to
// process. We cannot process these outputs untill all Stmts in their history
// have been processed, as those Stmts contain all dependencies to produce
// these values. What we will do is traverse towards inputs until we hit a
// leaf node. Once we hit a leaf node that node will be visited, then we will
// take them off the stack. Once a stack entry is empty, know everything
// needed to be visited to visit stmt_stack.back().back(). We then visit that
// node, make it as visisted and remove it from the stack.
//
// To prevent traversing all paths through a DAG (unless we want to) we have a
// function to remove visited nodes from being re-added to the stack
// (remove_visited).
void IterVisitor::traverseBetween(
Fusion* fusion,
const std::unordered_set<Val*>& from,
const std::vector<Val*>& to,
bool traverse_all_paths,
bool traverse_into_members) {
FusionGuard fg(fusion);
std::unordered_set<Statement*> visited;
stmt_stack.clear();
stmt_stack.emplace_back(to.rbegin(), to.rend());
bool all_inputs_visited = false;
while (!stmt_stack.empty()) {
auto& current_inputs = stmt_stack.back();
// If current_inputs is empty, pop a level of the stmt_stack, mark the level
// we pop to as having all inputs processed, the layer we processed were all
// added inputs required for that Stmt.
if (current_inputs.empty()) {
stmt_stack.pop_back();
all_inputs_visited = true;
continue;
}
// Get the very last entry in the stack to process
const auto& stmt = current_inputs.back();
// If we just poped a stmt_stack level, we can finally visit it!
if (all_inputs_visited) {
// stmt may have be already visited.
if (traverse_all_paths || visited.find(stmt) == visited.end()) {
// Mark visited
visited.insert(stmt);
// Actually visit stmt
handle(stmt);
}
// Remove last value just visited
current_inputs.pop_back();
// Mark that we need to visit a new Stmt's.
all_inputs_visited = false;
} else {
// We're not ready to process this node, so add all its inputs to be
// checked Visit input nodes.
std::vector<Statement*> next_stmts;
if ((stmt->isVal() && from.find(stmt->asVal()) == from.end()) ||
stmt->isExpr()) {
next_stmts = next(stmt);
}
if (traverse_into_members) {
auto members = MemberStatements::next(stmt);
next_stmts.insert(next_stmts.end(), members.begin(), members.end());
}
// We may want to retraverse nodes, in that case revisit everything!
if (!traverse_all_paths) {
// If we don't want to retraverse, remove nodes we already visisted.
remove_visited(next_stmts, visited);
}
if (next_stmts.empty()) {
// If there's nothing to visit because it was all already visited, mark
// to process
all_inputs_visited = true;
} else {
// Add all these new stmts to visit to the stack.
stmt_stack.emplace_back(next_stmts.rbegin(), next_stmts.rend());
// We have new things to visit,
all_inputs_visited = false;
}
}
}
}
void IterVisitor::traverseTo(
Fusion* fusion,
const std::vector<Val*>& to,
bool traverse_all_paths,
bool traverse_into_members) {
traverseBetween(fusion, {}, to, traverse_all_paths, traverse_into_members);
}
void IterVisitor::traverseHelper(Fusion* fusion, bool traverse_all_paths) {
FusionGuard fg(fusion);
auto term_val_outs = fusion->getTerminatingOutputs();
if (!term_val_outs.empty()) {
traverseTo(fusion, term_val_outs, traverse_all_paths);
}
}
void IterVisitor::traverse(Fusion* fusion) {
traverseHelper(fusion, false);
}
void IterVisitor::traverseAllPaths(Fusion* fusion) {
traverseHelper(fusion, true);
}
namespace {
// TODO: Also have InputsOf should pick one and remove the other.
class Inputs : public IterVisitor {
private:
//! Optional list of input vals. While traversing to inputs if a value in the
//! all_inputs list is found, that value will be added to the inputs_ and
//! traversal will not go into its definition. Otherwise traversal follows
//! definition paths until hitting a definition that is a nullptr (i.e. a
//! terminating input).
const std::vector<Val*>& all_inputs_;
std::vector<Val*> inputs_;
Inputs(const std::vector<Val*>& all_inputs) : all_inputs_(all_inputs) {}
std::vector<Statement*> next(Val* v) override {
if (std::find(inputs_.begin(), inputs_.end(), v) != inputs_.end()) {
return {};
}
return IterVisitor::next(v);
}
void handle(Val* val) override {
// If there's no definition to val, or val is created inside the fusion, or
// val is within the provided inputs
if (val->definition() == nullptr || val->definition()->inputs().empty() ||
std::find(all_inputs_.begin(), all_inputs_.end(), val) !=
all_inputs_.end()) {
// if not already placed in the inputs
if (std::find(inputs_.begin(), inputs_.end(), val) == inputs_.end()) {
inputs_.push_back(val);
}
}
}
public:
static std::vector<Val*> getInputs(
const std::vector<Val*>& of,
const std::vector<Val*>& all_inputs) {
if (of.empty()) {
return {};
}
Inputs inps(all_inputs);
inps.traverseTo(of[0]->fusion(), of);
return inps.inputs_;
}
};
} // namespace
std::vector<Val*> IterVisitor::getInputsTo(
const std::vector<Val*>& vals,
const std::vector<Val*>& inputs) {
return Inputs::getInputs(vals, inputs);
}
namespace {
class AllVals : public IterVisitor {
private:
std::unordered_set<Val*> vals;
void handle(Val* val) final {
vals.emplace(val);
}
public:
// Return all values in history of all values in from
static std::unordered_set<Val*> get(
Fusion* fusion,
const std::vector<Val*>& from) {
AllVals av;
av.traverseTo(fusion, from, false);
return av.vals;
}
};
} // namespace
/* BACKWARDS VISITOR */
std::vector<Statement*> BackwardVisitor::next(Statement* stmt) {
if (stmt->isVal()) {
return next(stmt->as<Val>());
} else if (stmt->isExpr()) {
return next(stmt->as<Expr>());
} else {
TORCH_INTERNAL_ASSERT(
false, "BackwardVisitor could not detect type in next_dispatch.");
}
}
std::vector<Statement*> BackwardVisitor::next(Expr* expr) {
return std::vector<Statement*>(
expr->outputs().begin(), expr->outputs().end());
}
std::vector<Statement*> BackwardVisitor::next(Val* val) {
// Going to sort based on relative topological position
std::map<size_t, Statement*> exprs;
for (auto expr : FusionGuard::getCurFusion()->unordered_uses(val)) {
// Make sure it's an expr we can traverse
if (traversal_exprs_.find(expr) != traversal_exprs_.end()) {
exprs[traversal_exprs_[expr]] = expr;
}
}
std::vector<Statement*> next_stmts(exprs.size());
std::transform(
exprs.begin(),
exprs.end(),
next_stmts.begin(),
[](std::pair<size_t, Statement*> pair) { return pair.second; });
return next_stmts;
}
void BackwardVisitor::handle(Statement* stmt) {
OptOutDispatch::handle(stmt);
}
void BackwardVisitor::handle(Expr* expr) {
OptOutDispatch::handle(expr);
}
void BackwardVisitor::handle(Val* val) {
OptOutDispatch::handle(val);
}
void BackwardVisitor::traverseTo(
Fusion* fusion,
const std::vector<Val*>& from,
bool traverseAllPaths) {
FusionGuard fg(fusion);
// Reset members
stmt_stack_.clear();
traversal_exprs_.clear();
if (from.empty()) {
return;
}
auto vals = AllVals::get(fusion, from);
auto exprs = StmtSort::getExprs(fusion, from);
{
size_t pos = 0;
for (auto expr : exprs)
traversal_exprs_[expr] = pos++;
}
// All stmts we've called handle on
std::unordered_set<Statement*> visited_stmts_;
if (must_cover_all_expr_outputs_) {
for (auto traversal_pair : traversal_exprs_) {
for (auto out : traversal_pair.first->outputs()) {
TORCH_INTERNAL_ASSERT(
vals.find(out) != vals.end(),
"Invalid backward traversal found. Some output paths were not provided:",
out);
}
}
}
auto inputs = InputsOf::getInputsTo(from);
stmt_stack_.emplace_back(inputs.begin(), inputs.end());
// The rest is basically copy-pasted from IterVitor:
while (!stmt_stack_.empty()) {
auto next_stmts = next(stmt_stack_.back().back());
// Remove statements we already visited if we're not traversing all paths
if (!traverseAllPaths) {
remove_visited(next_stmts, visited_stmts_);
}
// Traverse down until we get to a leaf
while (!next_stmts.empty()) {
stmt_stack_.emplace_back(next_stmts.rbegin(), next_stmts.rend());
next_stmts = next(stmt_stack_.back().back());
// Remove statements we already visited if we're not traversing all paths
if (!traverseAllPaths) {
remove_visited(next_stmts, visited_stmts_);
}
}
// Traverse back up
// Mark visited
visited_stmts_.emplace(stmt_stack_.back().back());
// Handle
handle(stmt_stack_.back().back());
// Remove
stmt_stack_.back().pop_back();
while (!stmt_stack_.empty() && stmt_stack_.back().empty()) {
stmt_stack_.pop_back();
if (!stmt_stack_.empty()) {
// Mark visited
visited_stmts_.emplace(stmt_stack_.back().back());
// Handle
handle(stmt_stack_.back().back());
// Remove
stmt_stack_.back().pop_back();
}
}
}
}
/* DEPENDENCY CHECKING */
namespace {
// Looks for and returns all values in between dependencies and vals, including
// them.
struct Dependencies : public IterVisitor {
private:
//! A given set of dependency Vals
const std::unordered_set<Val*> dependencies_;
//! Vals that are found between dependencies_ and of. Topologically
//! ordered.
std::vector<Val*> vals_;
//! Exprs that are found between dependencies_ and of. Topologically
//! ordered.
std::vector<Expr*> exprs_;
//! A set version of vals_
std::unordered_set<Val*> dependent_vals_;
//! A set version of exprs_
std::unordered_set<Expr*> dependent_exprs_;
private:
std::vector<Statement*> next(Val* v) override {
if (dependencies_.find(v) != dependencies_.end()) {
return std::vector<Statement*>();
}
return IterVisitor::next(v);
}
void handle(Val* val) override {
// val is included if:
// 1. it is one of the dependencies, or
// 2. its defining expression is included in the dependent expr set
if (dependencies_.find(val) != dependencies_.end()) {
TORCH_INTERNAL_ASSERT(
dependent_vals_.find(val) == dependent_vals_.end(),
"Trying to add already added val: ",
val);
vals_.push_back(val);
dependent_vals_.insert(val);
} else {
auto def = val->definition();
if (def != nullptr &&
dependent_exprs_.find(def) != dependent_exprs_.end()) {
TORCH_INTERNAL_ASSERT(
dependent_vals_.find(val) == dependent_vals_.end(),
"Trying to add already added val: ",
val);
vals_.push_back(val);
dependent_vals_.insert(val);
}
}
}
void handle(Expr* expr) override {
// Track which expr is depedent on the dependencies_ exprs.
if (std::any_of(
expr->inputs().begin(), expr->inputs().end(), [&](Val* input_val) {
return dependent_vals_.find(input_val) != dependent_vals_.end();
})) {
if (!dependent_exprs_.count(expr)) {
exprs_.push_back(expr);
dependent_exprs_.insert(expr);
}
}
}
Dependencies(
std::unordered_set<Val*> _dependencies,
const std::vector<Val*>& of)
: dependencies_(std::move(_dependencies)) {
traverseTo(of[0]->fusion(), of, false);
};
public:
static std::vector<Val*> getAllVals(
const std::unordered_set<Val*>& dependencies,
const std::vector<Val*>& of) {
if (of.empty()) {
return {};
}
Dependencies deps(dependencies, of);
return deps.vals_;
}
static std::vector<Expr*> getAllExprs(
const std::unordered_set<Val*>& dependencies,
const std::vector<Val*>& of) {
if (of.empty()) {
return {};
}
Dependencies deps(dependencies, of);
return deps.exprs_;
}
};
// Looks for and returns all output values with dependencies on `of`.
struct FindOutputs : public IterVisitor {
const std::unordered_set<Val*>& of_;
std::unordered_set<Val*> outs_;
void handle(Val* val) override {
if (of_.find(val) != of_.end()) {
Statement* out_stmt = stmt_stack.front().back();
TORCH_INTERNAL_ASSERT(out_stmt->isVal());
auto out_val = out_stmt->as<Val>();
if (of_.find(out_val) == of_.end()) {
outs_.emplace(out_val);
}
}
}
// TODO: Simply traverse through uses from of. Would be a lot faster than
// tracing all paths like this.
FindOutputs(const std::unordered_set<Val*>& _of) : of_(_of) {
auto fusion = (*of_.begin())->fusion();
traverseTo(fusion, fusion->outputs(), true);
};
static std::unordered_set<Val*> getAllOutputsOf(
const std::unordered_set<Val*>& of) {
if (of.empty()) {
return std::unordered_set<Val*>();
}
FindOutputs finder(of);
return finder.outs_;
}
};
// Looks for and returns all values that depends on `of`.
class DependentVals : public IterVisitor {
private:
// Which nodes to find dependencies of
const std::unordered_set<Val*>& of_;
// Dependencies we have so far
std::unordered_set<Val*> outs_;
// Boundary where we want to stop searching beyond
// TODO: Based on the todo below, shouldn't we stop just at the definition of?
// If we really wanted to make this traverse left, wouldn't we first check
// which outputs are outputs dependent on of?
std::unordered_set<Val*> boundary_;
std::vector<Statement*> next(Val* v) override {
if (boundary_.find(v) != boundary_.end())
return std::vector<Statement*>();
return IterVisitor::next(v);
}
void handle(Val* val) override {
if (val->isFusionInput() || val->definition() == nullptr ||
of_.count(val) || outs_.count(val)) {
return;
}
for (auto v : val->definition()->inputs()) {
if (of_.count(v) || outs_.count(v)) {
outs_.emplace(val);
return;
}
}
}
// optimization to limit search path
// TODO: Is this valid? Couldn't something like:
// out0 = of + val0
// out1 = out0 + val1
// out2 = TernaryOp(out1, val0, of)
// Hide the dep of out1 on of?
void createBoundary() {
for (auto v_of : of_) {
for (auto v_expr : v_of->uses()) {
for (auto v_in : v_expr->inputs()) {
boundary_.emplace(v_in);
}
}
}
}
DependentVals(const std::unordered_set<Val*>& _of) : of_(_of) {
createBoundary();
auto fusion = (*of_.begin())->fusion();
traverseTo(fusion, fusion->outputs(), false);
};
public:
static std::unordered_set<Val*> getAllDependentVals(
const std::unordered_set<Val*>& of) {
if (of.empty()) {
return std::unordered_set<Val*>();
}
DependentVals dependencies(of);
return dependencies.outs_;
}
};
class DependencyChains : public IterVisitor {
public:
std::deque<std::deque<Val*>> dep_chains;
bool is_dependency = false;
std::unordered_set<Val*> dependencies_;
void handle(Val* val) override {
if (dependencies_.find(val) != dependencies_.end()) {
is_dependency = true;
std::deque<Val*> deps;
for (auto stack : stmt_stack) {
if (stack.back()->isVal()) {
deps.push_back(stack.back()->as<Val>());
}
}
// Order as dependency -> of
dep_chains.emplace_back(deps.rbegin(), deps.rend());
}
}
DependencyChains(Val* _dependency, Val* _of, bool all_chains_ = false)
: dependencies_({_dependency}) {
traverseTo(_of->fusion(), {_of}, all_chains_);
}
DependencyChains(Val* _dependency, bool all_chains_ = false)
: dependencies_({_dependency}) {
if (all_chains_) {
traverseAllPaths(_dependency->fusion());
} else {
traverse(_dependency->fusion());
}
}
DependencyChains(
std::unordered_set<Val*> _dependencies,
bool all_chains_ = false)
: dependencies_(std::move(_dependencies)) {
if (dependencies_.empty()) {
return;
}
if (all_chains_) {
traverseAllPaths((*dependencies_.begin())->fusion());
} else {
traverse((*dependencies_.begin())->fusion());
}
}
static std::deque<Val*> getDependencyChain(Val* dependency, Val* of) {
DependencyChains dp(dependency, of, false);
if (dp.dep_chains.empty()) {
return std::deque<Val*>();
}
return dp.dep_chains[0];
}
// I don't think this is actually hooked up, but leaving for now.
static std::deque<std::deque<Val*>> getDependencyChains(
Val* dependency,
Val* of) {
DependencyChains dp(dependency, of, true);
if (dp.dep_chains.empty()) {
return std::deque<std::deque<Val*>>();
}
return dp.dep_chains;
}
static std::deque<std::deque<Val*>> getAllUseChains(Val* dependency) {
DependencyChains dp(dependency, true);
if (dp.dep_chains.empty()) {
return std::deque<std::deque<Val*>>();
}
return dp.dep_chains;
}
static std::deque<std::deque<Val*>> getAllUseChains(
const std::unordered_set<Val*>& dependencies) {
DependencyChains dp(dependencies, true);
if (dp.dep_chains.empty()) {
return std::deque<std::deque<Val*>>();
}
return dp.dep_chains;
}
};
} // namespace
bool DependencyCheck::isDependencyOf(Val* dependency, Val* of) {
return !DependencyChains::getDependencyChain(dependency, of).empty();
}
std::deque<Val*> DependencyCheck::getSingleDependencyChain(
Val* dependency,
Val* of) {
return DependencyChains::getDependencyChain(dependency, of);
}
std::deque<std::deque<Val*>> DependencyCheck::getAllDependencyChains(
Val* dependency,
Val* of) {
return DependencyChains::getDependencyChains(dependency, of);
}
std::deque<std::deque<Val*>> DependencyCheck::getAllUseChains(Val* producer) {
return DependencyChains::getAllUseChains(producer);
}
std::vector<Val*> DependencyCheck::getAllValsBetween(
const std::unordered_set<Val*>& dependencies,
const std::vector<Val*>& of) {
return Dependencies::getAllVals(dependencies, of);
}
std::vector<Expr*> DependencyCheck::getAllExprsBetween(
const std::unordered_set<Val*>& dependencies,
const std::vector<Val*>& of) {
return Dependencies::getAllExprs(dependencies, of);
}
std::unordered_set<Val*> DependencyCheck::getAllOutputsOf(
const std::unordered_set<Val*>& of) {
if (of.empty()) {
return std::unordered_set<Val*>();
}
FusionGuard fg((*of.begin())->fusion());
return FindOutputs::getAllOutputsOf(of);
}
std::unordered_set<Val*> DependencyCheck::getAllDependentVals(
const std::unordered_set<Val*>& of) {
if (of.empty()) {
return std::unordered_set<Val*>();
}
FusionGuard fg((*of.begin())->fusion());
return DependentVals::getAllDependentVals(of);
}
void StmtSort::handle(Statement* stmt) {
stmts.push_back(stmt);
}
std::vector<Expr*> StmtSort::getExprs(Fusion* fusion, bool traverse_members) {
auto terminating_outputs = fusion->getTerminatingOutputs();
return StmtSort::getExprs(fusion, terminating_outputs, traverse_members);
}
std::vector<Expr*> StmtSort::getExprs(
Fusion* fusion,
const std::vector<Val*>& to,
bool traverse_members) {
auto stmts = StmtSort::getStmts(fusion, to, traverse_members);
auto filter = ir_utils::filterByType<Expr>(stmts.begin(), stmts.end());
std::vector<Expr*> exprs(filter.begin(), filter.end());
return exprs;
}
std::vector<Expr*> StmtSort::getExprsBetween(
Fusion* fusion,
const std::vector<Val*>& from,
const std::vector<Val*>& to,
bool traverse_members) {
auto stmts = StmtSort::getStmtsBetween(fusion, from, to, traverse_members);
auto filter = ir_utils::filterByType<Expr>(stmts.begin(), stmts.end());
std::vector<Expr*> exprs(filter.begin(), filter.end());
return exprs;
}
std::vector<Statement*> StmtSort::getStmts(
Fusion* fusion,
bool traverse_members) {
auto terminating_outputs = fusion->getTerminatingOutputs();
return StmtSort::getStmts(fusion, terminating_outputs, traverse_members);
}
std::vector<Statement*> StmtSort::getStmts(
Fusion* fusion,
const std::vector<Val*>& to,
bool traverse_members) {
StmtSort es;
es.traverseTo(fusion, to, false, traverse_members);
return es.stmts;
}
std::vector<Statement*> StmtSort::getStmtsBetween(
Fusion* fusion,
const std::vector<Val*>& from,
const std::vector<Val*>& to,
bool traverse_members) {
StmtSort es;
es.traverseBetween(
fusion, {from.begin(), from.end()}, to, false, traverse_members);
return es.stmts;
}
void InputsOf::handle(Val* v) {
if (v->definition() == nullptr || v->definition()->inputs().empty()) {
if (grabbed_inputs.emplace(v).second) {
ordered_inputs.push_back(v);
}
}
}
std::vector<Val*> InputsOf::output(Fusion* fusion, Val* output_) {
return outputs(fusion, {output_});
}
std::vector<Val*> InputsOf::outputs(
Fusion* fusion,
const std::vector<Val*>& outputs_) {
InputsOf io;
io.traverseTo(fusion, outputs_, false);
return io.ordered_inputs;
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,349 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <dispatch.h>
#include <type.h>
#include <deque>
#include <unordered_set>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
class Fusion;
class Statement;
class Expr;
class Val;
/*
* IterVisitor starts from leaf nodes, fusion outputs, or the provided values.
* It walks the DAG bacwkards from the starting nodes, to roots. Each node in
* the dag will be called with handle(Statement*) in topolgical order inputs of
* the fusion to outputs of the fusion.
*
* TODO: We may want a BFS version of this code to extract ILP, not implemented
* yet.
*
* TODO: We may want to have ordering of outputs to inputs. I'm not sure why we
* would want this, but seems like it would be a reasonable request.
*/
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
class TORCH_CUDA_CU_API IterVisitor : public OptOutDispatch {
public:
~IterVisitor() override = default;
IterVisitor() = default;
IterVisitor(const IterVisitor& other) = default;
IterVisitor& operator=(const IterVisitor& other) = default;
IterVisitor(IterVisitor&& other) = default;
IterVisitor& operator=(IterVisitor&& other) = default;
protected:
// Functions return nodes in reverse order to be added to the to_visit queue
// These functions will start at outputs and propagate up through the DAG
// to inputs based on depth first traversal. Next could be called on a node
// multiple times.
virtual std::vector<Statement*> next(Statement* stmt);
virtual std::vector<Statement*> next(Val* v);
virtual std::vector<Statement*> next(Expr* expr);
// This handle functions is called on every Statement* in topological order,
// starting from outputs to inputs.
void handle(Statement* s) override;
// This handle functions is called on every Expr* in topological order,
// starting from outputs to inputs.
void handle(Expr* e) override;
// This handle functions is called on every Val* in topological order,
// starting from outputs to inputs.
void handle(Val* v) override;
// The entire stack during traversal. stmt_stack.back().back() is the node
// that is being called in handle(). stmt_stack.back() contains siblings (not
// guarenteed to be all siblings throughout traversal). stmt_stack.front()
// contains the outputs we started with (not guarenteed to be all outputs
// throughout traversal).
// NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
std::vector<std::vector<Statement*>> stmt_stack;
void traverseHelper(Fusion* fusion, bool traverse_all_paths = false);
public:
//! Traverses nodes in Fusion from inputs in topological order to "to". i.e.
//! from inputs towards outputs.
//! \param traverseAllPaths = false only call handle on each Statement* once
//! traverseAllPaths = true traverses all paths between expressions/values.
//! Calls handle on a Statement* for every path from inputs to "to".
//! \param traverseIntoMembers = When hitting nodes like TensorView,
//! TensorDomain, or IterDomain where there are members of the nodes that are
//! Val's a value of "true" will also traverse into those member Val's, a
//! value of "false" will not traverse into the members.
void traverseTo(
Fusion* fusion,
const std::vector<Val*>& to,
bool traverse_all_paths = false,
bool traverse_into_members = false);
//! Traverses nodes in Fusion from inputs in topological order to "to". i.e.
//! from inputs towards outputs.
//! \param traverseAllPaths = false only call handle on each Statement* once
//! traverseAllPaths = true traverses all paths between expressions/values.
//! Calls handle on a Statement* for every path from inputs to "to".
//! \param traverseIntoMembers = When hitting nodes like TensorView,
//! TensorDomain, or IterDomain where there are members of the nodes that are
//! Val's a value of "true" will also traverse into those member Val's, a
//! value of "false" will not traverse into the members.
//! \param from: Specified values to start traversing. If a "from" Val is not
//! on path from inputs to "to" node it will not be visited. If there's a path
//! from inputs to "to" that doesn't go through "from" that input and the path
//! from it will also be traversed.
void traverseBetween(
Fusion* fusion,
const std::unordered_set<Val*>& from,
const std::vector<Val*>& to,
bool traverse_all_paths = false,
bool traverse_into_members = false);
// Iterates from terminating outputs registered with the fusion. Terminating
// means value is not used to generate any other value used in producing
// registered outputs.
void traverse(Fusion* fusion);
// Same as traverse put it traverses every edge, meaning it will traverse
// values more than once.
void traverseAllPaths(Fusion* fusion);
//! Get inputs to vals. Possible input vals can be optionally
//! given. If not, vals with no producers are returned.
//
// TODO: This doesn't seem to fit with IterVisitor. Should probably be moved
// out of the class.
static std::vector<Val*> getInputsTo(
const std::vector<Val*>& vals,
const std::vector<Val*>& inputs = {});
};
/*
* Backward visitor IterVisitor calls handle in reverse order from outputs
* to inputs It would be really nice to unify this with IterVisitor, however,
* the challenge there is that we specify traversal from outputs towards inputs
* because it implicitly provides DCE. However, if users are not careful, they
* could miss necessary outputs to do a backward traversal.
*
* BackwardVisitor checks that all outputs of an Expr is visited before visiting
* the Expr. If we don't provide nodes to start from on all backward paths of
* those outputs we will never visit the Expr.
*
* The first step of BackwardVisitor is to make sure we've specified enough
* outputs to guarentee that we will traverse all outputs of all exprs during
* the backward traversal. In the case where we don't require visiting all
* outputs of some exprs, example being the `N` output of welford ops.
* `must_cover_all_expr_outputs` is added to disable the check, and in
* this case the visitor pass need be aware
* 1. Exprs with any output that has a use chain that ends with a final
* consumer in the `from` list `will be` visited.
* 2. Vals that doesn't have a use chain that ends with a final
* consumer in the `from` list `will not be` visited, even though its
* definition expr might be visited. An example is if the `N` output
* of an welford op is unused, but other outputs are, the welford op
* will be visited but the `N` output will not.
*
*/
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
class TORCH_CUDA_CU_API BackwardVisitor : public OptOutDispatch {
protected:
virtual ~BackwardVisitor() override = default;
BackwardVisitor(bool must_cover_all_expr_outputs = true)
: must_cover_all_expr_outputs_(must_cover_all_expr_outputs) {}
BackwardVisitor(const BackwardVisitor& other) = default;
BackwardVisitor& operator=(const BackwardVisitor& other) = default;
BackwardVisitor(BackwardVisitor&& other) = default;
BackwardVisitor& operator=(BackwardVisitor&& other) = default;
// Functions return nodes in reverse order to be added to the to_visit queue
// These functions will start at outputs and propagate up through the DAG
// to inputs based on depth first traversal. Next could be called on a node
// multiple times.
virtual std::vector<Statement*> next(Statement* stmt);
virtual std::vector<Statement*> next(Expr* expr);
virtual std::vector<Statement*> next(Val* val);
// This handle functions is called on every Statement* in topological order,
// starting from outputs to inputs.
// NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
virtual void handle(Statement* stmt) override;
// This handle functions is called on every Expr* in topological order,
// starting from outputs to inputs.
// NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
virtual void handle(Expr* expr) override;
// This handle functions is called on every Val* in topological order,
// starting from outputs to inputs.
// NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions)
virtual void handle(Val* val) override;
// All exprs that need to be visited in this traversal. Labeled in topological
// order (size_t).
std::unordered_map<Expr*, size_t> traversal_exprs_;
// The entire stack during traversal. stmt_stack.back().back() is the node
// that is being called in handle(). stmt_stack.back() contains siblings (not
// guarenteed to be all siblings throughout traversal). stmt_stack.front()
// contains the inputs we started with (not guarenteed to be all outputs
// throughout traversal).
std::deque<std::deque<Statement*>> stmt_stack_;
// Starts at nodes provided in from, traverses from these nodes to inputs.
// Calls handle on all Statement*s in topological sorted order.
// traverseAllPaths = false only call handle on each Statement* once
// traverseAllPaths = true traverses all paths from nodes in from to inputs.
// Handle on a Statement* for every path from "from" nodes, to inputs.
void traverseTo(
Fusion* fusion,
const std::vector<Val*>& from,
bool traverseAllPaths = false);
bool must_cover_all_expr_outputs_ = true;
};
class TORCH_CUDA_CU_API DependencyCheck {
public:
// Returns if "dependency" is a dependency of "of".
static bool isDependencyOf(Val* dependency, Val* of);
// Finds a Val* path from "of" to "dependency". Returns that path.
// deque.back() is "of", deque[0] is dependency if a chain exists.
static std::deque<Val*> getSingleDependencyChain(Val* dependency, Val* of);
// Finds all Val* paths from "of" to "dependency". Returns those paths.
// deque[i].back() is "of", and deque[i][0] is "dependency". Returns an
// empty deque if no dependency found.
static std::deque<std::deque<Val*>> getAllDependencyChains(
Val* dependency,
Val* of);
// Finds all Val* paths from all leaf nodes to "dependency". Returns those
// paths. deque[i].back() are leaf nodes, and deque[i][0] is "dependency".
// Returns an empty deque if there are no uses of dependency found.
static std::deque<std::deque<Val*>> getAllUseChains(Val* dependency);
// Grab all values that exist between and including provided
// vals. Returned values are topologicaly ordered, and unique.
static std::vector<Val*> getAllValsBetween(
const std::unordered_set<Val*>& dependencies,
const std::vector<Val*>& of);
// Returns all dependent exprs that exist between
// the provided vals
static std::vector<Expr*> getAllExprsBetween(
const std::unordered_set<Val*>& dependencies,
const std::vector<Val*>& of);
// Return registered outputs of the fusion that are a dependency of any val of
static std::unordered_set<Val*> getAllOutputsOf(
const std::unordered_set<Val*>& of);
// Return all Vals that depend on the given Vals
static std::unordered_set<Val*> getAllDependentVals(
const std::unordered_set<Val*>& of);
};
// Expr sort will take a fusion and return a topologically sorted list of
// expressions.
class StmtSort : public IterVisitor {
protected:
StmtSort() = default;
std::vector<Statement*> stmts;
void handle(Statement* stmt) override;
public:
// If traverse_members it will also extract all member nodes in the sorted
// statement list in the fusion. i.e. all IterDomains, extents, and associated
// expressions of them
static std::vector<Statement*> getStmts(
Fusion* fusion,
bool traverse_members = false);
// Returns ordered Statements required to produce from, including from.
static std::vector<Statement*> getStmts(
Fusion* fusion,
const std::vector<Val*>& to,
bool traverse_members = false);
// Returns ordered Statements required to produce from, including from.
// Stops traversal once hiting any Statements in to. Includes Statements in
// to.
//
// Warning: this doesn't necessarily prevent statements before `to` from being
// returned. e.g.
// i1 = i0
// i2 = i1
// i3 = i2
// i4 = i3 + i1
// getExprs(fusion, {i4}, {i3})
// will return the definition and values {i0, i1, i4}
// i3 is dependent on i1, but since i4 also is then the traversal will go down
// the i4->i1->i0 path, even though the i4->i3-//>i2->i1 path is blocked.
//
// If traverse_members it will also extract all member nodes in the sorted
// expr list in the fusion. i.e. all expressions on IterDomains, extents, etc
static std::vector<Statement*> getStmtsBetween(
Fusion* fusion,
const std::vector<Val*>& from,
const std::vector<Val*>& to,
bool traverse_members = false);
// Same as getStmts version but filters to only return the Expr*s
static std::vector<Expr*> getExprs(
Fusion* fusion,
bool traverse_members = false);
// Same as getStmts version but filters to only return the Expr*s
static std::vector<Expr*> getExprs(
Fusion* fusion,
const std::vector<Val*>& to,
bool traverse_members = false);
// Same as getStmts version but filters to only return the Expr*s
static std::vector<Expr*> getExprsBetween(
Fusion* fusion,
const std::vector<Val*>& from,
const std::vector<Val*>& to,
bool traverse_members = false);
};
class InputsOf : public IterVisitor {
private:
std::unordered_set<Val*> grabbed_inputs;
std::vector<Val*> ordered_inputs;
void handle(Val* v) final;
public:
static std::vector<Val*> output(Fusion* fusion, Val* output_);
static std::vector<Val*> outputs(
Fusion* fusion,
const std::vector<Val*>& outputs_);
};
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,428 +0,0 @@
#include <instrumentation.h>
#include <ir_iostream.h>
#include <kernel.h>
#include <kernel_expr_evaluator.h>
#include <kernel_ir_dispatch.h>
#include <lower2device.h>
#include <ATen/cuda/CUDAContext.h>
#include <iostream>
#include <unordered_set>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
IrBuilderPasskey::IrBuilderPasskey(IrContainer* ir_container)
: ir_container_(ir_container) {}
namespace kir {
namespace {
//! Scan all primary expressions in the Kernel IR and build
//! lists of specialized nodes and other interesting information
class KernelIrScanner : private IrVisitor {
public:
explicit KernelIrScanner(const Kernel* kernel) {
IrVisitor::handle(kernel->topLevelExprs());
const auto gpu_lower = GpuLower::current();
for (auto split : gpu_lower->nonDivisibleSplitInfo().splitsToValidate()) {
auto extent = split->in()->extent();
auto factor = split->factor();
summary_.splits_to_validate.emplace_back(extent, factor);
}
}
const auto& summary() const {
return summary_;
}
private:
using IrVisitor::handle;
void handle(Expr* expr) final {
IrVisitor::handle(expr);
for (auto inp : expr->inputs()) {
handle(inp);
}
for (auto out : expr->outputs()) {
handle(out);
}
}
void handle(BlockSync* sync) final {
// TODO: Move to a dedicated validation pass
// which is not on the common execution/compilation path
if (sync->isWarHazardSync()) {
++summary_.war_hazard_syncs_count;
}
}
void handle(GridSync* sync) final {
summary_.has_cooperative_grid_reduction = true;
}
void handle(Allocate* allocate) final {
switch (allocate->memoryType()) {
case MemoryType::Global:
summary_.global_allocations.push_back(allocate);
break;
case MemoryType::Shared:
summary_.dynamic_smem_allocations.push_back(allocate);
break;
case MemoryType::Local:
if (!ExpressionEvaluator::isConst(allocate->size())) {
summary_.has_dynamic_local_memory_allocations = true;
summary_.dynamic_lmem_allocations.emplace_back(allocate);
}
break;
}
}
void handle(RNGOp* rng_op) final {
summary_.max_rng_offsets =
std::max<int>(summary_.max_rng_offsets, rng_op->getRNGOffset());
}
void handle(TensorIndex* tensor_index) final {
const auto tv = tensor_index->view();
const auto domain = tv->domain();
// Do we have any reductions?
summary_.has_block_reductions =
summary_.has_block_reductions || domain->hasBlockReduction();
// Update the largest smem data type
if (domain->hasBlockReduction() || domain->hasGridReduction() ||
tv->getMemoryType() == MemoryType::Shared) {
const auto data_type = tv->dtype();
const size_t type_size = dataTypeSize(data_type);
if (type_size > max_smem_type_size_) {
max_smem_type_size_ = type_size;
summary_.largest_smem_data_type = data_type;
}
}
}
void handle(WelfordOp* welford_op) final {
summary_.has_welford = true;
TORCH_INTERNAL_ASSERT(welford_op->outAvg()->isA<TensorIndex>());
auto out_dom = welford_op->outAvg()->as<TensorIndex>()->view()->domain();
summary_.has_block_welford =
summary_.has_block_welford || out_dom->hasBlockReduction();
}
void handle(GridWelford* grid_welford) final {
summary_.has_welford = true;
summary_.has_grid_welford = true;
summary_.has_grid_reductions = true;
if (grid_welford->welford_op()->isAllreduce()) {
summary_.has_cooperative_grid_reduction = true;
}
}
void handle(GridReduction* grid_reduction) final {
summary_.has_grid_reductions = true;
if (grid_reduction->isAllreduce()) {
summary_.has_cooperative_grid_reduction = true;
}
}
void handle(GroupedGridReduction* grid_reduction) final {
summary_.has_grid_reductions = true;
if (grid_reduction->isAllreduce()) {
summary_.has_cooperative_grid_reduction = true;
}
}
void handle(GroupedGridWelford* grid_welford) final {
summary_.has_welford = true;
summary_.has_grid_welford = true;
summary_.has_grid_reductions = true;
if (grid_welford->isAllreduce()) {
summary_.has_cooperative_grid_reduction = true;
}
}
void handle(GridBroadcast* grid_broadcast) final {
summary_.has_cooperative_grid_reduction = true;
handle(grid_broadcast->broadcast_op());
}
void handle(BroadcastOp* bop) final {
const ParallelTypeBitmap parallel_types =
GpuLower::current()->threadPredMap().getParallelBroadcastDomains(
bop->out()->as<TensorIndex>()->view());
summary_.broadcast_parallel_types.emplace(bop, parallel_types);
// Do we have block broadcasts?
summary_.has_block_broadcasts =
summary_.has_block_broadcasts || parallel_types.hasTID();
// Do we have grid broadcasts?
summary_.has_grid_broadcasts =
summary_.has_grid_broadcasts || parallel_types.hasBID();
}
private:
size_t max_smem_type_size_ = 0;
KernelSummary summary_;
};
//! Make sure tensors have valid allocations even when parallelized
//! loops potentially have larger iteration counts than the number of
//! threads.
//!
//! When an IterDomain of a tensor is parallelized, the IterDomain
//! may not contribute to the allocation of the tensor. For example,
//! it is assumed that an allocation of a local-memory tensor does not
//! need to be accounted for an parallelied IterDomain. This is true
//! when it is guaranteed that each thread only needs to execute the
//! loop body once. However, if not, the allocation is invalid as it
//! only has a space for one value per thread.
//!
//! ValidateAllocation checks all tensor allocations and sees if any
//! tensor may have a parallelized loop whose iteration count may
//! be larger than the number of threads. If so, an error is thrown if
//! the tensor is not allocated on thread-shared memories. Note that
//! when allocated on a shared memory (i.e., MemoryType::Shared or
//! MemoryType::Global for tensors parallelized with threadIdx, or
//! MemoryType::Global for tensors parallelized with blockIdx), it is
//! assumed that allocation is properly extended for the iteration
//! count.
class ValidateAllocation : private OptOutConstDispatch {
public:
static void validate(const Kernel* kernel) {
ValidateAllocation validate_allocation(kernel);
}
private:
explicit ValidateAllocation(const Kernel* kernel) {
live_allocations_.emplace_back(std::vector<const Allocate*>());
for (const auto& expr : kernel->topLevelExprs()) {
OptOutConstDispatch::handle(expr);
}
live_allocations_.pop_back();
TORCH_INTERNAL_ASSERT(live_allocations_.empty());
}
void handle(const Allocate* allocate) final {
TORCH_INTERNAL_ASSERT(!live_allocations_.empty());
live_allocations_.back().push_back(allocate);
}
// for_loop is parallelized and its stop value is not guaranteed to
// be <= the number of threads, which breaks an assumption made
// during in the allocation lowering if it's thread-parallel and not
// allocated on shared or global memories, or if it's block-parallel
// ando not allocated on global memory.
void validate(const ForLoop* for_loop) {
const auto loop_id = for_loop->iter_domain();
for (const auto& allocations : live_allocations_) {
for (const auto& allocate : allocations) {
const auto tv = dynamic_cast<TensorView*>(allocate->buffer());
if (tv == nullptr) {
continue;
}
for (const auto& axis : tv->domain()->domain()) {
if (!GpuLower::current()->caMap()->areMapped(
loop_id, axis, IdMappingMode::LOOP)) {
continue;
}
if (isParallelTypeThreadDim(loop_id->getParallelType())) {
TORCH_INTERNAL_ASSERT(
tv->getMemoryType() == MemoryType::Shared ||
tv->getMemoryType() == MemoryType::Global,
"Tensor t",
tv->name(),
" must be allocated on SMEM or GMEM.");
} else if (isParallelTypeBlockDim(loop_id->getParallelType())) {
TORCH_INTERNAL_ASSERT(tv->getMemoryType() == MemoryType::Global);
}
}
}
}
}
void handle(const ForLoop* for_loop) final {
if (for_loop->stop() != for_loop->iter_domain()->extent() &&
isParallelTypeThread(for_loop->iter_domain()->getParallelType())) {
validate(for_loop);
}
live_allocations_.emplace_back(std::vector<const Allocate*>());
for (const auto& expr : for_loop->body().exprs()) {
OptOutConstDispatch::handle(expr);
}
live_allocations_.pop_back();
}
void handle(const IfThenElse* ite) final {
for (const auto& expr : ite->thenBody().exprs()) {
OptOutConstDispatch::handle(expr);
}
for (const auto& expr : ite->elseBody().exprs()) {
OptOutConstDispatch::handle(expr);
}
}
private:
std::vector<std::vector<const Allocate*>> live_allocations_;
};
} // namespace
// TODO(kir): Kernel IR validation
void Kernel::finalize(std::vector<Expr*> top_level_exprs) {
TORCH_INTERNAL_ASSERT(top_level_exprs_.empty());
top_level_exprs_ = std::move(top_level_exprs);
warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo();
profile_ = GpuLower::current()->profile();
ValidateAllocation::validate(this);
analyze();
// Make sure this is after analyze as it sets summary_
summary_.vectorized_accesses = GpuLower::current()->vectorizedAccesses();
summary_.vectorized_set_info = GpuLower::current()->vectorizedSetInfo();
summary_.sync_map = GpuLower::current()->syncMap();
summary_.parallel_dimension_map_ =
GpuLower::current()->parallelDimensionMap();
}
void Kernel::analyze() {
FUSER_PERF_SCOPE("Kernel::analyze");
const KernelIrScanner ir_scanner(this);
summary_ = ir_scanner.summary();
}
void Kernel::print() const {
IrPrinter ir_printer(std::cout);
ir_printer.handle(this);
}
//! Register the Val with this fusion
void Kernel::registerVal(Val* val) {
if (inContainer(val)) {
return;
}
if (val->kernel()) {
TORCH_CHECK(
val->kernel() == this,
val->toString(),
" was not found in the active kernel.");
}
Fusion::registerVal(val);
}
//! Register expr with this fusion.
//! When we register an expression, we want to update the dependency tracking
//! of Vals. We add expr to our general expr_set_,
void Kernel::registerExpr(Expr* expr) {
if (inContainer(expr)) {
return;
}
if (expr->kernel()) {
TORCH_CHECK(
expr->kernel() == this,
expr->toString(),
" was not found in the active kernel.");
}
for (Val* input : expr->inputs()) {
TORCH_INTERNAL_ASSERT(
inContainer(input),
"Input\n",
input->toString(),
" to expr,\n",
expr->toString(),
",\n is invalid because it is not in the same kernel.");
}
for (Val* output : expr->outputs()) {
TORCH_INTERNAL_ASSERT(
inContainer(output),
"Output\n",
output->toString(),
" to expr,\n",
expr->toString(),
",\n is invalid because it is not in the same kernel.");
}
// Register expr is explicitly non-SSA when coming from a kernel. This is
// detected inside Fusion::registerExpr
Fusion::registerExpr(expr);
}
std::vector<Expr*>& KernelInternalProxy::topLevelExprs() {
return kernel_->top_level_exprs_;
}
void KernelPerformanceProfile::registerExpr(const Expr* expr) {
if (expr_entry_map_.find(expr) != expr_entry_map_.end()) {
return;
}
auto slot = getNewIndex();
expr_entry_map_.emplace(expr, slot);
}
int KernelPerformanceProfile::getNewIndex() {
return num_profile_entries_++;
}
bool KernelPerformanceProfile::isProfiled(const Expr* expr) const {
return expr_entry_map_.find(expr) != expr_entry_map_.end();
}
c10::optional<int> KernelPerformanceProfile::getIndex(const Expr* expr) const {
auto it = expr_entry_map_.find(expr);
if (it == expr_entry_map_.end()) {
return c10::optional<int>();
} else {
return it->second;
}
}
std::array<int, 2> KernelPerformanceProfile::getIndicesInProfileBuffer(
const Expr* expr) const {
TORCH_INTERNAL_ASSERT(
isProfiled(expr), "Not a profiled expression: ", expr->toString());
int cycle_index = getIndex(expr).value() * 2;
int count_index = cycle_index + 1;
return {cycle_index, count_index};
}
std::string KernelPerformanceProfile::toString(const at::Tensor& buffer) const {
std::stringstream ss;
ss << "Kernel performance profile:\n";
if (!buffer.defined()) {
ss << "No profile found\n";
return ss.str();
}
double kilo_freq = at::cuda::getCurrentDeviceProperties()->clockRate;
ss << std::setprecision(3) << std::fixed;
for (const auto& kv : expr_entry_map_) {
auto expr = kv.first;
auto index = kv.second;
auto out_tv = ir_utils::getTvOutput(expr);
double cycles = static_cast<double>(buffer[index][0].item<int64_t>());
auto count = buffer[index][1].item<int64_t>();
auto cycles_per_call = count == 0 ? 0.0 : cycles / count;
auto us_per_call = cycles_per_call / kilo_freq * 1000.0;
ss << expr->getExprType().value() << ", T" << out_tv->name() << ", "
<< us_per_call << " us, " << count << "\n";
}
return ss.str();
}
} // namespace kir
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

View File

@ -1,257 +0,0 @@
#pragma once
#include <c10/macros/Export.h>
#include <fusion.h>
#include <ir_base_nodes.h>
#include <ir_builder.h>
#include <lower_sync_information.h>
#include <lower_warp_reduce.h>
#include <parallel_dimension_map.h>
#include <utils.h>
#include <vectorization_info.h>
#include <memory>
#include <unordered_map>
#include <utility>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace kir {
//! Summary of interesting facts about the kernel
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
struct KernelSummary {
//! Count of WAR (write-after-read) hazard barriers
int war_hazard_syncs_count = 0;
//! List of global buffers
std::vector<const kir::Allocate*> global_allocations;
//! List of dynamic shared memory buffers
std::vector<const kir::Allocate*> dynamic_smem_allocations;
//! List of static shared memory buffers
std::vector<const kir::Allocate*> static_smem_allocations;
//! Indicate the need to generate random numbers
int max_rng_offsets = -1;
//! Do we have any block reductions?
bool has_block_reductions = false;
//! Number of static grid reductions
bool has_grid_reductions = false;
//! Do we have any grid reduction in a loop, or grid reductions dependent on
//! grid reductions
bool has_cooperative_grid_reduction = false;
//! Do we have any block broadcasts?
bool has_block_broadcasts = false;
//! Do we have any grid broadcasts?
bool has_grid_broadcasts = false;
//! Do we have any welford op?
bool has_welford = false;
//! Do we have any welford op?
bool has_block_welford = false;
//! Do we have any welford op?
bool has_grid_welford = false;
//! Largest shared memory buffer base type
DataType largest_smem_data_type = DataType::Null;
//! Do we have allocations of dynamic local memory?
bool has_dynamic_local_memory_allocations = false;
//! List of dynamic local memory buffers.
//! Only used for debugging.
std::vector<const kir::Allocate*> dynamic_lmem_allocations;
//! ceilDiv extents that must be divisible
std::vector<std::pair<const Val*, const Val*>> splits_to_validate;
//! Effective ParallelTypes of broadcast ops
std::unordered_map<const BroadcastOp*, ParallelTypeBitmap>
broadcast_parallel_types;
//! Track which tensor views are inputs or outputs of a vectorized operation
//! and their maximum vectorized access size
std::unordered_map<TensorView*, int> vectorized_accesses;
// Sync map is needed to figure out if global memory buffers need to be marked
// as volatile because they're used for communication.
SyncMap sync_map;
// Parallel dimension map needed to set the correct properties of grid buffers
// (is a dim inactive)
ParallelDimensionMap parallel_dimension_map_;
//! Track information on vectorized set operations for runtime validation
std::vector<VectorizedSetInfo> vectorized_set_info;
};
class TORCH_CUDA_CU_API KernelPerformanceProfile {
public:
//! Register an expression to profile
void registerExpr(const Expr* expr);
//! Query if an expression is profiled
bool isProfiled(const Expr* expr) const;
//! Get the number of profiled expressions
int getNumberOfProfileEntries() const {
return num_profile_entries_;
}
//! Set the backing buffer of profile.
void setBuffer(TensorView* buffer) {
buffer_ = buffer;
}
//! Get the backing buffer
TensorView* getBuffer() const {
return buffer_;
}
//! Get the indices of the profile of an expression in the backing buffer
std::array<int, 2> getIndicesInProfileBuffer(const Expr* expr) const;
std::string toString(const at::Tensor& buffer) const;
private:
//! Get the new profile index
int getNewIndex();
//! Get the profile index
c10::optional<int> getIndex(const Expr* expr) const;
private:
int num_profile_entries_ = 0;
//! Backing buffer of Nx2 integer tensor, where N is the number of profiled
//! regions. Each region has two integer values, one representing
//! the cycles spent, and another the count.
TensorView* buffer_ = nullptr;
//! Map profiled expressions to profile entry offsets
std::unordered_map<const Expr*, int> expr_entry_map_;
// TODO: Allow profiling of ForLoops
//! Map profiled ForLoop to profile entry offsets
// std::unordered_map<const kir::ForLoop*, int> loop_entry_map_;
};
class KernelInternalProxy;
//! Container for a lowered Kernel IR
//!
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
class TORCH_CUDA_CU_API Kernel final : public Fusion {
friend KernelInternalProxy;
public:
// Kernel starts by grabbing all the nodes from the provided fusion.
// Kernel is not SSA, if a definition is not set, we should update it, but
// not remove previous definition if it is set. This is primarily because when
// we do something like generate an initialization statement for a reduction
// TV, we may want to continue to do fusion like analysis on the original
// expression.
// TODO: Assert index type is int or int32
Kernel(Fusion* fusion, DataType index_type = DataType::Int)
: Fusion(*fusion), index_type_(index_type) {}
Kernel() = delete;
// No move or copy semantics
Kernel(const Kernel&) = delete;
Kernel& operator=(const Kernel&) = delete;
//! Finalize a kernel definition
//!
//! At this point we have a complete kernel definition and we can
//! run analysis passes to build a KernelSummary.
void finalize(std::vector<Expr*> top_level_exprs);
const std::vector<Expr*>& topLevelExprs() const {
return top_level_exprs_;
}
const KernelSummary& summary() const {
return summary_;
}
DataType indexType() const {
return index_type_;
}
//! Checks if parallel type is padded
bool isParallelTypePadded(ParallelType ptype) const {
return ptype == ParallelType::TIDx &&
warp_padded_parallel_info_.is_tidx_padded;
}
const WarpPaddedParallelInfo& getWarpPaddedParallelInfo() const {
return warp_padded_parallel_info_;
}
const KernelPerformanceProfile& profile() const {
return profile_;
}
//! Debug dump of the Kernel IR
void print() const;
protected:
//! Register the Val with this fusion
void registerVal(Val* val) override;
//! Register expr with this fusion.
//! When we register an expression, we want to update the dependency tracking
//! of Vals. We add expr to our general expr_set_,
void registerExpr(Expr* expr) override;
private:
// Analyze the kernel IR and caches the summary of interesting data
void analyze();
// Top level statements
std::vector<Expr*> top_level_exprs_;
// Summary of interesting kernel data
KernelSummary summary_;
// Is this kernel being compiled with int32 or int64 indexing. This
// information is required to resolve DataType::Index
DataType index_type_ = DataType::Int;
WarpPaddedParallelInfo warp_padded_parallel_info_;
KernelPerformanceProfile profile_;
};
//! A special debugging proxy for Kernel.
//!
//! Should not be used for other than testing and debugging.
class TORCH_CUDA_CU_API KernelInternalProxy {
public:
KernelInternalProxy(Kernel* kernel) : kernel_(kernel) {}
std::vector<Expr*>& topLevelExprs();
private:
Kernel* kernel_ = nullptr;
};
} // namespace kir
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch

Some files were not shown because too many files have changed in this diff Show More