mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: TLDR: * Remove config.split_cat_fx_passes, and move split cat passes behind config.pattern_matcher (True by default) * Add config.experimental_patterns (False by default). * In the future, general/universal patterns should behind config.pattern_matcher; customized/unmatured patterns should behind config.experimental_patterns. More details at: https://docs.google.com/document/d/1P8uJTpOTdQpUbw56UxHol40tt-EPFTq1Qu38072E9aM/edit Test Plan: Existing unit tests Reviewed By: jansel, jackiexu1992 Differential Revision: D46752606 Pull Request resolved: https://github.com/pytorch/pytorch/pull/104208 Approved by: https://github.com/williamwen42
408 lines
13 KiB
Python
408 lines
13 KiB
Python
import os
|
|
import sys
|
|
|
|
import torch
|
|
|
|
# add some debug printouts
|
|
debug = False
|
|
|
|
# Whether to disable a progress bar for autotuning
|
|
disable_progress = True
|
|
|
|
# Whether to enable printing the source code for each future
|
|
verbose_progress = False
|
|
|
|
# use cpp wrapper instead of python wrapper
|
|
cpp_wrapper = False
|
|
|
|
# dead code elimination
|
|
dce = False
|
|
|
|
# assume weight tensors are fixed size
|
|
static_weight_shapes = True
|
|
|
|
# put correctness assertions in generated code
|
|
size_asserts = os.environ.get("TORCHINDUCTOR_SIZE_ASSERTS", "1") == "1"
|
|
|
|
# enable loop reordering based on input orders
|
|
pick_loop_orders = True
|
|
|
|
# reuse a kernel input as the output
|
|
inplace_buffers = True
|
|
|
|
# reuse a buffer for an unrelated purpose
|
|
allow_buffer_reuse = True
|
|
|
|
# codegen benchmark harness
|
|
benchmark_harness = True
|
|
|
|
# fuse pointwise into templates
|
|
epilogue_fusion = True
|
|
|
|
# do epilogue fusions before other fusions
|
|
epilogue_fusion_first = False
|
|
|
|
# enable pattern match+replace optimizations
|
|
pattern_matcher = True
|
|
|
|
# enable experimental patterns for match+replace optimizations
|
|
experimental_patterns = False
|
|
|
|
# enable reordering pass
|
|
reordering = True
|
|
|
|
# inductor engine name
|
|
dll_name = "inductor_engine.so"
|
|
|
|
# enable slow autotuning passes to select algorithms
|
|
max_autotune = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE") == "1"
|
|
|
|
# enable slow autotuning passes to select pointwise/reductions algorithms
|
|
max_autotune_pointwise = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE") == "1"
|
|
|
|
# enable slow autotuning passes to select gemm algorithms
|
|
max_autotune_gemm = os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_GEMM") == "1"
|
|
|
|
# enable searching global and local cache regardless of `max_autotune`
|
|
search_autotune_cache = os.environ.get("TORCHINDUCTOR_SEARCH_AUTOTUNE_CACHE") == "1"
|
|
|
|
# We will disable creating subprocess for autotuning if this is False
|
|
autotune_in_subproc = os.environ.get("TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC") == "1"
|
|
|
|
coordinate_descent_tuning = (
|
|
os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_TUNING") == "1"
|
|
)
|
|
coordinate_descent_check_all_directions = (
|
|
os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_CHECK_ALL_DIRECTIONS") == "1"
|
|
)
|
|
coordinate_descent_search_radius = int(
|
|
os.environ.get("TORCHINDUCTOR_COORDINATE_DESCENT_RADIUS", "1")
|
|
)
|
|
|
|
layout_optimization = os.environ.get("TORCHINDUCTOR_LAYOUT_OPTIMIZATION", "1") == "1"
|
|
|
|
# Whether to keep the output strides the same as eager after layout optimization.
|
|
keep_output_stride = os.environ.get("TORCHINDUCTOR_KEEP_OUTPUT_STRIDE", "1") == "1"
|
|
|
|
# Enabling this will let compiler print warning messages if a generated triton
|
|
# kernel has inputs with mixed layouts. This is helpful for perf debugging
|
|
# since kernel with mixed layout inputs may run much slower then one whose inputs
|
|
# have uniform layouts.
|
|
warn_mix_layout = os.environ.get("TORCHINDUCTOR_WARN_MIX_LAYOUT") == "1"
|
|
|
|
# control store vs recompute heuristic
|
|
# For fanouts, rematerialization can lead to exponential blowup. So, have
|
|
# smaller threshold
|
|
realize_reads_threshold = 4
|
|
realize_bytes_threshold = 2000
|
|
|
|
# Threshold to prevent excessive accumulation of ops in one buffer during lowering
|
|
realize_acc_reads_threshold = 8
|
|
|
|
# fallback to eager for random/dropout, this is slow but useful for debugging
|
|
fallback_random = False
|
|
|
|
# automatically create fallbacks when encountering an unhandled op
|
|
implicit_fallbacks = True
|
|
|
|
# fuse even in cases without common reads
|
|
aggressive_fusion = False
|
|
|
|
# how many nodes to allow into a single fusion
|
|
max_fusion_size = 64
|
|
|
|
# replace small reductions with pointwise, disable with `= 1`
|
|
unroll_reductions_threshold = 8
|
|
|
|
# Add extra comments to output code (causes compile cache misses)
|
|
comment_origin = False
|
|
|
|
# Convert 1x1 convs into matmuls
|
|
conv_1x1_as_mm = False
|
|
|
|
# Enable split reductions for better utilization when the dimension
|
|
# being reduced over is large (by splitting it)
|
|
split_reductions = True
|
|
|
|
benchmark_kernel = os.environ.get("TORCHINDUCTOR_BENCHMARK_KERNEL", "0") == "1"
|
|
|
|
# Enable constant and index_expr folding
|
|
constant_and_index_propagation = True
|
|
|
|
# constant folding on the joint graph
|
|
joint_graph_constant_folding = True
|
|
|
|
# Enable indirect_indexing asserts for decompositions and lowerings
|
|
debug_index_asserts = False
|
|
|
|
|
|
def is_fbcode():
|
|
return not hasattr(torch.version, "git_version")
|
|
|
|
|
|
# warnings intended for PyTorch developers, disable for point releases
|
|
is_nightly_or_source = "dev" in torch.__version__ or "git" in torch.__version__
|
|
developer_warnings = is_fbcode() or is_nightly_or_source
|
|
|
|
|
|
def decide_compile_threads():
|
|
"""
|
|
Here are the precedence to decide compile_threads
|
|
1. User can override it by TORCHINDUCTOR_COMPILE_THREADS. One may want to disable async compiling by
|
|
setting this to 1 to make pdb happy.
|
|
2. Set to 1 if it's win32 platform or it's a fbcode build
|
|
3. decide by the number of CPU cores
|
|
"""
|
|
if "TORCHINDUCTOR_COMPILE_THREADS" in os.environ:
|
|
return int(os.environ["TORCHINDUCTOR_COMPILE_THREADS"])
|
|
elif sys.platform == "win32" or is_fbcode():
|
|
return 1
|
|
else:
|
|
return min(
|
|
32,
|
|
len(os.sched_getaffinity(0))
|
|
if hasattr(os, "sched_getaffinity")
|
|
else os.cpu_count(),
|
|
)
|
|
|
|
|
|
compile_threads = decide_compile_threads()
|
|
|
|
# gemm autotuning global cache dir
|
|
if is_fbcode():
|
|
global_cache_dir = "fb/cache"
|
|
else:
|
|
global_cache_dir = None
|
|
|
|
# If kernel is fused, the name is generated from the origin node op names
|
|
# for larger kernels limit this
|
|
kernel_name_max_ops = 10
|
|
|
|
# Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs
|
|
shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "1") == "1"
|
|
|
|
# Fx-based linear/matmul/bmm + permute/transpose vertical fusion
|
|
permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
|
|
|
|
# Mark the wrapper call in PyTorch profiler
|
|
profiler_mark_wrapper_call = False
|
|
|
|
# Generate hook calls to torch._inductor.hooks.run_intermediate_hooks for
|
|
# every intermediate for which we can correlate it with an intermediate
|
|
# from the original FX graph
|
|
generate_intermediate_hooks = False
|
|
|
|
# Populate traceback field on IRNode; good for debugging why origin_node is
|
|
# not populated, or finding out where an IRNode was constructed
|
|
debug_ir_traceback = False
|
|
|
|
# used for debugging to make sure config is properly set
|
|
_raise_error_for_testing = False
|
|
|
|
_profile_var = os.environ.get("TORCHINDUCTOR_PROFILE", "")
|
|
profile_bandwidth = _profile_var != ""
|
|
profile_bandwidth_regex = "" if _profile_var == "1" else _profile_var
|
|
|
|
disable_cpp_codegen = is_fbcode()
|
|
|
|
|
|
# Freezing will attempt to inline weights as constants in optimization
|
|
# and run constant folding and other optimizations on them. After freezing, weights
|
|
# can no longer be updated.
|
|
freezing: bool = os.environ.get("TORCHINDUCTOR_FREEZING", "0") == "1"
|
|
|
|
# Make freezing invalidate the eager Parameters of nn modules, to avoid memory overhead
|
|
# of potentially keeping multiple copies of weights.
|
|
freezing_discard_parameters: bool = False
|
|
|
|
|
|
# config specific to codegen/cpp.py
|
|
class cpp:
|
|
# set to torch.get_num_threads()
|
|
threads = -1
|
|
|
|
# Do not generate loops when the condition doesn't hold, like:
|
|
# for(long i0=4096; i0<4096; i0+=1)
|
|
no_redundant_loops = True
|
|
|
|
# Assume number of threads is dynamic, don't specialize thread number.
|
|
# Kernels don't recompile on thread number changes with this flag on.
|
|
# For single-threaded workload, turning it on would incur a slight
|
|
# performance degradation.
|
|
dynamic_threads = False
|
|
|
|
simdlen = None
|
|
min_chunk_size = 4096
|
|
cxx = (
|
|
None, # download gcc12 from conda-forge if conda is installed
|
|
# "g++-12",
|
|
# "g++-11",
|
|
# "g++-10",
|
|
# "clang++",
|
|
os.environ.get("CXX", "g++"),
|
|
# "g++.par",
|
|
)
|
|
# Allow kernel performance profiling via PyTorch profiler
|
|
enable_kernel_profile = False
|
|
|
|
# enable weight prepacking to get a better performance; may lead to large memory footprint
|
|
weight_prepack = True
|
|
|
|
# Inject a bug into our relu implementation; useful for testing our repro
|
|
# extraction and minification functionality.
|
|
# Valid values: "compile_error", "runtime_error", "accuracy"
|
|
inject_relu_bug_TESTING_ONLY = None
|
|
inject_log1p_bug_TESTING_ONLY = None
|
|
|
|
# If None, autodetect whether or not AVX512/AVX2 can be used. Otherwise,
|
|
# force usage as specified, without testing.
|
|
vec_isa_ok = None
|
|
|
|
# similar to config.triton.descriptive_names
|
|
descriptive_names = "original_aten"
|
|
|
|
# how many nodes to allow into a single horizontal fusion
|
|
max_horizontal_fusion_size = 16
|
|
|
|
|
|
# config specific to codegen/triton.py
|
|
class triton:
|
|
# Use cudagraphs on output code
|
|
cudagraphs = False
|
|
|
|
# Use cudagraph trees for memory pooling if `cudagraphs` is True
|
|
cudagraph_trees = not is_fbcode()
|
|
|
|
# assertions not on the fast path, steady state
|
|
slow_path_cudagraph_asserts = True
|
|
|
|
# TODO - need to debug why this prevents cleanup
|
|
cudagraph_trees_history_recording = False
|
|
|
|
# assertions on the fast path
|
|
fast_path_cudagraph_asserts = False
|
|
|
|
# skip warmup for cudagraph trees
|
|
skip_cudagraph_warmup = False
|
|
|
|
# Synchronize before and after every compiled graph.
|
|
debug_sync_graph = False
|
|
|
|
# Synchronize after every kernel launch, to help pinpoint bugs
|
|
debug_sync_kernel = False
|
|
|
|
# Always load full blocks (rather than broadcasting inside the block)
|
|
dense_indexing = False
|
|
|
|
# limit tiling dimensions
|
|
max_tiles = 2
|
|
|
|
# use triton.autotune for pointwise ops with complex layouts
|
|
# this should only be disabled for debugging/testing
|
|
autotune_pointwise = True
|
|
|
|
# max autotune gemm with cublasLt
|
|
autotune_cublasLt = True
|
|
|
|
# should we stop a fusion to allow better tiling?
|
|
tiling_prevents_pointwise_fusion = True
|
|
tiling_prevents_reduction_fusion = True
|
|
|
|
# assert that indirect indexing does not read / write out of bounds
|
|
assert_indirect_indexing = True
|
|
|
|
# should we give different names to kernels
|
|
# Note: This is orthogonal to descriptive_names - this is deciding whether
|
|
# our triton kernel names should all be `triton_` (to maximize caching) or
|
|
# whether they should be unique.
|
|
unique_kernel_names = os.environ.get("TORCHINDUCTOR_UNIQUE_KERNEL_NAMES") == "1"
|
|
|
|
# should we put op names in kernel names
|
|
# False: No special names (just triton__1, triton__2, etc.)
|
|
# "torch": Maps to the fx op in the Dynamo graph (module name, method name, etc.)
|
|
# "original_aten": Maps to the highest-level aten op (i.e. pre-decompositions)
|
|
# "inductor_node": Maps to the node name in the FX graph passed to Inductor
|
|
descriptive_names = "original_aten"
|
|
|
|
# use alternate codegen for smaller reductions
|
|
persistent_reductions = (
|
|
os.environ.get("TORCHINDUCTOR_PERSISTENT_REDUCTIONS", "1") == "1"
|
|
)
|
|
|
|
# hint to Triton when arguments are divisible by 16
|
|
divisible_by_16 = True
|
|
|
|
# theses are not enforced, but they are used by asserts in triton_heuristics.py
|
|
# NOTE: mobilevit_s in timm_models required X to be set to the higher value 2048
|
|
max_block = {"X": 2048, "Y": 1024, "Z": 1024}
|
|
|
|
# Store the generated cubin files for cpp wrapper code to load
|
|
store_cubin = False
|
|
|
|
# the max number of spills we allow for the configs we benchmark.
|
|
# Setting this to 0 means we skip a config if it spills even a single
|
|
# register.
|
|
# Settting it to a larger value allows a config spilling a small amount
|
|
# of registers being benchmarked.
|
|
#
|
|
# NOTE: triton will always report >0 register spills for kernels using sin/cos.
|
|
# (check this issue https://github.com/openai/triton/issues/1756 )
|
|
# So far we see a fixed 8 spilled registers for kernels using sin/cos.
|
|
# Raise the threshold to 16 to be safe.
|
|
# We should revisit this once we understand more of the source of register spills.
|
|
spill_threshold: int = 16
|
|
|
|
# Inject a bug into our relu implementation; useful for testing our repro
|
|
# extraction and minification functionality.
|
|
# Valid values: "compile_error", "runtime_error", "accuracy"
|
|
inject_relu_bug_TESTING_ONLY = None
|
|
|
|
|
|
# create a directory containing lots of debug information
|
|
class trace:
|
|
# master switch for all debugging flags below
|
|
enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
|
|
|
|
# Save python logger call >=logging.DEBUG
|
|
debug_log = False
|
|
|
|
# Save python logger call >=logging.INFO
|
|
info_log = False
|
|
|
|
# Save input FX graph (post decomps, pre optimization)
|
|
fx_graph = True
|
|
|
|
# Save FX graph after transformations
|
|
fx_graph_transformed = True
|
|
|
|
# Save TorchInductor IR before fusion pass
|
|
ir_pre_fusion = True
|
|
|
|
# Save TorchInductor IR after fusion pass
|
|
ir_post_fusion = True
|
|
|
|
# Copy generated code to trace dir
|
|
output_code = True
|
|
|
|
# SVG figure showing post-fusion graph
|
|
graph_diagram = False
|
|
|
|
# Store cProfile (see snakeviz to view)
|
|
compile_profile = False
|
|
|
|
# Upload the .tar.gz file
|
|
# Needs to be overriden based on specific environment needs
|
|
upload_tar = None
|
|
|
|
|
|
_save_config_ignore = {
|
|
# workaround: "Can't pickle <function ...>"
|
|
"trace.upload_tar",
|
|
}
|
|
|
|
|
|
from .._dynamo.config_utils import install_config_module
|
|
|
|
# adds patch, save_config, etc
|
|
install_config_module(sys.modules[__name__])
|