Compare commits

...

4 Commits

Author SHA1 Message Date
f9851af59b Add Attention ops to CI (#165915)
This pull request introduces a new attention operator microbenchmark workflow to the CI system, enabling automated benchmarking and reporting for attention-related operations. The main changes include adding a new GitHub Actions workflow, to add attention benchmarks to the existing Pytorch operator microbenchmark [dashboard](https://hud.pytorch.org/benchmark/v3/dashboard/pytorch_operator_microbenchmark?renderGroupId=main&time.start=2025-10-27T00%3A00%3A00.000Z&time.end=2025-10-29T01%3A00%3A00.000Z&filters.device=cuda&filters.arch=NVIDIA+A100-SXM4-40GB&filters.deviceName=cuda%7C%7CNVIDIA+A100-SXM4-40GB&filters.operatorName=&lcommit.commit=665df0bc7288996d638fcc3da750f8cb2addd6d0&lcommit.workflow_id=18888994873&lcommit.date=2025-10-29T00%3A00%3A00Z&lcommit.branch=refs%2Ftags%2Fciflow%2Fop-benchmark%2F165915&rcommit.commit=665df0bc7288996d638fcc3da750f8cb2addd6d0&rcommit.workflow_id=18888994873&rcommit.date=2025-10-29T00%3A00%3A00Z&rcommit.branch=refs%2Ftags%2Fciflow%2Fop-benchmark%2F165915&lbranch=refs%2Ftags%2Fciflow%2Fop-benchmark%2F165915&rbranch=refs%2Ftags%2Fciflow%2Fop-benchmark%2F165915)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/165915
Approved by: https://github.com/jbschlosser
2025-11-13 05:30:04 +00:00
eeebf9f664 [dynamo] [3.14] Update broken numpy test (#167681)
This is related to upgrading numpy versions, not 3.14 specifically.  See https://github.com/numpy/numpy/pull/27148
Pull Request resolved: https://github.com/pytorch/pytorch/pull/167681
Approved by: https://github.com/williamwen42
ghstack dependencies: #167619
2025-11-13 04:27:55 +00:00
d9a50bf9a8 [dynamo] [3.14] Support np._CopyMode (#167619)
Upgrading scipy to 1.16 introduced errors related to the `copy` parameter of
`np.array`.  Add special handling for `np._CopyMode.IF_NEEDED`, which is not
handled correctly, but matches the existing behavior when `copy=None`.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167619
Approved by: https://github.com/williamwen42
2025-11-13 04:27:55 +00:00
2984331c87 [inductor][NFC][2/X] extract do_autotuning/autotune/benchmark from AlgorithmSelectorCache.__call__ (#167489)
Summary: see https://github.com/pytorch/pytorch/pull/167487 for context

Test Plan: CI

Differential Revision: D86714833

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167489
Approved by: https://github.com/aorenste
2025-11-13 03:29:39 +00:00
8 changed files with 380 additions and 167 deletions

View File

@ -1680,6 +1680,22 @@ test_operator_microbenchmark() {
done
}
test_attention_microbenchmark() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
TEST_DIR=$(pwd)
# Install attention-gym dependency
echo "Installing attention-gym..."
python -m pip install git+https://github.com/meta-pytorch/attention-gym.git@main
pip show triton
cd "${TEST_DIR}"/benchmarks/transformer
$TASKSET python score_mod.py --config configs/config_basic.yaml \
--output-json-for-dashboard "${TEST_REPORTS_DIR}/attention_microbenchmark.json"
}
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
(cd test && python -c "import torch; print(torch.__config__.show())")
(cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@ -1737,6 +1753,8 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
fi
elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
test_operator_microbenchmark
elif [[ "${TEST_CONFIG}" == *attention_microbenchmark* ]]; then
test_attention_microbenchmark
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then

View File

@ -0,0 +1,73 @@
name: attention_op_microbenchmark
on:
push:
tags:
- ciflow/op-benchmark/*
workflow_dispatch:
schedule:
# Run at 06:00 UTC everyday
- cron: 0 7 * * *
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
attn-microbenchmark-build:
if: github.repository_owner == 'pytorch'
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '8.0 9.0'
test-matrix: |
{ include: [
{ config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
{ config: "attention_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
]}
secrets: inherit
attn-microbenchmark-test:
name: attn-microbenchmark-test
uses: ./.github/workflows/_linux-test.yml
needs: attn-microbenchmark-build
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
secrets: inherit
# B200 runner
opmicrobenchmark-build-b200:
if: github.repository_owner == 'pytorch'
name: opmicrobenchmark-build-b200
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
]}
secrets: inherit
opmicrobenchmark-test-b200:
name: opmicrobenchmark-test-b200
uses: ./.github/workflows/_linux-test.yml
needs: opmicrobenchmark-build-b200
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
secrets: inherit

View File

@ -125,6 +125,17 @@ AttentionType = Literal[
]
DtypeString = Literal["bfloat16", "float16", "float32"]
SpeedupType = Literal["fwd", "bwd"]
# Operator Name mapping
backend_to_operator_name = {
"math": "math attention kernel",
"efficient": "efficient attention kernel",
"cudnn": "cudnn attention kernel",
"fav2": "flash attention 2 kernel",
"fav3": "flash attention 3 kernel",
"fakv": "flash attention kv cache kernel",
"og-eager": "eager attention kernel",
"flex": "flex attention kernel",
}
def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
@ -1265,12 +1276,14 @@ def _output_json_for_dashboard(
model: ModelInfo
metric: MetricInfo
operator_name = backend_to_operator_name.get(backend, backend)
# Benchmark extra info
benchmark_extra_info = {
"input_config": input_config,
"device": device,
"arch": device_arch,
"operator_name": backend,
"operator_name": operator_name,
"attn_type": config.attn_type,
"shape": str(config.shape),
"max_autotune": config.max_autotune,
@ -1288,7 +1301,7 @@ def _output_json_for_dashboard(
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
"operator_name": operator_name,
"attn_type": config.attn_type,
},
),
@ -1315,7 +1328,7 @@ def _output_json_for_dashboard(
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
"operator_name": operator_name,
},
),
metric=MetricInfo(
@ -1341,7 +1354,7 @@ def _output_json_for_dashboard(
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
"operator_name": operator_name,
},
),
metric=MetricInfo(
@ -1371,7 +1384,7 @@ def _output_json_for_dashboard(
type="attention-benchmark",
origins=["pytorch"],
extra_info={
"operator_name": backend,
"operator_name": operator_name,
},
),
metric=MetricInfo(

View File

@ -683,6 +683,16 @@ class TestNumPyInterop(TestCase):
):
f(xs)
def test_copy_mode(self):
def f(x):
return np.array(x, copy=np._CopyMode.IF_NEEDED)
opt_f = torch.compile(backend="eager", fullgraph=True)(f)
x = np.array([1, 2, 3])
# Should run without throwing an exception
y = opt_f(x)
self.assertEqual(y, f(x))
instantiate_device_type_tests(TestNumPyInterop, globals())

View File

@ -310,7 +310,7 @@ class TestHistogram(TestCase):
)
# these should not crash
np.histogram([np.array(0.5) for i in range(10)] + [0.500000000000001])
np.histogram([np.array(0.5) for i in range(10)] + [0.500000000000002])
np.histogram([np.array(0.5) for i in range(10)] + [0.5])
@xpassIfTorchDynamo_np # (reason="bins='auto'")

View File

@ -18,6 +18,7 @@ Key classes include:
"""
import dataclasses
import enum
import functools
import inspect
import itertools
@ -1604,11 +1605,16 @@ class NumpyVariable(VariableTracker):
return self.value
def as_proxy(self):
if config.trace_numpy and isinstance(self.value, type):
# This handles numpy dtype attributes such as np.float32
# We return a string as we don't want to serialize non-PyTorch objects in the output FX graph
# In torch/_numpy we normalize strings to their dtypes when the input is a dtype, as NumPy does
return self.value.__name__
if config.trace_numpy:
# Can replace with EnumType once we drop 3.10 support
if isinstance(self.value, enum.EnumMeta):
# This is mostly for np._CopyMode
return self.value
if isinstance(self.value, type):
# This handles numpy dtype attributes such as np.float32
# We return a string as we don't want to serialize non-PyTorch objects in the output FX graph
# In torch/_numpy we normalize strings to their dtypes when the input is a dtype, as NumPy does
return self.value.__name__
return super().as_proxy()

View File

@ -2740,163 +2740,10 @@ class AlgorithmSelectorCache(PersistentCache):
inputs_key = create_inputs_key(input_nodes)
# TODO(nmacchioni): remove this hacky way to tell if we ran benchmarking
has_autotuned = False
def benchmark(choices, hint_override: Optional[int] = None):
nonlocal has_autotuned
# TODO(nmacchioni): remove this hacky way to tell if we ran benchmarking
has_autotuned = True
counters["inductor"]["select_algorithm_autotune"] += 1
# TODO(nmacchioni): remove this layer of abstraction
# construct `benchmark_fn` which should pick between in-process and sub-process autotuning
benchmark_fn = self.make_benchmark_fn(
choices, input_nodes, layout, input_gen_fns, hint_override=hint_override
)
# `benchmark_fn(choices)` will execute each choice, and return a dict[choice, timing] which
# maps each choice to its runtime, calculated by the specified benchmarker, in milliseconds
return benchmark_fn(choices)
def autotune(choices, hint_override: Optional[int] = None):
log.debug("Starting autotuning")
with dynamo_timed(
f"{name}_template_autotuning",
log_pt2_compile_event=True,
dynamo_compile_column_us="compile_time_autotune_time_us",
metadata=_autotune_metadata(input_nodes),
):
benchmark_results = benchmark(choices, hint_override=hint_override)
if config.max_autotune_report_choices_stats:
_log_autotune_choices_stats(
f"{name}_template_autotuning", benchmark_results
)
return benchmark_results
if config.autotune_in_subproc:
# Initialize the suprocess pool so it will warmup early.
torch._inductor.autotune_process.get_tuning_process_pool()
def do_autotuning(choices, precompile_fn, hint_override: Optional[int] = None):
precompile_start_ts = time.time()
with dynamo_timed(
f"{name}_template_precompiling",
log_pt2_compile_event=True,
dynamo_compile_column_us="compile_time_autotune_time_us",
):
precompile_fn()
precompile_elapse = time.time() - precompile_start_ts
log.debug("Precompilation elapsed time: %.02fs", precompile_elapse)
# Prune anything that failed to compile
choices = [c for c in choices if not c.failed]
if len(choices) == 0:
raise self.create_no_valid_choices(
name, "All choices failed to compile for backend."
)
candidates = self.prescreen_choices(
choices, name, inputs_key, self.prescreening_cache
)
prescreening_elapse: Optional[float] = None
if candidates:
prescreening_start_ts = time.time()
timings = self.lookup(
candidates,
name,
inputs_key,
lambda choices: autotune(choices, hint_override=hint_override),
hint_override=hint_override,
)
choices = self.prune_choices_postscreen(
choices, timings, name, inputs_key, self.prescreening_cache
)
prescreening_elapse = time.time() - prescreening_start_ts
log.debug("Prescreening elapsed time: %.02fs", prescreening_elapse)
autotune_start_ts = time.time()
if best_config_future is not None:
best_config = await_sync(best_config_future)
important_keys = [
"ACC_TYPE",
"ALLOW_TF32",
"BLOCK_K",
"BLOCK_M",
"BLOCK_N",
"EVEN_K",
"GROUP_M",
"USE_FAST_ACCUM",
"num_stages",
"num_warps",
"num_consumer_groups",
"num_buffers_warp_spec",
]
choices = [
choice
for choice in choices
if all(
f"{k}={best_config[k]}" in choice.description
for k in important_keys
)
for k in important_keys
]
log.info("Filtered to %d choices based on best_config", len(choices))
timings = self.lookup(
choices,
name,
inputs_key,
lambda choices: autotune(choices, hint_override=hint_override),
hint_override=hint_override,
)
autotune_elapse = time.time() - autotune_start_ts
log.debug("Autotuning elapsed time: %.02fs", autotune_elapse)
if timings and all(
not math.isfinite(timing) for timing in timings.values()
):
raise NoValidChoicesError
if (
has_autotuned
or log.getEffectiveLevel() == logging.DEBUG
or config.trace.log_autotuning_results
):
self.log_results(
name,
input_nodes,
timings,
autotune_elapse,
precompile_elapse,
prescreening_elapse,
hint_override=hint_override,
)
def profiler_bench_function():
# we're not running through the normal caching autotuner method here because we want to avoid returning
# the cached value.
# Avoid benchmarking in a separate process because it's not easy to signal to the TuningProcess that we
# should use the profiler.
with config.patch(
profile_bandwidth_with_do_bench_using_profiling=True,
autotune_in_subproc=False,
):
return benchmark(choices)
for feedback_fn in self.feedback_saver_fns:
# re-benchmarking the same choices with profiler is a bit expensive, so pass it in as a thunk.
feedback_fn(
timings,
name,
input_nodes,
choices,
profiler_bench_function,
)
return timings
precompile_fn = self.make_precompile_fn(
choices,
name,
@ -2913,8 +2760,16 @@ class AlgorithmSelectorCache(PersistentCache):
if not hasattr(c, "hint_override")
or c.hint_override == hint_override
]
timings = do_autotuning(
filtered_choices, precompile_fn, hint_override=hint_override
timings = self.do_autotuning(
name,
input_nodes,
layout,
input_gen_fns,
inputs_key,
filtered_choices,
precompile_fn,
hint_override=hint_override,
best_config_future=best_config_future,
)
min_extern_choice = float("inf")
for choice, timing in timings.items():
@ -2950,7 +2805,16 @@ class AlgorithmSelectorCache(PersistentCache):
)
)
timings = do_autotuning(choices, precompile_fn)
timings = self.do_autotuning(
name,
input_nodes,
layout,
input_gen_fns,
inputs_key,
choices,
precompile_fn,
best_config_future=best_config_future,
)
# if timings is empty, we really have no choice but to return a semi-random
# choice. returning the first `ExternKernelCaller` is probably the safest bet
# in this case, since it will generally be the ATen kernel. if there are no
@ -2986,6 +2850,229 @@ class AlgorithmSelectorCache(PersistentCache):
return node, choice
return node
def benchmark(
self,
choices,
input_nodes,
layout,
input_gen_fns,
hint_override: Optional[int] = None,
):
counters["inductor"]["select_algorithm_autotune"] += 1
# TODO(nmacchioni): remove this layer of abstraction
# construct `benchmark_fn` which should pick between in-process and sub-process autotuning
benchmark_fn = self.make_benchmark_fn(
choices, input_nodes, layout, input_gen_fns, hint_override=hint_override
)
# `benchmark_fn(choices)` will execute each choice, and return a dict[choice, timing] which
# maps each choice to its runtime, calculated by the specified benchmarker, in milliseconds
return benchmark_fn(choices)
def autotune(
self,
name,
input_nodes,
layout,
input_gen_fns,
choices,
hint_override: Optional[int] = None,
):
log.debug("Starting autotuning")
with dynamo_timed(
f"{name}_template_autotuning",
log_pt2_compile_event=True,
dynamo_compile_column_us="compile_time_autotune_time_us",
metadata=_autotune_metadata(input_nodes),
):
benchmark_results = self.benchmark(
choices, input_nodes, layout, input_gen_fns, hint_override=hint_override
)
if config.max_autotune_report_choices_stats:
_log_autotune_choices_stats(
f"{name}_template_autotuning", benchmark_results
)
return benchmark_results
def do_autotuning(
self,
name,
input_nodes,
layout,
input_gen_fns,
inputs_key,
choices,
precompile_fn,
hint_override: Optional[int] = None,
best_config_future=None,
):
"""Execute the autotuning process for kernel algorithm selection.
This method orchestrates the complete autotuning pipeline including precompilation,
prescreening, benchmarking, and feedback collection to select the optimal kernel
implementation for given inputs.
Args:
name: Name identifier for the operation being autotuned (e.g., 'mm', 'convolution').
input_nodes: List of input IR nodes used for benchmarking.
layout: Layout information specifying device and memory format for the operation.
input_gen_fns: Optional dict mapping argument indices to functions that generate
torch.Tensor inputs from ir.Buffer for benchmarking. If provided, these are
used instead of random tensors.
inputs_key: Cache key representing the input characteristics (sizes, strides, dtypes).
choices: List of ChoiceCaller objects representing candidate kernel implementations.
precompile_fn: Callable that precompiles all kernel choices before benchmarking.
hint_override: Optional index to override which choice is selected, used for testing
or forced selection.
best_config_future: Optional future containing pre-determined best configuration to
filter choices by specific config parameters.
Returns:
dict: Mapping from ChoiceCaller to benchmark timing in seconds. Choices with
non-finite timings (inf/nan) indicate failures.
Raises:
NoValidChoicesError: When all choices fail to compile or benchmark, or when all
timing results are non-finite.
"""
precompile_start_ts = time.time()
with dynamo_timed(
f"{name}_template_precompiling",
log_pt2_compile_event=True,
dynamo_compile_column_us="compile_time_autotune_time_us",
):
precompile_fn()
precompile_elapse = time.time() - precompile_start_ts
log.debug("Precompilation elapsed time: %.02fs", precompile_elapse)
# Prune anything that failed to compile
choices = [c for c in choices if not c.failed]
if len(choices) == 0:
raise self.create_no_valid_choices(
name, "All choices failed to compile for backend."
)
candidates = self.prescreen_choices(
choices, name, inputs_key, self.prescreening_cache
)
prescreening_elapse: Optional[float] = None
if candidates:
prescreening_start_ts = time.time()
timings = self.lookup(
candidates,
name,
inputs_key,
lambda choices: self.autotune(
name,
input_nodes,
layout,
input_gen_fns,
choices,
hint_override=hint_override,
),
hint_override=hint_override,
)
choices = self.prune_choices_postscreen(
choices, timings, name, inputs_key, self.prescreening_cache
)
prescreening_elapse = time.time() - prescreening_start_ts
log.debug("Prescreening elapsed time: %.02fs", prescreening_elapse)
autotune_start_ts = time.time()
if best_config_future is not None:
best_config = await_sync(best_config_future)
important_keys = [
"ACC_TYPE",
"ALLOW_TF32",
"BLOCK_K",
"BLOCK_M",
"BLOCK_N",
"EVEN_K",
"GROUP_M",
"USE_FAST_ACCUM",
"num_stages",
"num_warps",
"num_consumer_groups",
"num_buffers_warp_spec",
]
choices = [
choice
for choice in choices
if all(
f"{k}={best_config[k]}" in choice.description
for k in important_keys
)
for k in important_keys
]
log.info("Filtered to %d choices based on best_config", len(choices))
has_autotuned: bool = False
def track_has_autotuned(choices):
nonlocal has_autotuned
has_autotuned = True
return self.autotune(
name,
input_nodes,
layout,
input_gen_fns,
choices,
hint_override=hint_override,
)
timings = self.lookup(
choices,
name,
inputs_key,
track_has_autotuned,
hint_override=hint_override,
)
autotune_elapse = time.time() - autotune_start_ts
log.debug("Autotuning elapsed time: %.02fs", autotune_elapse)
if timings and all(not math.isfinite(timing) for timing in timings.values()):
raise NoValidChoicesError
if (
has_autotuned
or log.getEffectiveLevel() == logging.DEBUG
or config.trace.log_autotuning_results
):
self.log_results(
name,
input_nodes,
timings,
autotune_elapse,
precompile_elapse,
prescreening_elapse,
hint_override=hint_override,
)
def profiler_bench_function():
# we're not running through the normal caching autotuner method here because we want to avoid returning
# the cached value.
# Avoid benchmarking in a separate process because it's not easy to signal to the TuningProcess that we
# should use the profiler.
with config.patch(
profile_bandwidth_with_do_bench_using_profiling=True,
autotune_in_subproc=False,
):
return self.benchmark(choices, input_nodes, layout, input_gen_fns)
for feedback_fn in self.feedback_saver_fns:
# re-benchmarking the same choices with profiler is a bit expensive, so pass it in as a thunk.
feedback_fn(
timings,
name,
input_nodes,
choices,
profiler_bench_function,
)
return timings
def create_no_valid_choices(self, name: str, reason: str) -> NoValidChoicesError:
backend_config = (
"max_autotune_gemm_backends"

View File

@ -230,6 +230,12 @@ def _coerce_to_tensor(obj, dtype=None, copy=False, ndmin=0):
if ndim_extra > 0:
tensor = tensor.view((1,) * ndim_extra + tensor.shape)
# special handling for np._CopyMode
try:
copy = bool(copy)
except ValueError:
# TODO handle _CopyMode.IF_NEEDED correctly
copy = False
# copy if requested
if copy:
tensor = tensor.clone()