Revert "[inductor] verify determinism with inductor benchmark script (#164904)"

This reverts commit a3c700656f9a666eb33074b60333a23eb7e99a15.

Reverted https://github.com/pytorch/pytorch/pull/164904 on behalf of https://github.com/huydhn due to Sorry for reverting your PR but there seems to be some failed vLLM failures coming out of this ([comment](https://github.com/pytorch/pytorch/pull/164904#issuecomment-3388443678))
This commit is contained in:
PyTorch MergeBot
2025-10-10 06:23:07 +00:00
parent 38095fbd13
commit d2cb183344
9 changed files with 35 additions and 172 deletions

View File

@ -50,7 +50,6 @@ from torch._dynamo.testing import (
reset_rng_state,
same,
)
from torch._dynamo.utils import bitwise_same
from torch._logging.scribe import open_source_signpost
@ -2322,40 +2321,6 @@ class BenchmarkRunner:
new_result = process_fn(new_result)
fp64_outputs = process_fn(fp64_outputs)
if (
self.args.save_model_outputs_to
and self.args.compare_model_outputs_with
and self.args.save_model_outputs_to
== self.args.compare_model_outputs_with
):
log.warning(
"args.save_model_outputs_to and args.compare_model_outputs_with points to the same path."
"Result will be undefined."
)
if self.args.save_model_outputs_to:
print(f"Save model outputs to: {self.args.save_model_outputs_to}")
torch.save(new_result, self.args.save_model_outputs_to)
if self.args.compare_model_outputs_with:
print(
f"Load model outputs from {self.args.compare_model_outputs_with} to compare"
)
saved_result = torch.load(self.args.compare_model_outputs_with)
is_bitwise_same = bitwise_same(saved_result, new_result)
if not is_bitwise_same:
print(
"The result is not bitwise equivalent to the previously saved result"
)
return record_status(
"not_bitwise_equivalent", dynamo_start_stats=start_stats
)
print(
"The result is bitwise equivalent to the previously saved result"
)
del saved_result
if not same(
correct_result,
new_result,
@ -3396,17 +3361,6 @@ def parse_args(args=None):
help="Enables caching precompile, serializing artifacts to DynamoCache between runs",
)
parser.add_argument(
"--save-model-outputs-to",
default="",
help="Specify the path to save model output to so we can load later for comparison",
)
parser.add_argument(
"--compare-model-outputs-with",
default="",
help="Specify the path for the saved model outputs to compare against",
)
group_latency = parser.add_mutually_exclusive_group()
group_latency.add_argument(
"--cold-start-latency",
@ -3686,43 +3640,6 @@ def write_csv_when_exception(args, name: str, status: str, device=None):
write_outputs(output_filename, headers, row)
def setup_determinism_for_accuracy_test(args):
if args.only is not None and args.only not in {
"alexnet",
"Background_Matting",
"pytorch_CycleGAN_and_pix2pix",
"pytorch_unet",
"Super_SloMo",
"vgg16",
# https://github.com/pytorch/pytorch/issues/96724
"Wav2Vec2ForCTC",
"Wav2Vec2ForPreTraining",
"sam",
"sam_fast",
"resnet50_quantized_qat",
"mobilenet_v2_quantized_qat",
"detectron2_maskrcnn",
"detectron2_maskrcnn_r_101_c4",
"detectron2_maskrcnn_r_101_fpn",
"detectron2_maskrcnn_r_50_c4",
"detectron2_maskrcnn_r_50_fpn",
"detectron2_fasterrcnn_r_101_c4",
"detectron2_fasterrcnn_r_101_dc5",
"detectron2_fasterrcnn_r_101_fpn",
"detectron2_fasterrcnn_r_50_c4",
"detectron2_fasterrcnn_r_50_dc5",
"detectron2_fasterrcnn_r_50_fpn",
}:
# some of the models do not support use_deterministic_algorithms
torch.use_deterministic_algorithms(True)
if args.devices == ["xpu"]:
torch.use_deterministic_algorithms(True, warn_only=True)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.mkldnn.deterministic = True
def run(runner, args, original_dir=None):
# Pass the parsed args object to benchmark runner object
torch._dynamo.reset()
@ -3788,9 +3705,36 @@ def run(runner, args, original_dir=None):
# TODO - Using train mode for timm_models and HF models. Move to train mode for Torchbench as well.
args.use_eval_mode = True
inductor_config.fallback_random = True
setup_determinism_for_accuracy_test(args)
if args.only is not None and args.only not in {
"alexnet",
"Background_Matting",
"pytorch_CycleGAN_and_pix2pix",
"pytorch_unet",
"Super_SloMo",
"vgg16",
# https://github.com/pytorch/pytorch/issues/96724
"Wav2Vec2ForCTC",
"Wav2Vec2ForPreTraining",
"sam",
"sam_fast",
"resnet50_quantized_qat",
"mobilenet_v2_quantized_qat",
"detectron2_maskrcnn",
"detectron2_maskrcnn_r_101_c4",
"detectron2_maskrcnn_r_101_fpn",
"detectron2_maskrcnn_r_50_c4",
"detectron2_maskrcnn_r_50_fpn",
"detectron2_fasterrcnn_r_101_c4",
"detectron2_fasterrcnn_r_101_dc5",
"detectron2_fasterrcnn_r_101_fpn",
"detectron2_fasterrcnn_r_50_c4",
"detectron2_fasterrcnn_r_50_dc5",
"detectron2_fasterrcnn_r_50_fpn",
}:
# some of the models do not support use_deterministic_algorithms
torch.use_deterministic_algorithms(True)
if args.devices == ["xpu"]:
torch.use_deterministic_algorithms(True, warn_only=True)
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
if args.only is not None and args.only in {
"nvidia_deeprecommender",
@ -3799,10 +3743,14 @@ def run(runner, args, original_dir=None):
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.benchmark = False
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(False)
torch.backends.mkldnn.deterministic = True
# Remove randomness when torch manual seed is called
patch_torch_manual_seed()

View File

@ -24,22 +24,8 @@ class DeterministicTest(TestCase):
super().setUp()
self._exit_stack = contextlib.ExitStack()
self._exit_stack.enter_context(fresh_cache())
self._exit_stack.enter_context(
getattr(torch.backends, "__allow_nonbracketed_mutation")() # noqa: B009
)
self.old_flags = [
torch.backends.cudnn.deterministic,
torch.backends.cudnn.benchmark,
torch.backends.mkldnn.deterministic,
]
def tearDown(self) -> None:
(
torch.backends.cudnn.deterministic,
torch.backends.cudnn.benchmark,
torch.backends.mkldnn.deterministic,
) = self.old_flags
self._exit_stack.close()
super().tearDown()

View File

@ -2912,15 +2912,6 @@ def rmse(ref: torch.Tensor, res: torch.Tensor) -> torch.Tensor:
return torch.sqrt(torch.mean(torch.square(ref - res)))
def bitwise_same(ref: Any, res: Any, equal_nan: bool = False) -> bool:
return same(
ref,
res,
tol=0.0,
equal_nan=equal_nan,
)
def same(
ref: Any,
res: Any,

View File

@ -4274,6 +4274,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
def inductor_meta_common():
inductor_meta = {
"backend_hash": torch.utils._triton.triton_hash_with_backend(),
"are_deterministic_algorithms_enabled": torch.are_deterministic_algorithms_enabled(),
"assert_indirect_indexing": config.assert_indirect_indexing,
"autotune_local_cache": config.autotune_local_cache,
"autotune_pointwise": config.triton.autotune_pointwise,
@ -4287,12 +4288,6 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
"store_cubin": config.triton.store_cubin,
"deterministic": config.deterministic,
}
if config.write_are_deterministic_algorithms_enabled:
inductor_meta["are_deterministic_algorithms_enabled"] = (
torch.are_deterministic_algorithms_enabled()
)
if torch.version.hip is not None:
inductor_meta["is_hip"] = True
if config.is_fbcode():

View File

@ -2447,11 +2447,6 @@ def compile_fx(
ignore_shape_env=ignore_shape_env,
)
if config.deterministic:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.mkldnn.deterministic = True # type: ignore[assignment]
# Wake up the AsyncCompile subproc pool as early as possible (if there's cuda).
if any(
isinstance(e, torch.Tensor) and e.device.type in ("cuda", "xpu")

View File

@ -2004,10 +2004,6 @@ _cache_config_ignore_prefix: list[str] = [
# External callable for matmul tuning candidates
external_matmul: list[Callable[[torch.Tensor, torch.Tensor, torch.Tensor], None]] = []
write_are_deterministic_algorithms_enabled = (
os.getenv("TORCHINDUCTOR_WRITE_ARE_DETERMINISTIC_ALGORITHMS_ENABLED", "1") == "1"
)
class test_configs:
force_extern_kernel_in_multi_template: bool = False
@ -2053,14 +2049,6 @@ class test_configs:
os.getenv("TORCHINDUCTOR_FORCE_FILTER_REDUCTION_CONFIGS") == "1"
)
# a testing config to distort benchmarking result
# - empty string to disable
# - "inverse" to inverse the numbers
# - "random" return a random value
distort_benchmarking_result = os.getenv(
"TORCHINDUCTOR_DISTORT_BENCHMARKING_RESULT", ""
)
if TYPE_CHECKING:
from torch.utils._config_typing import * # noqa: F401, F403

View File

@ -1,4 +1,3 @@
import functools
import inspect
import time
from functools import cached_property, wraps
@ -24,40 +23,6 @@ P = ParamSpec("P")
T = TypeVar("T")
def may_distort_benchmarking_result(fn: Callable[..., Any]) -> Callable[..., Any]:
from torch._inductor import config
if config.test_configs.distort_benchmarking_result == "":
return fn
def distort(
ms: Union[list[float], tuple[float], float],
) -> Union[list[float], tuple[float], float]:
if isinstance(ms, (list, tuple)):
return type(ms)(distort(val) for val in ms) # type: ignore[misc]
distort_method = config.test_configs.distort_benchmarking_result
assert isinstance(ms, float)
if distort_method == "inverse":
return 1.0 / ms if ms else 0.0
elif distort_method == "random":
import random
return random.random()
else:
raise RuntimeError(f"Unrecognized distort method {distort_method}")
@functools.wraps(fn)
def wrapper(
*args: list[Any], **kwargs: dict[str, Any]
) -> Union[list[float], tuple[float], float]:
ms = fn(*args, **kwargs)
return distort(ms)
return wrapper
def may_ban_benchmarking() -> None:
if torch._inductor.config.deterministic:
raise RuntimeError("""In the deterministic mode of Inductor, we will avoid those
@ -194,7 +159,6 @@ class TritonBenchmarker(Benchmarker):
raise NotImplementedError("requires Triton") from e
return do_bench
@may_distort_benchmarking_result
@time_and_count
def benchmark_gpu(
self: Self,
@ -263,7 +227,6 @@ class InductorBenchmarker(TritonBenchmarker): # noqa: docstring_linter
]
)
@may_distort_benchmarking_result
@time_and_count
def benchmark_gpu( # type: ignore[override]
self: Self,

View File

@ -3020,7 +3020,6 @@ def reduction(
configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
configs = filter_reduction_configs_for_determinism(inductor_meta, configs)
return cached_autotune(
size_hints,
configs=configs,

View File

@ -72,7 +72,6 @@ OPTIMUS_EXCLUDE_POST_GRAD = [
"inductor_autotune_lookup_table",
]
from torch._inductor.runtime.benchmarking import may_distort_benchmarking_result
from torch.fx.experimental.symbolic_shapes import (
free_symbols,
free_unbacked_symbols,
@ -272,7 +271,6 @@ def fp8_bench(fn: Callable[[], Any], warmup: int = 25, rep: int = 100) -> float:
return res
@may_distort_benchmarking_result
def do_bench_using_profiling(
fn: Callable[[], Any],
warmup: int = 25,