[ghstack-poisoned]
This commit is contained in:
Pearu Peterson
2025-09-29 16:51:55 +03:00
8 changed files with 296 additions and 261 deletions

View File

@ -1,6 +1,10 @@
---
name: "⚠️ CI SEV"
about: Tracking incidents for PyTorch's CI infra.
title: ''
labels: ''
assignees: ''
---
> NOTE: Remember to label this issue with "`ci: sev`"

View File

@ -0,0 +1,18 @@
---
name: DISABLE AUTOREVERT
about: Disables autorevert when open
title: "❌​\U0001F519 [DISABLE AUTOREVERT]"
labels: 'ci: disable-autorevert'
assignees: ''
---
This issue, while open, disables the autorevert functionality.
More details can be found [here](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md)
## Why are you disabling autorevert?
## Links to any issues/commits/errors that shows the source of problem

View File

@ -1,8 +1,10 @@
---
name: Disable CI jobs (PyTorch Dev Infra only)
about: Use this template to disable CI jobs
title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
labels: "module: ci"
title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]
labels: 'module: ci'
assignees: ''
---
> For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once

View File

@ -1 +1 @@
c77852e117bdf056c8e9a087e51d6f65cf6ba53d
0fc62aa26a30ed7ca419d285f285cb5ba02c4394

View File

@ -508,6 +508,7 @@ if not IS_WINDOWS:
self.assertNotEqual(result.data_ptr(), expected.data_ptr())
self.assertEqual(result.stride(), expected.stride())
@skipIfTorchDynamo("testing C++ accessor")
def test_my_element_wise_clone(self, device):
# tests tensor accessor
import libtorch_agnostic

View File

@ -1,249 +1,243 @@
{
"EndToEndLSTM (__main__.RNNTest)": 197.77900187174478,
"MultiheadAttention (__main__.ModulesTest)": 137.42000325520834,
"test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 214.1816660563151,
"test__adaptive_avg_pool2d (__main__.CPUReproTests)": 91.37688869900174,
"test_adaptive_max_pool2d1_cpu_halide (__main__.HalideCpuTests)": 116.57933298746745,
"test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 66.92922253078885,
"test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 65.68500010172527,
"test_alexnet_prefix_cpu_halide (__main__.HalideCpuTests)": 177.91966756184897,
"test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 87.69499969482422,
"test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 74.02233378092448,
"test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.45699946085612,
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 136.27599589029947,
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 259.30466715494794,
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 135.36400095621744,
"test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 61.07166544596354,
"test_aot_autograd_symbolic_exhaustive_ormqr_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.8491905757359,
"test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 90.34733327229817,
"test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 140.09266916910806,
"test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_False_cpu (__main__.AssociativeScanTests)": 65.17999935150146,
"test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_True_cpu (__main__.AssociativeScanTests)": 73.75112533569336,
"test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 646.9324035644531,
"test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 142.86450004577637,
"test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 493.49299791124133,
"test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 498.72944810655383,
"test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 133.2033322652181,
"test_avg_pool3d_backward_cpu_halide (__main__.HalideCpuTests)": 61.788333892822266,
"test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 69.57333119710286,
"test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 81.06516774495442,
"test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 215.5933346218533,
"test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 135.41816584269205,
"test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 338.17533026801215,
"test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 423.4767761230469,
"test_collect_callgrind (__main__.TestBenchmarkUtils)": 325.6485578748915,
"test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 111.10633341471355,
"test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 104.33766555786133,
"test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 69.72683334350586,
"test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 71.48199971516927,
"test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 96.58033243815105,
"test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 96.65433247884114,
"test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 464.92467244466144,
"test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 460.3839925130208,
"test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 263.58483632405597,
"test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 298.0318349202474,
"test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1310.3350016276042,
"test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 66.3976656595866,
"test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1316.084981282552,
"test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.58183288574219,
"test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.05749893188477,
"test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 72.31333287556966,
"test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.53133392333984,
"test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 82.40500005086263,
"test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 69.91749890645345,
"test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 70.98916562398274,
"test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 126.90333302815755,
"test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 112.40283330281575,
"test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 114.09550094604492,
"test_comprehensive_nn_functional_conv_transpose3d_cuda_complex128 (__main__.TestDecompCUDA)": 63.223000049591064,
"test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 67.44083213806152,
"test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 62.70066706339518,
"test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 60.468666076660156,
"test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 116.34999974568684,
"test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 116.57566579182942,
"test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 115.4306640625,
"test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 114.67599741617839,
"test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 78.96566772460938,
"test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 60.72616704305013,
"test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 270.3598327636719,
"test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 260.6623306274414,
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 88.48316701253255,
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.13166681925456,
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 83.55450057983398,
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 80.67749913533528,
"test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 136.17766698201498,
"test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 157.4010009765625,
"test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1222.983662923177,
"test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1228.281494140625,
"test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1216.2643432617188,
"test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 503.51465861002606,
"test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 523.0736694335938,
"test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 68.91749954223633,
"test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 61.947166442871094,
"test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 63.17983309427897,
"test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 77.92383321126302,
"test_comprehensive_nn_functional_unfold_cuda_complex64 (__main__.TestDecompCUDA)": 69.46137571334839,
"test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 62.2076670328776,
"test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 139.3495012919108,
"test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 124.99983469645183,
"test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 73.96983273824056,
"test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 73.27383422851562,
"test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 80.94216791788737,
"test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 73.65583419799805,
"test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 74.30566660563152,
"test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 112.75583267211914,
"test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 106.72283299763997,
"test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 102.85349909464519,
"test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 73.14683278401692,
"test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 137.8197758992513,
"test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 437.60955386691626,
"test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 75.4076665242513,
"test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 62.40233357747396,
"test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 149.36666870117188,
"test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 72.90299987792969,
"test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 81.56499862670898,
"test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 75.13744566175673,
"test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 82.20433298746745,
"test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 76.78600056966145,
"test_count_nonzero_all (__main__.TestBool)": 655.6186726888021,
"test_cpu_gpu_parity_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 80.43400009940652,
"test_custom_module_lstm (__main__.TestQuantizedOps)": 798.5362040201823,
"test_ddp_uneven_inputs (__main__.TestDistBackendWithSpawn)": 360.75275349617004,
"test_diff_hyperparams_sharding_strategy_str_no_shard (__main__.TestFSDPUseOrigParamsMultipleParamGroups)": 60.4433339436849,
"test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 85.3961664835612,
"test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDTensorOpsCPU)": 93.10799916585286,
"test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 215.1919957002004,
"test_error_detection_and_propagation (__main__.NcclErrorHandlingTest)": 67.04866790771484,
"test_fail_arithmetic_ops.py (__main__.TestTyping)": 64.6271112230089,
"test_fail_creation_ops.py (__main__.TestTyping)": 71.04431086573108,
"test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 88.46849950154622,
"test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 107.12216822306316,
"test_fuse_large_params_cpu (__main__.CpuTests)": 80.30040054321289,
"test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 162.87633260091147,
"test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 160.84833441840277,
"test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 153.62799580891928,
"test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 120.26516850789388,
"test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 62.87366739908854,
"test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 104.12133407592773,
"test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 117.95999908447266,
"test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 113.97000122070312,
"test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 248.1183293660482,
"test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 180.4351666768392,
"test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 160.81400299072266,
"test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 694.055165608724,
"test_grid_sampler_2d_cpu_halide (__main__.HalideCpuTests)": 194.28900146484375,
"test_group_norm (__main__.TestQuantizedOps)": 207.3484410179986,
"test_indirect_device_assert (__main__.TritonCodeGenTests)": 329.52866617838544,
"test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 67.15944459703233,
"test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 84.40099970499675,
"test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 132.7371097140842,
"test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.91166687011719,
"test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 130.4806671142578,
"test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 101.25733184814453,
"test_linear (__main__.TestStaticQuantizedModule)": 131.34678183661566,
"test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 124.32133229573567,
"test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 126.89633433024089,
"test_linear_relu (__main__.TestStaticQuantizedModule)": 128.11266708374023,
"test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 75.69916741053264,
"test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.60366736518012,
"test_lstm_cpu (__main__.TestMkldnnCPU)": 66.15800094604492,
"test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 130.17633226182727,
"test_max_autotune_addmm_max_autotune_gemm_backends_CK_x_shape2 (__main__.TestCKBackend)": 60.61724901199341,
"test_max_autotune_addmm_search_space_EXHAUSTIVE_dynamic_True (__main__.TestMaxAutotuneSubproc)": 82.76533508300781,
"test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_False (__main__.TestCKBackend)": 84.80249977111816,
"test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_True_use_aoti_False (__main__.TestCKBackend)": 82.48874931409955,
"test_max_pool2d2_cpu_halide (__main__.HalideCpuTests)": 421.6166585286458,
"test_max_pool2d3_cpu_halide (__main__.HalideCpuTests)": 133.6796671549479,
"test_max_pool2d5_cpu_halide (__main__.HalideCpuTests)": 357.6593322753906,
"test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.8608890109592,
"test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 64.60900031195746,
"test_proper_exit (__main__.TestDataLoader)": 223.7907740275065,
"test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 213.6155548095703,
"test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 168.48199971516928,
"test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 68.48926869834342,
"test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 68.39782928838963,
"test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 99.70321994357639,
"test_qat_resnet18 (__main__.TestQuantizePT2EQATModels)": 61.103378822063576,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 99.00533294677734,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.10599772135417,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 75.0443344116211,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.9883321126302,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.07866668701172,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 68.79566701253255,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 90.1106669108073,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.92966969807942,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 75.10766855875652,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 103.41666666666667,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 96.1106669108073,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 77.91766866048177,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 92.16766611735027,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.9856669108073,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 93.22266642252605,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.57533264160156,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 70.04799906412761,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 90.56433359781902,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.017333984375,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 94.46166737874348,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.06233215332031,
"test_qrnncell (__main__.TestDynamicQuantizedOps)": 204.8830050362481,
"test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 584.1243489583334,
"test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1194.274678548177,
"test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 842.1573282877604,
"test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1500.2438354492188,
"test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 80.01266479492188,
"test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 304.8406728108724,
"test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 123.26833089192708,
"test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 289.4941685994466,
"test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 78.4913330078125,
"test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 160.19433085123697,
"test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 76.93316650390625,
"test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 95.25599924723308,
"test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 190.9510014851888,
"test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 115.96716562906902,
"test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 85.82816696166992,
"test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 64.81233215332031,
"test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 73.0594991048177,
"test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 78.28866704305013,
"test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 203.66749827067056,
"test_save_load_large_string_attribute (__main__.TestSaveLoad)": 118.92166392008464,
"test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 161.21966722276477,
"test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 119.33677842881944,
"test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 122.50711229112413,
"test_sort_stable_cpu (__main__.CpuTritonTests)": 77.22933451334636,
"test_split_cumsum_cpu (__main__.CpuTritonTests)": 89.92000071207683,
"test_std (__main__.TestQuantizedOps)": 118.49511219395532,
"test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 149.61699732144675,
"test_tensor_split (__main__.TestVmapOperators)": 83.01314294423376,
"test_terminate_handler_on_crash (__main__.TestTorch)": 111.18021970325046,
"test_terminate_signal (__main__.ForkTest)": 131.81088901807865,
"test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 131.90911058253712,
"test_terminate_signal (__main__.SpawnTest)": 135.51344219843546,
"test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 71.71866671244304,
"test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 69.4015007019043,
"test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 75.85683250427246,
"test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 144.25,
"test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 142.70416514078775,
"test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 105.90866597493489,
"test_unary_ops (__main__.TestTEFuserDynamic)": 83.01277730200026,
"test_unary_ops (__main__.TestTEFuserStatic)": 84.06699878639645,
"test_upsample_bicubic2d_cpu_halide (__main__.HalideCpuTests)": 97.28433227539062,
"test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 96.625,
"test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 78.01066716512044,
"test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 82.23649978637695,
"test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 100.44966379801433,
"test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 78.67900085449219,
"test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 75.2140007019043,
"test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 100.80166753133138,
"test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 96.56916745503743,
"test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 99.54433314005534,
"test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 69.86966705322266,
"test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 103.45650100708008,
"test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 69.28766759236653,
"test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 70.02966690063477,
"test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 100.93566703796387,
"test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 94.60433260599773,
"test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 98.65516599019368,
"test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 105.35816828409831,
"test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 74.68983332316081,
"test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 152.76449966430664
"EndToEndLSTM (__main__.RNNTest)": 193.9463348388672,
"MultiheadAttention (__main__.ModulesTest)": 139.1413319905599,
"test_RNN_cpu_vs_cudnn_no_dropout (__main__.TestNN)": 61.32188834084405,
"test_RNN_cpu_vs_cudnn_with_dropout (__main__.TestNN)": 61.86122176382277,
"test__adaptive_avg_pool2d (__main__.CPUReproTests)": 95.82633293999567,
"test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 67.38811153835721,
"test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 65.14966583251953,
"test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 68.78299713134766,
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 128.60199737548828,
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 202.05332946777344,
"test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 130.45366923014322,
"test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 62.74999872843424,
"test_aot_autograd_symbolic_exhaustive_ormqr_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 62.966331481933594,
"test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 100.54332987467448,
"test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 139.6759999593099,
"test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_False_cpu (__main__.AssociativeScanTests)": 84.69166692097981,
"test_associative_scan_partial_grad_combine_mode_generic_compile_mode_compile_dynamic_shape_reverse_True_cpu (__main__.AssociativeScanTests)": 99.57083257039388,
"test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 1071.9003397623699,
"test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 132.51277838812933,
"test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 490.90610758463544,
"test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 499.2733323838976,
"test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 144.38800048828125,
"test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 84.0356674194336,
"test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 70.09966659545898,
"test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 237.6084442138672,
"test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 125.26549911499023,
"test_binary (__main__.StartProcessesListAsBinaryTest)": 1000.1651713053385,
"test_cat_2k_args (__main__.TestTEFuserDynamic)": 103.55858390861087,
"test_cat_2k_args (__main__.TestTEFuserStatic)": 105.87935954456528,
"test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 325.1778869628906,
"test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 413.15588717990454,
"test_collect_callgrind (__main__.TestBenchmarkUtils)": 308.7801106770833,
"test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 111.68166732788086,
"test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 111.85049947102864,
"test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 77.57400004069011,
"test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 70.30566787719727,
"test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 109.21900177001953,
"test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 110.39766693115234,
"test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 468.9913330078125,
"test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 465.79933675130206,
"test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 328.7638422648112,
"test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 273.0831705729167,
"test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1329.9698486328125,
"test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 65.87999979654948,
"test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1412.2426350911458,
"test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 68.08216667175293,
"test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.76099904378255,
"test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 76.54166666666667,
"test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 79.53366724650066,
"test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 75.6653340657552,
"test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 73.4104995727539,
"test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 72.91466649373372,
"test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 79.73185811723981,
"test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 81.20195234389533,
"test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 114.71116765340169,
"test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 124.87133534749348,
"test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 129.04366556803384,
"test_comprehensive_nn_functional_conv_transpose3d_cuda_complex128 (__main__.TestDecompCUDA)": 68.48949940999348,
"test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 67.9880002339681,
"test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 73.09500122070312,
"test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 61.695664723714195,
"test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 128.5021654764811,
"test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 127.40399932861328,
"test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 127.78599802652995,
"test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 109.68999989827473,
"test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 71.56066703796387,
"test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 67.20216687520345,
"test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 315.4568354288737,
"test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 281.15333048502606,
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 96.95050048828125,
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 79.56900151570638,
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 92.87216695149739,
"test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 83.13700103759766,
"test_comprehensive_nn_functional_interpolate_trilinear_cpu_float32 (__main__.TestDecompCPU)": 60.85660006205241,
"test_comprehensive_nn_functional_interpolate_trilinear_cpu_float64 (__main__.TestDecompCPU)": 63.24093297322591,
"test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 140.31299845377603,
"test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 147.38216400146484,
"test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1217.5511678059895,
"test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1214.899658203125,
"test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1275.4351806640625,
"test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 503.7261606852214,
"test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 528.8081563313802,
"test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 70.5486666361491,
"test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 62.99533335367838,
"test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 62.22033246358236,
"test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 66.1475003560384,
"test_comprehensive_nn_functional_unfold_cuda_complex64 (__main__.TestDecompCUDA)": 77.28883298238118,
"test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 61.14533233642578,
"test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 137.0048370361328,
"test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 135.70533243815103,
"test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 70.40099906921387,
"test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 76.96783447265625,
"test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 75.45516713460286,
"test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 73.21916580200195,
"test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 78.95233408610027,
"test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 97.49600092569987,
"test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 90.06533432006836,
"test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 78.35216776529948,
"test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 66.41249910990398,
"test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 221.83310869004993,
"test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 450.34077623155383,
"test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 74.4596659342448,
"test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.95766576131185,
"test_conv2d_unary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 61.92473398844401,
"test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 147.46733601888022,
"test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 66.45500055948894,
"test_conv3d_unary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 74.4767339070638,
"test_conv_bn_fuse_cpu (__main__.CpuTests)": 88.3716672261556,
"test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 83.09855524698894,
"test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 78.07822291056316,
"test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 82.23633321126302,
"test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 78.74683252970378,
"test_count_nonzero_all (__main__.TestBool)": 605.5543280707466,
"test_diff_hyperparams_sharding_strategy_str_full_shard (__main__.TestFSDPUseOrigParamsMultipleParamGroups)": 60.07566706339518,
"test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 86.39066569010417,
"test_eager_sequence_nr_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 151.10684871673584,
"test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 280.5350062052409,
"test_fail_arithmetic_ops.py (__main__.TestTyping)": 65.64933268229167,
"test_fail_random.py (__main__.TestTyping)": 69.83713787999646,
"test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 98.06083297729492,
"test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 84.82183329264323,
"test_fuse_large_params_cpu (__main__.CpuTests)": 94.35925006866455,
"test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 152.93722364637586,
"test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 153.35333591037326,
"test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 150.93633015950522,
"test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 124.4856669108073,
"test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 101.38466898600261,
"test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 129.30833435058594,
"test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 123.59467061360677,
"test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 226.4316609700521,
"test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 135.91733169555664,
"test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 152.5710016886393,
"test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 547.8351593017578,
"test_indirect_device_assert (__main__.TritonCodeGenTests)": 326.15099080403644,
"test_inductor_dynamic_shapes_broadcasting_dynamic_shapes (__main__.DynamicShapesReproTests)": 148.13324947357177,
"test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 69.01822280883789,
"test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 91.91716639200847,
"test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 121.67666541205512,
"test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.85133361816406,
"test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 125.84166463216145,
"test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 102.97733306884766,
"test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 116.88899993896484,
"test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 131.7979990641276,
"test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 87.50866731007893,
"test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.516111585829,
"test_low_memory_max_pool_dilation_1_dim_3_cpu_halide (__main__.HalideCpuTests)": 585.5493367513021,
"test_low_memory_max_pool_dilation_2_dim_3_cpu_halide (__main__.HalideCpuTests)": 504.03033447265625,
"test_lstm_cpu (__main__.TestMkldnnCPU)": 82.75600051879883,
"test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 127.66822052001953,
"test_max_autotune_addmm_max_autotune_gemm_backends_CK_x_shape2 (__main__.TestCKBackend)": 65.35700225830078,
"test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_False (__main__.TestCKBackend)": 88.17566680908203,
"test_max_autotune_precompile_matmul_max_autotune_gemm_backends_CKTILE_autotune_in_subproc_False_use_aoti_True (__main__.TestCKBackend)": 104.73616536458333,
"test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 61.47077645195855,
"test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 64.25455644395616,
"test_pattern_matcher_multi_user_cpu (__main__.CpuTritonTests)": 147.75366719563803,
"test_proper_exit (__main__.TestDataLoader)": 223.39966583251953,
"test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 225.20499844021268,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 99.69400024414062,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 93.267333984375,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 88.71366628011067,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 99.59566497802734,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 95.48433430989583,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 84.74000295003255,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 94.17366536458333,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 97.67199961344402,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 84.49066670735677,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 103.60699971516927,
"test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 105.57999674479167,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 87.43433634440105,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 95.87666575113933,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 97.8759994506836,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 97.38899993896484,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.58533223470052,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 76.88133239746094,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 94.61466725667317,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 97.26300048828125,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 101.59933217366536,
"test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.20266723632812,
"test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 593.1826578776041,
"test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1345.5384928385417,
"test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 810.4720052083334,
"test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1663.7119954427083,
"test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 95.72633361816406,
"test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 326.12732950846356,
"test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 141.55733235677084,
"test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 282.22650146484375,
"test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 77.04066467285156,
"test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 166.39016977945963,
"test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 80.13649876912434,
"test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 104.67433166503906,
"test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 192.83983357747397,
"test_quick_core_backward_std_cpu_float64 (__main__.TestDecompCPU)": 61.601732889811196,
"test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 124.57199732462566,
"test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 98.55700047810872,
"test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 63.58999888102213,
"test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 88.80024898052216,
"test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 106.82337558269501,
"test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 69.5413335164388,
"test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 71.78844494289822,
"test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 202.65066528320312,
"test_save_load_large_string_attribute (__main__.TestSaveLoad)": 126.04400126139323,
"test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 160.4792226155599,
"test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 119.55877685546875,
"test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 127.251890818278,
"test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 149.14533384641013,
"test_terminate_handler_on_crash (__main__.TestTorch)": 110.46733378039465,
"test_terminate_signal (__main__.ForkTest)": 129.4945574692554,
"test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 129.3840003940794,
"test_terminate_signal (__main__.SpawnTest)": 133.58888869815402,
"test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 68.54926042971404,
"test_train_parity_multi_group (__main__.TestFullyShard1DTrainingCore)": 145.9515012105306,
"test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 102.10916709899902,
"test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 88.42933400472005,
"test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 72.05966695149739,
"test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 140.50333150227866,
"test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 135.5356648763021,
"test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 101.21150016784668,
"test_unary_ops (__main__.TestTEFuserDynamic)": 85.61999864048428,
"test_unary_ops (__main__.TestTEFuserStatic)": 88.45922152201335,
"test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 88.01866658528645,
"test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 73.22766494750977,
"test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 82.29233296712239,
"test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 104.00999959309895,
"test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 85.15966542561848,
"test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 85.22466786702473,
"test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 87.94649950663249,
"test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 84.46383221944173,
"test_vmapjvpvjp_linalg_pinv_singular_cpu_float32 (__main__.TestOperatorsCPU)": 67.13399887084961,
"test_vmapjvpvjp_linalg_solve_triangular_cuda_float32 (__main__.TestOperatorsCUDA)": 64.15857097080776,
"test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 79.29399998982747,
"test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 77.52399826049805,
"test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 98.83866755167644,
"test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 64.5303332010905,
"test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 109.37966791788737,
"test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 100.79700152079265,
"test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 64.4817141578311,
"test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 72.93033345540364,
"test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 61.30200068155924,
"test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 115.37533315022786,
"test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 96.93316650390625,
"test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 181.42449696858725
}

View File

@ -2342,6 +2342,7 @@ def triton_config_reduction(
num_warps=None,
register_intensive=False,
dynamic_scale_rblock=True,
reduction_hint=None,
) -> Config:
"""
Construct a reduction triton config with some adjustment heuristics
@ -2369,7 +2370,12 @@ def triton_config_reduction(
rnumels[prefix] *= 2
if num_warps is None:
num_warps = total_numel() // 128
if reduction_hint == ReductionHint.INNER and not is_fbcode():
# r is contiguous, so ensure that each thread has 8 elements for
# vectorized loads, assuming bf16/fp16
num_warps = r // (32 * 8)
else:
num_warps = total_numel() // 128
max_num_warps = 16 if r <= 8192 else 32
num_warps = _num_warps(
@ -2639,6 +2645,7 @@ def _reduction_configs(
num_stages=num_stages,
register_intensive=register_intensive,
dynamic_scale_rblock=dynamic_scale_rblock,
reduction_hint=reduction_hint,
)
def outer_config_opt():
@ -2690,7 +2697,7 @@ def _reduction_configs(
)
contiguous_config = make_config(
1,
1 if rnumel > 2048 and not is_fbcode() else 2, # 1024 or less is persistent
min(rnumel, MAX_R0_BLOCK),
register_intensive=register_intensive,
)
@ -2920,7 +2927,13 @@ def _persistent_reduction_configs(
if "y" not in size_hints:
configs = [
triton_config_reduction(size_hints, xblock, rnumel, register_intensive=True)
triton_config_reduction(
size_hints,
xblock,
rnumel,
register_intensive=True,
reduction_hint=reduction_hint,
)
for xblock in (1, 8, 32, 128)
if xblock == 1
or (rnumel * xblock <= MAX_PERSISTENT_BLOCK_NUMEL and xblock <= xnumel)
@ -2963,6 +2976,7 @@ def _persistent_reduction_configs(
x_block,
rnumel,
register_intensive=True,
reduction_hint=reduction_hint,
)
]
@ -2974,6 +2988,7 @@ def _persistent_reduction_configs(
size_hints,
2 * (256 // rnumel) if rnumel <= 256 else 1,
rnumel,
reduction_hint=reduction_hint,
)
]
for c in configs:

View File

@ -8,6 +8,7 @@
#include <torch/headeronly/macros/Macros.h>
#include <type_traits>
#include <vector>
namespace torch::stable {
@ -23,7 +24,7 @@ struct RestrictPtrTraits {
template <
typename T,
size_t N,
std::size_t N,
template <typename U> class PtrTraits = DefaultPtrTraits,
typename index_t = int64_t>
class TensorAccessorBase {
@ -38,7 +39,7 @@ class TensorAccessorBase {
// Originally, TensorAccessor is a view of sizes and strides as
// these are ArrayRef instances. Until torch::stable supports an
// ArrayRef-like feature, we store copies of sizes and strides:
for (auto i = 0; i < N; ++i) {
for (std::size_t i = 0; i < N; ++i) {
this->sizes_[i] = sizes_[i];
this->strides_[i] = strides_[i];
}
@ -65,7 +66,7 @@ class TensorAccessorBase {
template <
typename T,
size_t N,
std::size_t N,
template <typename U> class PtrTraits = DefaultPtrTraits,
typename index_t = int64_t>
class TensorAccessor : public TensorAccessorBase<T, N, PtrTraits, index_t> {
@ -117,7 +118,7 @@ class TensorAccessor<T, 1, PtrTraits, index_t>
template <
typename T,
size_t N,
std::size_t N,
template <typename U> class PtrTraits = DefaultPtrTraits,
typename index_t = int64_t>
class GenericPackedTensorAccessorBase {
@ -140,7 +141,7 @@ class GenericPackedTensorAccessorBase {
const source_index_t* sizes_,
const source_index_t* strides_)
: data_(data_) {
for (auto i = 0; i < N; ++i) {
for (std::size_t i = 0; i < N; ++i) {
this->sizes_[i] = sizes_[i];
this->strides_[i] = strides_[i];
}
@ -171,7 +172,7 @@ class GenericPackedTensorAccessorBase {
template <
typename T,
size_t N,
std::size_t N,
template <typename U> class PtrTraits = DefaultPtrTraits,
typename index_t = int64_t>
class GenericPackedTensorAccessor
@ -254,14 +255,14 @@ class GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>
template <
typename T,
size_t N,
std::size_t N,
template <typename U> class PtrTraits = DefaultPtrTraits>
using PackedTensorAccessor32 =
GenericPackedTensorAccessor<T, N, PtrTraits, int32_t>;
template <
typename T,
size_t N,
std::size_t N,
template <typename U> class PtrTraits = DefaultPtrTraits>
using PackedTensorAccessor64 =
GenericPackedTensorAccessor<T, N, PtrTraits, int64_t>;