pytorch

mirror of https://github.com/pytorch/pytorch.git synced 2025-10-22 14:15:01 +08:00

Author	SHA1	Message	Date
PyTorch UpdateBot	3217cb3fa1	update xla commit hash	2025-10-20 07:42:07 +00:00
Amin Sedaghat	c1eda348be	[cuda] fix triu/tril int32 overflow for large matrices (#164705 ) Fixes #136611 Cast blockIdx.x to int64_t before multiplication to prevent overflow when computing linear_idx for matrices larger than 2^31 elements. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164705 Approved by: https://github.com/eqy, https://github.com/ngimel	2025-10-20 07:17:41 +00:00
Amin Sedaghat	ba93d5636e	[cuda] fix nll_loss2d backward bounds check with reduction=none (#165247 ) Fixes #49882 Add missing bounds check in nll_loss2d backward kernel with reduction=none. Forward kernel already had CUDA_KERNEL_ASSERT for target bounds, now backward kernel matches. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165247 Approved by: https://github.com/ngimel	2025-10-20 06:25:11 +00:00
Animesh Jain	722b2b86c9	[dynamo] Remove duplicated guards (#165806 ) This is by looking at a tlparse of an internal job. We will need deeper audit. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165806 Approved by: https://github.com/jansel	2025-10-20 05:50:33 +00:00
Yuanyuan Chen	e1e8491b31	[1/N] Change C-style casts to static_cast or reinterpret_cast (#165750 ) This series of changes try to cover C style casts into C++ alternatives. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165750 Approved by: https://github.com/Skylion007	2025-10-20 04:36:19 +00:00
Amin Sedaghat	767199fd9b	[flex_attention] replace sliced BlockMask noop with helpful error (#164702 ) Fixes part of #163314 After slicing BlockMask with `[]`, mask_mod was silently replaced with noop_mask. This caused silent incorrect results when users applied transformations to `sliced_mask.mask_mod`. Replace noop with `_sliced_mask_mod_error` that raises RuntimeError with guidance to use `base_mask.mask_mod` instead. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164702 Approved by: https://github.com/drisspg, https://github.com/BoyuanFeng	2025-10-20 03:46:16 +00:00
PyTorch MergeBot	602ace5eb4	Revert "[ATen] Fix CUDA reduction warp shuffle order (#164790 )" This reverts commit 36371b8ec7a1baed255c18451b2c716386a54c95. Reverted https://github.com/pytorch/pytorch/pull/164790 on behalf of https://github.com/clee2000 due to was reverted due to failing internal tests after merge D84992607 ([comment](https://github.com/pytorch/pytorch/pull/164790#issuecomment-3420373755))	2025-10-20 03:06:52 +00:00
PyTorch MergeBot	47804ce467	Revert "12/n : Remove fbandroid_compiler_flags (#165558 )" This reverts commit aead9270f56ebc7302c7f5fa7e5dff959f26608e. Reverted https://github.com/pytorch/pytorch/pull/165558 on behalf of https://github.com/clee2000 due to Diff was actually reverted internally D84832629 ([comment](https://github.com/pytorch/pytorch/pull/165558#issuecomment-3420367955))	2025-10-20 03:03:13 +00:00
Sun, Jiayi	e8cb34dd52	[Inductor] support masked vectorization for the tail_loop for fp8 datatype (#163324 ) Summary: Support masked vectorization for the tail_loop for fp8 datatype. Example: ``` import torch def fn( x, scale, zero_point, quant_min, quant_max, dtype, ): x = torch.ops.quantized_decomposed.dequantize_per_tensor( x, scale, zero_point, quant_min, quant_max, dtype, ) x = torch.relu(x) x = torch.ops.quantized_decomposed.quantize_per_tensor( x, scale, zero_point, quant_min, quant_max, dtype ) return x quant_min = -128 quant_max = 127 dtype = torch.float8_e4m3fn x = torch.clamp(torch.randn((1, 7, 7, 9), dtype=torch.float32) * 100, quant_min, quant_max).to(dtype) zero_point = 100 scale = 0.01 with torch.no_grad(): compiled_fn = torch.compile(fn) compiled_fn(x, scale, zero_point, quant_min, quant_max, dtype) ``` Generated code: - Before ``` cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0 = async_compile.cpp_pybinding(['const at::Float8_e4m3fn', 'at::Float8_e4m3fn'], r''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(const at::Float8_e4m3fn* in_ptr0, at::Float8_e4m3fn* out_ptr0) { { for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(441L); x0+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(432L))) { auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); auto tmp1 = at::vec::convert<float>(tmp0); auto tmp2 = static_cast<float>(100.0); auto tmp3 = at::vec::Vectorized<float>(tmp2); auto tmp4 = tmp1 - tmp3; auto tmp5 = static_cast<float>(0.01); auto tmp6 = at::vec::Vectorized<float>(tmp5); auto tmp7 = tmp4 * tmp6; auto tmp8 = (tmp7); auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0)); auto tmp10 = tmp9 * tmp3; auto tmp11 = tmp10.round(); auto tmp12 = tmp11 + tmp3; auto tmp13 = static_cast<float>(-128.0); auto tmp14 = at::vec::Vectorized<float>(tmp13); auto tmp15 = at::vec::maximum(tmp12, tmp14); auto tmp16 = static_cast<float>(127.0); auto tmp17 = at::vec::Vectorized<float>(tmp16); auto tmp18 = at::vec::minimum(tmp15, tmp17); auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18); tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); } if(C10_UNLIKELY(x0 >= static_cast<int64_t>(432L) && x0 < static_cast<int64_t>(441L))) { for (int64_t x0_tail = static_cast<int64_t>(432L);x0_tail < static_cast<int64_t>(441L); x0_tail++) { auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)]; auto tmp1 = c10::convert<float>(tmp0); auto tmp2 = static_cast<float>(100.0); auto tmp3 = float(tmp1 - tmp2); auto tmp4 = static_cast<float>(0.01); auto tmp5 = float(tmp3 * tmp4); auto tmp6 = c10::convert<float>(tmp5); auto tmp7 = std::max(tmp6, decltype(tmp6)(0)); auto tmp8 = float(tmp7 * tmp2); auto tmp9 = std::nearbyint(tmp8); auto tmp10 = float(tmp9 + tmp2); auto tmp11 = static_cast<float>(-128.0); auto tmp12 = max_propagate_nan(tmp10, tmp11); auto tmp13 = static_cast<float>(127.0); auto tmp14 = min_propagate_nan(tmp12, tmp13); auto tmp15 = c10::convert<at::Float8_e4m3fn>(tmp14); out_ptr0[static_cast<int64_t>(x0_tail)] = tmp15; } } } } } } ''') async_compile.wait(globals()) del async_compile class Runner: def __init__(self, partitions): self.partitions = partitions def recursively_apply_fns(self, fns): new_callables = [] for fn, c in zip(fns, self.partitions): new_callables.append(fn(c)) self.partitions = new_callables def call(self, args): arg0_1, = args args.clear() assert_size_stride(arg0_1, (1, 7, 7, 9), (441, 63, 9, 1)) buf0 = empty_strided_cpu((1, 7, 7, 9), (441, 63, 9, 1), torch.float8_e4m3fn) # [Provenance debug handles] cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0:1 cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0(arg0_1, buf0) del arg0_1 return (buf0, ) ``` - After ``` cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0 = async_compile.cpp_pybinding(['const at::Float8_e4m3fn', 'at::Float8_e4m3fn'], r''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(const at::Float8_e4m3fn* in_ptr0, at::Float8_e4m3fn* out_ptr0) { { for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(441L); x0+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(432L))) { auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); auto tmp1 = at::vec::convert<float>(tmp0); auto tmp2 = static_cast<float>(100.0); auto tmp3 = at::vec::Vectorized<float>(tmp2); auto tmp4 = tmp1 - tmp3; auto tmp5 = static_cast<float>(0.01); auto tmp6 = at::vec::Vectorized<float>(tmp5); auto tmp7 = tmp4 * tmp6; auto tmp8 = (tmp7); auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0)); auto tmp10 = tmp9 * tmp3; auto tmp11 = tmp10.round(); auto tmp12 = tmp11 + tmp3; auto tmp13 = static_cast<float>(-128.0); auto tmp14 = at::vec::Vectorized<float>(tmp13); auto tmp15 = at::vec::maximum(tmp12, tmp14); auto tmp16 = static_cast<float>(127.0); auto tmp17 = at::vec::Vectorized<float>(tmp16); auto tmp18 = at::vec::minimum(tmp15, tmp17); auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18); tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); } if(C10_UNLIKELY(x0 >= static_cast<int64_t>(432L) && x0 < static_cast<int64_t>(441L))) { auto tmp0 = at::vec::Vectorized<at::Float8_e4m3fn>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(9L)); auto tmp1 = at::vec::convert<float>(tmp0); auto tmp2 = static_cast<float>(100.0); auto tmp3 = at::vec::Vectorized<float>(tmp2); auto tmp4 = tmp1 - tmp3; auto tmp5 = static_cast<float>(0.01); auto tmp6 = at::vec::Vectorized<float>(tmp5); auto tmp7 = tmp4 * tmp6; auto tmp8 = (tmp7); auto tmp9 = at::vec::clamp_min(tmp8, decltype(tmp8)(0)); auto tmp10 = tmp9 * tmp3; auto tmp11 = tmp10.round(); auto tmp12 = tmp11 + tmp3; auto tmp13 = static_cast<float>(-128.0); auto tmp14 = at::vec::Vectorized<float>(tmp13); auto tmp15 = at::vec::maximum(tmp12, tmp14); auto tmp16 = static_cast<float>(127.0); auto tmp17 = at::vec::Vectorized<float>(tmp16); auto tmp18 = at::vec::minimum(tmp15, tmp17); auto tmp19 = at::vec::convert<at::Float8_e4m3fn>(tmp18); tmp19.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(9L)); } } } } } ''') async_compile.wait(globals()) del async_compile class Runner: def __init__(self, partitions): self.partitions = partitions def recursively_apply_fns(self, fns): new_callables = [] for fn, c in zip(fns, self.partitions): new_callables.append(fn(c)) self.partitions = new_callables def call(self, args): arg0_1, = args args.clear() assert_size_stride(arg0_1, (1, 7, 7, 9), (441, 63, 9, 1)) buf0 = empty_strided_cpu((1, 7, 7, 9), (441, 63, 9, 1), torch.float8_e4m3fn) # [Provenance debug handles] cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0:1 cpp_fused_dequantize_per_tensor_quantize_per_tensor_relu_0(arg0_1, buf0) del arg0_1 return (buf0, ) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163324 Approved by: https://github.com/Xia-Weiwen, https://github.com/mingfeima, https://github.com/jansel ghstack dependencies: #163316	2025-10-20 01:56:00 +00:00
Sun, Jiayi	e9d8973427	[Inductor] support masked vectorization for the tail_loop for float64 datatype (#163316 ) Summary: Support masked vectorization for the tail_loop for float64 datatype. Example: ``` import torch def fn(x): return x * x x = torch.randn((22, 22), dtype=torch.double) with torch.no_grad(): compiled_fn = torch.compile(fn) compiled_fn(x) ``` Generated code: - Before ``` cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double', 'double'], r''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(const double* in_ptr0, double* out_ptr0) { { for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L))) { auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); auto tmp1 = tmp0 * tmp0; tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); } if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L))) { for (int64_t x0_tail = static_cast<int64_t>(480L);x0_tail < static_cast<int64_t>(484L); x0_tail++) { auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)]; auto tmp1 = double(tmp0 * tmp0); out_ptr0[static_cast<int64_t>(x0_tail)] = tmp1; } } } } } } ''') async_compile.wait(globals()) del async_compile class Runner: def __init__(self, partitions): self.partitions = partitions def recursively_apply_fns(self, fns): new_callables = [] for fn, c in zip(fns, self.partitions): new_callables.append(fn(c)) self.partitions = new_callables def call(self, args): arg0_1, = args args.clear() assert_size_stride(arg0_1, (22, 22), (22, 1)) buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64) # [Provenance debug handles] cpp_fused_mul_0:1 cpp_fused_mul_0(arg0_1, buf0) del arg0_1 return (buf0, ) ``` - After ``` cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double', 'double'], r''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(const double* in_ptr0, double* out_ptr0) { { for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L))) { auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); auto tmp1 = tmp0 * tmp0; tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); } if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L))) { auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L)); auto tmp1 = tmp0 * tmp0; tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L)); } } } } } ''') async_compile.wait(globals()) del async_compile class Runner: def __init__(self, partitions): self.partitions = partitions def recursively_apply_fns(self, fns): new_callables = [] for fn, c in zip(fns, self.partitions): new_callables.append(fn(c)) self.partitions = new_callables def call(self, args): arg0_1, = args args.clear() assert_size_stride(arg0_1, (22, 22), (22, 1)) buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64) # [Provenance debug handles] cpp_fused_mul_0:1 cpp_fused_mul_0(arg0_1, buf0) del arg0_1 return (buf0, ) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163316 Approved by: https://github.com/mingfeima, https://github.com/jansel	2025-10-20 01:41:38 +00:00
xinan.lin	61d9a5180e	[Fix XPU CI] [Inductor UT] Fix test cases broken by community. (#165714 ) Fixes #165719, Fixes #165771 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165714 Approved by: https://github.com/jansel	2025-10-19 23:59:04 +00:00
PaulZhang12	8a8329b51f	[ATen] Switch order of blocked reduce when vectorize loads (#165178 ) Performance benchmarking, perf neutral: ``` ================================================================================================================================================================================================================================================ Tensor Shape Operation Full reduce (ms) Non-Contig dim (ms) Contig dim (ms) Full reduce (ms) Non-Contig dim (ms) Contig dim (ms) Full diff % Non-Contig diff % Contig diff % ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (256, 256) mean 0.015684 0.017056 0.008287 0.016015 0.016929 0.008170 -2.07% +0.75% +1.43% (256, 256) sum 0.015774 0.016638 0.007926 0.015811 0.016935 0.008330 -0.23% -1.75% -4.85% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (512, 512) mean 0.013385 0.025742 0.008629 0.013046 0.026005 0.008924 +2.60% -1.01% -3.31% (512, 512) sum 0.013390 0.026059 0.009116 0.013054 0.025696 0.008952 +2.57% +1.41% +1.83% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 1024) mean 0.014213 0.015467 0.010334 0.013862 0.015082 0.010318 +2.53% +2.55% +0.16% (1024, 1024) sum 0.014179 0.015446 0.010774 0.014132 0.015073 0.010350 +0.33% +2.47% +4.10% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 2048) mean 0.018234 0.019487 0.014812 0.018482 0.019397 0.014802 -1.34% +0.46% +0.07% (2048, 2048) sum 0.018202 0.019529 0.015195 0.018122 0.019485 0.015129 +0.44% +0.23% +0.44% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (4096, 4096) mean 0.033582 0.039378 0.030751 0.033810 0.039673 0.031019 -0.67% -0.74% -0.86% (4096, 4096) sum 0.033604 0.039777 0.030809 0.033530 0.039386 0.031113 +0.22% +0.99% -0.98% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 8192) mean 0.085824 0.091133 0.084200 0.085431 0.091364 0.084303 +0.46% -0.25% -0.12% (8192, 8192) sum 0.085763 0.091442 0.084180 0.085508 0.091419 0.084595 +0.30% +0.03% -0.49% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 16384) mean 0.146480 0.147666 0.138807 0.146515 0.147987 0.138930 -0.02% -0.22% -0.09% (8192, 16384) sum 0.146446 0.147593 0.138559 0.146151 0.147982 0.139120 +0.20% -0.26% -0.40% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 32768) mean 0.266047 0.265386 0.253837 0.265648 0.265885 0.253652 +0.15% -0.19% +0.07% (8192, 32768) sum 0.266093 0.265421 0.253890 0.265458 0.265591 0.253567 +0.24% -0.06% +0.13% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 65536) mean 0.498632 0.508976 0.481865 0.498237 0.508777 0.481476 +0.08% +0.04% +0.08% (8192, 65536) sum 0.498917 0.508202 0.481883 0.498104 0.508016 0.481972 +0.16% +0.04% -0.02% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 131072) mean 0.957633 0.968519 0.938172 0.956766 0.968267 0.938196 +0.09% +0.03% -0.00% (8192, 131072) sum 0.956972 0.968140 0.937741 0.957365 0.968404 0.938056 -0.04% -0.03% -0.03% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 262144) mean 1.906661 1.928377 1.861846 1.907327 1.928811 1.862083 -0.03% -0.02% -0.01% (8192, 262144) sum 1.905976 1.928362 1.862399 1.907098 1.928844 1.861782 -0.06% -0.02% +0.03% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (4096, 262144) mean 0.956852 0.970101 0.936524 0.957263 0.969809 0.936965 -0.04% +0.03% -0.05% (4096, 262144) sum 0.957117 0.969933 0.936247 0.956675 0.969451 0.936395 +0.05% +0.05% -0.02% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 262144) mean 0.498813 0.511299 0.483415 0.498567 0.511482 0.483376 +0.05% -0.04% +0.01% (2048, 262144) sum 0.498813 0.510834 0.483641 0.498875 0.511036 0.483338 -0.01% -0.04% +0.06% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 262144) mean 0.266157 0.276751 0.255192 0.265966 0.276808 0.255544 +0.07% -0.02% -0.14% (1024, 262144) sum 0.266133 0.276709 0.255528 0.265658 0.276685 0.255287 +0.18% +0.01% +0.09% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (512, 131072) mean 0.085941 0.081184 0.087931 0.085591 0.080832 0.088008 +0.41% +0.44% -0.09% (512, 131072) sum 0.085962 0.081107 0.088045 0.085882 0.081160 0.088024 +0.09% -0.07% +0.02% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1000, 1000) mean 0.014203 0.045859 0.010310 0.013885 0.046132 0.010621 +2.29% -0.59% -2.93% (1000, 1000) sum 0.014180 0.046165 0.010756 0.013893 0.046109 0.010338 +2.07% +0.12% +4.04% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 129) mean 0.012953 0.016751 0.008536 0.012977 0.016714 0.008916 -0.18% +0.22% -4.26% (1024, 129) sum 0.013356 0.016806 0.008722 0.013003 0.017071 0.008611 +2.71% -1.55% +1.29% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 257) mean 0.013075 0.016787 0.009102 0.013116 0.016769 0.008679 -0.31% +0.11% +4.87% (1024, 257) sum 0.013092 0.016842 0.008786 0.013126 0.017128 0.008771 -0.26% -1.67% +0.17% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 587) mean 0.013662 0.017412 0.010055 0.013659 0.017019 0.010033 +0.02% +2.31% +0.22% (1024, 587) sum 0.013636 0.017473 0.010163 0.013642 0.017363 0.010101 -0.04% +0.63% +0.61% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 977) mean 0.015276 0.027873 0.012531 0.015241 0.027783 0.012467 +0.23% +0.32% +0.51% (2048, 977) sum 0.015345 0.027949 0.012192 0.015255 0.027839 0.012485 +0.59% +0.40% -2.35% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 128) mean 0.012806 0.014020 0.008291 0.013137 0.014309 0.007908 -2.52% -2.02% +4.84% (1024, 128) sum 0.012769 0.014308 0.007924 0.012788 0.014236 0.008038 -0.15% +0.51% -1.42% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 128) mean 0.014145 0.023049 0.009143 0.014104 0.023298 0.009501 +0.29% -1.07% -3.77% (8192, 128) sum 0.014132 0.023082 0.009638 0.014107 0.023331 0.009244 +0.18% -1.07% +4.26% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 130) mean 0.013420 0.025834 0.008949 0.013368 0.025724 0.008918 +0.39% +0.43% +0.35% (1024, 130) sum 0.013300 0.025940 0.009113 0.013266 0.025419 0.008922 +0.26% +2.05% +2.14% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 130) mean 0.013993 0.017883 0.009661 0.014275 0.018220 0.009596 -1.98% -1.85% +0.68% (8192, 130) sum 0.014026 0.018297 0.010066 0.014326 0.018257 0.009659 -2.09% +0.22% +4.21% ================================================================================================================================================================================================================================================ ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165178 Approved by: https://github.com/ngimel ghstack dependencies: #165494, #164790, #165055	2025-10-19 23:39:05 +00:00
drisspg	6b80c94901	[FlexAttention] Fix dynamic shaped heads flex_flash check (#165866 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165866 Approved by: https://github.com/BoyuanFeng ghstack dependencies: #165729	2025-10-19 23:10:16 +00:00
Jagadish Krishnamoorthy	8951df03de	test_scaled_matmul_cuda: fix infer_scale_swizzle (#165788 ) Extend #165747 fix to other cases. Add parentheses to clarify operator precedence. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165788 Approved by: https://github.com/jeffdaily, https://github.com/slayton58	2025-10-19 21:42:01 +00:00
Parshant Sharma	8139f33fa5	[dynamo] Add recompile reason for set_stance fail_on_recompile (#165445 ) Fixes #163500 ### Summary: For `set_stance("fail_on_recompile")` failures will provide the reason why the recompilation occurred ### Impacts: module: dynamo Pull Request resolved: https://github.com/pytorch/pytorch/pull/165445 Approved by: https://github.com/williamwen42	2025-10-19 21:12:19 +00:00
can-gaa-hou	a88587348b	[dynamo] Clean up assert in dynamo [1/N] (#165430 ) Fixes some part of #162852 and #164878. These two issues have some relationship though. * __->__ #165430 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165430 Approved by: https://github.com/Lucaskabela, https://github.com/williamwen42 Co-authored-by: Lucas Kabela <lucasakabela@gmail.com>	2025-10-19 21:00:05 +00:00
PyTorch MergeBot	633a3b7f67	Revert "shrink_group implementation to expose ncclCommShrink API (#164518 )" This reverts commit fa0db212e717b6cb225159cb32ea3d83baa52381. Reverted https://github.com/pytorch/pytorch/pull/164518 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/164518#issuecomment-3419893217))	2025-10-19 19:20:45 +00:00
Bruce Chang	fa0db212e7	shrink_group implementation to expose ncclCommShrink API (#164518 ) Closes #164529 To expose the new [ncclCommShrink](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommshrink) API to PyTorch. This is useful when you need to exclude certain GPUs or nodes from a collective operation, for example in fault tolerance scenarios or when dynamically adjusting resource utilization. For more info: [Shrinking a communicator](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#shrinking-a-communicator) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164518 Approved by: https://github.com/kwen2501	2025-10-19 18:00:08 +00:00
Yuanyuan Chen	15ff1cd28b	Remove E721 suppression in flake8 (#165855 ) Currently all files pass the E721 check. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165855 Approved by: https://github.com/albanD	2025-10-19 17:51:12 +00:00
Tugsbayasgalan Manlaibaatar	c73f5080de	Migrating some more callsites (#163580 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163580 Approved by: https://github.com/avikchaudhuri ghstack dependencies: #165582	2025-10-19 15:52:17 +00:00
Tugsbayasgalan Manlaibaatar	22ae059d32	AOTI util deprecated flow using the new tracer (#165582 ) Reapply of https://github.com/pytorch/pytorch/pull/163260 AOTI utils expect free function sometimes so adjust export API to handle that, haven't seen any methods getting exported. Some AOTI flows also require we populate dynamo_flat_name_to_original_fqn so i just copy how it is done in eval_frame.py. I also cleaned up how we get rid of export_root and fixed some overcomplicated nn_module_stack handling in export code. The logic is simpler now thanks to @anijain2305 . Pull Request resolved: https://github.com/pytorch/pytorch/pull/165582 Approved by: https://github.com/anijain2305	2025-10-19 15:52:16 +00:00
Yu, Guangye	1b121d636e	Fix AllocatorConfig parse roundup division bug (#165304 ) * #165288 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165304 Approved by: https://github.com/albanD ghstack dependencies: #165288, #165289, #165291, #165298	2025-10-19 15:34:44 +00:00
Yu, Guangye	1ba808dd97	Refine CUDA BackendStaticInitializer for allocator select (#165298 ) * #165288 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165298 Approved by: https://github.com/albanD ghstack dependencies: #165288, #165289, #165291	2025-10-19 15:34:44 +00:00
Yu, Guangye	b2f5c25b27	Introduce a generic API torch._C._accelerator_setAllocatorSettings (#165291 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165291 Approved by: https://github.com/albanD ghstack dependencies: #165288, #165289	2025-10-19 15:34:36 +00:00
Yu, Guangye	a1114beed2	Deprecate overlapped functions in CUDAAllocatorConfig (#165289 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165289 Approved by: https://github.com/albanD ghstack dependencies: #165288	2025-10-19 15:34:26 +00:00
Yu, Guangye	4888ed440e	Refine Allocator Config error message friendly (#165288 ) * __->__ #165288 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165288 Approved by: https://github.com/albanD	2025-10-19 15:34:17 +00:00
Nikita Shulga	5d62b63a76	[BE] Use Python-3.14 GE build (#165804 ) 3.14 reached general availability on Oct 7th 2025, so we can remove all pre-release workarounds Pull Request resolved: https://github.com/pytorch/pytorch/pull/165804 Approved by: https://github.com/yangw-dev, https://github.com/Skylion007, https://github.com/cyyever	2025-10-19 11:45:10 +00:00
Aaron Gokaslan	57ba575242	[BE][Ez]: Update torch.is_tensor documentation (#165841 ) TypeIs propogates the isinstance check with the typing system. They are now equivalent. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165841 Approved by: https://github.com/albanD	2025-10-19 09:24:11 +00:00
Aaron Gokaslan	ceb11a584d	[BE]: Update kleidai submodule to v1.15.0 (#165842 ) This mostly just adds a few new kernels and fixes some IMA and performance improvement of prev kernels. Also improves compiler support. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165842 Approved by: https://github.com/albanD	2025-10-19 08:25:03 +00:00
Aaron Gokaslan	33adb276fe	[BE][Ez]: Update Eigen to 5.0.0. C++14 support and more! (#165840 ) Update Eigen pin to 5.0.0 . Tons of new features and perf improvements. Most importantly updates minimum from C++03 to C++14 giving a ton of performance optimizations like properly implemented move operators, simplified code, etc. Also improved vectorization particularily on ARM. We really only use this library as a fallback for sparse operators, but still useful to update it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165840 Approved by: https://github.com/albanD	2025-10-19 08:00:06 +00:00
PyTorch UpdateBot	e939651972	[audio hash update] update the pinned audio hash (#165807 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165807 Approved by: https://github.com/pytorchbot	2025-10-19 04:45:20 +00:00
Yuanyuan Chen	3255e7872b	Enable all flake8-logging-format rules (#164655 ) These rules are enabled by removing existing suppressions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164655 Approved by: https://github.com/janeyx99, https://github.com/mlazos	2025-10-19 00:59:28 +00:00
Dzmitry Huba	c4f6619330	Enable more DTensor tests in local tensor mode and fix more integration issues (#165716 ) - During op dispatch local tensor is supposed to collect rng state from CPU and CUDA devices so that it can be reset before execution of the op for each such that ops with randomness produces the same result for all ranks (note that we are planning a separate change to add support of per rank rng state). Previously we relied on op input arguments to deduce which devices to get rng state from. Which doesn't work for factory functions such torch.randn. Hence this changes switches to uncondionally collecting rng state from all devices. - Fixing per rank specific computations in _MaskedPartial and Shard placements discovered during test enablement. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165716 Approved by: https://github.com/ezyang	2025-10-18 23:33:24 +00:00
andreh7	f18041cca8	Fix missing closing quote in __init__.py documentation (#165827 ) Title says it all. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165827 Approved by: https://github.com/Skylion007	2025-10-18 22:09:18 +00:00
Yuanyuan Chen	35e51893bd	Remove CUDA 11 workarounds for CUB_SUPPORTS_SCAN_BY_KEY and CUB_SUPPORTS_UNIQUE_BY_KEY (#164637 ) `CUB_SUPPORTS_SCAN_BY_KEY` and `CUB_SUPPORTS_UNIQUE_BY_KEY` are true since CUDA 12. This PR removes the old branches and source files. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164637 Approved by: https://github.com/ezyang	2025-10-18 20:05:54 +00:00
Yuanyuan Chen	1f43d17ce6	Fix self assignment (#165816 ) This PR removes assignments of the form `var=var`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165816 Approved by: https://github.com/jansel	2025-10-18 18:51:52 +00:00
Yuanyuan Chen	032bed95cd	Various C++ code fixes in LSAN integration (#165818 ) This PR extracts the C++ code fixes from #154584, which are fixes in enabling LSAN. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165818 Approved by: https://github.com/ezyang	2025-10-18 17:59:23 +00:00
Simon Layton	d14cbb4476	Add NVFP4 two-level scaling to scaled_mm (#165774 ) Summary: * Add second-level scaling dispatch to scaled_mm, tying into optional `alpha` passing * Add two-level tests Test Plan: ``` pytest -svv -k "nvfp4_global_scale" test/test_scaled_matmul_cuda.py ``` Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlayton@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165774 Approved by: https://github.com/drisspg	2025-10-18 13:06:04 +00:00
arkadip-maitra	f510d0dbc0	Clarrifying input output angle unit in the docs for trigonometric fun… (#161248 ) …ctions Fixes #[160995](https://github.com/pytorch/pytorch/issues/160995) Modified the docs to clarify that input tensor values for torch.sin, torch.cos and torch.tan should be in radians and the output tensor values for torch.acos, torch.asin and torch.atan is in radians. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161248 Approved by: https://github.com/isuruf Co-authored-by: Isuru Fernando <isuruf@gmail.com>	2025-10-18 11:53:48 +00:00
PyTorch MergeBot	beb6b62e8c	Revert "Enable more DTensor tests in local tensor mode and fix more integration issues (#165716 )" This reverts commit 1b397420f22b22f90a1093233ecd9167656e50cb. Reverted https://github.com/pytorch/pytorch/pull/165716 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165716#issuecomment-3418083391))	2025-10-18 09:15:49 +00:00
Chien-Chin Huang	4740ce7787	[CP] Fix load balancer incorrectly assuming batch dimension exists (#165792 ) https://github.com/pytorch/pytorch/pull/163617 removes the if/else statement to check if the input buffers have the batch dimension. This PR fixes the issue and also adds a test. In the future, we should explicitly ask users to unsqueeze the batch dimension. This is a BC of the existing contract but implicitly infers the batch dimension existence is not safe. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165792 Approved by: https://github.com/XilunWu	2025-10-18 09:11:16 +00:00
Isalia20	ad67170c8b	[MPS] sparse matmuls (#165232 ) Implements matmuls for sparse tensors. With this commit most of the core sparse operations should be implemented. Fixes: https://github.com/pytorch/pytorch/issues/156540 https://github.com/pytorch/pytorch/issues/129842 Should be merged after: https://github.com/pytorch/pytorch/pull/165102 To compare MPS and CPU, you can use this script: ```python import torch import time import matplotlib.pyplot as plt B, I, J, K = 8, 20000, 20000, 20000 num_iterations = 500 nnz_values = [10, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 100000] speedups = [] for nnz in nnz_values: indices = torch.stack([ torch.randint(0, B, (nnz,)), torch.randint(0, I, (nnz,)), torch.randint(0, J, (nnz,)), ]) values = torch.rand(nnz) sparse = torch.sparse_coo_tensor(indices, values, size=(B, I, J), device="mps").coalesce() dense = torch.randn(B, J, 200, device="mps") t1 = time.time() for _ in range(num_iterations): result = torch.bmm(sparse, dense) torch.mps.synchronize() t2 = time.time() mps_time = (t2 - t1) / num_iterations sparse_cpu = sparse.cpu() dense_cpu = dense.cpu() t1 = time.time() for _ in range(num_iterations): result_cpu = torch.bmm(sparse_cpu, dense_cpu) t2 = time.time() cpu_time = (t2 - t1) / num_iterations speedup = cpu_time / mps_time speedups.append(speedup) print(f"nnz={nnz}: MPS={mps_time:.6f}s, CPU={cpu_time:.6f}s, Speedup={speedup:.2f}x") plt.figure(figsize=(10, 6)) plt.plot(nnz_values, speedups, marker='o', linewidth=2, markersize=8) plt.xlabel('Number of Non-Zero Elements (nnz)', fontsize=12) plt.ylabel('Speedup (CPU time / MPS time)', fontsize=12) plt.title('MPS vs CPU Speedup for Sparse-Dense BMM', fontsize=14) plt.grid(True, alpha=0.3) plt.axhline(y=1, color='r', linestyle='--', alpha=0.5) plt.xscale('log') plt.tight_layout() plt.show() ``` ## Tested on M1 Pro <img width="1000" height="600" alt="Figure_1" src="https://github.com/user-attachments/assets/4a2402ec-3dc4-402d-8196-a0426906ca3d" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165232 Approved by: https://github.com/malfet	2025-10-18 09:04:42 +00:00
Yuanyuan Chen	fdab48a7c1	Enable all PIE rules on ruff (#165814 ) This PR enables all PIE rules on ruff, there are already some enabled rules from this family, the new added rules are ``` PIE796 Enum contains duplicate value: {value} PIE808 Unnecessary start argument in range ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165814 Approved by: https://github.com/ezyang	2025-10-18 07:36:18 +00:00
Nichols A. Romero	a0948d4d23	[ROCm][inductor] autotune support for persistent reduction kernels (#163908 ) After the removal of want_no_x_dim for persistent reduction kernels, we can improve the autotuning setup for persistent reduction kernels. Currently even with tuning enable, filtering will only try a single config in many cases. Avoid filtering with autotune mode, and override MAX_BLOCK limit. Also we always include tiny_config when autotuning is enabled. Contributions from several members of the AMD Inductor and Triton teams: @jataylo @iupaikov-amd @AmdSampsa @xiaohuguo2023 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163908 Approved by: https://github.com/jansel, https://github.com/PaulZhang12	2025-10-18 07:33:24 +00:00
Nichols A. Romero	0bbdd6b8db	[ROCm][inductor] heuristic improvements for pointwise kernels (#163197 ) Heuristic improvements for pointwise kernels for MI350. Contributions from several members of the AMD Inductor and Triton teams: @jataylo @AmdSampsa @iupaikov-amd @@xiaohuguo2023 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163197 Approved by: https://github.com/PaulZhang12, https://github.com/eellison, https://github.com/jansel Co-authored-by: AmdSampsa <sampsa.riikonen@amd.com> Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com>	2025-10-18 07:23:41 +00:00
PyTorch MergeBot	24520b8386	Revert "Enable all PIE rules on ruff (#165814 )" This reverts commit c79dfdc6550e872783aa5cb5fc9e86589bf18872. Reverted https://github.com/pytorch/pytorch/pull/165814 on behalf of https://github.com/cyyever due to Need to cover more files ([comment](https://github.com/pytorch/pytorch/pull/165814#issuecomment-3417931863))	2025-10-18 07:21:08 +00:00
Yuanyuan Chen	c79dfdc655	Enable all PIE rules on ruff (#165814 ) This PR enables all PIE rules on ruff, there are already some enabled rules from this family, the new added rules are ``` PIE796 Enum contains duplicate value: {value} PIE808 Unnecessary start argument in range ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165814 Approved by: https://github.com/ezyang	2025-10-18 06:40:12 +00:00
Yuanyuan Chen	e595136187	Enable PLC1802 on ruff (#165813 ) This PR enables ruff check `PLC1802`, which detects len calls on sequences in a boolean test context. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165813 Approved by: https://github.com/ezyang	2025-10-18 05:44:14 +00:00
Yuanyuan Chen	aaac8cb0f5	[1/N] Add strict parameter to Python zip calls (#165531 ) Add `strict=True/False` to zip calls in test utils. `strict=True` is passed when possible. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165531 Approved by: https://github.com/Skylion007	2025-10-18 05:26:33 +00:00
Yuanyuan Chen	0f0b4bf029	[1/N] Remove unused header inclusion (#165763 ) This PR removes unused header inclusion in C++ files. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165763 Approved by: https://github.com/Skylion007	2025-10-18 05:23:11 +00:00
Yuanyuan Chen	b8194268a6	Remove unnecessary noqa suppressions (#164106 ) This PR removes unused `noqa` suppressions in Python code. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164106 Approved by: https://github.com/albanD	2025-10-18 04:52:41 +00:00
Maggie Moss	f02e3947f6	Expand type checking to mypy strict files (#165697 ) Expands Pyrefly type checking to check the files outlined in the mypy-strict.ini configuration file: Pull Request resolved: https://github.com/pytorch/pytorch/pull/165697 Approved by: https://github.com/ezyang	2025-10-18 04:34:45 +00:00
Huy Do	9095a9dfae	[CD] Apply the fix from #162455 to aarch64+cu129 build (#165794 ) When trying to bring cu129 back in https://github.com/pytorch/pytorch/pull/163029, I mainly looked at https://github.com/pytorch/pytorch/pull/163029 and missed another tweak coming from https://github.com/pytorch/pytorch/pull/162455 I discover this issue when testing aarch64+cu129 builds in https://github.com/pytorch/test-infra/actions/runs/18603342105/job/53046883322?pr=7373. Surprisingly, there is no test running for aarch64 CUDA build from what I see in `79a37055e7`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165794 Approved by: https://github.com/malfet	2025-10-18 04:16:24 +00:00
Animesh Jain	d9f94e0d7d	[dynamo] Support fx.traceback.annotate as decorator (#165805 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165805 Approved by: https://github.com/Lucaskabela, https://github.com/SherlockNoMad, https://github.com/yushangdi	2025-10-18 03:58:11 +00:00
Simon Layton	23417ae50f	[Submodule] Bump FBGEMM to latest (#165544 ) Summary: * FBGEMM submodule updated to main * CMake updated to reflect necessary changes * Notably pulls in NVFP4 grouped gemm kernels Test Plan: Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlayton@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165544 Approved by: https://github.com/cyyever, https://github.com/jeffdaily	2025-10-18 03:58:08 +00:00
Yiming Zhou	e4d6c56ffb	Improve dynamo graph capture stack trace for custom ops (#165693 ) For a custom op ``` @torch.library.custom_op("my_lib::foo", mutates_args={}) def foo(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: return x + y ``` ppl could call `torch.ops.my_lib.foo()` or directly call `foo()` in the `forward` of an `nn.Module` These two calling conventions will lead to the same node in the output graph, but different stack traces. When directly calling `foo()`, the displayed stack_trace in the graph will be ``` # File: .../pytorch/torch/_library/custom_ops.py:687 in __call__, code: return self._opoverload(args, *kwargs) ``` This is not useful so we filter it out. ``` python test/functorch/test_aot_joint_with_descriptors.py -k test_custom_op_stack_trace ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165693 Approved by: https://github.com/SherlockNoMad, https://github.com/williamwen42	2025-10-18 03:48:18 +00:00
Laith Sakka	017d2985f3	set unbacked bindings in reinplace pass for newly created nodes during generalize_scatter decomp (#164948 ) Two fixes: 1. in rein_place pass, set unbacked bindings for newly created nodes. 2. In inductor, ComputeBuffer used to miss detecting some used symbols, fixed that. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164948 Approved by: https://github.com/bobrenjc93 ghstack dependencies: #164341	2025-10-18 03:20:30 +00:00
Laith Sakka	c6a8db0b9a	Fix issues with generalized_scatter and setitem allocated unbacked symbols. (#164341 ) Three fixes: 1. When doing t[u0] +=1 if u0 is unbacked we could allocate a new unbacked symbol during the the indexing of t[u0] (when we fake trace setitem), namely because meta_select does allocate a new unbacked symbol for the storage offset when we do not know if u0>=0 or u0<0. but the output size/stride of setitem(), does not depend on that new symbol. it's self consumed in setitem so we shall ignore it. 2. Also when we trace through generalized_scatter the applications of the views could allocate unbacked symints but those do not effect final output, we also shall ignore them. 3.Before accessing strides in lowering we shall materialize. Address https://github.com/pytorch/pytorch/issues/114293 and https://github.com/pytorch/pytorch/issues/131911 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164341 Approved by: https://github.com/bobrenjc93	2025-10-18 03:20:30 +00:00
Aaron Gokaslan	de09bab4b6	[BE]: Update cudnn frontend submodule to 1.15.0 (#165776 ) Update cudnn frontend submodule to 1.15.0 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165776 Approved by: https://github.com/eqy	2025-10-18 02:23:27 +00:00
jmaczan	c137e222d4	.venv/ in .gitignore (#165418 ) `uv venv` creates venv in `.venv/` directory. So, it's useful to have `.venv/` in `.gitignore`, since perhaps more people are using `uv` in their work. As per comment `3592f5f4e5 (diff-bc37d034bad564583790a46f19d807abfe519c5671395fd494d8cce506c42947)` uv docs that confirms it: https://docs.astral.sh/uv/pip/environments/#using-arbitrary-python-environments Pull Request resolved: https://github.com/pytorch/pytorch/pull/165418 Approved by: https://github.com/ezyang	2025-10-18 02:00:52 +00:00
Shangdi Yu	cf3a787bbc	[annotate] Annotate bw nodes before eliminate dead code (#165782 ) Fixes https://github.com/pytorch/torchtitan/pull/1907 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165782 Approved by: https://github.com/SherlockNoMad	2025-10-18 01:54:31 +00:00
drisspg	de3da77cf7	Thread deterministic config vars to subproc compilation (#165729 ) # Summary TIL (AFTER WAYYYY TOO MUCH INSANITY), that we do not serialize the full set of configs for the subproc compilation. I found this while working on Flex-attention determinism: https://github.com/meta-pytorch/attention-gym/pull/168 might be good to audit if we need to thread through any more Pull Request resolved: https://github.com/pytorch/pytorch/pull/165729 Approved by: https://github.com/shunting314, https://github.com/eellison	2025-10-18 01:25:50 +00:00
Ti-Tai Wang	543ddbf44c	[ONNX] Support renaming in dynamic axes to shapes conversion (#165769 ) Discovered in ##165748 This PR also deprecates the conversion. ONNX exporter team does not intend to maintain the conversion in long term. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165769 Approved by: https://github.com/justinchuby	2025-10-18 01:11:20 +00:00
orangeH25	e9f4999985	[Code Clean] Replace std::runtime_error with TORCH_CHECK (#165305 ) Fixes part of #148114 Including: - torch/csrc/distributed Pull Request resolved: https://github.com/pytorch/pytorch/pull/165305 Approved by: https://github.com/FFFrog, https://github.com/albanD	2025-10-18 01:08:44 +00:00
Chris Leonard	29b029648e	Fixed issue with GradTrackingTensor not properly propagating sparse layout (#165765 ) Fixes #164286 Fixed issue with GradTrackingTensor not properly propagating sparse layout. @ezyang @jcaip Pull Request resolved: https://github.com/pytorch/pytorch/pull/165765 Approved by: https://github.com/ezyang	2025-10-18 01:00:53 +00:00
Shivam Raikundalia	a25a649e70	[Mem Snapshot] Add Metadata Field (#165490 ) Summary: The implementation adds the ability to: Set custom metadata strings that will be attached to all subsequent allocations Clear or change the metadata at any point View the metadata in memory snapshots via _dump_snapshot() Test Plan: Added test in test_cuda.py and check manually in snapshot to see that metadata was added. Differential Revision: D84654933 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165490 Approved by: https://github.com/yushangdi	2025-10-17 23:46:02 +00:00
PyTorch MergeBot	69c33898fa	Revert "[Inductor][CuTeDSL] Move load_template up two directories (#165347 ) (#165576 )" This reverts commit febb60323018948b2b9d2cff35b3cc4e0d0c55c8. Reverted https://github.com/pytorch/pytorch/pull/165576 on behalf of https://github.com/seemethere due to This was actually reverted internally, current PR is linked to a stale diff so diff train tools think that this is landed via co-dev when it was actually reverted ([comment](https://github.com/pytorch/pytorch/pull/165576#issuecomment-3417510146))	2025-10-17 23:33:17 +00:00
Dzmitry Huba	1b397420f2	Enable more DTensor tests in local tensor mode and fix more integration issues (#165716 ) - During op dispatch local tensor is supposed to collect rng state from CPU and CUDA devices so that it can be reset before execution of the op for each such that ops with randomness produces the same result for all ranks (note that we are planning a separate change to add support of per rank rng state). Previously we relied on op input arguments to deduce which devices to get rng state from. Which doesn't work for factory functions such torch.randn. Hence this changes switches to uncondionally collecting rng state from all devices. - Fixing per rank specific computations in _MaskedPartial and Shard placements discovered during test enablement. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165716 Approved by: https://github.com/ezyang	2025-10-17 23:28:22 +00:00
drisspg	fe80f03726	Add B200 files to labeler and update codeowners (#165767 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165767 Approved by: https://github.com/slayton58	2025-10-17 23:24:17 +00:00
PyTorch MergeBot	e50dc40d28	Revert "Update gm.print_readable to include Annotation (#165397 )" This reverts commit 7a657700131f31577544e93587eb339618677e97. Reverted https://github.com/pytorch/pytorch/pull/165397 on behalf of https://github.com/malfet due to I don't know how/why, but it breaks windows tests, see `2e22b1a61e/1` ([comment](https://github.com/pytorch/pytorch/pull/165397#issuecomment-3417428128))	2025-10-17 22:35:50 +00:00
Wes Bland	2e22b1a61e	[pytorch] Composite backend potential fix for is_backend_available (#165061 ) Summary: `is_backend_available` takes in a string and expects it to only be backend, if its given a composite (device:backend) string, it fails. Reviewed By: prashrock Differential Revision: D81886736 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165061 Approved by: https://github.com/H-Huang	2025-10-17 22:06:36 +00:00
Animesh Jain	616c6bdf8f	[dynamo][ac] Config flag to allow eager and compile AC divergence for side-effects (#165775 ) Eager AC/SAC reapplies the mutations (like global dict mutations) in the backward during the recomputation of forward. torch.compile has no easy way to reapply python mutations in the backward. But many users might be ok to skip reapplication of side effects in the backward. They can set this config flag to accept this eager and compile divergence. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165775 Approved by: https://github.com/zou3519 ghstack dependencies: #165734	2025-10-17 22:04:19 +00:00
Animesh Jain	c18ddfc572	[dynamo][easy] Support torch.accelerator.current_accelerator (#165734 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165734 Approved by: https://github.com/Skylion007	2025-10-17 22:04:19 +00:00
Zhengxu Chen	86ebce1766	[precompile] Pass tensor_to_context to backend. (#165702 ) Summary: Fixing a VLLM issue https://github.com/vllm-project/vllm/issues/27040 where aot precompile fails on some models using symbolic shapes in inductor. Test Plan: pp HF_HUB_DISABLE_XET=1 VLLM_ENABLE_V1_MULTIPROCESSING=0 VLLM_USE_AOT_COMPILE=1 vllm bench latency --model microsoft/DialoGPT-small --input-len 128 --output-len 256 --num-iters 50 --dtype float16 Reviewers: Subscribers: Tasks: Tags: Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/165702 Approved by: https://github.com/tugsbayasgalan	2025-10-17 21:52:04 +00:00
Nan Zhang	8cb2fb44f2	[Inductor] Support fallback for all gemm like ops (#165755 ) Summary: Fill op_override field for bmm aten ops so they can be converted properly in the wrapper_fxir backend Reviewed By: StellarrZ Differential Revision: D84840948 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165755 Approved by: https://github.com/blaine-rister	2025-10-17 21:08:29 +00:00
zpcore	ab65498d71	Fix `_StridedShard` incorrect split (#165533 ) https://github.com/pytorch/pytorch/pull/164820 introduced a bug that `_StridedShard` will call parent class `Shard`'s `split_tensor` method, thus results in incorrect data locality. (I think @ezyang spotted this issue, but we have no test to capture this) Meanwhile, I notice another bug that when we normalize a `_StridedShard`'s placement, it will also trigger parent class `Shard`'s `split_tensor` method because it will create a Shard class [here](`0c14f55de6/torch/distributed/tensor/_api.py (L783)`). I think we never test `distribute_tensor` for `_StridedShard` before. So I added a test here to compare against ordered shard. Using classmethod because the _split_tensor logic is different between `Shard` and `_StridedShard`. Basically I want to shard on local tensors without initializing the Shard object: ``` local_tensor = _StridedShard._make_shard_tensor(dim, tensor, mesh, mesh_dim, split_factor=split_factor) local_tensor = Shard._make_shard_tensor(dim, tensor, mesh, mesh_dim) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165533 Approved by: https://github.com/XilunWu	2025-10-17 20:54:46 +00:00
PyTorch MergeBot	06d324365c	Revert "Escaped html tags name and target to appear as strings (#165543 )" This reverts commit 080365b7d82a3c99c995cab6dc912b7dfe22aa41. Reverted https://github.com/pytorch/pytorch/pull/165543 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165543#issuecomment-3417102048))	2025-10-17 20:45:48 +00:00
Yuanyuan Chen	6c9c6e0936	Enable C407 of flake8 (#165046 ) This PR enables C407 on flake8. The description is `C407` is `Unnecessary list comprehension - ‘<builtin>’ can take a generator`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165046 Approved by: https://github.com/albanD	2025-10-17 20:15:39 +00:00
Rohit Singh Rathaur	2bcd892c86	[distributed] Replace assert statements in distributed checkpoint with explicit checks (#165256 ) Fixes partially #164878 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165256 Approved by: https://github.com/albanD	2025-10-17 20:14:35 +00:00
Shangdi Yu	75e2a9fae3	[annotate] add annotate_fn function decorator (#165703 ) Example usage: ``` @fx_traceback.annotate_fn({"pp_stage": 1}) def example_function(x): return x * x class SimpleLinear(nn.Module): def __init__(self): super().__init__() self.linear = nn.Linear(3, 2) def forward(self, x): with fx_traceback.annotate({"pp_stage": 0}): y = self.linear(x) y = example_function(y) return y - 1 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165703 Approved by: https://github.com/SherlockNoMad	2025-10-17 20:10:53 +00:00
Eddie Yan	a16fd6b488	[NVSHMEM][Triton] Fix NVSHMEM triton test for wacky world sizes (#165704 ) Currently assumes divisible by 4? world size Not as slick as the old setup code but more general Pull Request resolved: https://github.com/pytorch/pytorch/pull/165704 Approved by: https://github.com/Skylion007, https://github.com/kwen2501	2025-10-17 19:33:26 +00:00
vishalgoyal316	382b0150de	[docs] Add usage examples to ConvTranspose1d docstring (#165618 ) Fixes #165615 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165618 Approved by: https://github.com/mikaylagawarecki	2025-10-17 19:11:57 +00:00
Kasparas Karlauskas	a664b299ac	Update docs for torch.mode (#165614 ) Currently the docs for `torch.mode` include a note: `This function is not defined for torch.cuda.Tensor yet.` However with `torch==2.7.1+cu126` when I try to get the mode of a Tensor that is in cuda memory, I do not face any issues: ``` >>> a = torch.tensor([0, 2, 1, 1, 1, 3, 3]) >>> a.mode() torch.return_types.mode( values=tensor(1), indices=tensor(4)) >>> a.cuda().mode() torch.return_types.mode( values=tensor(1, device='cuda:0'), indices=tensor(4, device='cuda:0')) ``` Am I misunderstanding the note? If not, I suggest removing it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165614 Approved by: https://github.com/mikaylagawarecki	2025-10-17 19:06:33 +00:00
vishalgoyal316	9c12651417	Improve error message for non-positive groups in convolution (#165669 ) Prevents from segmentation fault for invalid groups value in convolution. Fixes #142835 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165669 Approved by: https://github.com/mikaylagawarecki	2025-10-17 19:06:05 +00:00
Tugsbayasgalan Manlaibaatar	08c97b4a1f	Don't run compile inside kernel invocation (#165687 ) When we call torch.compile during fake tensor prop, we shouldn't actually compile because we can't guarantee that the compiled artifact can be fake tensor prop-d. (for example, inductor backend). Instead we should just skip compiling. However, the inner compile will be triggered when being executed in runtime. Fixes: https://github.com/pytorch/pytorch/issues/151328 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165687 Approved by: https://github.com/zou3519	2025-10-17 19:03:57 +00:00
PyTorch MergeBot	fae74cd52f	Revert "shrink_group implementation to expose ncclCommShrink API (#164518 )" This reverts commit a032510db38e8331afa08f7635d146f9cefdd0ab. Reverted https://github.com/pytorch/pytorch/pull/164518 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/164518#issuecomment-3416718767))	2025-10-17 18:55:53 +00:00
Sherlock Huang	7a65770013	Update gm.print_readable to include Annotation (#165397 ) Sample output ``` [rank0]: # Annotation: {'compile_with_inductor': 'flex_attention'} File: /data/users/bahuang/pytorch/torch/nn/attention/flex_attention.py:1490 in flex_attention, code: out, lse, max_scores = flex_attention_hop( [rank0]: score_mod_2 = self.score_mod_2 [rank0]: mask_fn_2 = self.mask_fn_2 [rank0]: flex_attention_1 = torch.ops.higher_order.flex_attention(xq_5, xk_5, xv_3, score_mod_2, (2048, 2048, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___kv_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___kv_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_kv_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_kv_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___q_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___q_indices, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_q_num_blocks, g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___full_q_indices, 128, 128, mask_fn_2), 0.25, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False}, (), (g____import_torchtitan_dot_models_dot_attention___flex_attention_block_masks___block_causal___none___mask_mod___closure___0_cell_contents,)); xq_5 = xk_5 = xv_3 = score_mod_2 = mask_fn_2 = None [rank0]: out_2: "bf16[8, 4, 2048, 16]" = flex_attention_1[0]; flex_attention_1 = None ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165397 Approved by: https://github.com/yushangdi, https://github.com/anijain2305	2025-10-17 18:35:18 +00:00
Jane Xu	e4454947e2	Widen ops support to take in IntHOArrayRef vs only std::vec (#165152 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165152 Approved by: https://github.com/mikaylagawarecki ghstack dependencies: #164991	2025-10-17 18:32:39 +00:00
Jane Xu	3806e9767b	Refactor out headeronly ArrayRef (#164991 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164991 Approved by: https://github.com/swolchok	2025-10-17 18:32:39 +00:00
PyTorch MergeBot	b08d8c2e50	Revert "[DebugMode][2/N] add nn.Module tracking (#165498 )" This reverts commit 45afaf08a14ab760d86ea80dea6d50cec8626513. Reverted https://github.com/pytorch/pytorch/pull/165498 on behalf of https://github.com/seemethere due to First part of the stack was reverted so will need to revert this too ([comment](https://github.com/pytorch/pytorch/pull/165498#issuecomment-3416618198))	2025-10-17 18:22:48 +00:00
Colin L Reliability Rice	ca5b7f8ded	torch.compile: populate compiler_config (#165581 ) Summary: This starts writing the compiler_config metadata into logger Test Plan: Modified existing test case to make sure this is not null. (Also eyeballed what we're logging tomake sure it's reasonable Reviewed By: masnesral Differential Revision: D84014636 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165581 Approved by: https://github.com/masnesral	2025-10-17 18:21:18 +00:00
PyTorch MergeBot	9a71d96256	Revert "[DebugMode][1/N] refactor logs into _DebugCalls (#165376 )" This reverts commit 556fc09a9f67f24ca5591ec049c5d0c347c5f62a. Reverted https://github.com/pytorch/pytorch/pull/165376 on behalf of https://github.com/seemethere due to This is failing for internal tests, see D84877379 for more context ([comment](https://github.com/pytorch/pytorch/pull/165376#issuecomment-3416570407))	2025-10-17 18:08:59 +00:00
Luca Wehrstedt	0d4c2b71e8	[DeviceMesh] Simplify unflatten method (#165556 ) By adding a few small helpers (e.g., a `splice` method to `_MeshLayout`, and making `_init_process_groups` static and thus stateless) we can substantially shorten the definition of the unflatten method, and help readability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165556 Approved by: https://github.com/fduwjj ghstack dependencies: #165554, #165555	2025-10-17 17:57:51 +00:00
Luca Wehrstedt	d659bbde62	[DeviceMesh] Introduce private constructor instead of _create_mesh_from_ranks (#165555 ) The refactoring of DeviceMesh is heavily constrained by the signature of its constructor, which is a public API which contains some "legacy" concepts which we'd love to get rid of, such as an explicit/materialized `mesh` Tensor. In other languages the solution to this would be to add a private overload of the constructor. Python doesn't natively allow this, but in this PR I managed to build something that approximates it. This new private constructor basically only takes `_layout`, `_global_rank_permutation`, and `mesh_dim_names`. With such a constructor we can effectively simplify a lot of callsites and get rid of the `_create_mesh_from_ranks` helper method. That's a good thing because it was instantiating many DeviceMeshes in a for loop, which always felt unnecessary. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165555 Approved by: https://github.com/fduwjj, https://github.com/fegin ghstack dependencies: #165554	2025-10-17 17:57:51 +00:00
Luca Wehrstedt	58879bfafa	[DeviceMesh] Prefer using _layout over _mesh for all sorts of things (#165554 ) The goal of this PR is to avoid storing the explicit `mesh` Tensor inside each DeviceMesh, and instead compute it on-the-fly when the end user needs it, and try to replace all of its internal usages with `_layout` and the newly-introduced `_global_rank_permutation` Tensor. The name of this attribute is up for debate. The advantage of the `_global_rank_permutation` Tensor is that it is _the same_ Tensor for the root mesh and all its children, so it doesn't need to be copied/reallocated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165554 Approved by: https://github.com/fduwjj	2025-10-17 17:57:51 +00:00
Bruce Chang	a032510db3	shrink_group implementation to expose ncclCommShrink API (#164518 ) Closes #164529 To expose the new [ncclCommShrink](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommshrink) API to PyTorch. This is useful when you need to exclude certain GPUs or nodes from a collective operation, for example in fault tolerance scenarios or when dynamically adjusting resource utilization. For more info: [Shrinking a communicator](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#shrinking-a-communicator) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164518 Approved by: https://github.com/Skylion007, https://github.com/syed-ahmed, https://github.com/kwen2501	2025-10-17 17:55:03 +00:00
Simon Layton	39e0a832c9	Fix B200 test fails in scaled_mm (#165747 ) Summary: PR #165528 changes some scale/swizzle inference behavior in scaled_mm tests - mxfp8 tests on Blackwell can get incorrectly classified, resulting in failures. Fix the scale/swizzle inference code to prevent this. Fixes https://github.com/pytorch/pytorch/issues/165743 Test Plan: ``` pytest -svv test/test_scaled_matmul_cuda.py ``` Reviewers: @jagadish-amd @jeffdaily @drisspg Subscribers: @Aidyn-A Tasks: Tags: Signed-off-by: Simon Layton <simonlaytonmeta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165747 Approved by: https://github.com/eqy, https://github.com/drisspg, https://github.com/jeffdaily	2025-10-17 17:52:19 +00:00
James Wu	dd3b48e85d	Fix bug with serialization after AOTAutogradCache hit (#165474 ) Fixes #165447 On AOTAutogradCache load, the serialization function we pick is just lambda: self, because the object itself is an AOTAutogradCacheEntry. However, this isn't safe, because `wrap_post_compile` will make `self` unserializable, since it needs to load triton kernels and stuff! So instead, on AOTAutogradCache load, we preserve the bytes that were used to load the object to begin with, and return that object on a call to serialize(). This effectively makes it so that we save a copy of the pre-hydrated artifact, without needing to do an eager copy until someone actually calls `serialize`. Test Plan: Run ```py import torch class M(torch.nn.Module): def __init__(self): super().__init__() self.linear1 = torch.nn.Linear(2, 4) self.relu = torch.nn.ReLU() self.linear2 = torch.nn.Linear(4, 8) def forward(self, x): return self.linear2(self.relu(self.linear1(x))) device = "cuda" m = M().to(device) sample_inputs = (torch.randn(2, 2, device=device),) eager_out = m(sample_inputs) with torch._dynamo.config.patch("enable_aot_compile", True): compiled_fn_path = "./m.pt" compiled_fn = torch.compile( m, fullgraph=True ).forward.aot_compile((sample_inputs, {})) compiled_fn.save_compiled_function(compiled_fn_path) torch._dynamo.reset() with torch.compiler.set_stance("fail_on_recompile"): with open(compiled_fn_path, "rb") as f: loaded_fn = torch.compiler.load_compiled_function(f) assert loaded_fn is not None compiled_out = loaded_fn(m, sample_inputs) assert torch.allclose(eager_out, compiled_out) ``` twice, see that it succeeds. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165474 Approved by: https://github.com/yiming0416, https://github.com/zhxchen17	2025-10-17 17:47:24 +00:00
jmaczan	cff1b20771	Patch the flex_attention._get_mod_type to not use inspect.signature when computing num_positional_args (an alternative fix for flex attention graph break on create_block_mask) (#164923 ) The initial fix for inspect.signature uses not a right approach (https://github.com/pytorch/pytorch/pull/164349#pullrequestreview-3306614010). As @williamwen42 suggests (https://github.com/pytorch/pytorch/pull/164349#issuecomment-3379222885) we can just for now get rid of `inspect.signature` call in flex_attention to resolve this high priority issue (https://github.com/pytorch/pytorch/issues/164247#issuecomment-3378673179). In this PR I did exactly this - limited the scope of fix to just computing `num_positional_args` in `flex_attention._get_mod_type` based on properties returned by `NestedUserFunctionVariable.const_getattr` (some were missing so I added them) Fixes #164247 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164923 Approved by: https://github.com/williamwen42	2025-10-17 17:44:45 +00:00
Jeff Daily	da8517fa63	[ROCm][CI] upgrade wheels to 7.0.2 and 6.4.4 patch release (#165756 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165756 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-17 17:41:19 +00:00
Pian Pawakapan	45afaf08a1	[DebugMode][2/N] add nn.Module tracking (#165498 ) Uses ModTracker to record nn.Module entries, much like CommDebugMode. Can be switched on with `DebugMode(record_nn_module=True)`: ``` [nn.Mod] Bar [nn.Mod] Bar.abc [nn.Mod] Bar.abc.l1 aten::t(t: f32[4, 4]) aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4]) [nn.Mod] Bar.abc.l2 aten::t(t: f32[4, 4]) aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4]) [nn.Mod] Bar.xyz aten::t(t: f32[4, 4]) aten::addmm(t: f32[4], t: f32[4, 4], t: f32[4, 4])""" ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165498 Approved by: https://github.com/SherlockNoMad ghstack dependencies: #165376	2025-10-17 17:39:48 +00:00
Turner Richmond	080365b7d8	Escaped html tags name and target to appear as strings (#165543 ) Fixes small typo in markdown documentation file - Added escape characters to precede tag pattern. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165543 Approved by: https://github.com/mikaylagawarecki	2025-10-17 17:35:18 +00:00
PyTorch MergeBot	2928c5c572	Revert "Pyrefly suppressions 2 (#165692 )" This reverts commit 43d78423ac224cce432bf34ed9627035169d5433. Reverted https://github.com/pytorch/pytorch/pull/165692 on behalf of https://github.com/seemethere due to This is causing merge conflicts when attempting to land internally, see D84890919 for more details ([comment](https://github.com/pytorch/pytorch/pull/165692#issuecomment-3416397240))	2025-10-17 17:13:04 +00:00
Animesh Jain	630520b346	[dynamo][misc] Replace UserFunctionVariable with VariableTracker build (#165707 ) Audit: To prevent future issues with functools.partial or callable objects. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165707 Approved by: https://github.com/Lucaskabela ghstack dependencies: #165683, #165706	2025-10-17 17:02:18 +00:00
Animesh Jain	1dc9a05d03	[dynamo][user_defined] Replace UserFunctionVariable with VariableTracker build (#165706 ) Audit: To prevent future issues with functools.partial or callable objects. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165706 Approved by: https://github.com/Lucaskabela ghstack dependencies: #165683	2025-10-17 17:02:18 +00:00
Han, Xu	bfcdbd0a97	fix wrong accuracy_status when exception. (#165731 ) When I debug `XPU` accruacy issue, I found the script output wrong accuracy_status. When the `try` block raise an exception, we should process the exception, but not return the `fail_accuracy`. Before fixing, it returned as `fail_accuracy`: <img width="1109" height="216" alt="image" src="https://github.com/user-attachments/assets/385c354f-fbf6-48e4-a1be-3e37e987341b" /> After fixing, it returned the exception message: <img width="1101" height="292" alt="image" src="https://github.com/user-attachments/assets/f18c0e3c-8358-4ec7-a6bb-c2e01b69d27f" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165731 Approved by: https://github.com/Stonepia, https://github.com/chuanqi129, https://github.com/Lucaskabela	2025-10-17 16:37:06 +00:00
PyTorch MergeBot	faff826a46	Revert "[ROCm] new implementation of upsample_bilinear2d_backward (#164572 )" This reverts commit 53f9ae0e50d4dcc47f2ca4bf854803f9d4f875ae. Reverted https://github.com/pytorch/pytorch/pull/164572 on behalf of https://github.com/seemethere due to Looks like this is failing in our internal builds, will post a suggestion for a fix but want you to double verify that this behavior is correct ([comment](https://github.com/pytorch/pytorch/pull/164572#issuecomment-3416262676))	2025-10-17 16:27:59 +00:00
PyTorch MergeBot	85c5433d38	Revert "Fix `_StridedShard` incorrect split (#165533 )" This reverts commit dfc8a1c5ddc8401197e9ab546e03b0f745edc27b. Reverted https://github.com/pytorch/pytorch/pull/165533 on behalf of https://github.com/seemethere due to Causing a merge conflict internally, see D84829161 ([comment](https://github.com/pytorch/pytorch/pull/165533#issuecomment-3416143176))	2025-10-17 15:57:01 +00:00
inventshah	935ccdbe75	[MPS] Fix internal assertion in torch.linalg.solve for singular matrices (#165254 ) Fixes #163962 by special casing MPS in the negative status code branch in `_linalg_check_errors`. Checks if info is [`MPSMatrixDecompositionStatus.singular`](https://developer.apple.com/documentation/metalperformanceshaders/mpsmatrixdecompositionstatus/singular) (which has a raw value of -2). I didn't find an official Apple source with this raw value (besides printing the enum value), so I'm not sure if we can (or should) depend on it? Is there a way to directly get the Objective-C enum value in C++? Pull Request resolved: https://github.com/pytorch/pytorch/pull/165254 Approved by: https://github.com/malfet	2025-10-17 15:35:49 +00:00
Isuru Fernando	3af2f0c12a	[inductor] require shape in TritonCSEVariable (#162275 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162275 Approved by: https://github.com/mlazos ghstack dependencies: #164158	2025-10-17 14:47:45 +00:00
Nikita Shulga	6ece527fc5	[CI] Add aarch64 operator benchmark (#165585 ) Running on Graviton4 Skip ConvTranspose1d benchmarks if PyTorch is compiled with ACL, due to https://github.com/pytorch/pytorch/issues/165654 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165585 Approved by: https://github.com/huydhn	2025-10-17 14:42:14 +00:00
PaulZhang12	ce29d0d796	[ATen] Vectorize 8 elements on 16 bit data types for sum/mean (#165055 ) Benchmarks for a full reduction + reduction on the contiguous dimension. Vectorized loads do not occur on the non contiguous dimension. Benchmarking done for FP16/BF16, ~6% improvement on average across shapes, up to ~24% for single reduction on contiguous dimension and 46% for full reduce: BF16 ``` Tensor Shape Operation Full reduce (ms) Contiguous dim (ms) Full reduce (ms) Contiguous dim (ms) Full reduce diff % Contiguous diff % ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (256, 256) mean 0.022686 0.008263 0.015498 0.008117 +46.38% +1.80% (256, 256) sum 0.022769 0.008269 0.015628 0.008185 +45.69% +1.03% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (512, 512) mean 0.014116 0.009545 0.012892 0.008839 +9.49% +7.99% (512, 512) sum 0.014110 0.009892 0.012891 0.008878 +9.46% +11.42% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 1024) mean 0.014727 0.012642 0.014061 0.010519 +4.74% +20.18% (1024, 1024) sum 0.014376 0.012636 0.014069 0.010595 +2.18% +19.26% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 2048) mean 0.018663 0.018294 0.018171 0.014678 +2.71% +24.64% (2048, 2048) sum 0.018638 0.017931 0.018142 0.014713 +2.73% +21.87% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (4096, 4096) mean 0.034216 0.036953 0.033520 0.030585 +2.08% +20.82% (4096, 4096) sum 0.034196 0.036942 0.033518 0.030676 +2.02% +20.43% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 8192) mean 0.087763 0.095201 0.085439 0.084960 +2.72% +12.05% (8192, 8192) sum 0.088079 0.095592 0.085353 0.084632 +3.19% +12.95% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 16384) mean 0.148174 0.149705 0.146274 0.138865 +1.30% +7.81% (8192, 16384) sum 0.147820 0.149371 0.146419 0.138752 +0.96% +7.65% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 32768) mean 0.266144 0.260807 0.265953 0.253330 +0.07% +2.95% (8192, 32768) sum 0.266572 0.261163 0.265729 0.253294 +0.32% +3.11% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 65536) mean 0.502034 0.486312 0.498417 0.481246 +0.73% +1.05% (8192, 65536) sum 0.501597 0.486351 0.497735 0.481579 +0.78% +0.99% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 131072) mean 0.971178 0.942988 0.957164 0.938316 +1.46% +0.50% (8192, 131072) sum 0.971189 0.943232 0.956814 0.937816 +1.50% +0.58% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 262144) mean 1.953728 1.877648 1.904937 1.861692 +2.56% +0.86% (8192, 262144) sum 1.953969 1.877538 1.905990 1.862547 +2.52% +0.80% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (4096, 262144) mean 0.970408 0.940965 0.957871 0.936732 +1.31% +0.45% (4096, 262144) sum 0.970919 0.941652 0.957765 0.936676 +1.37% +0.53% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 262144) mean 0.501477 0.486976 0.497964 0.483570 +0.71% +0.70% (2048, 262144) sum 0.501955 0.487213 0.498210 0.483218 +0.75% +0.83% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 262144) mean 0.266536 0.257111 0.265642 0.255439 +0.34% +0.65% (1024, 262144) sum 0.266613 0.257096 0.265427 0.255472 +0.45% +0.64% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (512, 131072) mean 0.087805 0.091200 0.085818 0.087851 +2.32% +3.81% (512, 131072) sum 0.087788 0.091249 0.085373 0.087944 +2.83% +3.76% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1000, 1000) mean 0.014503 0.012328 0.013663 0.010190 +6.15% +20.98% (1000, 1000) sum 0.014545 0.012378 0.013662 0.010579 +6.46% +17.01% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 129) mean 0.014163 0.008371 0.012893 0.008828 +9.85% -5.18% (1024, 129) sum 0.014132 0.008751 0.013234 0.008868 +6.79% -1.32% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 257) mean 0.014296 0.009101 0.013334 0.008563 +7.21% +6.28% (1024, 257) sum 0.014302 0.009058 0.013020 0.008672 +9.85% +4.45% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 587) mean 0.014127 0.010997 0.013443 0.009944 +5.09% +10.59% (1024, 587) sum 0.014471 0.011373 0.013123 0.010354 +10.27% +9.84% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 977) mean 0.015607 0.013566 0.015089 0.012152 +3.43% +11.64% (2048, 977) sum 0.015953 0.013580 0.015039 0.011861 +6.08% +14.49% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 128) mean 0.013982 0.008058 0.012747 0.008139 +9.69% -1.00% (1024, 128) sum 0.013967 0.008071 0.012726 0.007859 +9.75% +2.70% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 128) mean 0.014378 0.009627 0.013712 0.009395 +4.86% +2.47% (8192, 128) sum 0.014389 0.009965 0.013718 0.009521 +4.89% +4.66% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 130) mean 0.014156 0.008267 0.012895 0.008833 +9.78% -6.41% (1024, 130) sum 0.013797 0.008277 0.012903 0.008512 +6.93% -2.76% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 130) mean 0.014977 0.010026 0.013911 0.009876 +7.66% +1.52% (8192, 130) sum 0.014994 0.010043 0.014235 0.009604 +5.33% +4.57% ==================================================================================================================================================================================== ``` FP16 ``` Tensor Shape Operation Full reduce (ms) Contiguous dim (ms) Full reduce (ms) Contiguous dim (ms) Full reduce diff % Contiguous diff % ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (256, 256) mean 0.022804 0.008298 0.015888 0.007848 +43.53% +5.73% (256, 256) sum 0.023215 0.008328 0.015677 0.007850 +48.08% +6.09% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (512, 512) mean 0.013777 0.009988 0.012884 0.008512 +6.93% +17.34% (512, 512) sum 0.013775 0.009622 0.012870 0.009028 +7.03% +6.58% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 1024) mean 0.014740 0.012322 0.013708 0.010239 +7.53% +20.34% (1024, 1024) sum 0.014762 0.012756 0.013722 0.010307 +7.58% +23.76% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 2048) mean 0.018700 0.018364 0.018135 0.015078 +3.12% +21.79% (2048, 2048) sum 0.018276 0.018415 0.018471 0.015127 -1.06% +21.74% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (4096, 4096) mean 0.034518 0.037000 0.033838 0.030617 +2.01% +20.85% (4096, 4096) sum 0.034569 0.037448 0.033842 0.031100 +2.15% +20.41% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 8192) mean 0.087675 0.095176 0.085328 0.084105 +2.75% +13.16% (8192, 8192) sum 0.088102 0.095211 0.085707 0.084090 +2.79% +13.23% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 16384) mean 0.147800 0.149263 0.146388 0.138390 +0.96% +7.86% (8192, 16384) sum 0.148147 0.148957 0.146439 0.138801 +1.17% +7.32% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 32768) mean 0.266316 0.260294 0.265829 0.253411 +0.18% +2.72% (8192, 32768) sum 0.266562 0.260717 0.265744 0.253308 +0.31% +2.92% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 65536) mean 0.502035 0.486077 0.498139 0.481374 +0.78% +0.98% (8192, 65536) sum 0.501571 0.485733 0.498353 0.481350 +0.65% +0.91% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 131072) mean 0.971343 0.943016 0.956600 0.938622 +1.54% +0.47% (8192, 131072) sum 0.971463 0.942991 0.957352 0.938334 +1.47% +0.50% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 262144) mean 1.952722 1.877165 1.906406 1.861455 +2.43% +0.84% (8192, 262144) sum 1.952634 1.876388 1.904677 1.861282 +2.52% +0.81% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (4096, 262144) mean 0.970697 0.941298 0.956964 0.936160 +1.44% +0.55% (4096, 262144) sum 0.969981 0.941078 0.957016 0.936260 +1.35% +0.51% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 262144) mean 0.501577 0.487208 0.498422 0.483493 +0.63% +0.77% (2048, 262144) sum 0.502029 0.487124 0.497854 0.483643 +0.84% +0.72% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 262144) mean 0.266416 0.257383 0.265928 0.255140 +0.18% +0.88% (1024, 262144) sum 0.266434 0.257081 0.265817 0.255143 +0.23% +0.76% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (512, 131072) mean 0.087858 0.091296 0.085816 0.087745 +2.38% +4.05% (512, 131072) sum 0.088144 0.091314 0.085664 0.087864 +2.90% +3.93% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1000, 1000) mean 0.014977 0.012393 0.014141 0.010614 +5.91% +16.76% (1000, 1000) sum 0.014589 0.012804 0.014118 0.010320 +3.34% +24.07% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 129) mean 0.014208 0.008383 0.013273 0.008440 +7.04% -0.68% (1024, 129) sum 0.013804 0.008863 0.013265 0.009003 +4.06% -1.56% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 257) mean 0.014378 0.009109 0.013037 0.009038 +10.29% +0.79% (1024, 257) sum 0.014387 0.009113 0.013396 0.008698 +7.40% +4.77% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 587) mean 0.014207 0.011037 0.013182 0.010391 +7.78% +6.22% (1024, 587) sum 0.014588 0.011453 0.013539 0.010049 +7.75% +13.97% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 977) mean 0.016024 0.013614 0.015448 0.011845 +3.73% +14.93% (2048, 977) sum 0.015990 0.014033 0.015406 0.012278 +3.79% +14.29% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 128) mean 0.014037 0.007804 0.013143 0.008242 +6.80% -5.31% (1024, 128) sum 0.014041 0.007847 0.012759 0.007850 +10.05% -0.04% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 128) mean 0.014361 0.009644 0.014075 0.009061 +2.03% +6.43% (8192, 128) sum 0.014366 0.010032 0.013702 0.009181 +4.85% +9.27% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 130) mean 0.014226 0.008696 0.012894 0.008835 +10.33% -1.57% (1024, 130) sum 0.013830 0.008740 0.013288 0.008989 +4.08% -2.77% ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 130) mean 0.015036 0.010019 0.013917 0.009538 +8.04% +5.04% (8192, 130) sum 0.014652 0.010403 0.013900 0.009565 +5.41% +8.76% ==================================================================================================================================================================================== ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165055 Approved by: https://github.com/ngimel ghstack dependencies: #165494, #164790	2025-10-17 13:39:36 +00:00
Yuanyuan Chen	7231118db3	Turn some const variables into constexpr in C++ code (#165401 ) This PR checks the C++ code and turns some const variables into constexpr. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165401 Approved by: https://github.com/Skylion007	2025-10-17 13:24:46 +00:00
PyTorch MergeBot	5d4da26ed0	Revert "[export] preserve_node_meta by default (#165524 )" This reverts commit fdd560afd1d413a9f814cbf7cc2a72e0d39b0117. Reverted https://github.com/pytorch/pytorch/pull/165524 on behalf of https://github.com/lw due to test/functorch/test_control_flow.py::TestControlFlowTraced::test_cond_symint_closure [GH job link](https://github.com/pytorch/pytorch/actions/runs/18586312291/job/52991654051) [HUD commit link](`fdd560afd1`) ([comment](https://github.com/pytorch/pytorch/pull/165524#issuecomment-3415352522))	2025-10-17 12:27:17 +00:00
PyTorch MergeBot	574c9fc950	Revert "Remove torch.serialization entries from the doc ignore list (#160224 )" This reverts commit 9fe3b2afbeff12080b483af1ee23e1c9d9fb0421. Reverted https://github.com/pytorch/pytorch/pull/160224 on behalf of https://github.com/lw due to [GH job link](https://github.com/pytorch/pytorch/actions/runs/18588004962/job/52997748336) [HUD commit link](`9fe3b2afbe`) ([comment](https://github.com/pytorch/pytorch/pull/160224#issuecomment-3415345175))	2025-10-17 12:24:08 +00:00
PyTorch MergeBot	80d2ca7566	Revert "[annotate] add annotate_fn function decorator (#165703 )" This reverts commit f1d882212afc3a73ce1e319d80b6406f9dc4a0c8. Reverted https://github.com/pytorch/pytorch/pull/165703 on behalf of https://github.com/lw due to [GH job link](https://github.com/pytorch/pytorch/actions/runs/18585518705/job/52989521797) [HUD commit link](`f1d882212a`) ([comment](https://github.com/pytorch/pytorch/pull/165703#issuecomment-3415073467))	2025-10-17 11:23:13 +00:00
Nikita Shulga	4a22139eea	[MPS][BE] Fix unused variable warning (#165726 ) Namely this one ``` /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/Shape.metal:19:18: warning: unused variable 'output_sizes' [-Wunused-variable] constant auto& output_sizes = shared_params.output_sizes; ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/Shape.metal:85:1: note: in instantiation of function template specialization 'cat<long, float, float>' requested here REGISTER_CAT_FOR_INDEX_TYPE(int64_t); ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/Shape.metal:69:3: note: expanded from macro 'REGISTER_CAT_FOR_INDEX_TYPE' REGISTER_CAT_OP_ALL_INPUT_TYPES(I, float); \ ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/Shape.metal:55:3: note: expanded from macro 'REGISTER_CAT_OP_ALL_INPUT_TYPES' REGISTER_CAT_OP(I, float, T_out); \ ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/Shape.metal:47:15: note: expanded from macro 'REGISTER_CAT_OP' kernel void cat<I, T_in, T_out>( \ ``` Repeated about 20-30 times Pull Request resolved: https://github.com/pytorch/pytorch/pull/165726 Approved by: https://github.com/Skylion007	2025-10-17 11:16:21 +00:00
Simon Layton	cb6e4d7d82	User-passed alpha to scaled_gemm (#165563 ) Summary: Add optional user-passed `alpha` argument to `at::cuda::blas::scaled_gemm`, necessary for two-level-scaled NVFP4 gemm calls (where the global de-scales are folded into the `alpha` argument. Global de-scales are naturally device tensors, but using cublas' device-pointer mode for `alpha`/`beta` has an interesting lifetime implication - the `alpha` tensor must be valid & correct until the end of the matmul call, not just the launch (as for host values). To enable this, I added device-constant memory for `one` and `zero`, along with a statically-held single-fp32-value tensor, which is valid from the first passed-`alpha` invocation of `scaled_gemm` to the end of the program. User-passed values are copied into this perpetual buffer to ensure lifetime requirements are met. Test Plan: Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlayton@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165563 Approved by: https://github.com/drisspg, https://github.com/eqy	2025-10-17 09:42:33 +00:00
Jerry Mannil	202f83dc4e	[ROCm][layer_norm] Use __builtin_amdgcn_rcpf(x) instead of 1.f/x (#165589 ) Replace (more) exact calculation with hardware approximation. Benefits: Reduced code size. Improved performance for certain scenarios. Experiments show low reduction in precision. Experiments show no significant performance regressions. bfloat16 as well as float16 related calculations may benefit largely from this change. Co-author: @mhalk @amd-hhashemi Pull Request resolved: https://github.com/pytorch/pytorch/pull/165589 Approved by: https://github.com/jeffdaily	2025-10-17 09:12:30 +00:00
Joel Schlosser	9fe3b2afbe	Remove torch.serialization entries from the doc ignore list (#160224 ) Follows the approach done in #158581 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160224 Approved by: https://github.com/janeyx99	2025-10-17 09:06:09 +00:00
Xilun Wu	d0c24b392c	[APF Logging][Error Trait] To fill the errorTraits for ChildFailedError with signal abort (re-attempt of #165476 ) (#165688 ) Summary Land @guoding83128 's PR https://github.com/pytorch/pytorch/pull/165476 on his behalf due to EasyCLA blocking. Refer his original PR for detail. But in short, elastic leaves 'errorTraits' as unknown when the error dump file is missing, this PR adds a "system terminated error" to such case so the internal scuba table can correctly aggregate. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165688 Approved by: https://github.com/fduwjj	2025-10-17 08:23:27 +00:00
Yu, Guangye	b44fb14906	Remove unused parameter when query extension attribute (#165623 ) # Motivation This code is no longer needed since SYCL compiler 2025.0. We are now using compiler 2025.2 (two tool uplifts later), so it can be safely removed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165623 Approved by: https://github.com/EikanWang ghstack dependencies: #165622	2025-10-17 08:16:13 +00:00
Yu, Guangye	51348c0219	Give a friendly message for older Intel GPU (#165622 ) # Motivation Notify the user if the GPU is older than officially supported. This provides a friendly warning that the GPU may work, but the experience could be unstable. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165622 Approved by: https://github.com/EikanWang	2025-10-17 08:16:13 +00:00
Pian Pawakapan	fdd560afd1	[export] preserve_node_meta by default (#165524 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/165524 Approved by: https://github.com/malaybag	2025-10-17 07:55:28 +00:00
Yuanyuan Chen	e925dfcc6b	Enable all SIM rules except disabled ones (#164645 ) `SIM` rules are useful for simplifying boolean expressions and enhances code readability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164645 Approved by: https://github.com/ezyang, https://github.com/mlazos	2025-10-17 07:27:11 +00:00
Shangdi Yu	f1d882212a	[annotate] add annotate_fn function decorator (#165703 ) Example usage: ``` @fx_traceback.annotate_fn({"pp_stage": 1}) def example_function(x): return x * x class SimpleLinear(nn.Module): def __init__(self): super().__init__() self.linear = nn.Linear(3, 2) def forward(self, x): with fx_traceback.annotate({"pp_stage": 0}): y = self.linear(x) y = example_function(y) return y - 1 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165703 Approved by: https://github.com/SherlockNoMad	2025-10-17 07:18:47 +00:00
Animesh Jain	24879f0de9	[dynamo] Use Variable Builder to build the property fget object (#165683 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165683 Approved by: https://github.com/ezyang, https://github.com/williamwen42	2025-10-17 06:29:24 +00:00
PyTorch MergeBot	9e94ec76b8	Revert "Turn some const variables into constexpr in C++ code (#165401 )" This reverts commit 5b2afe4c5dc87786ca65bf22ca9a78f7c21a33a4. Reverted https://github.com/pytorch/pytorch/pull/165401 on behalf of https://github.com/seemethere due to This is breaking test/distributions/test_distributions.py::TestDistributions::test_binomial_sample on HUD, see `5b2afe4c5d` ([comment](https://github.com/pytorch/pytorch/pull/165401#issuecomment-3414023134))	2025-10-17 06:14:09 +00:00
Richard Barnes	364624e209	[codemod][lowrisk] Remove unused exception parameter from some files (#165700 ) Summary: `-Wunused-exception-parameter` has identified an unused exception parameter. This diff removes it. This: ``` try { ... } catch (exception& e) { // no use of e } ``` should instead be written as ``` } catch (exception&) { ``` If the code compiles, this is safe to land. Test Plan: Sandcastle Differential Revision: D84868162 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165700 Approved by: https://github.com/Skylion007	2025-10-17 05:30:06 +00:00
Tushar Jain	7e150467f7	allow providing full fr trace path (#165639 ) Summary: - allow users to specify the full path instead of fr suffixing the rank id - this will be used by torchft to provide the global rank id accross all replicas - we can't just prefix the replica id because analysis tool expects the file name to provide a unique integer --- [//]: # (BEGIN SAPLING FOOTER) Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/pytorch/pytorch/pull/165639). * #165638 * #165640 * #165677 * #165642 * __->__ #165639 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165639 Approved by: https://github.com/fduwjj	2025-10-17 04:43:44 +00:00
Maggie Moss	43d78423ac	Pyrefly suppressions 2 (#165692 ) This is the last directory to opt in for the regular mypy.ini file. Will put up a diff to remove unused ignores before making sure we're also type checking all the files in the mypy strict configurations Test plan: dmypy restart && python3 scripts/lintrunner.py -a pyrefly check step 1: delete lines in the pyrefly.toml file from the project-excludes field step 2: run pyrefly check step 3: add suppressions, clean up unused suppressions before: https://gist.github.com/maggiemoss/4b3bf2037014e116bc00706a16aef199 after: INFO 0 errors (6,884 ignored) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165692 Approved by: https://github.com/oulgen	2025-10-17 04:15:25 +00:00
Justin Chu	fcbde24c1c	[ONNX] Remove common imports from torchlib (#165156 ) The Rank and IsScalar functions are no longer used in the torchlib. Requires onnxscript v0.5.4 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165156 Approved by: https://github.com/Skylion007, https://github.com/cyyever	2025-10-17 03:25:34 +00:00
eellison	861cdb887b	use statically_known_leq & *=2 instead of bound_sympy in persistent rblock (#165657 ) While these should be equivalent, we've found instances where they are not, and an error was caused. update until we figure out underlying issue. Differential Revision: [D84835898](https://our.internmc.facebook.com/intern/diff/D84835898) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165657 Approved by: https://github.com/bobrenjc93	2025-10-17 02:48:03 +00:00
Eddie Yan	3154482072	[CUDA][cuBLAS] Only `xFail` `addmm` with reduced precision reductions on non-RTX skus (#165379 ) RTX Blackwells don't behave quite like their datacenter counterparts here Pull Request resolved: https://github.com/pytorch/pytorch/pull/165379 Approved by: https://github.com/Skylion007	2025-10-17 02:45:07 +00:00
Mu-Chu Lee	9fccbdd4f0	Fix incorrect function signature in template (#165567 ) Summary: In https://github.com/pytorch/pytorch/pull/148305 we refactored the grid argument out, but it's not reflected in our template. Test Plan: Included in commit. python test/inductor/test_aot_inductor.py AOTInductorTestABICompatibleGpu.test_cond_symint_input_disable_one_pass_cuda Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/165567 Approved by: https://github.com/desertfire	2025-10-17 02:40:56 +00:00
bobrenjc93	7dabfb07cb	[torchfuzz] add support for --stop-at-first-failure flag (#165529 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165529 Approved by: https://github.com/pianpwk ghstack dependencies: #164749	2025-10-17 02:18:07 +00:00
bobrenjc93	d0add0be43	[torchfuzz] check in some more ignore regexes (#164749 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164749 Approved by: https://github.com/pianpwk	2025-10-17 02:18:07 +00:00
PyTorch MergeBot	11e2084308	Revert "[Mem Snapshot] Add Metadata Field (#165490 )" This reverts commit 5b3ea758951558e7d9f681ae784acb57eaa07910. Reverted https://github.com/pytorch/pytorch/pull/165490 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165490#issuecomment-3413491091))	2025-10-17 02:01:53 +00:00
Aaron Gokaslan	9726553653	[BE][Ez]: Use sys.executable instead of hardcoded Python (#165679 ) Handles edgecase to ensure proper interpreter is called. Inspired by #165633 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165679 Approved by: https://github.com/FindHao	2025-10-17 01:07:40 +00:00
Shangdi Yu	d82527b32a	[Windows] Add AOTI cross-compilation CI (#165573 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165573 Approved by: https://github.com/malfet ghstack dependencies: #165560	2025-10-17 01:05:35 +00:00
Shangdi Yu	5d9b024276	Add mingw to docker (#165560 ) Add mingw to `pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11` docker image to support AOTI cross-compilation This PR will make docker container rebuild, and upgrade python version from 3.13.7 to 3.13.8. and it relies on https://github.com/pytorch/pytorch/pull/165667 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165560 Approved by: https://github.com/malfet	2025-10-17 00:47:01 +00:00
Yuanyuan Chen	5b2afe4c5d	Turn some const variables into constexpr in C++ code (#165401 ) This PR checks the C++ code and turns some const variables into constexpr. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165401 Approved by: https://github.com/Skylion007	2025-10-17 00:40:11 +00:00
Yuanyuan Chen	b2953f5643	[9/N] Apply ruff UP035 rule (#165515 ) This is follow-up of #165214 to continue applying ruff UP035 rule to the code base. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165515 Approved by: https://github.com/Lucaskabela	2025-10-17 00:09:51 +00:00
PyTorch MergeBot	470e2f61c3	Revert "[Fix] Use sys.executable instead of hardcoded python (#165633 )" This reverts commit 37f3ba274a8ccebc6b3409f52cf068a8b23617d4. Reverted https://github.com/pytorch/pytorch/pull/165633 on behalf of https://github.com/malfet due to Looks like it broke test_collect_callgrind in slow workflows, see `e0fe37fa68/1` ([comment](https://github.com/pytorch/pytorch/pull/165633#issuecomment-3413290813))	2025-10-17 00:06:40 +00:00
Kurt Mohler	e0fe37fa68	[MPS] Move `torch.cat` impl to Metal (#165373 ) After this change, all of the cases tested in [this performance measurement script](`10de64c5ac/cat/perf0.py`) take either roughly the same runtime or less. Before: ``` idx: cpu time, mps time, speedup, op, args, kwargs ----------------------------------------- 0: 0.000857 ms, 0.016098 ms, 0.05, cat, [[tensor(shape[5, 5]), tensor(shape[5, 5])]], {'dim': -1} 1: 0.000858 ms, 0.014861 ms, 0.06, cat, [[tensor(shape[5, 5]), tensor(shape[5, 5])]], {'dim': 1} 2: 0.000806 ms, 0.015145 ms, 0.05, cat, [[tensor(shape[10, 5]), tensor(shape[5, 5])]], {'dim': 0} 3: 0.000829 ms, 0.015355 ms, 0.05, cat, [[tensor(shape[1, 2, 3]), tensor(shape[1, 2, 3])]], {'dim': -2} 4: 0.000591 ms, 0.000582 ms, 1.02, cat, [[tensor(shape[0]), tensor(shape[0])]], {'dim': 0} 5: 0.001076 ms, 0.022387 ms, 0.05, cat, [[tensor(shape[0]), tensor(shape[5, 5])]], {'dim': 1} 6: 0.000708 ms, 0.022300 ms, 0.03, cat, [[tensor(shape[0, 5]), tensor(shape[5, 5])]], {'dim': 0} 7: 0.000640 ms, 0.014367 ms, 0.04, cat, [[tensor(shape[1]), tensor(shape[1])]], {} 8: 0.000777 ms, 0.027506 ms, 0.03, cat, [[tensor(shape[2, 2, 2, 2])], 1], {} 9: 0.003383 ms, 0.269277 ms, 0.01, cat, "[[tensor(shape[3, 1, 2]), tensor(shape[3, 2, 2]), tensor(shape[3, 3, 2]), tensor(shape[3, 1, 2]), te...", {'dim': 1} 10: 0.526138 ms, 0.650852 ms, 0.81, cat, "[[tensor(shape[3, 1, 2]), tensor(shape[3, 2, 2]), tensor(shape[3, 3, 2]), tensor(shape[3, 1, 2]), te...", {'dim': 1} 11: 0.444091 ms, 0.628630 ms, 0.71, cat, "[[tensor(shape[1, 3, 2]), tensor(shape[2, 3, 2]), tensor(shape[3, 3, 2]), tensor(shape[1, 3, 2]), te...", {'dim': 0} 12: 2.011870 ms, 0.989525 ms, 2.03, cat, [[tensor(shape[1000000, 3, 2]), tensor(shape[1000000, 3, 2])]], {'dim': 0} 13: 3.100653 ms, 0.948178 ms, 3.27, cat, [[tensor(shape[3, 1000000, 2]), tensor(shape[3, 1000000, 2])]], {'dim': 1} 14: 3.112174 ms, 0.954174 ms, 3.26, cat, [[tensor(shape[3, 2, 1000000]), tensor(shape[3, 2, 1000000])]], {'dim': 2} ``` After: ``` idx: cpu time, mps time, speedup, op, args, kwargs ----------------------------------------- 0: 0.000790 ms, 0.013111 ms, 0.06, cat, [[tensor(shape[5, 5]), tensor(shape[5, 5])]], {'dim': -1} 1: 0.000800 ms, 0.014419 ms, 0.06, cat, [[tensor(shape[5, 5]), tensor(shape[5, 5])]], {'dim': 1} 2: 0.000748 ms, 0.010019 ms, 0.07, cat, [[tensor(shape[10, 5]), tensor(shape[5, 5])]], {'dim': 0} 3: 0.000767 ms, 0.010063 ms, 0.08, cat, [[tensor(shape[1, 2, 3]), tensor(shape[1, 2, 3])]], {'dim': -2} 4: 0.000591 ms, 0.000591 ms, 1.00, cat, [[tensor(shape[0]), tensor(shape[0])]], {'dim': 0} 5: 0.001220 ms, 0.009763 ms, 0.12, cat, [[tensor(shape[0]), tensor(shape[5, 5])]], {'dim': 1} 6: 0.000739 ms, 0.006203 ms, 0.12, cat, [[tensor(shape[0, 5]), tensor(shape[5, 5])]], {'dim': 0} 7: 0.000647 ms, 0.009905 ms, 0.07, cat, [[tensor(shape[1]), tensor(shape[1])]], {} 8: 0.000753 ms, 0.007818 ms, 0.10, cat, [[tensor(shape[2, 2, 2, 2])], 1], {} 9: 0.003823 ms, 0.192723 ms, 0.02, cat, "[[tensor(shape[3, 1, 2]), tensor(shape[3, 2, 2]), tensor(shape[3, 3, 2]), tensor(shape[3, 1, 2]), te...", {'dim': 1} 10: 0.576564 ms, 0.733920 ms, 0.79, cat, "[[tensor(shape[3, 1, 2]), tensor(shape[3, 2, 2]), tensor(shape[3, 3, 2]), tensor(shape[3, 1, 2]), te...", {'dim': 1} 11: 0.462957 ms, 0.692799 ms, 0.67, cat, "[[tensor(shape[1, 3, 2]), tensor(shape[2, 3, 2]), tensor(shape[3, 3, 2]), tensor(shape[1, 3, 2]), te...", {'dim': 0} 12: 2.017181 ms, 0.968345 ms, 2.08, cat, [[tensor(shape[1000000, 3, 2]), tensor(shape[1000000, 3, 2])]], {'dim': 0} 13: 3.203508 ms, 0.986382 ms, 3.25, cat, [[tensor(shape[3, 1000000, 2]), tensor(shape[3, 1000000, 2])]], {'dim': 1} 14: 3.181249 ms, 1.007773 ms, 3.16, cat, [[tensor(shape[3, 2, 1000000]), tensor(shape[3, 2, 1000000])]], {'dim': 2} ``` Fixes #165350 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165373 Approved by: https://github.com/kulinseth, https://github.com/malfet	2025-10-17 00:03:04 +00:00
PyTorch MergeBot	d2c82bafb7	Revert "158232 Fix autocast cache incorrectly retaining no_grad state (#165068 )" This reverts commit 5daef30b26b794d237fbbc399c1d47ec0380200a. Reverted https://github.com/pytorch/pytorch/pull/165068 on behalf of https://github.com/jeffdaily due to This broke ROCm CI. test/test_transformers.py::TestTransformersCUDA::test_transformerencoder_fastpath_use_torchscript_False_enable_nested_tensor_True_use_autocast_True_d_model_256_cuda [GH job link](https://github.com/pytorch/pytorch/actions/runs/18572589089/job/52952074008) [HUD commit link](`5daef30b26`) ([comment](https://github.com/pytorch/pytorch/pull/165068#issuecomment-3413184445))	2025-10-16 23:08:27 +00:00
Colin L Reliability Rice	98a488c9aa	Start recording inductor provenance (#162669 ) Summary: This stores information on where fx graphs come from, which makes it significantly easier to debug. One outstanding question 1) I only stored the kernel stack traces, do we also want the node mappings? Test Plan: I wrote a explicit logging test which makes a module, fx traces it, compiles it, and makes sure the logging infomration shows up. ``` clr@devvm17763 ~/fbsource/fbcode/caffe2/test/dynamo % buck2 test @//mode/opt fbcode//caffe2/test/dynamo:test_dynamo -- test_utils File changed: fbsource//xplat/caffe2/test/dynamo/test_utils.py File changed: fbcode//caffe2/test/dynamo/test_utils.py Buck UI: https://www.internalfb.com/buck2/528dea32-2416-4a62-a1ec-39f3c0efdd2e Test UI: https://www.internalfb.com/intern/testinfra/testrun/13229324015574003 Network: Up: 0B Down: 0B Executing actions. Remaining 0/2 Command: test. Time elapsed: 17.3s Tests finished: Pass 16. Fail 0. Fatal 0. Skip 0. Build failure 0 ``` Rollback Plan: Differential Revision: D82037582 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162669 Approved by: https://github.com/yushangdi	2025-10-16 23:05:31 +00:00
Shivam Raikundalia	5b3ea75895	[Mem Snapshot] Add Metadata Field (#165490 ) Summary: The implementation adds the ability to: Set custom metadata strings that will be attached to all subsequent allocations Clear or change the metadata at any point View the metadata in memory snapshots via _dump_snapshot() Test Plan: Added test in test_cuda.py and check manually in snapshot to see that metadata was added. Differential Revision: D84654933 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165490 Approved by: https://github.com/yushangdi	2025-10-16 22:54:27 +00:00
Pian Pawakapan	556fc09a9f	[DebugMode][1/N] refactor logs into _DebugCalls (#165376 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165376 Approved by: https://github.com/SherlockNoMad	2025-10-16 22:43:52 +00:00
Nikita Shulga	ce109b3f79	Add `torch.backends.mkldnn.is_acl_available()` method (#165678 ) That tells whether or not PyTorch was compiled with Arm Compute Library Pull Request resolved: https://github.com/pytorch/pytorch/pull/165678 Approved by: https://github.com/Skylion007, https://github.com/atalman, https://github.com/albanD ghstack dependencies: #165583, #165584, #165676	2025-10-16 22:34:21 +00:00
Nikita Shulga	4d833f859b	[BE] [CI] Fix aarch64 arch checks (#165676 ) Instead of relying on `TEST_CONFIG` environment variable to contain `aarch64`, which is prone to errors, use output of `$(uname -m)` that is equal to `aarch64` on Linux ARM systems Pull Request resolved: https://github.com/pytorch/pytorch/pull/165676 Approved by: https://github.com/huydhn, https://github.com/atalman ghstack dependencies: #165583, #165584	2025-10-16 22:19:53 +00:00
Wei Wang	d7e275d4b4	[CI][CUDA] Add periodic b200 distributed job (#159323 ) 1. Run distributed job with B200 runner, periodically. 2. discovered generic distributed test issue that certain unit test hard-coded ranks, calling for require_exact_world_size(world_size) API instead of require_world_size(world_size). Pull Request resolved: https://github.com/pytorch/pytorch/pull/159323 Approved by: https://github.com/eqy Co-authored-by: Aidyn-A <aidyn.b.aitzhan@gmail.com>	2025-10-16 21:54:04 +00:00
Jithun Nair	d5db3aee0d	[CI] Use 1-GPU runners for rocm-mi355.yml (#165658 ) Should only need 1-GPU runners for rocm-mi355.yml since it runs `default` test config which only needs 1 GPU Pull Request resolved: https://github.com/pytorch/pytorch/pull/165658 Approved by: https://github.com/jeffdaily	2025-10-16 21:53:22 +00:00
Maggie Moss	5641de7b6b	Add suppressions for _inductor/codegen (#165659 ) Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283 Test plan: dmypy restart && python3 scripts/lintrunner.py -a pyrefly check step 1: delete lines in the pyrefly.toml file from the project-excludes field step 2: run pyrefly check step 3: add suppressions, clean up unused suppressions before: https://gist.github.com/maggiemoss/4b3bf2037014e116bc00706a16aef199 after: INFO 0 errors (6,884 ignored) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165659 Approved by: https://github.com/oulgen	2025-10-16 21:37:37 +00:00
Nicolas De Carli	cbc08c8993	Add NEON acceleration for `Vectorized<int[8\|16\|32\|64>` (#165273 ) Summary: Adding NEON specializations of Vectorized<T> for int8, int16, int32 and int64. Correcness has been checked using test_ops.py and the comprehensive torch test operator_benchmark_test.py has been enhanced by adding cases of bitwise operations, boolean ops and integer ops. The benchmark, which uses the PyTorch API, shows significant enhancements in a wide variety of operations: Before: bitwise xor: 779.882us boolean any: 636.209us boolean all: 538.621us integer mul: 304.457us integer asr: 447.997us After: bitwise xor: 680.221us ---> 15% higher throughput boolean any: 391.468us ---> 63% higher throughput boolean all: 390.189us ---> 38% higher throughput integer mul: 193.532us ---> 57% higher throughput integer asr: 179.929us---> 149% higher throughput Test Plan: Correctness: buck2 test @mode/opt //caffe2/test:test_ops buck2 test @mode/opt //caffe2/test:torch buck2 test @mode/opt //caffe2/test/distributed/launcher/fb:fb_run_test Performance: buck2 run mode/opt //caffe2/benchmarks/operator_benchmark/fb:operator_benchmark_test Differential Revision: D84424638 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165273 Approved by: https://github.com/malfet	2025-10-16 21:35:13 +00:00
Yiming Zhou	1a54d3333d	[easy] Fix graph_capture in aot_joint_with_descriptors test (#165660 ) when `with_export=True`, `aot_export_joint_with_descriptors` should take the graph produced by `_dynamo_graph_capture_for_export` ``` python test/functorch/test_aot_joint_with_descriptors.py -k test_preserve_annotate_simple python test/functorch/test_aot_joint_with_descriptors.py -k test_preserve_annotate_flex_attention ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165660 Approved by: https://github.com/yushangdi	2025-10-16 21:10:11 +00:00
Aaron Orenstein	4c1c341fa0	FakeTensorMode shouldn't cache syms when tracing (#164718 ) Improve FakeTensor cache to handle SymNode and tracing properly. For now, when we're proxy tracing just don't bother caching operations that contain SymNodes in the output. The problem is that the proxy tracer relies on SymNode identity and our cache doesn't preserve that. It can be fixed (and I left some notes in _validate_symbolic_output_for_caching() how) but it's not worth it for now. If we aren't proxy tracing then caching is fine. Thus these changes: 1. Our cache key needs to include whether we were actively tracing or not - this way if we create a cache entry when we weren't tracing and then we try to use it when we ARE tracing it gets rerun. 2. If there's a SymNode in the output then bypass tracing. 3. Some general cleanup of the output validation - we were unnecessarily doing it as a two-step process when it could just be a single step (it's still two parts internally but only a single outer try/except). Pull Request resolved: https://github.com/pytorch/pytorch/pull/164718 Approved by: https://github.com/bobrenjc93 ghstack dependencies: #165266, #164717	2025-10-16 20:57:07 +00:00
Aaron Orenstein	5f21cc786a	Teach ProxyTorchDispatchMode how to decompose sympy.Expr into known inputs (#164717 ) In a training library we hit a weird conflict between dtensor, dynamic shapes, and proxy tensor. The problem is occuring because in sharding_prop we use FakeTensors to compute an operation size (so we don't have to use the full "real" data). We turn off proxy tracing while we're doing that because we don't want the FakeTensor ops to end up in the graph. We then use that size when doing later operations. Normally this is no problem - but when those sizes are dynamic shapes then we have a problem - the proxy tracer wants to track the provenance of all shape operations (`s1*s2`) but since tracing is disabled it doesn't see the operation and when we then use the result shape later on the proxy tracer gets all confused (because the SymNode appeared out of nowhere). At first we were thinking to never disable shape tracing - but that caused a slew of other downstream problems (lots of code that actually needs the shape tracing to be disabled) so instead we enable having a "sym tracing override" and surgically when we disable proxy tracing we leave shape tracing enabled. After this change the dtensor embedding is "fixed" but then runs afoul of a FakeTensor cache bug - which is fixed in the next PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164717 Approved by: https://github.com/bobrenjc93, https://github.com/ezyang ghstack dependencies: #165266	2025-10-16 20:57:06 +00:00
Aaron Orenstein	e86942f422	minor proxy_tensor reorg (#165266 ) Moving some code around in proxy_tensor in preparation for the next PR. There we no actual changes (other than simple relabeling such as `self.tracer` -> `tracer`): - Move _compute_proxy() out of ProxyTorchDispatchMode. - Give `sympy_expr_tracker` a structured type instead of `object`. - Split SymNode registration out of ProxyTorchDispatchMode.__sym_dispatch__() so it can be reused. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165266 Approved by: https://github.com/ezyang, https://github.com/mlazos	2025-10-16 20:57:06 +00:00
Dzmitry Huba	2cd5fd1588	Enable local tensor mode on DTensor view ops test (#165596 ) While enabling this test discovered lack of support for sub meshes. Added limited support for sub meshes by properly computing rank coordinates for a given sub mesh. The implementation follows similar approach to collectives. We infer all sub meshes for the given dimensions and compute each rank's coordinates with respect to is sub mesh. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165596 Approved by: https://github.com/ezyang	2025-10-16 20:52:06 +00:00
Oguz Ulgen	7d0f872cb3	Use union syntax in torch/_inductor runtime and fx_passes (#165652 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165652 Approved by: https://github.com/aorenste	2025-10-16 20:51:59 +00:00
PyTorch MergeBot	fb06e49ce8	Revert "[inductor] print 0.0 as 0 for triton (#164291 )" This reverts commit 99b32a6750bfd0cfe2bc84a47823e1da34802b7b. Reverted https://github.com/pytorch/pytorch/pull/164291 on behalf of https://github.com/malfet due to Broke slow job, see `aba8c43594/1` ([comment](https://github.com/pytorch/pytorch/pull/164291#issuecomment-3412768915))	2025-10-16 20:44:29 +00:00
PyTorch MergeBot	27a98e6ae9	Revert "[DeviceMesh] Prefer using _layout over _mesh for all sorts of things (#165554 )" This reverts commit d61a9b88cf3be04a29c5a7d6e9622ae5e8d51de3. Reverted https://github.com/pytorch/pytorch/pull/165554 on behalf of https://github.com/malfet due to Looks like it broke serialization test, see `aba8c43594/1` ([comment](https://github.com/pytorch/pytorch/pull/165554#issuecomment-3412765681))	2025-10-16 20:41:37 +00:00
PyTorch MergeBot	b10f463b1a	Revert "[DeviceMesh] Introduce private constructor instead of _create_mesh_from_ranks (#165555 )" This reverts commit 99097b6d89c927c15180ff4683c38be01f9955f6. Reverted https://github.com/pytorch/pytorch/pull/165555 on behalf of https://github.com/malfet due to Looks like it broke serialization test, see `aba8c43594/1` ([comment](https://github.com/pytorch/pytorch/pull/165554#issuecomment-3412765681))	2025-10-16 20:41:37 +00:00
PyTorch MergeBot	431c13cf61	Revert "[DeviceMesh] Simplify unflatten method (#165556 )" This reverts commit 86fd4fc23e697e275d37c36e3cbe521f156434fd. Reverted https://github.com/pytorch/pytorch/pull/165556 on behalf of https://github.com/malfet due to Looks like it broke serialization test, see `aba8c43594/1` ([comment](https://github.com/pytorch/pytorch/pull/165554#issuecomment-3412765681))	2025-10-16 20:41:37 +00:00
Ketan Ambati	aead9270f5	12/n : Remove fbandroid_compiler_flags (#165558 ) Summary: Currently `get_c2_fbandroid_xplat_compiler_flags()` is reading the `caffe2.strip_glog` buckconfig which we want to get rid of. This diff removes the `fbandroid_compiler_flags` arg and merges it with compiler_flags with a nested select and the select version of the method The goal is to get rid of all the usages of `get_c2_fbandroid_xplat_compiler_flags()` so that we can get rid of the `caffe2.strip_glog` buckconfig Test Plan: CI bifferential Revision: D84626885 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165558 Approved by: https://github.com/malfet	2025-10-16 20:41:24 +00:00
Janani Sriram	9bf5b38c14	[Inductor][Triton][FP8] Refactor scaled_mm template to accept scaling mode (#164318 ) Summary: Refactor `scaled_mm` Inductor template to support template choice based on scaling mode. This modification sets up the infrastructure for adding new templates based on new scaling modes, such as deepseek-style scaling (a follow-up diff), as new scaling modes (deepseek, block, group) scale before the accumulation (as opposed to per-tensor and per-row scaling, which apply scaling after accumulation). This modification also further enables Inductor to infer a scaling type based on the shape of the scaling tensors, which makes existing infrastructure more extensible to new scaling modes. Test Plan: ``` TORCHINDUCTOR_CACHE_DIR=~/personal/cache_dir_inductor CUDA_LAUNCH_BLOCKING=1 TORCH_USE_CUDA_DSA=1 TRITON_PRINT_AUTOTUNING=1 TRITON_ALWAYS_COMPILE=1 TORCH_LOGS=+inductor TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 ENABLE_PERSISTENT_TMA_MATMUL=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM=1 buck2 run mode/{opt,inplace} pytorch/tritonbench:run -- --op fp8_gemm --only torch_fp8_gemm,pt2_fp8_gemm --metrics tflops,accuracy --m 256 --n 768 --k 512 --output="/home/jananisriram/personal/random_bench.csv" --scaling_rowwise --atol=20 --rtol=2 2>&1 \| tee ~/personal/random.log ``` bifferential Revision: D83591083 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164318 Approved by: https://github.com/drisspg, https://github.com/slayton58	2025-10-16 20:40:45 +00:00
Tristan Trouwen	aba8c43594	Register var for MTIA (#165382 ) Summary: Registers variance kernel Reviewed By: srsuryadev Differential Revision: D84546250 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165382 Approved by: https://github.com/malfet	2025-10-16 20:35:15 +00:00
linhaifeng	37f3ba274a	[Fix] Use sys.executable instead of hardcoded python (#165633 ) Replace hardcoded "python" string with sys.executable to ensure correct Python interpreter is used. This fixes failures on systems with multiple Python runtimes or where "python" is not in PATH. Similar to pytorch/pytorch#155918 Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/165633 Approved by: https://github.com/Skylion007	2025-10-16 20:26:10 +00:00
IvanKobzarev	585b9dbb5e	[async_tp] Support ag+mm with gather_dim lastdim of mat_A (#163068 ) Adding ag+mm support for the case, when gather_dim is last dim of matmul (reduction dim). When we decompose matmul by reduction dimension we result in partials that needs additional reduction, we allocate memory for accumulator. Decomposition should not produce small (thin) mms that can not efficiently load the GPU. Limiting for minimal size of the shard 1024 (found empirically by testing in torchtitan). scaled_mm is not supported yet for this case. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163068 Approved by: https://github.com/ngimel	2025-10-16 20:14:39 +00:00
Maggie Moss	d795fb225a	[RFC] Add pyrefly to lintrunner (#165179 ) This will add pyrefly to lint runner as a warning only - and allow us to collect feedback about the tool before switching to pyrefly as the main type checker. References the steps outlined here: : https://github.com/pytorch/pytorch/issues/163283: test plan: `lintrunner init` `lintrunner` confirm when pyrefly errors are present results look like: https://gist.github.com/maggiemoss/e6cb2d015dd1ded560ae1329098cf33f Pull Request resolved: https://github.com/pytorch/pytorch/pull/165179 Approved by: https://github.com/ezyang	2025-10-16 20:07:09 +00:00
tvukovic-amd	7df9aca529	[ROCm][Windows] Enable AOTriton runtime compile on Windows (#165538 ) AOTriton uses prebuilt runtime binaries if the user's ROCm version matches the ones used to generate the prebuilt runtime. However, since there's no prebuilt runtime available for Windows, this check needs to be bypassed for Windows. This PR enables it by changing condition to always build AOTriton runtime from source on Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165538 Approved by: https://github.com/xinyazhang, https://github.com/jeffdaily	2025-10-16 19:51:43 +00:00
Shangdi Yu	d4a713cd9c	Change forkserver test to only run below 3.13.8 (#165667 ) A multiprocessing bug is fixed in 3.13.8, see [https://docs.python.org/3.13/whatsnew/changelog.html](https://l.workplace.com/l.php?u=https%3A%2F%2Fdocs.python.org%2F3.13%2Fwhatsnew%2Fchangelog.html&h=AT0qUhHJq5c2UJvQaq9_MrSo0mVhwn1VOfq1nDQl2C1UOhDI80RMbzVayhG7LSAT1uYHKtkftKnBDwiGMhbw0YRvQLe5vwE01qejpPFautHvU3LXeOE1KChPykqz3qnCRzk7czu_iNzQ05shR4F1N_qYOzR5YxejA52ZZQ), [gh-126631](https://github.com/python/cpython/issues/126631) So this test will fail when we update to python 3.13.8 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165667 Approved by: https://github.com/malfet	2025-10-16 19:34:10 +00:00
Sean McGovern	5daef30b26	158232 Fix autocast cache incorrectly retaining no_grad state (#165068 ) Fixes #158232 The autocast caching heuristic in `aten/src/ATen/autocast_mode.cpp:139` did not account for gradient mode state when deciding whether to cache. FSDP2 is not directly related. ~~This PR adds `GradMode::is_enabled()` check to caching condition. Caching is now disabled in `no_grad()` contexts to prevent storing tensors with incorrect gradient state. Ensures correctness at the cost of using cache.~~ This PR proposes separate caches for gradient-enabled and gradient-disabled modes. Adds tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165068 Approved by: https://github.com/ngimel, https://github.com/janeyx99	2025-10-16 19:32:01 +00:00
Huy Do	6dedd34c31	[CD] Skip 12.9 build on Windows (#165665 ) Per title Pull Request resolved: https://github.com/pytorch/pytorch/pull/165665 Approved by: https://github.com/Camyll, https://github.com/malfet	2025-10-16 19:11:27 +00:00
Shunting Zhang	a303d6dda9	[inductor] don't try to reorder loops for template (#165601 ) fix https://github.com/pytorch/pytorch/issues/165579 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165601 Approved by: https://github.com/yushangdi	2025-10-16 19:05:21 +00:00
Jagadish Krishnamoorthy	7669ac9402	[ROCm] Add scaled_mm v2 support. (#165528 ) Add mx fp4 support in Blas.cpp. Updated the scale_kernel_dispatch array and ScaledGemmImplementation enum to include MXFP4 support. Modify the tests under test_scaled_matmul_cuda accordingly. PYTORCH_TEST_WITH_ROCM=1 python test/test_scaled_matmul_cuda.py -v -k test_blockwise 115 test passed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165528 Approved by: https://github.com/jeffdaily	2025-10-16 18:36:41 +00:00
Luca Wehrstedt	86fd4fc23e	[DeviceMesh] Simplify unflatten method (#165556 ) By adding a few small helpers (e.g., a `splice` method to `_MeshLayout`, and making `_init_process_groups` static and thus stateless) we can substantially shorten the definition of the unflatten method, and help readability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165556 Approved by: https://github.com/fduwjj ghstack dependencies: #165554, #165555	2025-10-16 18:36:16 +00:00
Luca Wehrstedt	99097b6d89	[DeviceMesh] Introduce private constructor instead of _create_mesh_from_ranks (#165555 ) The refactoring of DeviceMesh is heavily constrained by the signature of its constructor, which is a public API which contains some "legacy" concepts which we'd love to get rid of, such as an explicit/materialized `mesh` Tensor. In other languages the solution to this would be to add a private overload of the constructor. Python doesn't natively allow this, but in this PR I managed to build something that approximates it. This new private constructor basically only takes `_layout`, `_global_rank_permutation`, and `mesh_dim_names`. With such a constructor we can effectively simplify a lot of callsites and get rid of the `_create_mesh_from_ranks` helper method. That's a good thing because it was instantiating many DeviceMeshes in a for loop, which always felt unnecessary. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165555 Approved by: https://github.com/fduwjj, https://github.com/fegin ghstack dependencies: #165554	2025-10-16 18:36:16 +00:00
eqy	a214371008	[FP8] Add other Blackwell compute-capabiilities to expected fail `test_honor_sm_carveout` (#165159 ) CUTLASS SM hint also isn't working for other Blackwells, need green context for carveout Pull Request resolved: https://github.com/pytorch/pytorch/pull/165159 Approved by: https://github.com/Skylion007	2025-10-16 18:35:06 +00:00
IvanKobzarev	7d87d7052e	[inductor][bucketing] Fx collectives bucketing of multiple dtypes (#162470 ) Bucketing of multiple dtypes to be processed in one bucketed collective. First target is to bucket bf16 and f32, but already can be used with other dtypes. For now multidtype bucketing is only supported with "custom_ops" mode. Non custom_ops needs additional work on inductor side. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162470 Approved by: https://github.com/eellison	2025-10-16 18:31:43 +00:00
arkadip-maitra	1a34ff4e04	Fixing get_local_rank() variable missing when compiled (#165432 ) Fixes #165215 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165432 Approved by: https://github.com/bdhirsh	2025-10-16 18:20:34 +00:00
Angel Li	fe5ccb1a74	bf16 support for per tensor backward (#165362 ) Adding bf16 for the backward pass of `torch._fake_quantize_learnable_per_tensor_affine()`. Note that for testing, we modified the seed to avoid increasing tolerance due to cases where difference in Python vs CPP downcasting causes tensor mismatches. (e.g. 27.87704 vs 27.8408 before downcasting, 27.7500 vs 27.8750 after downcasting for Python vs CPP op) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165362 Approved by: https://github.com/andrewor14	2025-10-16 17:47:01 +00:00
Thanh Ha	85586d7efc	Make c7i the default for _linux-build.yml (#164747 ) Use linux.c7i.2xlarge as the default runner for the _linux-build.yml workflow. In testing we found that switching from c5 - c7i grants a 15-20% faster build times despite c7i costing 5% more. This should reduce costs of jobs using _linux-build.yml. Relates to pytorch/test-infra#7175. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164747 Approved by: https://github.com/atalman	2025-10-16 17:37:51 +00:00
PyTorch MergeBot	e1d71a6b35	Revert "12/n : Remove fbandroid_compiler_flags (#165558 )" This reverts commit d7ffa8b8a29ba6071c51499c1df3d702d0a26f72. Reverted https://github.com/pytorch/pytorch/pull/165558 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/165558#issuecomment-3411879769))	2025-10-16 17:18:56 +00:00
Luca Wehrstedt	d61a9b88cf	[DeviceMesh] Prefer using _layout over _mesh for all sorts of things (#165554 ) The goal of this PR is to avoid storing the explicit `mesh` Tensor inside each DeviceMesh, and instead compute it on-the-fly when the end user needs it, and try to replace all of its internal usages with `_layout` and the newly-introduced `_global_rank_permutation` Tensor. The name of this attribute is up for debate. The advantage of the `_global_rank_permutation` Tensor is that it is _the same_ Tensor for the root mesh and all its children, so it doesn't need to be copied/reallocated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165554 Approved by: https://github.com/fduwjj	2025-10-16 17:01:44 +00:00
Isuru Fernando	99b32a6750	[inductor] print 0.0 as 0 for triton (#164291 ) Fixes https://github.com/pytorch/pytorch/issues/164157 Fixes https://github.com/pytorch/pytorch/issues/164086 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164291 Approved by: https://github.com/bobrenjc93	2025-10-16 16:37:50 +00:00
Edward Yang	783da8b8e7	Repro for property related Dynamo graph break (#165609 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165609 Approved by: https://github.com/albanD, https://github.com/gchanan, https://github.com/malfet, https://github.com/anijain2305	2025-10-16 16:22:43 +00:00
Brian Hirsh	ed74dc054d	add the option to disable functionalization in AOTDispatcher (#164577 ) I'm cleaning this PR up as a proper way of disabling functionalization via config in AOTDispatcher. I removed the non-functionalization related changes from the original version: (1) preventing proxy mode (and functionalization) from incorrectly decomposing CIA ops (Ed has a PR for it here: https://github.com/pytorch/pytorch/pull/164939) (2) preventing python-dispatcher-based decomps above autograd from running. I'm not doing this for now, will likely do it in a followup Pull Request resolved: https://github.com/pytorch/pytorch/pull/164577 Approved by: https://github.com/ezyang ghstack dependencies: #165372	2025-10-16 15:44:11 +00:00
Brian Hirsh	f33c7e1a43	add and fix OpInfo tests for the default partitioner (#165372 ) I noticed the default partitioner was breaking in some dynamic shape tests, so prior to turning off functionalization I want to tweak it to pass all of our OpInfo tests Pull Request resolved: https://github.com/pytorch/pytorch/pull/165372 Approved by: https://github.com/ezyang	2025-10-16 15:44:11 +00:00
Yu, Guangye	219fb6aafc	Refactor CUDAAllocatorConfig using ConfigTokenizer (#165281 ) * #165129 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165281 Approved by: https://github.com/albanD ghstack dependencies: #165129, #165131, #165135, #165136	2025-10-16 15:26:50 +00:00
Yu, Guangye	515b5ff539	Remove unused code in CUDAAllocatorConfig (#165136 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165136 Approved by: https://github.com/Skylion007 ghstack dependencies: #165129, #165131, #165135	2025-10-16 15:26:50 +00:00
Yu, Guangye	608a6d4a26	Reuse AcceleratorAllocatorConfig in CUDAAllocatorConfig (#165135 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165135 Approved by: https://github.com/Skylion007 ghstack dependencies: #165129, #165131	2025-10-16 15:26:40 +00:00
Yu, Guangye	03e5dbb26e	Register CUDAAllocatorConfig to AcceleratorAllocatorConfig (#165131 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165131 Approved by: https://github.com/Skylion007 ghstack dependencies: #165129	2025-10-16 15:26:28 +00:00
Yu, Guangye	7ee45f7503	Restore AcceleratorAllocatorConfig to avoid potential regression (#165129 ) # Motivation This PR aims to restore `AcceleratorAllocatorConfig` to avoid the potential regression mentioned in https://github.com/pytorch/pytorch/pull/160666#issue-3323270375 These code change would be reverted in the following PR https://github.com/pytorch/pytorch/pull/165304 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165129 Approved by: https://github.com/albanD	2025-10-16 15:26:17 +00:00
Lucas Kabela	e6d9d68598	[Bugfix][Dynamo] Fix Sparse tensors by graph break in Dynamo (#164873 ) Fixes #164823 by making lack of support for sparse tensors very explicit (in fake tensor, inductor, and lowering code) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164873 Approved by: https://github.com/williamwen42, https://github.com/eellison, https://github.com/mlazos	2025-10-16 15:06:20 +00:00
Nikita Shulga	1a5b7eca7b	[BE] Fold cond into `TORCH_CHECK(false,...)` (#165593 ) Replace `if (!foo) { TORCH_CHECK(false, "bar");}` with `TORCH_CHECK(foo,"bar");` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165593 Approved by: https://github.com/albanD ghstack dependencies: #165594	2025-10-16 15:00:30 +00:00
Isalia20	8573574b32	[MPS] sparse mask implementation (#165102 ) sparse mask implementation Pull Request resolved: https://github.com/pytorch/pytorch/pull/165102 Approved by: https://github.com/malfet	2025-10-16 14:31:00 +00:00
Nikita Shulga	e6033f6efb	[MPS] Improve `index_fill_` error handling (#165594 ) It shoudl not throw "Cannot convert a float64 Tensor to MPS", but rather a sensible "Converting complex Scalar to non-complex type is not supported". Add TODO about the complex support, probably good reason to rip out MPSGraph from index_fill as well Pull Request resolved: https://github.com/pytorch/pytorch/pull/165594 Approved by: https://github.com/dcci, https://github.com/kulinseth	2025-10-16 14:18:39 +00:00
IvanKobzarev	9272437cde	Fx collectives bucketing: add bucket all_reduce (#165351 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165351 Approved by: https://github.com/eellison	2025-10-16 13:27:33 +00:00
lichuyang	f06e669f6c	refactor: replace runtime_error with TORCH_CHECK for better error handling (#163628 ) Fixes some parts of issue #148114 @pytorchbot label "topic: not user facing" @FFFrog PTAL Pull Request resolved: https://github.com/pytorch/pytorch/pull/163628 Approved by: https://github.com/albanD	2025-10-16 11:09:48 +00:00
PyTorch MergeBot	69b05913fb	Revert "Add mingw to docker (#165560 )" This reverts commit 5e480b8ecf870e4a466c165701ab0e9d055f2ceb. Reverted https://github.com/pytorch/pytorch/pull/165560 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165560#issuecomment-3409814274))	2025-10-16 08:42:11 +00:00
Isalia20	d73c283c3a	[CUDA] Large tensor maxpool crash fix (#165374 ) Fixes #165297 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165374 Approved by: https://github.com/eqy, https://github.com/malfet	2025-10-16 07:59:46 +00:00
Tiwari-Avanish	eaeaa08e3a	[PowerPC] Disable MKLDNN TF32 on PowerPC to fix build failure (#163454 ) The commits f4d8bc46c7706f872abcb4ec41f0b32207d5d826 added TF32 support for x86 CPUs, which causes build failures on PowerPC systems with mkldnn. This patch disables TF32 paths on PowerPC while keeping x86 TF32 support intact, allowing PyTorch to build successfully on PowerPC. I have run the mkldnn test case on PowerPC, and it passed successfully. `pytest test/test_mkldnn.py 87 passed, 2 skipped in 1709.02s (0:28:29` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163454 Approved by: https://github.com/jgong5, https://github.com/malfet	2025-10-16 06:13:59 +00:00
Yu, Guangye	d0c32971b4	Refine XPU allocator message when OOM (#165509 ) # Motivation Provide more information and align with other backends to enhance the user experience. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165509 Approved by: https://github.com/EikanWang ghstack dependencies: #165508	2025-10-16 05:47:49 +00:00
Ketan Ambati	d7ffa8b8a2	12/n : Remove fbandroid_compiler_flags (#165558 ) Summary: Currently `get_c2_fbandroid_xplat_compiler_flags()` is reading the `caffe2.strip_glog` buckconfig which we want to get rid of. This diff removes the `fbandroid_compiler_flags` arg and merges it with compiler_flags with a nested select and the select version of the method The goal is to get rid of all the usages of `get_c2_fbandroid_xplat_compiler_flags()` so that we can get rid of the `caffe2.strip_glog` buckconfig Test Plan: CI Differential Revision: D84626885 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165558 Approved by: https://github.com/malfet	2025-10-16 05:46:02 +00:00
Nan Zhang	00afa06800	Add cse for make_block_ptr in Triton codegen (#163399 ) Summary: per title Test Plan: added test cases Differential Revision: D82648215 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163399 Approved by: https://github.com/jansel, https://github.com/njriasan	2025-10-16 05:29:48 +00:00
Oguz Ulgen	5d0b22008d	Codemod inductor/fx_passes from Optional to union none (#165606 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165606 Approved by: https://github.com/aorenste ghstack dependencies: #165604, #165605	2025-10-16 04:59:47 +00:00
Oguz Ulgen	ab6014a903	Codemod inductor/runtime from Optional to union none (#165605 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165605 Approved by: https://github.com/aorenste ghstack dependencies: #165604	2025-10-16 04:59:47 +00:00
Oguz Ulgen	f6daffc54d	Codemod codecache.py from Optional to union none (#165604 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165604 Approved by: https://github.com/aorenste	2025-10-16 04:59:37 +00:00
Yu, Guangye	66b75693ae	Reuse kLargeBuffer in XPUCachingAllocator (#165508 ) # Motivation Reuse the shared code. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165508 Approved by: https://github.com/EikanWang	2025-10-16 04:12:52 +00:00
Simon Fan	21697feff2	[hop] run local_map with interpreter to preserve fx_traceback annotations (#165336 ) We have an issue when using fx_traceback.annotate and HOPs that trace joint graphs. HOPs have bodies that have already been traced by Dynamo, and after Animesh's PR, does have the annotations. But when we lower that Dynamo HOP body to aten in either pre-dispatch or post-dispatch, we need to propagate the annotations to the aten nodes. AOTAutograd does this indirectly by piggybacking off the `PropagateUnbackedSymInts` fx.Interpreter. I'm not sure if all HOPs should be using it to trace their joints or not. This PR adds an interpreter to local_map's implementation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165336 Approved by: https://github.com/yushangdi	2025-10-16 02:53:17 +00:00
Xilun Wu	12fa4192c5	[ContextParallel] add process-time based Round-Robin load-balance to CP (#163617 ) Summary The load-balancing problem can be modeled as [identical-machines scheduling](https://en.wikipedia.org/wiki/Identical-machines_scheduling) problem. We already provided an easy-to-extend interface in #161062 for implementing load-balancing and in this PR we start with adding a Round-Robin solution as an example and also a verification. This can be easily adapted to other solutions like Shortest-processing-time-first/ Longest-processing-time-first with extra padding added for collectives. - Added a new type of `_LoadBalancer` implementation `_PTRRLoadBalancer` which is designed for `flex_attention()`. This load-balance strategy analyzes the `BlockMask` sparsity info and perform Round-Robin (unlike traditional Round-Robin doing it in circular order, we do in zig-zag order). - Make `_context_parallel_buffers` and `context_parallel_unshard` handle batched load-balance index (previously it can only handle non-batched load-balance index), like in `create_cp_block_mask`. Test `pytest test/distributed/tensor/test_attention.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163617 Approved by: https://github.com/fegin	2025-10-16 02:20:27 +00:00
Nikita Shulga	23fb7e9f4b	[CI] Add arch prefix in front of op benchmark results (#165584 ) To be able to run x86 and aarch64 benchmarks later on Pull Request resolved: https://github.com/pytorch/pytorch/pull/165584 Approved by: https://github.com/huydhn ghstack dependencies: #165583	2025-10-16 01:50:52 +00:00
Shangdi Yu	5e480b8ecf	Add mingw to docker (#165560 ) Add mingw to `pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11` docker image to support AOTI cross-compilation Pull Request resolved: https://github.com/pytorch/pytorch/pull/165560 Approved by: https://github.com/malfet ghstack dependencies: #165574	2025-10-16 01:31:50 +00:00
Shangdi Yu	19ba506ca3	Support libtorch and posix mingw flavor (#165574 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165574 Approved by: https://github.com/desertfire	2025-10-16 01:31:50 +00:00
jmaczan	003dd13073	[dynamo, guards] Better error messages when generated guard fails on the same frame (#165242 ) Not sure what exactly we want to have in the message, but that's easy to adjust. I tried to find a reliable test to reproduce this message (happens only when a guard fails right after it's created), but I ended up mocking a `guard_manager.check` function to return `False` to trigger this behavior. I think that's fine, because any other case that we pick (like datetime.now()), we want to patch one day anyway, so every time we make the next patch, will need to chase for another repro test @williamwen42 Fixes #164990 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165242 Approved by: https://github.com/williamwen42	2025-10-16 01:05:31 +00:00
Huy Do	c2bd41ac9f	Build vLLM nightly wheels for CUDA 13.0 (#163239 ) Now that https://github.com/vllm-project/vllm/pull/24599 has been merged Pull Request resolved: https://github.com/pytorch/pytorch/pull/163239 Approved by: https://github.com/malfet, https://github.com/atalman	2025-10-16 01:03:26 +00:00
Pearu Peterson	ca8bd5dbed	Move toString(ScalarType) and ScalarType ostream operator to headeronly (#164405 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164405 Approved by: https://github.com/Skylion007, https://github.com/janeyx99 ghstack dependencies: #164350, #164354	2025-10-16 00:55:43 +00:00
Pearu Peterson	26f3803433	Remove workaround to old CUDA bug (#164354 ) As in the title. A check for https://github.com/pytorch/pytorch/issues/164348 to see if the workaround can be removed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164354 Approved by: https://github.com/janeyx99, https://github.com/ngimel, https://github.com/malfet, https://github.com/jeffdaily ghstack dependencies: #164350	2025-10-16 00:55:43 +00:00
Pearu Peterson	48064acf37	Move AT_FORALL_... macros and ScalarTypeToCPPTypeT to headeronly (#164350 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164350 Approved by: https://github.com/janeyx99	2025-10-16 00:55:42 +00:00
xinan.lin	e5a9c247bc	[Fix XPU CI] [Inductor UT] Fix test cases broken by community. (#165406 ) Fixes #163159, Fixes #164098, Fixes #164097, Fixes #164099, Fixes #165025 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165406 Approved by: https://github.com/EikanWang, https://github.com/jansel	2025-10-16 00:53:32 +00:00
PaulZhang12	36371b8ec7	[ATen] Fix CUDA reduction warp shuffle order (#164790 ) Typical warp shuffle reduction has the following pattern: <img width="1138" height="501" alt="image" src="https://github.com/user-attachments/assets/3bd176dc-0ad2-4df6-90c7-06e467337166" /> which is exhibited in Triton generated by torch.compile: <img width="663" height="403" alt="image" src="https://github.com/user-attachments/assets/7f9f36cd-b9eb-44c1-879e-b469668a2ea8" /> Switch the warp shuffle order to make bitwise equivalence between the 2 easier. PTX difference between old and new, we see a few extra instructions: https://www.diffchecker.com/h6ly3INC/ Comparing the performance on different reduction operations, we see minimal differences. New represents the changes in this PR, old represents the past warp shuffle order: ``` Tensor Shape Operation New all dims (ms) New dim=0 (ms) New dim=1 (ms) Old all dims (ms) Old dim=0 (ms) Old dim=1 (ms) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 1024) mean 0.015817 0.016259 0.013642 0.015990 0.016258 0.013631 (1024, 1024) sum 0.015917 0.015906 0.013359 0.015707 0.016266 0.013226 (1024, 1024) min 0.016021 0.024625 0.015631 0.015761 0.024485 0.015317 (1024, 1024) max 0.016349 0.024971 0.015972 0.015771 0.025001 0.015314 (1024, 1024) argmin 0.018070 0.024448 0.015578 0.018135 0.025370 0.015322 (1024, 1024) argmax 0.018427 0.024859 0.015932 0.018164 0.024452 0.015639 (1024, 1024) var 0.020078 0.026413 0.020295 0.020199 0.026381 0.020214 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 2048) mean 0.023826 0.023726 0.022273 0.023236 0.023776 0.022248 (2048, 2048) sum 0.023840 0.023355 0.021974 0.023294 0.023354 0.021884 (2048, 2048) min 0.024519 0.041263 0.024620 0.023292 0.041491 0.024358 (2048, 2048) max 0.024509 0.041670 0.024277 0.023334 0.041231 0.024395 (2048, 2048) argmin 0.026125 0.041282 0.024567 0.026772 0.041773 0.024296 (2048, 2048) argmax 0.026117 0.041487 0.024572 0.026412 0.041477 0.024273 (2048, 2048) var 0.026603 0.048581 0.031308 0.027587 0.048603 0.030860 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (4096, 4096) mean 0.053927 0.057070 0.054073 0.053028 0.057544 0.053935 (4096, 4096) sum 0.053604 0.057410 0.054451 0.053076 0.057033 0.054266 (4096, 4096) min 0.054293 0.109122 0.058363 0.053821 0.108689 0.058382 (4096, 4096) max 0.054258 0.108035 0.058703 0.053492 0.110552 0.058376 (4096, 4096) argmin 0.056805 0.111167 0.058301 0.056836 0.112325 0.058292 (4096, 4096) argmax 0.056488 0.110958 0.058636 0.056844 0.111000 0.057928 (4096, 4096) var 0.058936 0.141755 0.068693 0.059735 0.141284 0.068500 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 8192) mean 0.145552 0.148082 0.138647 0.145364 0.147818 0.138207 (8192, 8192) sum 0.145985 0.147900 0.138714 0.145755 0.148031 0.138616 (8192, 8192) min 0.146566 0.205359 0.192739 0.145611 0.205237 0.182335 (8192, 8192) max 0.146526 0.204844 0.193050 0.146073 0.205457 0.182697 (8192, 8192) argmin 0.150190 0.206605 0.192543 0.150654 0.206847 0.182007 (8192, 8192) argmax 0.150481 0.206368 0.192535 0.150845 0.206430 0.182022 (8192, 8192) var 0.150884 0.184546 0.203900 0.151594 0.184172 0.197983 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1, 1024, 128) mean 0.014293 0.008119 0.014533 0.013861 0.008022 0.014449 (1, 1024, 128) sum 0.014039 0.007877 0.014111 0.014219 0.008227 0.014045 (1, 1024, 128) min 0.014159 0.011354 0.023493 0.014271 0.010862 0.023644 (1, 1024, 128) max 0.014154 0.011027 0.023368 0.014259 0.011234 0.023692 (1, 1024, 128) argmin 0.016403 0.005677 0.023328 0.016273 0.005683 0.024073 (1, 1024, 128) argmax 0.016734 0.005675 0.023437 0.016580 0.005318 0.023331 (1, 1024, 128) var 0.018338 0.009549 0.025538 0.018528 0.009391 0.024777 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (5, 1024, 128) mean 0.014873 0.010131 0.015546 0.015123 0.010131 0.015481 (5, 1024, 128) sum 0.015334 0.009673 0.015824 0.014736 0.009671 0.015438 (5, 1024, 128) min 0.015047 0.013252 0.024573 0.014803 0.013163 0.024551 (5, 1024, 128) max 0.015050 0.013339 0.024197 0.014810 0.013525 0.024230 (5, 1024, 128) argmin 0.017341 0.012737 0.024306 0.017471 0.012379 0.024991 (5, 1024, 128) argmax 0.017345 0.012411 0.024421 0.017422 0.012471 0.024237 (5, 1024, 128) var 0.019973 0.011453 0.026188 0.020050 0.011438 0.026282 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (10, 1024, 128) mean 0.016976 0.011575 0.016831 0.016722 0.011927 0.017173 (10, 1024, 128) sum 0.017039 0.011841 0.017159 0.016385 0.011860 0.016753 (10, 1024, 128) min 0.017036 0.015331 0.026770 0.016944 0.015205 0.027166 (10, 1024, 128) max 0.017369 0.015348 0.027077 0.016531 0.015716 0.026819 (10, 1024, 128) argmin 0.019203 0.014447 0.026813 0.018994 0.014497 0.027313 (10, 1024, 128) argmax 0.019563 0.014795 0.027140 0.019460 0.014912 0.026733 (10, 1024, 128) var 0.020529 0.014316 0.030405 0.020719 0.013960 0.029964 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (100, 1024, 128) mean 0.045046 0.039168 0.046082 0.044839 0.039217 0.045782 (100, 1024, 128) sum 0.045094 0.039150 0.045777 0.044496 0.039542 0.046083 (100, 1024, 128) min 0.045768 0.054466 0.076244 0.044915 0.053943 0.076599 (100, 1024, 128) max 0.045748 0.054459 0.076188 0.044931 0.053949 0.076856 (100, 1024, 128) argmin 0.048275 0.054046 0.076647 0.048694 0.054105 0.077004 (100, 1024, 128) argmax 0.048267 0.054395 0.077401 0.048691 0.054131 0.076751 (100, 1024, 128) var 0.049710 0.043254 0.083077 0.050971 0.043251 0.082378 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1000, 1000, 100) mean 0.202312 0.196723 0.197765 0.201774 0.196641 0.197459 (1000, 1000, 100) sum 0.202651 0.196682 0.197736 0.202175 0.196313 0.197523 (1000, 1000, 100) min 0.203022 0.264762 0.269200 0.202729 0.264129 0.268694 (1000, 1000, 100) max 0.202864 0.264396 0.269388 0.202486 0.263896 0.268720 (1000, 1000, 100) argmin 0.226727 0.263781 0.268651 0.226597 0.264676 0.268983 (1000, 1000, 100) argmax 0.226412 0.264469 0.269090 0.226570 0.264595 0.269178 (1000, 1000, 100) var 0.243223 0.204079 0.216096 0.241942 0.204079 0.215925 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (10000, 100) mean 0.016193 0.020277 0.014316 0.016152 0.020324 0.013712 (10000, 100) sum 0.016289 0.020237 0.014034 0.016168 0.020265 0.013708 (10000, 100) min 0.016046 0.030872 0.019609 0.016208 0.030867 0.018627 (10000, 100) max 0.016369 0.030835 0.019257 0.016218 0.030861 0.018209 (10000, 100) argmin 0.017957 0.031171 0.019517 0.018050 0.031556 0.018077 (10000, 100) argmax 0.017961 0.031658 0.019521 0.018060 0.031564 0.018087 (10000, 100) var 0.020393 0.035652 0.019339 0.020144 0.035987 0.019171 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (100000, 10) mean 0.015718 0.016576 0.016555 0.015999 0.016246 0.014869 (100000, 10) sum 0.015833 0.016247 0.016572 0.016007 0.016627 0.014872 (100000, 10) min 0.015888 0.020510 0.023920 0.015671 0.020821 0.021417 (100000, 10) max 0.015889 0.020479 0.023918 0.016077 0.020386 0.021421 (100000, 10) argmin 0.018233 0.020863 0.023647 0.017574 0.020864 0.021103 (100000, 10) argmax 0.017896 0.020527 0.023296 0.017569 0.020447 0.021098 (100000, 10) var 0.020005 0.024198 0.024372 0.020075 0.024167 0.022415 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1023, 1023, 1023) mean 1.874816 1.963506 1.903909 1.873279 1.963859 1.903230 (1023, 1023, 1023) sum 1.875030 1.965716 1.902458 1.873566 1.960730 1.901642 (1023, 1023, 1023) min 1.878563 2.473455 2.179092 1.875174 2.482086 2.183027 (1023, 1023, 1023) max 1.879128 2.474803 2.178895 1.874831 2.482253 2.183884 (1023, 1023, 1023) argmin 1.921800 2.476629 2.174831 1.923987 2.472641 2.170453 (1023, 1023, 1023) argmax 1.922605 2.476688 2.177927 1.923366 2.472808 2.172979 (1023, 1023, 1023) var 1.972606 3.088695 2.758797 1.978679 3.095658 2.762243 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1023, 1023, 255) mean 0.489984 0.500954 0.492957 0.489891 0.500654 0.491971 (1023, 1023, 255) sum 0.490228 0.500764 0.492289 0.489624 0.501089 0.492824 (1023, 1023, 255) min 0.491457 0.563560 0.553334 0.490355 0.564709 0.554754 (1023, 1023, 255) max 0.491396 0.563628 0.553345 0.490017 0.565004 0.554947 (1023, 1023, 255) argmin 0.503666 0.561512 0.551831 0.503845 0.560972 0.551017 (1023, 1023, 255) argmax 0.503602 0.561185 0.551407 0.504328 0.561267 0.551448 (1023, 1023, 255) var 0.510844 0.709452 0.701630 0.512693 0.710365 0.701965 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1023, 1023, 377) mean 0.707439 0.727646 0.712019 0.706769 0.727101 0.711632 (1023, 1023, 377) sum 0.707780 0.727453 0.711554 0.706807 0.726656 0.711729 (1023, 1023, 377) min 0.709423 0.819809 0.794379 0.707847 0.822086 0.796664 (1023, 1023, 377) max 0.709297 0.819780 0.794308 0.707566 0.821913 0.796690 (1023, 1023, 377) argmin 0.725028 0.817088 0.791695 0.726039 0.816445 0.790828 (1023, 1023, 377) argmax 0.725301 0.817011 0.791420 0.726040 0.816917 0.791143 (1023, 1023, 377) var 0.740859 1.034165 1.006712 0.743413 1.035506 1.007638 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164790 Approved by: https://github.com/ngimel, https://github.com/eqy ghstack dependencies: #165494	2025-10-15 23:54:51 +00:00
Nikita Shulga	7e6721fb0a	[BE] Remove confusing `opbenchmark-on-demand-build` (#165583 ) As it doesn't have a test shard, so what's the point or running the build? Was added in https://github.com/pytorch/pytorch/pull/143733 and looks like test shard never existed for it Moreover, allow one to specify benchmark size as argument, so one technically can do a workflow dispatch with different opbenchmark sizes Pull Request resolved: https://github.com/pytorch/pytorch/pull/165583 Approved by: https://github.com/huydhn	2025-10-15 23:48:28 +00:00
PaulZhang12	901bbcba12	Gate division bitwise numerics under a flag (#165566 ) https://github.com/pytorch/pytorch/pull/164144 ensures that division for compile is bitwise equivalent with eager. However, in https://github.com/pytorch/pytorch/issues/164301, the kernel performance is regressed. On B200: With standard triton `/`: 6511 GB/s With triton `div_rn`: 4692 GB/s Further investigation is required for the generated PTX to see why there is such a large slowdown. For now, enable bitwise equivalent results under `TORCHINDUCTOR_EMULATE_DIVISION_ROUNDING` similar to emulate_precision_cast Pull Request resolved: https://github.com/pytorch/pytorch/pull/165566 Approved by: https://github.com/ngimel, https://github.com/eellison	2025-10-15 23:41:01 +00:00
Nikhil Patel	febb603230	[Inductor][CuTeDSL] Move load_template up two directories (#165347 ) (#165576 ) Summary: Moves the function used to load CuTeDSL Jinja templates up one level out of the flex attention folder. This way it can be used for more generate Inductor templates in the future. Test Plan: `INDUCTOR_TEST_DISABLE_FRESH_CACHE=1 TORCHINDUCTOR_CACHE_DIR=~/cutetest buck2 run mode/opt //caffe2/test/inductor:cutedsl_grouped_mm -c fbcode.nvcc_arch=b200a -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8` Reviewed By: drisspg Differential Revision: D84527470 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165576 Approved by: https://github.com/jananisriram	2025-10-15 23:37:55 +00:00
Xiao Fu	568d2f3ae7	[Dynamo][Logging] Add sources/types to LazyVariableTracker logging (#165402 ) Fixes #162860 This task add the variable source attrition to LazyVariableTracker when output trace bytecode Test plan -- test/dynamo/test_error_messages.py ErrorMessagesTest.test_variable_tracker_source_attribution The output is as specified in the prior mentioned Github issue. <img width="961" height="59" alt="Screenshot 2025-10-13 at 10 19 44 PM" src="https://github.com/user-attachments/assets/fb27da3f-d00b-437b-bf2e-52e892572cd7" /> This is specifically for the log setup with ``TORCH_LOGS=trace_bytecode`` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165402 Approved by: https://github.com/Lucaskabela, https://github.com/williamwen42 Co-authored-by: William Wen <williamwen@meta.com>	2025-10-15 23:23:09 +00:00
James Wu	b54e466fd0	Megacache integration (#163533 ) This diff adds megacache integration for DynamoCache. Because DynamoCache requires lazy serialization, i.e. it can only be serialized once all relevant backends have been compiled and we're ready for a save, we actually do the DynamoCache saving only on a call to `torch.compiler.save_cache_artifacts`. Differential Revision: [D82735763](https://our.internmc.facebook.com/intern/diff/D82735763/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163533 Approved by: https://github.com/oulgen, https://github.com/zhxchen17	2025-10-15 22:49:15 +00:00
Glen Cao	53f9ae0e50	[ROCm] new implementation of upsample_bilinear2d_backward (#164572 ) Changed the implementation from an output-based approach to an input-based one to remove `atomicAdd` operations, and it appears to deliver at least a 20× speedup. The changes are from Yu-Yun <YuYun.Chang@amd.com>. # Summary: Refactor of the implementation of the `upsample_bilinear2d_backward` opertion on MI300X/MI325X - The original "scatter-add" approach - Each thread, representing an output pixel, scattered gradient contributions to four input pixels, using costly atomic operations on MI300X/MI325X GPUs. - The new "gather-sum" approach - Each thread is responsible for a single input pixel and gathers all relevant gradient contributions from a small, calculated region of the output tensor (done by the `compute_output_range` device function). # Breakdown of the code changes - Inversion of the parallelization strategy of the kernel function `upsample_bilinear2d_backward_out_frame` - Originally, the main kernel loop was parallelized over the number of elements in the output gradient tensor (`const size_t o_numel = nc * width2 * height2;`). - Each thread processed one output pixel. - The new loop is parallelized over the number of elements in the input gradient tensor (`const size_t i_numel = nc * height1 * width1;`). - Each thread is responsible for calculating the final gradient for a single input pixel. - The kernel launch changes accordingly in the function `upsample_bilinear2d_backward_out_cuda_template`. - Added a device function for calculating the range of output pixels that could have possibly used that the input pixel (`input_pos`) during the forward pass interpolation - This is essentially the mathematical inverse of the forward pass. - This function tries to prune a thread's search space so that it only needs to inspect a small, local window of the output tensor. - Gradient calculation approach switching from "scatter-add" to "gather-sum" - Scatter-add - For each output pixel, the thread calculated 4 gradient contributions and use `fastAtomicAdd` 4 times to add these values to 4 different (and potentially highly contended) memory locations in the input gradient tensor. - Gather-sum - A thread responsible for one input pixel calls `compute_output_range` to determine the small rectangular region of output pixels that influence the input's final gradient value. - The thread iterates through this region, and for each output pixel in the regionre, it re-calculates the interpolation weights to determine the exact contribution to its specific input pixel. - All these contributions are accumulated into a private, per-thread register variable (`accscalar_t grad_sum = 0;`). - W/o any gloabl memory access, this accumulation is extremely fast. - When the loops are done, the thread performs a single, direct write (non-atomic) of the final summed gradient to its designated location in global memory (`idata[index] = static_cast<scalar_t>(grad_sum);`). # Why performance gets boosted - Analysis of the root cause of performance drop - Ref. (internal only) - https://amd.atlassian.net/wiki/spaces/~glencao2/pages/1140493327/PyTorch__upsample_bilinear2d_backward - First and foremost, elimination of the contention of atomic operations - Many parallel threads called `atomicAdd` frequently attempting to update the exact same memory location in the input gradient tensor at the same time. - The GPU's memory controler has to serialize these operations, effectively nullifying the benefit of parallel capability at those contention points. - MI300X/MI325X chiplet-based CDNA 3 architeture amplified the issue. - When contending threads reside on different XCDs, resolving the atomic operation requires high-latency coherence traffic across the Infinity Fabric interconnect. - The implementation change eliminates hardware-level serialization and cross-chiplet coherence traffic caused by many `atomicAdd`. - Improved memory access pattern and locality - Write coalescing - The regular sum writes `idata[index] = static_cast<scalar_t>(grad_sum);` can be perfectly coalesced by GPUs. - Read locality - Even though there are many (potentially repeated) reads from the output tensor (`static_cast<accscalar_t>(odata[output_idx])`), these are highly cache-friendly, meaning the data for one thread is likely to be in the L1 or L2 cache already due to an access from a neighboring thread. - Trade-off: computation for memory synchronization - The recalculation of interpolation weights fits well on high-computational-throughput modern GPUs like MI300X/MI325X. - Removal of atomic operations avoids expensive memory synchronization. --- Optimizations of `grid_sampler_2d_backward` will be addressed in a separate PR. Doc for reference: (internal only) https://amd.atlassian.net/wiki/spaces/~glencao2/pages/1162750701/PyTorch__grid_sampler_2d_backward Pull Request resolved: https://github.com/pytorch/pytorch/pull/164572 Approved by: https://github.com/jeffdaily	2025-10-15 22:35:43 +00:00
blorange-amd	b42fe389b9	ROCm unit tests enablement (#165366 ) Enables: test_cuda.py::TestCuda::test_streaming_backwards_multiple_streams test_cuda.py::TestCuda::test_graph_make_graphed_callables_with_amp_cache_disabled_allow_unused_input test_cuda.py::TestCuda::test_graph_make_graphed_callables_without_amp_allow_unused_input test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_1_10000_10000_10000_cuda_bfloat16 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_1_10000_10000_10000_cuda_float16 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_1_10000_10000_10000_cuda_float32 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_1_10000_1000_10000_cuda_bfloat16 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_1_10000_1000_10000_cuda_float16 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_1_10000_1000_10000_cuda_float32 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_2_1000_1000_1000_cuda_bfloat16 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_2_1000_1000_1000_cuda_float16 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_2_1000_1000_1000_cuda_float32 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_2_100_100_100_cuda_bfloat16 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_2_100_100_100_cuda_float16 test_matmul_cuda.py::TestMatmulCudaCUDA::test_cublas_baddbmm_large_input_2_100_100_100_cuda_float32 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165366 Approved by: https://github.com/jeffdaily	2025-10-15 22:35:03 +00:00
Sarthak Tandon	66ea76ec44	[ROCm][tunableop] Improvements to tunableop Numerical Check (#163079 ) Modified the flag PYTORCH_TUNABLEOP_NUMERICAL_CHECK, so that it accepts the numerical tolerances in the format atol_rtol as compared to the previous 0 and 1. Retains previous functionality with default values as well. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163079 Approved by: https://github.com/naromero77amd, https://github.com/jeffdaily	2025-10-15 22:26:47 +00:00
Richard Zou	e787d532b6	tmp fix for compile internal logger issue (#165568 ) Summary: Catch runtime exception when garse and scrub uninteresting configs from inductor config Test Plan: tested locally Differential Revision: D84727788 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165568 Approved by: https://github.com/luccafong, https://github.com/oulgen	2025-10-15 22:03:16 +00:00
eellison	b3f6d49b69	Overlap scheduler improvements (#165318 ) Bucketing a number of smallish improvements: - Account for bucketing in overlap calculation: if an in-flight collective exists with the same bucket key, reduce new collectives estimated time by its latency time - Update compute domination so we are ordering based on compute idx, as opposed to compute depth, so we never reorder compute. this makes it a bit easier to reason about memory, and pre-fetching, although we can exploring reordering in the future. - When we wait on a collective, force all collectives on the same process group as it that were enqueued prior to the collective to wait as well. Better Memory Handling: - Pre-fetch limiting - when scheduling collectives for overlap, only pre-fetch up to a certain distance, then schedule off-path collectives (which are typically memory reducing). - When we are above peak memory, schedule waits. TODO: - for each compute node, we know its original memory in the graph. we could limit pre-fetching that goes across peak memory - By scheduling off-path collectives for overlap, we reduce memory, but if there weren't enough compute for overlap, we need to proactively schedule them. not an issue yet on examples. - config some hard coded constants, clean up enablement (can do in subsequent pr) On small llama 2d backward : 578 of 618 potentially hideable collectives hidden original mem 14.4GB, rescheduled mem, 15.9GB on forward: 254/256 potentially hideable collectives hidden original mem 5.8 gb, reshceduled mem 5.8GB WIP: adding tests Pull Request resolved: https://github.com/pytorch/pytorch/pull/165318 Approved by: https://github.com/ezyang, https://github.com/IvanKobzarev ghstack dependencies: #164738, #164783, #164944, #164945, #165059	2025-10-15 21:58:47 +00:00
Howard Huang	bc1f2108d7	[PP] Update backward_counter and fsdp util to schedule class (#165513 ) Fixed one issue with FSDP last reshard not being called. Rest is mostly refactoring, changing some variables to be class variables so they can be used in https://github.com/pytorch/torchtitan/pull/1721 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165513 Approved by: https://github.com/fegin	2025-10-15 21:58:16 +00:00
Boyuan Feng	f071f17911	[Graph Partition] fix partition x memory plan issue (#165514 ) For `test_graph_partition_with_memory_plan_reuse`, before this PR, when using graph partition, it would error ([P1992728479](https://www.internalfb.com/phabricator/paste/view/P1992728479)): ``` def partition_0(args): ... del buf0 return (buf3, buf4, buf5, buf2, primals_4, ) ... File "/tmp/torchinductor_boyuan/ww/cwwc7ukfqscg2vy6ankby2fizdb377tvgyx3fwdgddrxe3g47jg6.py", line 132, in partition_0 return (buf3, buf4, buf5, buf2, primals_4, ) ^^^^ NameError: name 'buf2' is not defined. Did you mean: 'buf0'? ``` When not using graph partition, it would work and give the following code ([P1992997521](https://www.internalfb.com/phabricator/paste/view/P1992997521)): ``` def call(self, args): ... buf2 = buf0; del buf0 # reuse ... ``` Note that the issue is buf0 is not reused for buf2 when using graph partition. Why? Because the codegen runs `run_wrapper_ir_passes` and `memory_plan_reuse`, which pops tailing `MemoryPlanningLine` unless it is in graph output by checking `V.graph.get_output_names()`. However, for graph partition, we should check the output of the current partition instead of the graph before partition. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165514 Approved by: https://github.com/ProExpertProg, https://github.com/eellison	2025-10-15 21:52:16 +00:00
Avik Chaudhuri	fa1539594b	consolidate fw and inference compile paths (#165457 ) By design, fw compile and inference compile stages should share a bunch of code; just consolidating the duplication here. Differential Revision: D84628978 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165457 Approved by: https://github.com/zhxchen17, https://github.com/tugsbayasgalan	2025-10-15 21:33:50 +00:00
zpcore	dfc8a1c5dd	Fix `_StridedShard` incorrect split (#165533 ) https://github.com/pytorch/pytorch/pull/164820 introduced a bug that `_StridedShard` will call parent class `Shard`'s `split_tensor` method, thus results in incorrect data locality. (I think @ezyang spotted this issue, but we have no test to capture this) Meanwhile, I notice another bug that when we normalize a `_StridedShard`'s placement, it will also trigger parent class `Shard`'s `split_tensor` method because it will create a Shard class [here](`0c14f55de6/torch/distributed/tensor/_api.py (L783)`). I think we never test `distribute_tensor` for `_StridedShard` before. So I added a test here to compare against ordered shard. Using classmethod because the _split_tensor logic is different between `Shard` and `_StridedShard`. Basically I want to shard on local tensors without initializing the Shard object: ``` local_tensor = _StridedShard._make_shard_tensor(dim, tensor, mesh, mesh_dim, split_factor=split_factor) local_tensor = Shard._make_shard_tensor(dim, tensor, mesh, mesh_dim) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165533 Approved by: https://github.com/XilunWu	2025-10-15 20:52:41 +00:00
Sarthak Tandon	7f9b745494	[ROCm][tunableop] Modified Online Tuning Mode to add Instant Logging (#163965 ) - Added instant logging in online tuning mode, so that each tuned GEMM is instantly written - Allows us to have saved tuning configs, in cases of crashes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163965 Approved by: https://github.com/naromero77amd, https://github.com/jeffdaily	2025-10-15 20:02:31 +00:00
Lucas Kabela	83f9baf413	[Bugfix][Precompile][vLLM] Support for pickling einops for aot_autograd serialization in vLLM (#165359 ) Fixes issue with compiling `Qwen2_5_vl` in https://github.com/vllm-project/vllm/pull/23207 (issue happens with `aot_autograd_cache`) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165359 Approved by: https://github.com/jamesjwu	2025-10-15 20:00:24 +00:00
Catherine Lee	ffc7552e01	See if we can handle uploading all test data (#165484 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/165484 Approved by: https://github.com/izaitsevfb	2025-10-15 19:57:41 +00:00
Angel Li	78f5a1ec60	varlen api (#164502 ) Summary Today, the only way to have variable sequence length support in PyTorch attention is through nested tensors [here](https://docs.pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html#nestedtensor-and-dense-tensor-support). We also want to add an explicit lower-level API that provides variable sequence length support without padding/masking in SDPA. This PR builds out `varlen_attn`, the public API that users can call for the forward method, and `_varlen_attn`, the private API that calls into the Flash Attention/cuDNN backend. Benchmarking To benchmark, we compare runtime and TFLOPs against the current SDPA approach with padding. Settings: - 1 H100 machine - `batch_size=8`, `max_seq_len=2048`, `embed_dim=1024`, `num_heads=16` - dtype `torch.bfloat16` - `is_causal=False` - for variable length, we set sequences to be random multiples of 64 up to `max_seq_len` - 100 runs \| \| Variable Length API \| SDPA \| \|--------\|--------------------\|----------\| \| Runtime \| 0.21750560760498047 ms \| 0.43171775817871094 ms \| \| TFLOPs \| 231.812 \| 320.840 \| The sparsity is 0.453 which we can see matches the speedup we get from Varlen (approx 50%). TFLOPs remains around the same, with SDPA slightly larger due to potential higher overhead and total flops scaling with sequence length. Testing Run `python test/test_varlen_attention.py` for unit tests where we verify basic functionality and confirm numerical match between varlen outputs vs SDPA. Next steps Next steps from this PR (higher in the stack) include registering the private API `_varlen_attn` as a custom op, implementing backward support, and enabling cuDNN with correct numerics. (This stack builds on top of #162326) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164502 Approved by: https://github.com/v0i0, https://github.com/drisspg	2025-10-15 19:45:55 +00:00
eellison	2b71b62045	Add Memory Estimation Tracker (#165059 ) Add Memory Tracker utility, which will track live memory given alternate ordering of nodes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165059 Approved by: https://github.com/ezyang, https://github.com/IvanKobzarev ghstack dependencies: #164738, #164783, #164944, #164945	2025-10-15 19:44:29 +00:00
PyTorch MergeBot	8c4b528403	Revert "[Inductor][CuTeDSL] Move load_template up two directories (#165347 )" This reverts commit 815d6415996d5b32b569fd2a8206f1e57c75bfe3. Reverted https://github.com/pytorch/pytorch/pull/165347 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165347#issuecomment-3407958496))	2025-10-15 19:30:46 +00:00
Simon Layton	066f818eea	Refactor and unify v1/v2 _scaled_mm codes (#165436 ) Summary: * Refactor out some core routines (scaled_gemm, auto-tuned scaled_gemm) * Unify v1/v2 dispatch calls where possible * Simplify call pattern w.r.t. CUDA/ROCM for easier readability. Test Plan: ``` pytest -svv test/test_scaled_matmul_cuda.py ``` Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlayton@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165436 Approved by: https://github.com/drisspg	2025-10-15 19:07:05 +00:00
Luca Wehrstedt	14af1dc3da	[DeviceMesh] Fix layout calculation when flattening non-contiguous dims (#165542 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165542 Approved by: https://github.com/ezyang, https://github.com/fduwjj	2025-10-15 18:55:45 +00:00
Tugsbayasgalan Manlaibaatar	2395d7d7da	Relax equality check (#165460 ) When an object is inherited from multiple types, the previous check would fail. So we should relax it to respect eager semantic Differential Revision: [D84635322](https://our.internmc.facebook.com/intern/diff/D84635322) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165460 Approved by: https://github.com/avikchaudhuri	2025-10-15 18:32:01 +00:00
Catherine Lee	0aa7ebaf03	Fix periodic debug tests failing due to FakeProcessGroup things (#165479 ) These happen when building with CMAKE_BUILD_TYPE=RelWithAssert This should fix two types of failures that started with https://github.com/pytorch/pytorch/pull/163665 Disclaimer that I used a lot of AI since I don't how pybind works or what refcounts and pointers are, so idk if this is a good solution, or even a solution at all (fwiw the tests pass now) The first one type is Truncated: ``` default_pg, _ = _new_process_group_helper( File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2096, in _new_process_group_helper backend_class = creator_fn(dist_backend_opts, backend_options) File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/testing/_internal/distributed/fake_pg.py", line 25, in _create_fake_pg return FakeProcessGroup._create_internal( RuntimeError: new_refcount != 1 INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/c10/util/intrusive_ptr.h":319, please report a bug to PyTorch. intrusive_ptr: Cannot increase refcount after it reached zero. Exception raised from retain_ at /var/lib/jenkins/workspace/c10/util/intrusive_ptr.h:319 (most recent call first): C++ CapturedTraceback: #4 std::_Function_handler<std::shared_ptr<c10::LazyValue<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > const> (), c10::SetStackTraceFetcher(std::function<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) from Logging.cpp:0 #5 c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) from ??:0 #6 c10::detail::torchCheckFail(char const, char const, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) from ??:0 #7 c10::detail::torchInternalAssertFail(char const, char const, unsigned int, char const, char const) from ??:0 #8 void pybind11::class_<c10d::FakeProcessGroup, (anonymous namespace)::IntrusivePtrNoGilDestructor<c10d::FakeProcessGroup> >::init_instance<(anonymous namespace)::IntrusivePtrNoGilDestructor<c10d::FakeProcessGroup>, 0>(pybind11::detail::instance, void const) from init.cpp:0 #9 pybind11::detail::type_caster_generic::cast(void const, pybind11::return_value_policy, pybind11::handle, pybind11::detail::type_info const, void* ()(void const), void* ()(void const), void const) from :0 #10 pybind11::cpp_function::initialize<torch::distributed::c10d::(anonymous namespace)::c10d_init(_object, _object)::{lambda(int, int, c10::intrusive_ptr<c10d::FakeProcessGroup::Options, c10::detail::intrusive_target_default_null_type<c10d::FakeProcessGroup::Options> >)#127}, c10::intrusive_ptr<c10d::FakeProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::FakeProcessGroup> >, int, int, c10::intrusive_ptr<c10d::FakeProcessGroup::Options, c10::detail::intrusive_target_default_null_type<c10d::FakeProcessGroup::Options> >, pybind11::name, pybind11::scope, pybind11::sibling, pybind11::arg, pybind11::arg, pybind11::arg_v>(torch::distributed::c10d::(anonymous namespace)::c10d_init(_object, _object)::{lambda(int, int, c10::intrusive_ptr<c10d::FakeProcessGroup::Options, c10::detail::intrusive_target_default_null_type<c10d::FakeProcessGroup::Options> >)#127}&&, c10::intrusive_ptr<c10d::FakeProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::FakeProcessGroup> > ()(int, int, c10::intrusive_ptr<c10d::FakeProcessGroup::Options, c10::detail::intrusive_target_default_null_type<c10d::FakeProcessGroup::Options> >), pybind11::name const&, pybind11::scope const&, pybind11::sibling const&, pybind11::arg const&, pybind11::arg const&, pybind11::arg_v const&)::{lambda(pybind11::detail::function_call&)#3}::_FUN(pybind11::detail::function_call&) from init.cpp:0 ``` and I fix it here by getting rid of `DontIncreaseRefcount` and using make_intrusive to do the ref count handling instead. However, I also had to move the constructor to be public, which I think is not good, based on the reasoning of the original PR The other one type is ``` Traceback (most recent call last): File "/var/lib/jenkins/workspace/test/test_testing.py", line 2415, in test_no_warning_on_import self.assertEqual(out, "") File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 4233, in assertEqual raise error_metas.pop()[0].to_error( # type: ignore[index] AssertionError: String comparison failed: "/opt/conda/envs/py_3.10/lib/python3.10/s[352 chars]):\n" != '' - /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/distributed/__init__.py:29: FutureWarning: pybind11-bound class 'torch._C._distributed_c10d.FakeProcessGroup' is using an old-style placement-new '__init__' which has been deprecated. See the upgrade guide in pybind11's docs. This message is only visible when compiled in debug mode. - if is_available() and not torch._C._c10d_init(): To execute this test, run the following from the base repo dir: python test/test_testing.py TestImports.test_no_warning_on_import ``` which I fix by getting rid of the `__init__` which I think is ok since it'll just error if you try to make one? Pull Request resolved: https://github.com/pytorch/pytorch/pull/165479 Approved by: https://github.com/ezyang	2025-10-15 18:16:08 +00:00
Jeff Daily	7a97832585	[ROCm] Add more timm models, forward fix #165381 (#165569 ) PR #165381 added timm models to cuda and cpu expected accuracy files. ROCm expected accuracy files were not updated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165569 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-15 18:11:21 +00:00
PyTorch MergeBot	84d141e910	Revert "[inductor] Expand use of generic benchmark function (#164938 )" This reverts commit 5c583e2573f29243742e00b9fa36b266c5c78bb3. Reverted https://github.com/pytorch/pytorch/pull/164938 on behalf of https://github.com/clee2000 due to I think this broke test/inductor/test_cuda_repro.py::CudaReproTests::test_epilogue_fusion_with_view? [GH job link](https://github.com/pytorch/pytorch/actions/runs/18529735968/job/52813191763) [HUD commit link](`f58f301313`) on both rocm and the slow grad check for linux. It did run successfully on cuda workflow on trunk, I wonder if this a gpu capability thing? no clue though ([comment](https://github.com/pytorch/pytorch/pull/164938#issuecomment-3407600224))	2025-10-15 17:48:38 +00:00
Simon Layton	7c6c5d04fe	Add scaled_grouped_mm_v2 and python API (#165154 ) Summary: * Add `torch._scaled_grouped_mm_v2` with more functionality and extensibility for future formats * Add `torch.nn.functional.scaled_grouped_mm` as public entrypoint * Test both original and v2 functionality Test Plan: ``` pytest -svv -k grouped test/test_scaled_matmul_cuda.py ``` Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlayton@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165154 Approved by: https://github.com/drisspg, https://github.com/danielvegamyhre	2025-10-15 17:47:23 +00:00
PyTorch MergeBot	b509fb9b5d	Revert "add and fix OpInfo tests for the default partitioner (#165372 )" This reverts commit bcfea48ab7fd489218289693b98c1a6a6582d079. Reverted https://github.com/pytorch/pytorch/pull/165372 on behalf of https://github.com/malfet due to Looks like it broke slow jobs, see `331b7cc054/1` ([comment](https://github.com/pytorch/pytorch/pull/165372#issuecomment-3407567748))	2025-10-15 17:38:52 +00:00
Scott Wolchok	331b7cc054	Fix double dispatch to Python for detach (#163671 ) This fixes #71725. Differential Revision: [D83857880](https://our.internmc.facebook.com/intern/diff/D83857880) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163671 Approved by: https://github.com/ezyang, https://github.com/albanD	2025-10-15 17:24:50 +00:00
Nikhil Patel	815d641599	[Inductor][CuTeDSL] Move load_template up two directories (#165347 ) Summary: Moves the function used to load CuTeDSL Jinja templates up one level out of the flex attention folder. This way it can be used for more generate Inductor templates in the future. Test Plan: `INDUCTOR_TEST_DISABLE_FRESH_CACHE=1 TORCHINDUCTOR_CACHE_DIR=~/cutetest buck2 run mode/opt //caffe2/test/inductor:flex_flash -c fbcode.nvcc_arch=b200a -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8` Differential Revision: D84527470 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165347 Approved by: https://github.com/drisspg	2025-10-15 16:34:58 +00:00
Timm Ruland	ffe3cb226a	In pipeline parallelism: Use same dtype for receive and send tensor when initializing p2p communication. (#165539 ) When initializing the p2p communication for pipeline parallelism, currently different default dtypes are used for the send and receive tensor here: `5c583e2573/torch/distributed/pipelining/stage.py (L935-L936)` This caused hard to trace issues when training on multiple nodes. Multiple stages on one node seem to work for some reason which probably caused the unit tests not to catch this. Fixes #165143 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165539 Approved by: https://github.com/H-Huang	2025-10-15 15:05:55 +00:00
fduwjj	7ae123d72c	[DeviceMesh] Make _flatten_mapping an object attribute instead of a class attribute (#165521 ) The `_flatten_mapping` field was defined as a class attribute with a mutable default value {}: ``` _flatten_mapping: dict[str, "DeviceMesh"] = {} ``` This caused all DeviceMesh instances to share the same dictionary object. When multiple test instances tried to create flattened meshes with the same name (like "dp"), they would conflict because they were all using the same shared dictionary, resulting in the error: "Flatten mesh with mesh_dim_name dp has been created before, Please specify another valid mesh_dim_name." Pull Request resolved: https://github.com/pytorch/pytorch/pull/165521 Approved by: https://github.com/fegin, https://github.com/lw	2025-10-15 14:47:09 +00:00
Aidyn-A	7719cb75bf	[ATen][CMake] Fix duplicated CUTLASS path (#165424 ) Fixes #165110 The `PUBLIC` scope causes CUTLASS of the FBGEMM being included in for all PyTorch targets, including special matmuls (RowwiseScaledMM, ScaledGroupMM and GroupMM). Due to version mismatch between FBGEMM/CUTLASS and PyTorch/CUTLASS it is unacceptable to use FBGEMM/CUTLASS in PyTorch targets. This PR limits the scope of FBGEMM/CUTLASS to `fbgemm_genai` target only. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165424 Approved by: https://github.com/cthi, https://github.com/eqy, https://github.com/danielvegamyhre	2025-10-15 14:14:17 +00:00
PaulZhang12	712f54d453	[ATen] Remove explicit casting of complex nansum during accumulation (#165494 ) https://github.com/pytorch/pytorch/pull/164790 modifies aten to perform a different reduction order intra warp. However, this change exposed a large difference in a sum for complex32. Namely the case: ``` import torch a = torch.tensor([[ 4.82031250+7.34765625j, -3.37109375-1.9501953125j], [ 3.7832031250-2.43359375j, -6.07812500+5.32812500j]], dtype=torch.complex32, device='cuda:0') sum_out = torch.sum(a) nansum_out = torch.nansum(a) torch.testing.assert_close( sum_out, nansum_out, rtol=0, atol=0, ) ``` Here, the result of `sum` and `nansum` differed significantly by 1e-2. Further investigation showed that the explicit casting of b back to `arg_t` from `scalar_t` was the root cause. `arg_t` is the dtype of the accumulator, ComplexFloat, and `scalar_t` of the input dtype, ComplexHalf. When we cast in the reduction to the accumulator order, that means the input is still of ComplexHalf, which loses precision as it can store intermediate values. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165494 Approved by: https://github.com/ngimel	2025-10-15 13:49:25 +00:00
Samuel Park	f58f301313	Fixes bug with tolist calls to GradTrackingTensors (#165184 ) Fixes #161943 ## The Fix I implemented a recursive unwrapping helper function in the `tensor_to_list.cpp` file that looks for wrapped tensors and unwraps them. The recursive implementation was needed for multi-level gradTrackingTensors. Let me know if there is any more suggestions on fixing this issue! @guilhermeleobas @KimbingNg Pull Request resolved: https://github.com/pytorch/pytorch/pull/165184 Approved by: https://github.com/zou3519	2025-10-15 12:54:28 +00:00
Mwiza Kunda	5c583e2573	[inductor] Expand use of generic benchmark function (#164938 ) Use the more generic `Benchmarker.benchmark` function to allow benchmarking other devices that support the required functionality, for example prologue and epilogue fusion can be benchmarked for triton CPU. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164938 Approved by: https://github.com/nmacchioni, https://github.com/eellison	2025-10-15 09:18:24 +00:00
Bob Ren	0c14f55de6	[ez] fix typo (#165282 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165282 Approved by: https://github.com/ezyang, https://github.com/mlazos	2025-10-15 06:19:24 +00:00
Isalia20	8e510e1095	[MPS] fix empty dot op crash (#165237 ) reproducer ``` import torch # does not crash a = torch.rand((0), device="cpu") b = torch.rand((0), device="cpu") a.dot(b) # crashes due to internal assert a = torch.rand((0), device="mps") b = torch.rand((0), device="mps") a.dot(b) ``` Discovered when implementing an op for SparseMPS backend Pull Request resolved: https://github.com/pytorch/pytorch/pull/165237 Approved by: https://github.com/malfet	2025-10-15 04:49:29 +00:00
PyTorch UpdateBot	59d30d1b75	[vision hash update] update the pinned vision hash (#165496 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vision hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165496 Approved by: https://github.com/pytorchbot	2025-10-15 04:35:50 +00:00
PyTorch UpdateBot	3915898c22	[audio hash update] update the pinned audio hash (#165495 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165495 Approved by: https://github.com/pytorchbot	2025-10-15 04:32:49 +00:00
PyTorch MergeBot	3044e1a460	Revert "varlen api (#164502 )" This reverts commit 3681312ce03e425e280a110df2153db107616a15. Reverted https://github.com/pytorch/pytorch/pull/164502 on behalf of https://github.com/huydhn due to Sorry for reverting your change, but the doctests failure is legit ([comment](https://github.com/pytorch/pytorch/pull/164502#issuecomment-3404419420))	2025-10-15 03:56:42 +00:00
Yuanyuan Chen	b11593c31b	[8/N] Apply ruff UP035 rule (#165214 ) This is follow-up of #164653 to continue applying `UP035` fixes. The purpose is to finally enable this rule. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165214 Approved by: https://github.com/ezyang	2025-10-15 03:18:57 +00:00
Yuanyuan Chen	36871622f1	[2/N] Mark unused parameters in C++ code (#165121 ) This is follow-up of #164912 to mark unused C++ parameters to improve code readability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165121 Approved by: https://github.com/Skylion007	2025-10-15 03:04:39 +00:00
Michael Gathara	b4fd47179e	feat(dynamo): IS#160752 make F.one_hot work with jacfwd + torch.compile(dynamic=True) (#160837 ) Fixes #160752 # Background: `torch.func.jacfwd` is implemented as vmap over forward-mode JVP. With torch.compile(dynamic=True), FakeTensor + SymInt shape reasoning is used while tracing through the transform. The old vmap rule for one_hot decomposed into “zeros_symint + scatter,” which interacted poorly with the transform stack and dynamic shapes, leading to failures mid-trace. Using a functional equality construction makes one_hot composable with vmap/JVP and friendly to dynamic shape tracing. # Changes: - functorch vmap batching rule for `aten::one_hot` now uses a purely functional formulation: - Replace “zeros + scatter” with eq(self.unsqueeze(-1), arange(num_classes)).to(kLong) under FuncTorchBatched. - one_hot native path remains unchanged for regular eager; vmap transform no longer relies on scatter, which was fragile under dynamic shape tracing. The minimal repro from the issue is now fixed: ```python import torch import torch.nn.functional as F MAX, BATCH = 3, 37 def func(x, idxs): return x.square() * F.one_hot(idxs, MAX) def jacfunc(x, idxs): return torch.func.jacfwd(func, argnums=0)(x, idxs) idxs = torch.randint(MAX, (BATCH,), dtype=torch.int64) x = torch.rand((BATCH, MAX), dtype=torch.float64) # eager out_eager = jacfunc(x, idxs) # compiled dynamic jacfunc_c = torch.compile(jacfunc, dynamic=True) out_comp = jacfunc_c(x, idxs) torch.testing.assert_close(out_eager, out_comp) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160837 Approved by: https://github.com/guilhermeleobas, https://github.com/zou3519	2025-10-15 02:48:44 +00:00
Alex Sibiryakov	4f400ab520	Fix: nDims is mutated inside the loop in Shape.cu (#165446 ) Summary: The `nDims` variable is mutated inside the loop but never restored to its original value. This affects subsequent iterations of the outer loop. Each batch iteration may get incorrect `nDims` after the first batch. Test Plan: CI Reviewed By: ngimel Differential Revision: D84612194 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165446 Approved by: https://github.com/ngimel	2025-10-15 02:32:15 +00:00
Zhengxu Chen	839f6facdb	[precompile] Fix frame construction for wrapped model. (#165454 ) Summary: If a function is wrapped with functools, we should not look at the wrapped function signature but rather the wrapper, since we need to construct the frame for the top level function here. Test Plan: test_decorated_function_with_functools_wrap_aot Differential Revision: D84626752 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165454 Approved by: https://github.com/yiming0416	2025-10-15 02:01:46 +00:00
Howard Huang	ca65023b90	[PP] Fix edge case with FSDP when stages_per_rank > 3 (#165467 ) There is an edge case with FSDP + PP when we add UNSHARD + RESHARD, we at max have 3 stages unsharded, `3f83e8915e/torch/distributed/pipelining/schedules.py (L1029-L1031)` This change is need to be able to unshard and reshard a stage multiple times. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165467 Approved by: https://github.com/wwwjn	2025-10-15 01:53:04 +00:00
Huy Do	132ae8e6dd	Don't link with libnvToolsExt when building for 12.9 (#165465 ) This is to bring back this logic from https://github.com/pytorch/pytorch/pull/161916/files#diff-bf46b4a09ca67e50622bf84fefc0d11b584ffcc24ee6cc5019cf0fc7565d81a8L170. Building libtorch on 12.9 is failing otherwise https://github.com/pytorch/pytorch/actions/runs/18458531395/job/52610761895: ``` cp: cannot stat '/usr/local/cuda/lib64/libnvToolsExt.so.1': No such file or directory ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165465 Approved by: https://github.com/atalman, https://github.com/malfet	2025-10-15 01:45:37 +00:00
Bernhard Manfred Gruber	a20afb6100	Allow at::native::offset_t to be offset using `operator+=` (#164570 ) This will be required by CCCL 3.1. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164570 Approved by: https://github.com/Skylion007, https://github.com/eqy	2025-10-15 01:40:54 +00:00
Yiming Zhou	47524dcc48	[benchmark] Add more timm models (#165381 ) Added following models to timm_models - [convnextv2_nano.fcmae_ft_in22k_in1k](https://huggingface.co/timm/convnextv2_nano.fcmae_ft_in22k_in1k) - [vit_base_patch14_dinov2.lvd142m](https://huggingface.co/timm/vit_base_patch14_dinov2.lvd142m) - [ViT-B-16-SigLIP-i18n-256](https://huggingface.co/timm/ViT-B-16-SigLIP-i18n-256) - [deit_tiny_patch16_224.fb_in1k](https://huggingface.co/timm/deit_tiny_patch16_224.fb_in1k) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165381 Approved by: https://github.com/BoyuanFeng	2025-10-15 01:19:10 +00:00
Amandeep Chhabra	9ffba8a2f9	fixing stress test failure (#164353 ) Summary: This diff fixes a stress test failure by adding a new binary echo4.py and modifying the existing echo1.py binary. The changes are made in both fbcode and xplat directories. The api_test.py file is updated to use the new echo4.py binary, and the BUCK file is updated to include the new binary. Test Plan: ``` buck test -j 18 'fbcode//mode/opt' fbcode//caffe2/test/distributed/elastic/multiprocessing:api_test -- --exact 'caffe2/test/distributed/elastic/multiprocessing:api_test - test_binary_redirect_and_tee (api_test.StartProcessesListAsBinaryTest)' --run-disabled --stress-runs 20 --record-results ``` ``` buck test -j 18 'fbcode//mode/opt' fbcode//caffe2/test/distributed/elastic/multiprocessing:api_test -- --exact 'caffe2/test/distributed/elastic/multiprocessing:api_test - test_binary (api_test.StartProcessesListAsBinaryTest)' --run-disabled --stress-runs 20 --record-results ``` https://www.internalfb.com/intern/testinfra/testrun/17732923648474906 https://www.internalfb.com/intern/testinfra/testrun/15481123834815653 Differential Revision: D83623694 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164353 Approved by: https://github.com/d4l3k	2025-10-15 01:18:50 +00:00
Angel Li	3681312ce0	varlen api (#164502 ) Summary Today, the only way to have variable sequence length support in PyTorch attention is through nested tensors [here](https://docs.pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html#nestedtensor-and-dense-tensor-support). We also want to add an explicit lower-level API that provides variable sequence length support without padding/masking in SDPA. This PR builds out `varlen_attn`, the public API that users can call for the forward method, and `_varlen_attn`, the private API that calls into the Flash Attention/cuDNN backend. Benchmarking To benchmark, we compare runtime and TFLOPs against the current SDPA approach with padding. Settings: - 1 H100 machine - `batch_size=8`, `max_seq_len=2048`, `embed_dim=1024`, `num_heads=16` - dtype `torch.bfloat16` - `is_causal=False` - for variable length, we set sequences to be random multiples of 64 up to `max_seq_len` - 100 runs \| \| Variable Length API \| SDPA \| \|--------\|--------------------\|----------\| \| Runtime \| 0.21750560760498047 ms \| 0.43171775817871094 ms \| \| TFLOPs \| 231.812 \| 320.840 \| The sparsity is 0.453 which we can see matches the speedup we get from Varlen (approx 50%). TFLOPs remains around the same, with SDPA slightly larger due to potential higher overhead and total flops scaling with sequence length. Testing Run `python test/test_varlen_attention.py` for unit tests where we verify basic functionality and confirm numerical match between varlen outputs vs SDPA. Next steps Next steps from this PR (higher in the stack) include registering the private API `_varlen_attn` as a custom op, implementing backward support, and enabling cuDNN with correct numerics. (This stack builds on top of #162326) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164502 Approved by: https://github.com/v0i0, https://github.com/drisspg	2025-10-15 00:45:06 +00:00
PyTorch MergeBot	7778a58e7c	Revert "[export] Handle kwargs better in aot_export_joint_with_descriptors (#165334 )" This reverts commit bbb902c8dd911e1587253f496c1e2fb178d4b6a1. Reverted https://github.com/pytorch/pytorch/pull/165334 on behalf of https://github.com/jeffdaily due to trunk CI passed here but failures on HUD after merge? test/functorch/test_aot_joint_with_descriptors.py::TestAOTJointWithDescriptors::test_module_with_kwargs [GH job link](https://github.com/pytorch/pytorch/actions/runs/18511729262/job/52755708742) [HUD commit link](`bbb902c8dd`) ([comment](https://github.com/pytorch/pytorch/pull/165334#issuecomment-3404071893))	2025-10-15 00:21:49 +00:00
Xu Han	e7091a47da	[AOTI] skip Windows XPU crashed UTs. (#165393 ) Skip some UTs, which crashed on Windows XPU. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165393 Approved by: https://github.com/jansel	2025-10-14 23:45:14 +00:00
Brian Hirsh	bcfea48ab7	add and fix OpInfo tests for the default partitioner (#165372 ) I noticed the default partitioner was breaking in some dynamic shape tests, so prior to turning off functionalization I want to tweak it to pass all of our OpInfo tests Pull Request resolved: https://github.com/pytorch/pytorch/pull/165372 Approved by: https://github.com/ezyang ghstack dependencies: #165327	2025-10-14 23:34:34 +00:00
Brian Hirsh	d2e1dbc8f2	make aotdispatcher opinfo tests keep input mutations in graph (#165327 ) This stack is going to turn off functionalization and turn on the default partitioner, so I'm going to separate out a few changes before turning off functionalization in our OpInfo tests: (1) run our tests with input mutations allowed inside the graph (2) run our tests with the default partitioner (3) run with functionalization off (4) (later) make the tests properly test for bitwise equivalence Pull Request resolved: https://github.com/pytorch/pytorch/pull/165327 Approved by: https://github.com/ezyang	2025-10-14 23:34:33 +00:00
fduwjj	89298ada83	[device_mesh] Implement `_unflatten` on top of CuTe layout bookkeeping (#161224 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161224 Approved by: https://github.com/lw, https://github.com/fegin ghstack dependencies: #164510	2025-10-14 23:17:11 +00:00
sekyonda	c467e59cb0	dynamo configs to torch.compiler (#163517 ) Moving some dynamo configs to torch.compiler Pull Request resolved: https://github.com/pytorch/pytorch/pull/163517 Approved by: https://github.com/williamwen42, https://github.com/anijain2305 Co-authored-by: Svetlana Karslioglu <svekars@meta.com>	2025-10-14 22:44:53 +00:00
angelayi	bbb902c8dd	[export] Handle kwargs better in aot_export_joint_with_descriptors (#165334 ) fx.Interpreter doesn't handle kwargs... not sure how this code worked previously Pull Request resolved: https://github.com/pytorch/pytorch/pull/165334 Approved by: https://github.com/tugsbayasgalan, https://github.com/ezyang	2025-10-14 22:22:58 +00:00
Guilherme Leobas	e6f766c7d7	[Dynamo] Fixes for exceptions (#153966 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/153966 Approved by: https://github.com/Lucaskabela	2025-10-14 22:03:58 +00:00
Wei Feng	13b621d87c	[DTensor] add __repr__ for CommDebugMode(get_total_count()=) (#165006 ) I just want to print CommDebugMode and know if there is communication. implementing `__repr__` for `print(comm_mode)` ``` comm_mode = CommDebugMode() with comm_mode: out = torch.mm(inps, weight) print(comm_mode) # CommDebugMode(get_total_counts()=0) ``` Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/165006 Approved by: https://github.com/anshul-si ghstack dependencies: #165024	2025-10-14 21:31:23 +00:00
Dzmitry Huba	01738a3fea	Continue local tensor mode enablement for DTensor tests (#165451 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165451 Approved by: https://github.com/ezyang, https://github.com/albanD	2025-10-14 21:20:54 +00:00
PyTorch MergeBot	a2f34bdd7c	Revert "Patch the flex_attention._get_mod_type to not use inspect.signature when computing num_positional_args (an alternative fix for flex attention graph break on create_block_mask) (#164923 )" This reverts commit 3401665110dbfbfa4625646e4a18ebf8c99fa92f. Reverted https://github.com/pytorch/pytorch/pull/164923 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/164923#issuecomment-3403654378))	2025-10-14 21:20:49 +00:00
karthickai	a63ab0b8cd	[Inductor] Fix out-of-bounds indices in repeat_interleave decomposition (#165368 ) When `repeat_interleave` is decomposed into: ```bash cumsum = repeat.cumsum(0) pos = torch.arange(output_size, device=repeat.device) indices = torch.searchsorted(cumsum, pos, right=True) ``` `searchsorted` op with `right=True` returns the insertion point after matching elements. When query values `pos` are `>= cumsum[-1]`, searchsorted returns `len(cumsum)`, which is out of bounds for indexing (valid range: `[0, len(cumsum)-1]`). These invalid indices trigger CUDA device-side assert errors in downstream indexing operations. This fix adds clamping to ensure all indices stay within the valid range [0, repeat.size(0)-1]. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165368 Approved by: https://github.com/mlazos	2025-10-14 21:16:36 +00:00
Yiming Zhou	102b7885ff	Add option to run AOT Precompile in benchmark (#164906 ) Use the existing benchmark infra to get some signals for AOT precompile pass rate on OSS models. Here we also measure and log the loading time. ``` python ./benchmarks/dynamo/huggingface.py --accuracy --inference --aot-precompile python ./benchmarks/dynamo/timm_models.py --accuracy --inference --aot-precompile python ./benchmarks/dynamo/torchbench.py --accuracy --inference --aot-precompile ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164906 Approved by: https://github.com/zhxchen17	2025-10-14 20:59:55 +00:00
Janani Sriram	382d04a51e	[Inductor][ATen][FP8] Add note for supported blockwise scaling strategy pairs (#165450 ) Summary: Add note mentioning which scaling type pairs are supported in Inductor ATen, since this was a source of confusion and also informs which scaling strategies we choose to support for other backends, like Triton. Test Plan: n/a Reviewed By: lw Differential Revision: D84522373 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165450 Approved by: https://github.com/NikhilAPatel	2025-10-14 20:43:58 +00:00
Jean Schmidt	1ec0755a7e	[ISSUES] Update ci:sev template to include a note about ci: disable-autorevert label (#165459 ) We noticed that disabling autorevert in any and all ci:sevs is too impactful, as ci: sevs are sometimes created just to communicate an action or a impactful change. But sometimes durring a SEV we might not want to disable autorevert anyways, a example is a ci: sev impacting jobs we don't use as basis for autorevert. So, a note is added reminding the ci:sev author to optionally add this tag to disable auto-revert Note: using this opportunity to fix the ci: disable-autorevert issues. As it is best for the title to be simple and the displayed message in the GitHub interface to be decorated with emoji :) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165459 Approved by: https://github.com/malfet	2025-10-14 20:32:46 +00:00
Malay Bag	058782c6ab	[torch.export] Rmoving unused constants - add support for corner case (#165205 ) Summary: In some cases unused constant had only one level of child node, no second level of child node. Those constants should be removed too. The added test case has the scenario where this scenario will happen. Test Plan: ``` buck test mode/opt caffe2/test:test_export -- 'test_unused_constant' ``` https://www.internalfb.com/intern/testinfra/testrun/15481123837456594 Differential Revision: D84398413 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165205 Approved by: https://github.com/angelayi	2025-10-14 20:26:28 +00:00
angelayi	2b4ef6b4d6	[opaque_obj_v2] PyObject custom op schema type (#165004 ) This is a cleaner implementation of opaque objects (https://github.com/pytorch/pytorch/pull/162660). Instead now we just need to do: Call `register_opaque_type` to register the type as being "opaque" and allowed by custom ops. You also need to pass a unique name that maps to the type. ```python class OpaqueQueue: def __init__(self, queue: list[torch.Tensor], init_tensor_: torch.Tensor) -> None: super().__init__() self.queue = queue self.init_tensor_ = init_tensor_ def push(self, tensor: torch.Tensor) -> None: self.queue.append(tensor) def pop(self) -> torch.Tensor: if len(self.queue) > 0: return self.queue.pop(0) return self.init_tensor_ def size(self) -> int: return len(self.queue) register_opaque_type(OpaqueQueue, "_TestOpaqueObject_OpaqueQueue") ``` When creating the custom op, the schema will then use the unique name: ```python self.lib = torch.library.Library("_TestOpaqueObject", "FRAGMENT") torch.library.define( "_TestOpaqueObject::queue_push", "(_TestOpaqueObject_OpaqueQueue a, Tensor b) -> ()", tags=torch.Tag.pt2_compliant_tag, lib=self.lib, ) @torch.library.impl( "_TestOpaqueObject::queue_push", "CompositeExplicitAutograd", lib=self.lib ) def push_impl(queue: OpaqueQueue, b: torch.Tensor) -> None: assert isinstance(queue, OpaqueQueue) queue.push(b) ``` Using the custom op: ```python queue = OpaqueQueue([], torch.zeros(3)) torch.ops._TestOpaqueObject.queue_push(queue, torch.ones(3)) self.assertTrue(queue.size(), 1) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165004 Approved by: https://github.com/albanD	2025-10-14 20:21:04 +00:00
q1l1	3f83e8915e	[inductor] fix issue for example value with unbacked strides (#163660 ) ## Issue During autotune, we're not applying size hints atomically for the example inputs used for benchmarking. If there is unbacked symint showing up in inputs' strides, this might lead to CUDA IMA, and this could be reproduced by the added unittest, with stride being `[128 * u0, 128, 1]` and unbacked fallback being 8192, after calling `benchmark_example_value`, we get back a tensor with stride as `[8192, 128, 1]` as opposed to `[128 * 8192, 128, 1]` ## Fix Using the atomic API when trying to apply size hints to input tensor' strides. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163660 Approved by: https://github.com/ColinPeppler	2025-10-14 20:07:51 +00:00
Jeff Daily	d7e3f493d9	[ROCm][CI] add mi355 to inductor perf test nightly (#165326 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/165326 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-14 20:03:21 +00:00
Edward Yang	08f09d9543	Ensure rms_norm decomp generates add.Scalar for pattern match BC (#165437 ) Summary: Apparently if I just do `tensor + eps` this turns into add.Tensor, which is bad because the constant Tensor ends up getting hoisted into an input, which is a bozo thing to do. Just make sure it's exactly compatible. Test Plan: ``` buck run 'fbcode//mode/opt' fbcode//bolt/nn/executorch/backends/tests:qnn_test_ar1g1 bolt.nn.executorch.backends.tests.qnn_test_ar1g1.QnnTestAR1G1.test_RMSNorm ``` Reviewed By: tugsbayasgalan Differential Revision: D84613184 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165437 Approved by: https://github.com/tugsbayasgalan	2025-10-14 19:56:37 +00:00
Tugsbayasgalan (Tugsuu) Manlaibaatar	74acf92648	Forward fix inductor failure (#165363 ) (#165443 ) Summary: Title Test Plan: CI Differential Revision: D84615478 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165443 Approved by: https://github.com/angelayi	2025-10-14 19:31:58 +00:00
Nikita Shulga	cbf212e9c7	[CI] Fix doctest job if build without distributed (#165449 ) Guard test with `TORCH_DOCTEST_DISTRIBUTED` and set it to true in run_test.py to be able to pass doctest for PyTorch build without distribtued support. This is a regression introduced by https://github.com/pytorch/pytorch/pull/164806 Fixes https://github.com/pytorch/pytorch/issues/165343 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165449 Approved by: https://github.com/seemethere	2025-10-14 19:19:03 +00:00
Guilherme Leobas	d18e068fd6	[dict] Implement `__eq__` for dict_items (#155154 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/155154 Approved by: https://github.com/anijain2305	2025-10-14 18:56:51 +00:00
jmaczan	3401665110	Patch the flex_attention._get_mod_type to not use inspect.signature when computing num_positional_args (an alternative fix for flex attention graph break on create_block_mask) (#164923 ) The initial fix for inspect.signature uses not a right approach (https://github.com/pytorch/pytorch/pull/164349#pullrequestreview-3306614010). As @williamwen42 suggests (https://github.com/pytorch/pytorch/pull/164349#issuecomment-3379222885) we can just for now get rid of `inspect.signature` call in flex_attention to resolve this high priority issue (https://github.com/pytorch/pytorch/issues/164247#issuecomment-3378673179). In this PR I did exactly this - limited the scope of fix to just computing `num_positional_args` in `flex_attention._get_mod_type` based on properties returned by `NestedUserFunctionVariable.const_getattr` (some were missing so I added them) Fixes #164247 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164923 Approved by: https://github.com/williamwen42	2025-10-14 18:29:15 +00:00
Sean McGovern	8c60f4ae08	[Distributed] update table in docs (#165009 ) Fixes #162248 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165009 Approved by: https://github.com/ezyang	2025-10-14 18:17:22 +00:00
Rohit Singh Rathaur	c4565c3b94	[distributed] Replace 164 assert statements in fsdp directory (#165235 ) Replace assert statements with explicit if/raise patterns across 20 files: - _optim_utils.py (38 asserts) - _flat_param.py (25 asserts) - _fully_shard/_fsdp_param.py (23 asserts) - sharded_grad_scaler.py (12 asserts) - fully_sharded_data_parallel.py (11 asserts) - wrap.py (10 asserts) - _state_dict_utils.py (9 asserts) - _fully_shard/_fsdp_param_group.py (8 asserts) - _runtime_utils.py (6 asserts) - _init_utils.py (6 asserts) - 10 additional files (16 asserts) This prevents assertions from being disabled with Python -O flag. Fixes partially #164878 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165235 Approved by: https://github.com/albanD	2025-10-14 18:04:57 +00:00
Wei Feng	6918f17114	[FSDP2] provide public API to share cuda streams across roots (#165024 ) for pipeline parallel, we can have multiple FSDP roots (chunks) ``` model = nn.Sequential([chunk0, chunk1]) fully_shard(model.chunk0) fully_shard(model.chunk1) ``` we can call `share_comm_ctx` to share all-gather, reduce-scatter, all-reduce cuda streams. this avoids inter-stream memory fragmentation ``` from torch.distributed.fsdp import share_comm_ctx share_comm_ctx([model.chunk0, model.chunk1]) ``` unit test: `pytest -s test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_share_comm_context` Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/165024 Approved by: https://github.com/mori360	2025-10-14 17:50:46 +00:00
Rohit Singh Rathaur	9b6be53326	[distributed] Replace 94 assert statements in tensor ops files (#165229 ) Replace assert statements with explicit if/raise patterns in: - _math_ops.py (43 asserts) - _matrix_ops.py (27 asserts) - _view_ops.py (24 asserts) This prevents assertions from being disabled with Python -O flag. Fixes partially #164878. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165229 Approved by: https://github.com/albanD	2025-10-14 17:28:06 +00:00
Kathryn-cat	7fee6bbf34	[Fix] Completely remove stride normalization on DLPack Tensor (#164161 ) A followup on PR #163282 Fixes #163274 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164161 Approved by: https://github.com/ngimel, https://github.com/eqy	2025-10-14 17:17:11 +00:00
ruisizhang123	6adaa328f4	[autobucketing] aten autobucketing fix to enable aot_eager pass (#165063 ) When the autobucketing pass is registered as aot_eager backend `fw_compiler` and `bw_compiler`, this pr ensures the tensors are all-gathers on "cpu/cuda" device instead of "meta" device. When we do `dist.all_gather_object`, it will create new bytestorage outside no_dispatch [here](`a2e2e1d8c0/torch/distributed/distributed_c10d.py (L3303)`), which is on meta device. Thus, I updated the code to use `unset_fake_temporarily`, which would gather RealTensor from other ranks. It is needed to unblock the aot_eager+autobucketing pass in this [PR](https://github.com/pytorch/torchtitan/pull/1813). Otherwise, I hit the error as follows: ```bash traceback : Traceback (most recent call last): File "/home/ruisizhang123/pytorch/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 358, in wrapper return f(args, kwargs) File "/home/ruisizhang123/torchtitan/torchtitan/train.py", line 607, in train self.train_step(data_iterator) ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^ File "/home/ruisizhang123/torchtitan/torchtitan/train.py", line 507, in train_step loss = self.forward_backward_step(input_dict, labels) File "/home/ruisizhang123/torchtitan/torchtitan/train.py", line 483, in forward_backward_step pred = model_parts[0](inputs, extra_inputs, extra_args) File "/home/ruisizhang123/pytorch/torch/_dynamo/eval_frame.py", line 418, in __call__ return super().__call__(args, *kwargs) ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ File "/home/ruisizhang123/pytorch/torch/nn/modules/module.py", line 1784, in _wrapped_call_impl return self._call_impl(args, *kwargs) ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ File "/home/ruisizhang123/pytorch/torch/nn/modules/module.py", line 1795, in _call_impl return forward_call(args, kwargs) File "/home/ruisizhang123/pytorch/torch/_dynamo/eval_frame.py", line 901, in compile_wrapper raise e.remove_dynamo_frames() from None # see TORCHDYNAMO_VERBOSE=1 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ruisizhang123/pytorch/torch/_dynamo/output_graph.py", line 2359, in _call_user_compiler raise BackendCompilerFailed( self.compiler_fn, e, inspect.currentframe() ).with_traceback(e.__traceback__) from None File "/home/ruisizhang123/pytorch/torch/_dynamo/output_graph.py", line 2334, in _call_user_compiler compiled_fn = compiler_fn(gm, example_inputs) File "/home/ruisizhang123/pytorch/torch/_dynamo/repro/after_dynamo.py", line 156, in __call__ compiled_gm = compiler_fn(gm, example_inputs) File "/home/ruisizhang123/pytorch/torch/__init__.py", line 2441, in __call__ return self.compiler_fn(model_, inputs_, self.kwargs) ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ruisizhang123/pytorch/torch/_dynamo/backends/common.py", line 117, in __call__ cg = aot_module_simplified(gm, example_inputs, *self.kwargs) File "/home/ruisizhang123/pytorch/torch/_functorch/aot_autograd.py", line 1100, in aot_module_simplified compiled_fn, _ = aot_stage2_compile( ~~~~~~~~~~~~~~~~~~^ aot_state, ^^^^^^^^^^ ...<4 lines>... inference_compiler, ^^^^^^^^^^^^^^^^^^^ ) ^ File "/home/ruisizhang123/pytorch/torch/_functorch/_aot_autograd/graph_compile.py", line 257, in aot_stage2_compile return aot_stage2_autograd(aot_state, aot_graph_capture) File "/home/ruisizhang123/pytorch/torch/_functorch/_aot_autograd/graph_compile.py", line 1696, in aot_stage2_autograd compiled_fw_func = aot_config.fw_compiler(fw_module, adjusted_flat_args) File "/home/ruisizhang123/torchtitan/torchtitan/experiments/simple_fsdp/backend.py", line 35, in aten_autobucketing_reordering_pass schedule_overlap_bucketing(gm) ~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^ File "/home/ruisizhang123/pytorch/torch/_inductor/fx_passes/overlap_scheduling.py", line 755, in schedule_overlap_bucketing ).run() ~~~^^ File "/home/ruisizhang123/pytorch/torch/_inductor/fx_passes/overlap_scheduling.py", line 358, in run self._align_compute_nodes_runtime_estimations_across_all_distributed_ranks() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^ File "/home/ruisizhang123/pytorch/torch/_inductor/fx_passes/overlap_scheduling.py", line 337, in _align_compute_nodes_runtime_estimations_across_all_distributed_ranks dist.all_gather_object( ~~~~~~~~~~~~~~~~~~~~~~^ gathered_runtime_estimations, runtime_estimations, pg ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "/home/ruisizhang123/pytorch/torch/distributed/c10d_logger.py", line 82, in wrapper return func(args, **kwargs) File "/home/ruisizhang123/pytorch/torch/distributed/distributed_c10d.py", line 3170, in all_gather_object input_tensor, local_size = _object_to_tensor(obj, current_device, group) ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ruisizhang123/pytorch/torch/distributed/distributed_c10d.py", line 3079, in _object_to_tensor byte_tensor = torch.ByteTensor(byte_storage).to(device) ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^ torch._dynamo.exc.BackendCompilerFailed: backend='compiler_fn' raised: RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "meta". This is no longer allowed; the devices must match. Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165063 Approved by: https://github.com/eellison	2025-10-14 17:09:54 +00:00
Paul Zhang	4a7eed527f	Make truediv numerics change external only for now (#165328 ) Summary: For D84399286, failing ads ne deterministic tests now. These tests are especially brittle with subtle bitwise numerics changes. Will reenable for fbcode once e2e validation tests are performed Test Plan: N/A Differential Revision: D84514361 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165328 Approved by: https://github.com/izaitsevfb	2025-10-14 17:08:17 +00:00
PyTorch MergeBot	d2494cbb2b	Revert "[distributed] Replace assert statements with AssertionError exceptions (#165216 )" This reverts commit 74db92b21868b7e9e77cc966e5d57a8246723cbd. Reverted https://github.com/pytorch/pytorch/pull/165216 on behalf of https://github.com/clee2000 due to I think this broke distributed/test_pg_wrapper.py::ProcessGroupNCCLWrapperTest::test_debug_level_detail_no_gloo [GH job link](https://github.com/pytorch/pytorch/actions/runs/18492765290/job/52693842750) [HUD commit link](`74db92b218`), note to self: bad TD ([comment](https://github.com/pytorch/pytorch/pull/165216#issuecomment-3402838765))	2025-10-14 17:05:16 +00:00
Shangdi Yu	5eddbb5e47	[annotate] Annotation should be mapped across submod (#165202 ) The match for backward nodes might be in a different submod, so we should check all submod for potential matches. In flex attention, this could happen if `mask_mod` has operations (such as index) that increase the seq_nr of the forward graph nodes. Then the backward flex_attention nodes cannot find a match in its own subgraph. ``` python test/functorch/test_aot_joint_with_descriptors.py -k preserve_annotate ``` Also tested on torchtitan joint_graph_runner branch. The flex_attention backward nodes are annotated now. ``` NGPU=8 CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" LOG_RANK=0 TRAIN_FILE="torchtitan.train" TORCHFT_LIGHTHOUSE="http://localhost:29510" PYTORCH_ALLOC_CONF="expandable_segments:True" torchrun --nproc_per_node=8 --rdzv_backend c10d --rdzv_endpoint="localhost:0" --local-ranks-filter 0 --role rank --tee 3 -m torchtitan.train --job.config_file ./torchtitan/models/llama3/train_configs/debug_model.toml --model.name joint_graph_runner.llama3 --compile.enable --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4 --model.flavor=debugmodel_flex_attn ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165202 Approved by: https://github.com/SherlockNoMad	2025-10-14 16:19:38 +00:00
Animesh Jain	c9b2a09530	[export] Turn on install_free_tensors flag (#164691 ) The final step in removing the discrepancy between torch.compile(fullgraph=True) and torch.export(strict=True). Pull Request resolved: https://github.com/pytorch/pytorch/pull/164691 Approved by: https://github.com/avikchaudhuri	2025-10-14 15:33:50 +00:00
KarhouTam	bf5aeb3148	[torch/utils][Code Clean] Clean asserts in `hipify/`, `jit/`, `model_dump` and `tensorboard` of `torch/utils` (#165311 ) Including: - `torch/utils/hipify/` - `torch/utils/jit/` - `torch/utils/model_dump/` - `torch/utils/tensorboard/` Fixes part of #164878 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165311 Approved by: https://github.com/albanD	2025-10-14 15:26:23 +00:00
Rohit Singh Rathaur	45b8c0f75c	[distributed] Replace 54 assert statements in tensor/_ops/_tensor_ops.py (#165226 ) Replace assert statements with explicit if/raise patterns to prevent assertions from being disabled with Python -O flag. Fixes partially #164878 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165226 Approved by: https://github.com/albanD	2025-10-14 15:10:03 +00:00
Aleksei Nikiforov	c733072874	Fix IValue from SymBool on big-endian system (#163647 ) Skip test_compiled_autograd_attribution on s390x It fails both on s390x and x86_64 at least under some circumstances. Disable it for now until on s390x until it works reliably. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163647 Approved by: https://github.com/malfet	2025-10-14 15:07:48 +00:00
Yuanyuan Chen	fbe0d20a17	[2/N] More ruff SIM fixes (#165031 ) This is follow-up of #164695 to apply ruff SIM rules to more files. Most changes are about simplifying dict.get because None is already the default value. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165031 Approved by: https://github.com/mlazos	2025-10-14 14:22:54 +00:00
Lucas Kabela	1fa11f42b1	[Bugfix][vLLM] Explicitly do not support instead of crashing for named tuples in infer schema (#165191 ) Fixes https://github.com/vllm-project/vllm/issues/25270 by being explicit in erroring; previously we had a cryptic `__origin__ undefined` error, but now should give proper error message that we don't support NamedTuples in schema Test with ``` python test/test_custom_ops.py TestCustomOp.test_unsupported_param_types ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165191 Approved by: https://github.com/zou3519	2025-10-14 14:18:42 +00:00
FFFrog	6f713e25bb	[CodeClean] Replace std::runtime_error with TORCH_CHECK (#164130 ) As the title stated. Changes: - torch/csrc/inductor(Part 1) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164130 Approved by: https://github.com/albanD, https://github.com/Skylion007	2025-10-14 14:09:53 +00:00
Shangdi Yu	09a4187b8e	Update windows cuda build to use 12.8 (#165345 ) As title Motivation: The rest of the pytorch and inductor build is using 12.8 and we're deprecating cuda 12.6 builds soon per https://github.com/pytorch/pytorch/issues/165111 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165345 Approved by: https://github.com/atalman, https://github.com/malfet	2025-10-14 13:58:20 +00:00
Colin Peppler	306c55ba27	[atomically_apply_size_hint] Make unbacked replacements reconciles to a single expr (#164324 ) ## Problem Okay there's limitations with today's `atomically_apply_size_hint` though it works for most observed failures we've seen so far. However, it's easy to come up with an edge case. Suppose you encounter this setup. ``` a: [s0 + u0] b: [s1 + u1] c: [u2 + u3] d: [u100] ``` Today, we use a few heuristics to specify the LHS and RHS for replacements. `10d2734d9b/torch/_inductor/sizevars.py (L730-L759)` It's possible to end up with these replacement rules. Notice how there's no replacement for `s1 + u1` and `u2 + u3` :( That's because today picking the LHS and RHS matters a lot, and `s1 + u1` & `u2 + u3` happened to end up on the RHS. ``` s0 + u0 => s1 + u1 s0 + u0 => u2 + u3 # overrides previous replacement; each expr only gets one replacement s0 + u0 => u100 # overrides previous replacement; ditto ``` I believe what we really want is this: everybody gets a replacement! And they all should (eventually) settle at the same canonical expr (i.e. `u100`) when running the replacement several times. ``` s1 + u1 ==> s0 + u0 u2 + u3 ==> s0 + u0 s0 + u0 ==> u100 ``` We can just short-cut this by using the canonical expr as the replacement. ``` s1 + u1 ==> u100 u2 + u3 ==> u100 s0 + u0 ==> u100 ``` ## Implementation I offer one way to deal with this: 1. assure every expression has one canonical replacement (i.e. `u100`) 2. if two expressions are equal (inferred from `deferred_runtime_asserts`), then they must have the same canonical replacement We can implement the above with union find. * Whenever you see `Eq(lhs, rhs)` then do `union(lhs, rhs)`. * Whenever you want to find the canonical replacement for a given expr then do `find(expr)`. * When picking the canonical replacement we can use a few heuristics like (1) prefer a fully backed expr, (2) replacing with sub-expressions, and whatever we'd like. Differential Revision: [D84549260](https://our.internmc.facebook.com/intern/diff/D84549260) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164324 Approved by: https://github.com/laithsakka	2025-10-14 13:57:33 +00:00
Isalia20	56d6229ff9	[MPS] fix comment for normcdf (#165233 ) Just a small comment fix for normcdf Pull Request resolved: https://github.com/pytorch/pytorch/pull/165233 Approved by: https://github.com/malfet	2025-10-14 13:56:31 +00:00
Rohit Singh Rathaur	74db92b218	[distributed] Replace assert statements with AssertionError exceptions (#165216 ) Replaces 71 assert statements across 11 files in `torch.distributed` with explicit if-checks raising AssertionError to prevent assertions from being disabled with Python -O flag. Fixes #164878 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165216 Approved by: https://github.com/albanD	2025-10-14 09:58:59 +00:00
Chien-Chin Huang	c48843e4c6	[CP][BE] Docstrings, comments polish and remove unused variables (#165039 ) No logic change, just polish the docstrings, comments and remove unused variables Pull Request resolved: https://github.com/pytorch/pytorch/pull/165039 Approved by: https://github.com/XilunWu ghstack dependencies: #162542, #164500, #163185	2025-10-14 09:35:32 +00:00
Cui, Yifeng	9e89b1c4c7	Update torch-xpu-ops commit pin (#165321 ) Update the torch-xpu-ops commit to [intel/torch-xpu-ops@ce9db1](`ce9db15136`), includes: - Fix test_barrier hang by using static global rank in ProcessGroupXCCL - Update install_xpu_headers only when content should change to speedup recompilation - Add global rank information to communication logging - Remove duplicate normalization from FFT methods Pull Request resolved: https://github.com/pytorch/pytorch/pull/165321 Approved by: https://github.com/EikanWang	2025-10-14 09:07:24 +00:00
PyTorch MergeBot	c5972ebdfb	Revert "Update windows cuda build to use 12.8 (#165345 )" This reverts commit ca96c675001fa87b9d9c648972415ab8b1591f11. Reverted https://github.com/pytorch/pytorch/pull/165345 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165345#issuecomment-3400344079))	2025-10-14 06:46:33 +00:00
Shunting Zhang	18b3658df9	[inductor][ez] properly print Pointwise (#165369 ) Previously when we print a ComputedBuffer for reduction, we get something like: ``` ComputedBuffer(name='buf0', layout=FixedLayout('cuda:0', torch.float32, size=[1, 768], stride=[768, 1]), data=Reduction( 'cuda', torch.float32, def inner_fn(index, rindex): _, i1 = index r0_0 = rindex tmp0 = ops.load(tangents_1, i1 + 768 * r0_0) tmp1 = ops.to_dtype(tmp0, torch.float32, src_dtype=torch.bfloat16) tmp2 = ops.load(primals_1, i1 + 768 * r0_0) tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16) tmp4 = ops.load(rsqrt, r0_0) tmp5 = tmp3 * tmp4 tmp6 = tmp1 * tmp5 return tmp6 , ``` But if we print a ComputedBuffer for a pointwise, we get something like ``` ComputedBuffer(name='buf2', layout=FixedLayout('cuda:0', torch.bfloat16, size=[32768, 768], stride=[768, 1]), data=Pointwise(device=device(type='cuda', index=0), dtype=torch.bfloat16, inner_fn=<function make_pointwise.<locals>.inner.<locals>.inner_fn at 0x7f12922c5bc0>, ranges=[32768, 768])) ``` Note that the inner function str is not printed. With the change, we get the inner_fn string printed in this case: ``` ComputedBuffer(name='buf2', layout=FixedLayout('cuda:0', torch.bfloat16, size=[32768, 768], stride=[768, 1]), data=Pointwise( 14:42:46 [25/1988] 'cuda', torch.bfloat16, def inner_fn(index): i0, i1 = index tmp0 = ops.load(tangents_1, i1 + 768 * i0) tmp1 = ops.to_dtype(tmp0, torch.float32, src_dtype=torch.bfloat16) tmp2 = ops.load(primals_2, i1) tmp3 = tmp1 * tmp2 tmp4 = ops.load(rsqrt, i0) tmp5 = tmp3 * tmp4 tmp6 = ops.load(buf1, i0) tmp7 = ops.constant(-0.5, torch.float32) tmp8 = tmp6 * tmp7 tmp9 = ops.load(rsqrt, i0) tmp10 = tmp9 * tmp9 tmp11 = tmp10 * tmp9 tmp12 = tmp8 * tmp11 tmp13 = ops.constant(0.0013020833333333333, torch.float32) tmp14 = tmp12 * tmp13 tmp15 = ops.load(primals_1, i1 + 768 * i0) tmp16 = ops.to_dtype(tmp15, torch.float32, src_dtype=torch.bfloat16) tmp17 = tmp14 * tmp16 tmp18 = tmp5 + tmp17 tmp19 = ops.load(buf1, i0) tmp20 = ops.constant(-0.5, torch.float32) tmp21 = tmp19 * tmp20 tmp22 = ops.load(rsqrt, i0) tmp23 = tmp22 * tmp22 tmp24 = tmp23 * tmp22 tmp25 = tmp21 * tmp24 tmp26 = ops.constant(0.0013020833333333333, torch.float32) tmp27 = tmp25 * tmp26 tmp28 = ops.load(primals_1, i1 + 768 * i0) tmp29 = ops.to_dtype(tmp28, torch.float32, src_dtype=torch.bfloat16) tmp30 = tmp27 * tmp29 tmp31 = tmp18 + tmp30 tmp32 = ops.to_dtype(tmp31, torch.bfloat16, src_dtype=torch.float32) return tmp32 , ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165369 Approved by: https://github.com/eellison	2025-10-14 06:08:12 +00:00
Dzmitry Huba	5fbf93b774	Introduce automatic wrapper to run DTensor tests under local tensor mode (#165383 ) The wrapper enable to share test body implementation while eliminating need test class by hand. As an example, this change converts the whole DTensorTest to use local tensor mode. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165383 Approved by: https://github.com/ezyang	2025-10-14 06:08:03 +00:00
Angel Li	a856a17799	bf16 support for per_channel bwd (#165325 ) Follow up to #165098 - adding bf16 support for the backward pass. To avoid BC breaking changes/losing precision, we upcast the parameters to fp32 after the op gets called, and downcast the gradients to bf16 before returning. For testing, we upcast to fp32 before calling the reference function. We increase the tolerance to 1e-2 for bf16 inputs because of a difference in casting calculations between python's `x.to(torch.bfloat16)` and cpp's `x.to(at::kBFloat16)` (after comparing intermediate tensors, we found that the numerics diverge after the final casting). We don't explicitly cast in the CPP op but rather let autograd/optimizer handle it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165325 Approved by: https://github.com/andrewor14	2025-10-14 05:47:32 +00:00
Michael Lazos	bc6e08954d	[user-cuda-streams] Add fork/join custom ops (#162900 ) Creates the fork/join stream ops. These ops are passthrough ops which mutate all of their args (without actually performing any computation on them) so that during functionalization, implicit dependencies are added on all of their args. This allows us to prevent reordering during our pre/post grad graph passes. Make custom ops inplace Pull Request resolved: https://github.com/pytorch/pytorch/pull/162900 Approved by: https://github.com/anijain2305 ghstack dependencies: #163027, #162899, #163028	2025-10-14 05:43:19 +00:00
Michael Lazos	45a96b2081	[user-streams] Handle aliasing properly (#163028 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163028 Approved by: https://github.com/williamwen42, https://github.com/anijain2305 ghstack dependencies: #163027, #162899	2025-10-14 05:43:19 +00:00
Michael Lazos	04e36611bb	[user-cuda-streams] Pass streams/events to the graph via lookup table (#162899 ) Stores streams in a global object look table that maps a dynamo selected index to objects. This index is generated during tracing, and at runtime, a helper function is called from the bytecode to populate this map. This differs from the previous implementation that simply mapped IDs to the associated objects. This required specialization on the IDs of the specific objects, while this new approach does not. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162899 Approved by: https://github.com/anijain2305 ghstack dependencies: #163027	2025-10-14 05:43:19 +00:00
Michael Lazos	f15c25d5c3	[user-streams] Move stream code to streams module (#163027 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163027 Approved by: https://github.com/StrongerXi, https://github.com/anijain2305	2025-10-14 05:43:19 +00:00
Kostas Tsiampouris	e93981c243	[PyTorch][aarch64] Cast to signed char to fix aarch64 build (#165021 ) Summary: Initial fix: D39198776 Reverted by clang-tidy bot: D83948172 Test Plan: Can now build on aarch64 {P1983767795} Reviewed By: bigning Differential Revision: D84203406 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165021 Approved by: https://github.com/cyyever, https://github.com/Skylion007	2025-10-14 05:37:34 +00:00
Lakshay Garg	496adf9f9c	Replace insert with std::rotate_copy for RingBuffer (#165348 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165348 Approved by: https://github.com/eqy, https://github.com/Skylion007	2025-10-14 05:11:28 +00:00
PyTorch MergeBot	33bfec27ff	Revert "use sym_numel, to allow fake tensors to work (#163831 )" This reverts commit e71c75680f2d6ce5f61ad4b2125f4934087762eb. Reverted https://github.com/pytorch/pytorch/pull/163831 on behalf of https://github.com/isuruf due to test failure on mps introduced ([comment](https://github.com/pytorch/pytorch/pull/163831#issuecomment-3400131730))	2025-10-14 05:10:56 +00:00
KarhouTam	f44935cc14	[torch/utils][Code Clean] Clean asserts in `torch/utils/_sympy` (#165279 ) Including: `torch/utils/_sympy/` Fixes part of #164878 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165279 Approved by: https://github.com/albanD	2025-10-14 04:52:23 +00:00
KarhouTam	39116409a1	[torch/utils][Code Clean] Clean asserts in `benchmark/` and `data/` in `torch/utils/` (#165299 ) Including: - `torch/utils/benchmarks/` - `torch/utils/data/` Fixes part of #164878 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165299 Approved by: https://github.com/albanD	2025-10-14 04:50:39 +00:00
James Wu	515d1326c1	Add CLAUDE_CONTEXT directory to gitignore (#165358 ) Claude often adds a bunch of MD files or other stuff that is specific to a local session, add a folder for claude to put this stuff that doesn't get checked into the repo Pull Request resolved: https://github.com/pytorch/pytorch/pull/165358 Approved by: https://github.com/oulgen	2025-10-14 04:47:21 +00:00
nullplay	ac529df244	Native matmul (#157743 ) ### Implementation of #151705 This PR introduces the initial implementation of native `tl.dot` support in Inductor, with the goal of generating Triton matmul kernels directly—without relying on predefined templates. To avoid complexity and ease the review process, I plan to split this work into two phases as outlined in #151705: 1. Basic support (this PR) 2. Lazy broadcasting for optimal performance (future PR) ### Summary of This PR This PR implements the basic functionality. It does not include lazy broadcasting, so the generated kernels may involve explicit `tl.reshape` and `tl.trans` operations before calling `tl.dot`, which introduces some overhead. ### Notable Changes 1. Adds a new config flag: `config.triton.enable_native_matmul` 2. Introduces a new `ops.dot` IR node in Inductor and lowers `aten.mm` and `aten.bmm` to it when native matmul is enabled 3. Enforces tililng suitable for matmul when the native matmul flag is enabled 4. Implements code generation for `ops.dot` 5. Adds Triton autotuning heuristics: for now, I’ve copied the configuration from the existing matmul templates. However, this may not be optimal—it currently takes a long time to tune, and I think there must be a better way to tackle this. @eellison @jansel @PaulZhang12 @shunting314 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157743 Approved by: https://github.com/jansel	2025-10-14 04:22:30 +00:00
PyTorch MergeBot	fa3916f466	Revert "[export] Turn on install_free_tensors flag (#164691 )" This reverts commit 220a34118f40fab4f3f517556d6e1434139a1590. Reverted https://github.com/pytorch/pytorch/pull/164691 on behalf of https://github.com/seemethere due to Breaks some internal things, both me and author agreed that revert was the best course of action ([comment](https://github.com/pytorch/pytorch/pull/164691#issuecomment-3400013759))	2025-10-14 03:58:12 +00:00
PyTorch MergeBot	267348fe7f	Revert "Fix double dispatch to Python for detach (#163671 )" This reverts commit a3e3efe474bef63940ded803e78bb2a382681f1e. Reverted https://github.com/pytorch/pytorch/pull/163671 on behalf of https://github.com/seemethere due to We should've reverted this when we decided to revert https://github.com/pytorch/pytorch/pull/164691 since they were actually stacked ([comment](https://github.com/pytorch/pytorch/pull/163671#issuecomment-3400009953))	2025-10-14 03:55:36 +00:00
PyTorch MergeBot	1803d40c99	Reapply "[export] Turn on install_free_tensors flag (#164691 )" (#165353 ) This reverts commit 9166f6120f63e2d5d76e6ccdbfccb8d6e41cbb43. Reverted https://github.com/pytorch/pytorch/pull/165353 on behalf of https://github.com/seemethere due to This is causing merge conflicts since a dependent PR wasn't reverted ([comment](https://github.com/pytorch/pytorch/pull/165353#issuecomment-3400006587))	2025-10-14 03:52:50 +00:00
Tristan Trouwen	29c5368e0f	MTIA _cdist_forward registration (#165333 ) Summary: Added registration for _cdist_forward on MTIA Differential Revision: D84357997 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165333 Approved by: https://github.com/albanD	2025-10-14 03:51:31 +00:00
VINAY PRITHYANI	e71c75680f	use sym_numel, to allow fake tensors to work (#163831 ) Fixes #[163759](https://github.com/pytorch/pytorch/issues/163759) Replace `numel` with `sym_numel`. Tested with example in issue and it works now . Pull Request resolved: https://github.com/pytorch/pytorch/pull/163831 Approved by: https://github.com/bobrenjc93	2025-10-14 03:33:28 +00:00
Shangdi Yu	ca96c67500	Update windows cuda build to use 12.8 (#165345 ) As title Motivation: The rest of the pytorch and inductor build is using 12.8 and we're deprecating cuda 12.6 builds soon per https://github.com/pytorch/pytorch/issues/165111 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165345 Approved by: https://github.com/atalman	2025-10-14 02:33:44 +00:00
Nikita Shulga	770e6b910c	[DTensor] Extend conv ops to 3D (#165241 ) Current implementation hardcodes 4D input and output tensor shapes Change that by computing `output_conv_shape` for any number of input dims Replace `[.., .., .., slice]` with `[..., slice]` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165241 Approved by: https://github.com/ezyang	2025-10-14 02:30:46 +00:00
Colin Peppler	37d57ac9cb	Use sym_eq in _check_rms_norm_inputs_symint (#165112 ) Summary: ### Problem ArrayRef's `equals()`does elementwise quality using `==` operator. This can cause a DDE for unbacked symints since `==` operator calls `guard_bool`. ``` // SymInt.h bool operator==(const SymInt& o) const { return sym_eq(o).guard_bool(__FILE__, __LINE__); } ``` ### Solution Adds `sym_equals()` to do elementwise equality for `SymIntArrayRef`. Use this instead of `equals()` for `SymIntArrayRef`. Reviewed By: guangy10, pianpwk, muchulee8 Differential Revision: D84168401 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165112 Approved by: https://github.com/Skylion007	2025-10-14 00:06:24 +00:00
Animesh Jain	9166f6120f	Revert "[export] Turn on install_free_tensors flag (#164691 )" (#165353 ) This reverts commit 220a34118f40fab4f3f517556d6e1434139a1590. Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/165353 Approved by: https://github.com/seemethere	2025-10-13 23:40:11 +00:00
Nicolas Macchioni	fb0291d14b	[pt2][caching] fix runtime error in context on cpu-only machine when compile for gpu (#165220 ) re https://github.com/pytorch/pytorch/pull/165186 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165220 Approved by: https://github.com/clee2000	2025-10-13 22:47:41 +00:00
Animesh Jain	f3683453ae	[compile] Regional inductor compilation with fx.annotate (#164776 ) This PR introduces a way to compile a region of FX graph using `fx.traceback.annotate`. ### UX 1) In the user code, mark the region that you want to be compiled with inductor using `with fx_traceback.annotate({"compile_with_inductor": 0})`. As of now, we just rely on the string `compile_with_inductor` and ignore the integer. As the needs arise, we can update the logic. Example ``` def fn(x, y): sin = torch.sin(x) with fx_traceback.annotate({"compile_with_inductor": 0}): mul = sin * y add = mul + 1 return torch.sin(add) ``` 2) You have to instruct the compiler to use the annotations with `compile_fx_annotated_nodes_with_inductor` transformation. This is somewhat controversial, and a user might expect that just setting annotation is enough. But for now to control the blast radius, we need to explicitly do this. One such example is ``` # Set the fw and bw compiler of aot_autograd to `compile_fx_annotated_nodes_with_inductor` def aot_eager_regional_inductor(): return aot_autograd( fw_compiler=compile_fx_annotated_nodes_with_inductor, bw_compiler=compile_fx_annotated_nodes_with_inductor, ) ``` 3) Fixable in short-term - You have to wrap the user code in `torch.fx.traceback.preserve_node_meta` to ensure that annotations are propagated to the compiler. This is fixable, just need to make CI happy. ### Implementation 1) Relies on `CapabilityBasedPartitioner` to "scoop" out regions based on annotations, and then create subgraphs in the main graph. 2) Call `torch._inductor.standalone_compile` on these subgraphs, and jam the returned callable into the FX graph at the place of call_module Resulting graph looks something like this - search for `torch__inductor_standalone_compile_inner` Forward graph ``` class GraphModule(torch.nn.Module): def forward(self, primals_1: "f32[10]", primals_2: "f32[10]"): # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:64 in fn, code: sin = torch.sin(x) sin: "f32[10]" = torch.ops.aten.sin.default(primals_1) # No stacktrace found for following nodes inner = torch__inductor_standalone_compile_inner(sin, primals_2) # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:68 in fn, code: add = mul + 1 getitem: "f32[10]" = inner[0]; inner = None # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:70 in fn, code: return torch.sin(add) sin_1: "f32[10]" = torch.ops.aten.sin.default(getitem) return (sin_1, primals_1, primals_2, sin, getitem) ``` Backward graph ``` class GraphModule(torch.nn.Module): def forward(self, primals_1: "f32[10]", primals_2: "f32[10]", sin: "f32[10]", add: "f32[10]", tangents_1: "f32[10]"): # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:64 in fn, code: sin = torch.sin(x) cos_1: "f32[10]" = torch.ops.aten.cos.default(primals_1); primals_1 = None # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:70 in fn, code: return torch.sin(add) cos: "f32[10]" = torch.ops.aten.cos.default(add); add = None mul_1: "f32[10]" = torch.ops.aten.mul.Tensor(tangents_1, cos); tangents_1 = cos = None # No stacktrace found for following nodes inner = torch__inductor_standalone_compile_inner(mul_1, sin, primals_2); mul_1 = sin = primals_2 = None # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:67 in fn, code: mul = sin * y getitem: "f32[10]" = inner[0] getitem_1: "f32[10]" = inner[1]; inner = None # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:64 in fn, code: sin = torch.sin(x) mul_4: "f32[10]" = torch.ops.aten.mul.Tensor(getitem_1, cos_1); getitem_1 = cos_1 = None return (mul_4, getitem) ``` ### Some issue raised in the HOP meeting 1) CSE will not differentiate different meta custom nodes and do wrong thing. 2) SAC - The recomputed forward will be smaller than the forward. Will we compile a smaller region than? 3) What happens if you have a op in the middle which does not disturb the topology, is it still 1 subgraph? 4) What happens with the nesting of `fx_traceback.annotate`? Are there any ordering requirements? 5) What are we going to use the annotations for? a) compile flex b) streams c) nn.Module info to organize MoE components for pipelining d) PP stages e) Rename graph nodes for more debugging f) No nested regional compile Pull Request resolved: https://github.com/pytorch/pytorch/pull/164776 Approved by: https://github.com/SherlockNoMad ghstack dependencies: #165188	2025-10-13 22:22:20 +00:00
Animesh Jain	1191e51c44	[dynamo][annotate] Remove the need of external ctx mgr of preserve_node_meta (#165188 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165188 Approved by: https://github.com/yushangdi	2025-10-13 22:22:20 +00:00
zpcore	3edd94485f	[5/N][DTensor device order] Implement graph based redistribution algorithm (#164902 ) (Extract out the algorithm from https://github.com/pytorch/pytorch/pull/160266.) Build a graph to search for the path from source placement to destination placement (with device order). Currently solution introduces too many all-gathers and missing the opportunity for all-to-all when redistribute, especially when we consider the device order. ### How to build the graph: When operator of Shard, think of collective op as operation on a stack of device axis: - I, J are tensor dimensions; - X, Y, Z, Y are ordered mesh dimensions. <img width="357" height="253" alt="image" src="https://github.com/user-attachments/assets/23bb3cc3-0506-4071-9053-3c525cf0e526" /> Detailed collective op transition is implemented in `DTensorRedistributePlanner.get_next_state`. ### How to find the min cost path: Assign weight to different type of collective ops and use Dijkstra to find the min cost path from the graph we build. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164902 Approved by: https://github.com/ezyang	2025-10-13 22:03:57 +00:00
Animesh Jain	a701c937bf	[dynamo][executorch] Return already added nn.Module during registration (#165338 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165338 Approved by: https://github.com/tugsbayasgalan	2025-10-13 21:24:07 +00:00
Yuanyuan Chen	ecb53078fa	Turn some const strings into constexpr in C++ code (#165203 ) This PR turns more const strings into constexpr. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165203 Approved by: https://github.com/Skylion007	2025-10-13 20:25:20 +00:00
Angel Li	fa95882093	[BE] document distributed apis (#165194 ) This PR documents some `torch.distributed.distributed_c10d` APIs. Below are some screenshots of the rendered docs. <img width="909" height="527" alt="Screenshot 2025-10-10 at 10 18 40 PM" src="https://github.com/user-attachments/assets/555ae886-bead-47f3-8c67-9bc91c14bd11" /> <img width="885" height="548" alt="Screenshot 2025-10-10 at 10 18 47 PM" src="https://github.com/user-attachments/assets/1d6f7af1-db28-40f9-927e-5c47668a1a88" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165194 Approved by: https://github.com/janeyx99	2025-10-13 20:13:59 +00:00
PyTorch MergeBot	a71ca4dcb9	Revert "[opaque_obj_v2] PyObject custom op schema type (#165004 )" This reverts commit 3faee200674c0c2bca3f395a063264cfd8a9a5b7. Reverted https://github.com/pytorch/pytorch/pull/165004 on behalf of https://github.com/seemethere due to This fails internal tests, see D84399300 ([comment](https://github.com/pytorch/pytorch/pull/165004#issuecomment-3398906856))	2025-10-13 20:08:38 +00:00
Aidyn-A	c44d638b15	[Easy][Test][Dynamo] Avoid direct string comparison in MiscTestsDevice::get_device_module (#165314 ) Fixes a small issue on string comparison, as the test fails with: ``` AssertionError: String comparison failed: 'cuda' != 'cuda:0' ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165314 Approved by: https://github.com/soulitzer	2025-10-13 19:58:59 +00:00
Mark Saroufim	7c015334a3	Remove FIXME comment about reset_max_memory_reserved (#165249 ) The function doesn't actually exist https://github.com/pytorch/pytorch/blob/main/torch/cuda/__init__.py#L1816 Fixes https://github.com/pytorch/pytorch/issues/27785 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165249 Approved by: https://github.com/svekars	2025-10-13 19:44:40 +00:00
Scott Wolchok	cad2d473bf	Force inlining into torch_function_mode_enabled (#164617 ) This function is relatively hot; inlining here reduces time reported by `python -m timeit --setup 'import torch; t = torch.tensor([1])' 't._cdata'` from about 125 nsec/loop to about 110 nsec/loop. (To be fair, variance is high, but I did confirm with perf that time in this path seems to have roughly halved during torchtitan training.) Note that locally I am getting bit by a GCC bug that I documented in a comment. Would be interested to hear if this does anything for clang. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164617 Approved by: https://github.com/ezyang	2025-10-13 19:25:51 +00:00
Ti-Tai Wang	cb328c0b20	[ONNX] TorchTensor supports tofile() (#165195 ) Fixes #165120 ref: `43ebf47bb5/src/onnx_ir/tensor_adapters.py (L171-L200)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165195 Approved by: https://github.com/justinchuby	2025-10-13 19:12:06 +00:00
Catherine Lee	64699b8042	[trymerge] Do not check for rules when reverting (#165342 ) Why do we need to check for merge rules when reverting? Pull Request resolved: https://github.com/pytorch/pytorch/pull/165342 Approved by: https://github.com/malfet	2025-10-13 19:07:00 +00:00
Nikita Shulga	dcce473352	[BE] Fix unused parameter warning (#165272 ) Fixes ``` [23/1155] Compiling /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal to EmbeddingBag_31.air /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:252:62: warning: unused parameter 'bag_size' [-Wunused-parameter] inline opmath_t<T> operator()(opmath_t<T> val, opmath_t<T> bag_size) { ^ 1 warning generated. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165272 Approved by: https://github.com/Skylion007	2025-10-13 18:52:51 +00:00
Aleksandar Samardžić	c41e52118d	Fix loop pipelining for 2d/2d case of Triton grouped MM (#165265 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165265 Approved by: https://github.com/ngimel	2025-10-13 18:45:39 +00:00
PyTorch MergeBot	955cd7060b	Revert "Update round size with 1 division behavior (#162203 )" This reverts commit 12d2ef557f6e127100267c31a31572d8ab5cc788. Reverted https://github.com/pytorch/pytorch/pull/162203 on behalf of https://github.com/izaitsevfb due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/162203#issuecomment-3398622898))	2025-10-13 18:32:37 +00:00
can-gaa-hou	0ce945790e	[NJT] Fix schema validation error in jagged functions (#165307 ) Fixes #161812 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165307 Approved by: https://github.com/soulitzer	2025-10-13 17:59:18 +00:00
Angel Li	70ec464c16	[BE] document some quantization public apis (#165160 ) This PR documents some apis in `torch.ao.quantization.utils` <img width="885" height="296" alt="Screenshot 2025-10-10 at 4 38 10 PM" src="https://github.com/user-attachments/assets/4323a6f5-ac3a-4f2e-ba00-35f3b208bef4" /> <img width="876" height="319" alt="Screenshot 2025-10-10 at 4 38 14 PM" src="https://github.com/user-attachments/assets/164822c3-9740-46f9-953d-bb20c77bcf69" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165160 Approved by: https://github.com/janeyx99	2025-10-13 17:24:42 +00:00
can-gaa-hou	2c600bb665	[torchfuzz] fix some errors when walkthroughing README.md (#165225 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165225 Approved by: https://github.com/soulitzer	2025-10-13 17:17:50 +00:00
Chien-Chin Huang	e93343cfab	[CP] Introduce flex_cp_forward custom op for FlexAttention CP (#163185 ) The custom op will fetch the required K and V. Currently, the forward pass is just an all-gather, and the backward pass is a reduce-scatter. While the logic is the same as all_gather_tensor_autograd, the custom op avoids the Autograd warning that wait_tensor() is registered to autograd. For the next step, we should explore how to interpolate the required communication based on the information from BlockMask. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163185 Approved by: https://github.com/XilunWu ghstack dependencies: #162542, #164500	2025-10-13 17:16:32 +00:00
Catherine Lee	c86a7c5f5e	Disable failing test_int8_woq_mm_concat_cuda on slow grad check (#165331 ) Same as https://github.com/pytorch/pytorch/pull/165147, I missed some Pull Request resolved: https://github.com/pytorch/pytorch/pull/165331 Approved by: https://github.com/bbeckca	2025-10-13 17:08:00 +00:00
Guilherme Leobas	4e420415e8	Avoids calling builtin `iter` if object is a generator (#162521 ) The `iter(gen)` call will return the given `gen` object. So, we just avoid this call and shaves off a few ms of tracing time Pull Request resolved: https://github.com/pytorch/pytorch/pull/162521 Approved by: https://github.com/mlazos	2025-10-13 17:07:54 +00:00
Kurt Mohler	83cbba8759	[MPS] Support large tensors in `torch.cat` (#164416 ) Fixes #164415 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164416 Approved by: https://github.com/malfet	2025-10-13 16:56:56 +00:00
Catherine Lee	684df93975	[CI] Default keep-going true for tags of form ciflow/something/commitsha (#165180 ) Tags of the form `ciflow/something/commitsha` are usually created by running the workflow from HUD Pull Request resolved: https://github.com/pytorch/pytorch/pull/165180 Approved by: https://github.com/huydhn	2025-10-13 16:12:37 +00:00
Scott Wolchok	a3e3efe474	Fix double dispatch to Python for detach (#163671 ) This fixes #71725. Differential Revision: [D83857880](https://our.internmc.facebook.com/intern/diff/D83857880) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163671 Approved by: https://github.com/ezyang, https://github.com/albanD	2025-10-13 16:10:17 +00:00
Chien-Chin Huang	6bda3bb286	[PP] Fix split_args_kwargs_into_chunks issues (#165306 ) 1. https://github.com/pytorch/pytorch/pull/164111/ adds the support of splitting BlockMask. But BlockMask actually has B=1 case that the BlockMask will be broadcast. This PR adds the support of B=1 case. 2. The original split_args_kwargs_into_chunks doesn't initialize the default specs correctly. Since we now use tree_flatten and tree_unflatten to do split, we should also use tree_map to initialize the default spec. This will actually support the case when the values are not torch.Tensor, which were only supported if users explicitly provide the shard spec. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165306 Approved by: https://github.com/H-Huang	2025-10-13 15:52:39 +00:00
PyTorch MergeBot	8580112682	Revert "[dynamo][DebugMode] mask python keys in dispatch_key_set guard checks (#164992 )" This reverts commit 306b344a1847749f0baf085dcd92560f4e99cd1b. Reverted https://github.com/pytorch/pytorch/pull/164992 on behalf of https://github.com/jeffdaily due to broke ROCm CI test/inductor/test_inductor_scheduler.py::TestSchedulerCUDA::test_flop_counter_op_options0_cuda_float32 [GH job link](https://github.com/pytorch/pytorch/actions/runs/18417066364/job/52485636942) [HUD commit link](`306b344a18`) ([comment](https://github.com/pytorch/pytorch/pull/164992#issuecomment-3397927142))	2025-10-13 15:14:34 +00:00
PyTorch UpdateBot	4874cce52f	[xla hash update] update the pinned xla hash (#165302 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned xla hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165302 Approved by: https://github.com/pytorchbot	2025-10-13 12:36:29 +00:00
PyTorch UpdateBot	c509a78645	Update slow tests (#165301 ) This PR is auto-generated weekly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/weekly.yml). Update the list of slow tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165301 Approved by: https://github.com/pytorchbot	2025-10-13 11:47:32 +00:00
Chien-Chin Huang	8461b63f2c	[CP] Replace context_parallel context manager with functional APIs (#164500 ) `context_parallel()` being a context manager has annoyed users. Now that we plan to redesign CP's UX to explicitly ask users to: 1. Wrap the attention op into an `nn.Module` 2. Lift any buffers that are not sequence agnostic to input We can replace `context_parallel()` with two functional APIs: `_context_parallel_shard` and `_enable_context_parallel_dispatcher`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164500 Approved by: https://github.com/XilunWu ghstack dependencies: #162542	2025-10-13 06:30:18 +00:00
PyTorch UpdateBot	957b0e9793	[vision hash update] update the pinned vision hash (#165017 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vision hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165017 Approved by: https://github.com/pytorchbot	2025-10-13 04:35:52 +00:00
PyTorch UpdateBot	b04def139e	[audio hash update] update the pinned audio hash (#165113 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165113 Approved by: https://github.com/pytorchbot	2025-10-13 04:35:36 +00:00
Ma, Jing1	59ad8f1ac6	[XPU] Enhance XPUGeneratorImpl functionality to support XPUGraph (#163332 ) As this [XPUGraph RFC](https://github.com/pytorch/pytorch/issues/162143) descripted. This PR enhances `XPUGeneratorImpl` to support XPUGraph. In this PR, we add `XPUGerneratorState` and `PhiloxXpuState`. Which makes XPUGraph update philox state during graph capture and replay correctly XPUGraph PR submission plan: - [ ] 1, Enhance XPUGenerator functionality. Add XPUGeneratorState and philoxState - [ ] 2, implemenet XPUGraph capture_begin/capture_end/instantiate functionality - [ ] 3, implemenet XPUGraph replay/debug_dump/reset functionality - [ ] 4, python APIs: is_current_stream_capturing/graph_pool_handle/graph - [ ] 5, python APIs: make_graphed_callables Pull Request resolved: https://github.com/pytorch/pytorch/pull/163332 Approved by: https://github.com/gujinghui, https://github.com/EikanWang, https://github.com/albanD	2025-10-13 02:10:41 +00:00
Yuanyuan Chen	8de85896e0	Enable ruff rule E721 (#165162 ) `E721` checks for object type comparisons using == and other comparison operators. This is useful because it is recommended to use `is` for type comparisons. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165162 Approved by: https://github.com/Skylion007	2025-10-13 01:48:55 +00:00
Colin Peppler	a33f85e791	Add tlparse artifact for autotune_at_compile_time (#164984 ) This is useful for inspecting autotuning code when `autotune_at_compile_time=True` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164984 Approved by: https://github.com/yushangdi, https://github.com/desertfire	2025-10-12 23:38:11 +00:00
Dzmitry Huba	5e58420dff	LocalTensor (#164537 ) A LocalTensor is a tensor subclass which simulates a tensor that is distributed across SPMD ranks. A LocalTensor might be size N, but in fact there are world_size shards/replicas of it stored internally. When you do a plain PyTorch operation on it, we apply the operation to each shard; when you do a collective, we do the mathematically equivalent operation on the local shards. A LocalTensor is associated with a list of ranks which specify which ranks it holds local tensors for. NB, this is NOT a DataParallel like abstraction where you can run operations on multiple different GPUs. It is intended purely for debugging purposes, the overhead is almost certainly too high to keep eight GPUs (even the C++ autograd needs multithreading to keep up!) (It might potentially be possible to trace through this with torch.compile and then compile it with CUDA graphs but this is currently a non-goal.) In order to handle MPMD, we provide a helper decorator that allows you to run a function with no side effects for each LocalTensor shard and combine results back into LocalTensor or LocalIntNode. Note: This PR convert all DTensor ops and some DTensor tests to illustrate intended usage and ensure conrrectness. In subsequent PR more tests will be converted. DUring test conversion we aim to share as much as possible of test logic between multi-process / multi-threaded and local tensor tests. We would like to developers to be able to run both flavors of the tests. Note: This work is based on the original proposal by @ezyang (WIP PR https://github.com/pytorch/pytorch/pull/162753). Pull Request resolved: https://github.com/pytorch/pytorch/pull/164537 Approved by: https://github.com/ezyang	2025-10-12 20:06:41 +00:00
PyTorch UpdateBot	a2601630cd	[vllm hash update] update the pinned vllm hash (#164628 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164628 Approved by: https://github.com/pytorchbot Co-authored-by: Huy Do <huydhn@gmail.com>	2025-10-12 18:26:07 +00:00
Howard Huang	2beead7523	[PP] move FSDP reduce scatters to end of step (#165106 ) Move FSDP reduce scatters to the end of the PP step. The reduce scatter compute stream sync blocks the other stages from executing their backwards leading to bubbles. There should be a way to execute these RS earlier, but doing this for now as a quick fix. <img width="1056" height="463" alt="image" src="https://github.com/user-attachments/assets/b945dd55-8ab1-4acc-b862-c6e2e476b834" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165106 Approved by: https://github.com/weifengpy ghstack dependencies: #164976	2025-10-12 13:28:02 +00:00
Yu, Guangye	3a110c9bb2	Add a new API torch.xpu.is_tf32_supported for Intel GPU (#163141 ) # Motivation Aligned with other backends, this PR introduces a new API `torch.xpu.is_tf32_supported`, which should be used before `torch.backends.mkldnn.allow_tf32=True` or provide hardware capability information to the Triton # Additional Context On Intel Xe architecture and newer, TF32 operations can be accelerated through DPAS (Dot Product Accumulate Systolic) instructions. Therefore, TF32 support can be determined by checking whether the device supports subgroup matrix multiply-accumulate operations. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163141 Approved by: https://github.com/EikanWang	2025-10-12 12:11:57 +00:00
William Wen	5dbca58bd0	[dynamo] fix potential 3.12+ THP_PyOpcode_Caches init error seen internally (#165200 ) Another attempt at merging https://github.com/pytorch/pytorch/pull/164597 due to CLA signing failure. Differential Revision: [D84397377](https://our.internmc.facebook.com/intern/diff/D84397377) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165200 Approved by: https://github.com/anijain2305, https://github.com/mlazos	2025-10-12 05:29:04 +00:00
Huy Do	5ad7611b52	Reland vision pinned commit hash update (#164492 ) Redo https://github.com/pytorch/pytorch/pull/154694 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164492 Approved by: https://github.com/yangw-dev	2025-10-12 04:53:27 +00:00
Simon Fan	992857e286	Fix pre-dispatch AC HOP calling convention (#165145 ) For AC HOP, dynamo traces it without kwargs. (kwargs are only inputs to the HOP, not to the body) `55f01a48af/torch/_dynamo/variables/higher_order_ops.py (L2594-L2609)` When we add non-strict support, we should match this calling convention too. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165145 Approved by: https://github.com/tugsbayasgalan ghstack dependencies: #164296, #164321, #164419, #164420, #164340, #163602, #164431, #164433, #164437	2025-10-12 02:28:21 +00:00
zhudada	058814794b	[Code Clean] Replace std::runtime_error with TORCH_CHECK (#163437 ) Replace the runtime_error of the vallina C++ exceptions with TORCH_CEHCK Including: - torch/csrc/export - torch/csrc/cuda Fixes #148114 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163437 Approved by: https://github.com/Skylion007, https://github.com/cyyever	2025-10-12 01:23:02 +00:00
Shunting Zhang	bb0635d7dd	[inductor][eazy] change how torch.use_deterministic_algorithms affect inductor (#164905 ) Previously when torch.are_deterministic_algorithms_enabled() is True Inductor will - skip autotuning pointwise kernels - pick a fixed (and quite arbitrary) config for reduction This PR change the behavior to - for pointwise kernels, we still do autotuning - for reduction kernels, we use the recent added heuristic to pick a config Pull Request resolved: https://github.com/pytorch/pytorch/pull/164905 Approved by: https://github.com/jansel, https://github.com/v0i0, https://github.com/mlazos ghstack dependencies: #164904	2025-10-12 00:03:43 +00:00
Shunting Zhang	5171f14064	[inductor] verify determinism with inductor benchmark script (#164904 ) Verify the deterministic mode with torch.compile benchmark scripts. Here is what my testing script does (pasted in the end): - run a model in default mode, save it's result - run the model again in default mode, but distort the benchmarking results. Compare it with the saved result. - Do the above again in deterministic mode. I tried to test a few modes - BertForMaskedLM and GoogleFnet: I can repro the numeric change by distorting the benchnmark result in the default mode. The non-determinism is gone in the deterministic mode - DistillGPT2: I can not repro the numeric change by distorting the benchmarking result in the default mode. It does not surprise me much. Reduction order change does not always cause numeric change. ``` model=GoogleFnet export TORCHINDUCTOR_WRITE_ARE_DETERMINISTIC_ALGORITHMS_ENABLED=0 export TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 # disable autotune cache export TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE=0 export TORCHINDUCTOR_FX_GRAPH_CACHE=0 export TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor_shunting/ export TORCHINDUCTOR_BENCHMARK_KERNEL=1 export TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1 export INDUCTOR_TEST_DISABLE_FRESH_CACHE=1 # Non deterministic mode # --float32 rather than --amp to make it easier to repro non-deterministic echo "Save results for non-deterministic mode" python benchmarks/dynamo/huggingface.py --backend inductor --float32 --accuracy --only $model --training --disable-cudagraphs --save-model-outputs-to=/tmp/saved-non-deterministic.pkl echo "Compare results with distorted benchmarking in non-deterministic mode" TORCHINDUCTOR_DISTORT_BENCHMARKING_RESULT=inverse python benchmarks/dynamo/huggingface.py --backend inductor --float32 --accuracy --only $model --training --disable-cudagraphs --compare-model-outputs-with=/tmp/saved-non-deterministic.pkl echo "Save results for deterministic mode" TORCHINDUCTOR_DETERMINISTIC=1 python benchmarks/dynamo/huggingface.py --backend inductor --float32 --accuracy --only $model --training --disable-cudagraphs --save-model-outputs-to=/tmp/saved-deterministic.pkl echo "Compare results with distorted benchmarking in deterministic mode" TORCHINDUCTOR_DETERMINISTIC=1 TORCHINDUCTOR_DISTORT_BENCHMARKING_RESULT=inverse python benchmarks/dynamo/huggingface.py --backend inductor --float32 --accuracy --only $model --training --disable-cudagraphs --compare-model-outputs-with=/tmp/saved-deterministic.pkl ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164904 Approved by: https://github.com/jansel, https://github.com/v0i0	2025-10-12 00:03:42 +00:00
Raman Kumar	df26c51478	error message for instantiating CUDA Stream if CUDA not available (#159868 ) Fixes #159744 Summary: ``` import torch # Generate input data input_tensor = torch.randn(3, 3) stream = torch.cuda.Stream() # Call the API input_tensor.record_stream(stream) ``` ⚠️ will now show an error message `torch.cuda.Stream requires CUDA support` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159868 Approved by: https://github.com/malfet, https://github.com/isuruf	2025-10-11 23:21:35 +00:00
PyTorch MergeBot	8d49cd5b26	Revert "[compile] Regional inductor compilation with fx.annotate (#164776 )" This reverts commit 1e4c7dffa31b3284a4cd4daa4424602827bd9d0f. Reverted https://github.com/pytorch/pytorch/pull/164776 on behalf of https://github.com/malfet due to Looks like this one broke everything, not the top of the stack ([comment](https://github.com/pytorch/pytorch/pull/164776#issuecomment-3393725466))	2025-10-11 23:14:23 +00:00
PyTorch MergeBot	a19123b37e	Revert "[dynamo][annotate] Remove the need of external ctx mgr of preserve_node_meta (#165188 )" This reverts commit f0325d07876b5a52d29a44ee02dcf7a7c21b258a. Reverted https://github.com/pytorch/pytorch/pull/165188 on behalf of https://github.com/malfet due to Looks like it broke bunch of tests, see `2d4654d208/1` ([comment](https://github.com/pytorch/pytorch/pull/165188#issuecomment-3393674273))	2025-10-11 21:38:45 +00:00
Laith Sakka	2d4654d208	do not overguard when comparing lists (#165091 ) if we are comparing two lists l1, l2 of different lengths for equality. we should early exist if len(l1) != len(l2) and avoid guarding/comparing inner elements. This avoids recompilations as in the unit test. address https://github.com/pytorch/pytorch/issues/137515 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165091 Approved by: https://github.com/aorenste, https://github.com/mlazos ghstack dependencies: #164884, #164885, #164886, #164887, #164888, #164889	2025-10-11 20:37:51 +00:00
Animesh Jain	f0325d0787	[dynamo][annotate] Remove the need of external ctx mgr of preserve_node_meta (#165188 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165188 Approved by: https://github.com/yushangdi ghstack dependencies: #164776	2025-10-11 15:49:42 +00:00
Animesh Jain	1e4c7dffa3	[compile] Regional inductor compilation with fx.annotate (#164776 ) This PR introduces a way to compile a region of FX graph using `fx.traceback.annotate`. ### UX 1) In the user code, mark the region that you want to be compiled with inductor using `with fx_traceback.annotate({"compile_with_inductor": 0})`. As of now, we just rely on the string `compile_with_inductor` and ignore the integer. As the needs arise, we can update the logic. Example ``` def fn(x, y): sin = torch.sin(x) with fx_traceback.annotate({"compile_with_inductor": 0}): mul = sin * y add = mul + 1 return torch.sin(add) ``` 2) You have to instruct the compiler to use the annotations with `compile_fx_annotated_nodes_with_inductor` transformation. This is somewhat controversial, and a user might expect that just setting annotation is enough. But for now to control the blast radius, we need to explicitly do this. One such example is ``` # Set the fw and bw compiler of aot_autograd to `compile_fx_annotated_nodes_with_inductor` def aot_eager_regional_inductor(): return aot_autograd( fw_compiler=compile_fx_annotated_nodes_with_inductor, bw_compiler=compile_fx_annotated_nodes_with_inductor, ) ``` 3) Fixable in short-term - You have to wrap the user code in `torch.fx.traceback.preserve_node_meta` to ensure that annotations are propagated to the compiler. This is fixable, just need to make CI happy. ### Implementation 1) Relies on `CapabilityBasedPartitioner` to "scoop" out regions based on annotations, and then create subgraphs in the main graph. 2) Call `torch._inductor.standalone_compile` on these subgraphs, and jam the returned callable into the FX graph at the place of call_module Resulting graph looks something like this - search for `torch__inductor_standalone_compile_inner` Forward graph ``` class GraphModule(torch.nn.Module): def forward(self, primals_1: "f32[10]", primals_2: "f32[10]"): # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:64 in fn, code: sin = torch.sin(x) sin: "f32[10]" = torch.ops.aten.sin.default(primals_1) # No stacktrace found for following nodes inner = torch__inductor_standalone_compile_inner(sin, primals_2) # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:68 in fn, code: add = mul + 1 getitem: "f32[10]" = inner[0]; inner = None # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:70 in fn, code: return torch.sin(add) sin_1: "f32[10]" = torch.ops.aten.sin.default(getitem) return (sin_1, primals_1, primals_2, sin, getitem) ``` Backward graph ``` class GraphModule(torch.nn.Module): def forward(self, primals_1: "f32[10]", primals_2: "f32[10]", sin: "f32[10]", add: "f32[10]", tangents_1: "f32[10]"): # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:64 in fn, code: sin = torch.sin(x) cos_1: "f32[10]" = torch.ops.aten.cos.default(primals_1); primals_1 = None # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:70 in fn, code: return torch.sin(add) cos: "f32[10]" = torch.ops.aten.cos.default(add); add = None mul_1: "f32[10]" = torch.ops.aten.mul.Tensor(tangents_1, cos); tangents_1 = cos = None # No stacktrace found for following nodes inner = torch__inductor_standalone_compile_inner(mul_1, sin, primals_2); mul_1 = sin = primals_2 = None # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:67 in fn, code: mul = sin * y getitem: "f32[10]" = inner[0] getitem_1: "f32[10]" = inner[1]; inner = None # File: /data/users/anijain/pytorch2/test/dynamo/test_regional_inductor.py:64 in fn, code: sin = torch.sin(x) mul_4: "f32[10]" = torch.ops.aten.mul.Tensor(getitem_1, cos_1); getitem_1 = cos_1 = None return (mul_4, getitem) ``` ### Some issue raised in the HOP meeting 1) CSE will not differentiate different meta custom nodes and do wrong thing. 2) SAC - The recomputed forward will be smaller than the forward. Will we compile a smaller region than? 3) What happens if you have a op in the middle which does not disturb the topology, is it still 1 subgraph? 4) What happens with the nesting of `fx_traceback.annotate`? Are there any ordering requirements? 5) What are we going to use the annotations for? a) compile flex b) streams c) nn.Module info to organize MoE components for pipelining d) PP stages e) Rename graph nodes for more debugging f) No nested regional compile Pull Request resolved: https://github.com/pytorch/pytorch/pull/164776 Approved by: https://github.com/SherlockNoMad	2025-10-11 15:49:42 +00:00
Thanh Ha	79a33e2db2	Switch docs build from c5 to c7i (#165082 ) Switch docs build from c5 to c7i which should increase build performance by roughly 15-20% while reducing costs by 10-15%. Signed-off-by: Thanh Ha <thanh.ha@linuxfoundation.org>	2025-10-11 10:59:18 -04:00
PyTorch MergeBot	816fb7f48d	Revert "Enable ruff rule E721 (#165162 )" This reverts commit 9e7c19f72b6d0690915c307409c0c0a76b5a3bf0. Reverted https://github.com/pytorch/pytorch/pull/165162 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165162#issuecomment-3393328271))	2025-10-11 13:25:40 +00:00
zpcore	512dd79ff0	[4/N] [DTensor device order] Support debugmode to show dtensor distribution transform path (#164821 ) Enable the DebugMode to print out how `placements` and `shard_order` will update when we execute `transform_infos` to transform from source placement to target placement. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164821 Approved by: https://github.com/SherlockNoMad, https://github.com/pianpwk ghstack dependencies: #164806, #164820	2025-10-11 09:44:54 +00:00
zpcore	2001b18541	[3/N] [DTensor device order] Make some placement type class method static (#164820 ) Some methods in `Placement` class can be exposed as static. Those method should be useful w/o initializing the object. E.g., when we `distribute_tensor` from normal tensor, we may want: ``` local_tensor = Shard.shard_tensor(tensor_dim, local_tensor, device_mesh, mesh_dim,) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164820 Approved by: https://github.com/XilunWu, https://github.com/fduwjj, https://github.com/wanchaol ghstack dependencies: #164806	2025-10-11 09:42:13 +00:00
zpcore	9dac4e2540	[2/N] [DTensor device order] Add shard_order attribute in DTensorSpec (#164806 ) Add `shard_order` field in DTensorSpec. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164806 Approved by: https://github.com/XilunWu, https://github.com/wanchaol	2025-10-11 09:39:08 +00:00
Huy Do	4400c5d31e	Continue to build nightly CUDA 12.9 for internal (#163029 ) Revert part of https://github.com/pytorch/pytorch/pull/161916 to continue building CUDA 12.9 nightly Pull Request resolved: https://github.com/pytorch/pytorch/pull/163029 Approved by: https://github.com/malfet	2025-10-11 08:26:47 +00:00
Yuanyuan Chen	9e7c19f72b	Enable ruff rule E721 (#165162 ) `E721` checks for object type comparisons using == and other comparison operators. This is useful because it is recommended to use `is` for type comparisons. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165162 Approved by: https://github.com/Skylion007	2025-10-11 06:43:53 +00:00
Animesh Jain	220a34118f	[export] Turn on install_free_tensors flag (#164691 ) The final step in removing the discrepancy between torch.compile(fullgraph=True) and torch.export(strict=True). Pull Request resolved: https://github.com/pytorch/pytorch/pull/164691 Approved by: https://github.com/avikchaudhuri	2025-10-11 04:26:09 +00:00
Edward Z. Yang	de8d81275a	Do not decompose in functionalization/proxy tensor if autograd wouldn't have decomposed (#164939 ) This fixes AOTAutograd rms_norm not being bitwise equivalent to eager, because it avoids a decomposition. You can force the decomposition by having the decomposition in the dispatch table, but if eager mode wouldn't have decomposed (because it went to the fused one), we now default to preserving the fused call by default. This largely reverts https://github.com/pytorch/pytorch/pull/103275/ for view ops. This means that in inference mode we could hit the wrong C++ kernel; if this occurs we should just SymInt'ify the C++ kernel. Another neat side effect of this change is that Inductor's generated kernels for rms_norm now have rms_norm in their name. Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164939 Approved by: https://github.com/bdhirsh	2025-10-11 01:03:55 +00:00
Animesh Jain	d73416642f	[test] Skip testing of source_fn_stack in light of export changes (#165176 ) This is in regards to https://github.com/pytorch/pytorch/pull/164691 where we are inlining into nn modules, and therefore it is causing this test to fail. The test here looks for node.name which is quite different with inlining. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165176 Approved by: https://github.com/andrewor14 ghstack dependencies: #165172	2025-10-11 00:16:59 +00:00
Yuanyuan Chen	ef50c9b557	Remove unnecessary "static" for definitions in anonymous namespace (#165035 ) This PR removes unnecessary "static" for C++ functions and variables in anonymous namespace as detected by clang-tidy. This enhances code readability. The related rules are planed to be enabled in follow-up PRs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165035 Approved by: https://github.com/Skylion007	2025-10-11 00:04:23 +00:00
Animesh Jain	2d9f3f57f1	[dynamo][executorch] Handle lowered module from executorch delegate specially (#165172 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165172 Approved by: https://github.com/tugsbayasgalan	2025-10-10 23:24:17 +00:00
PaulZhang12	c8c5187e85	Fix truediv numerics between eager and compile (#164144 ) Addresses numeric differences between eager and compile in https://github.com/pytorch/pytorch/issues/141753 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164144 Approved by: https://github.com/bobrenjc93	2025-10-10 22:18:11 +00:00
Chien-Chin Huang	ee0a8a5a50	[CP]Introduce ContextParallal plan for parallelize_module() (#162542 ) Motivation Since FlexAttention and SDPA are both functions, not modules, we have tried numerous mechanisms to dispatch FlexAttention and SDPA to customized call paths so that we can inject the CP logic. Unfortunately, all of these approaches have their own composability issues with different techniques. Candidate Approaches 1. Ask users to write a module to wrap FlexAttention/SDPA and use `parallelize_module` to install a forward hook. - Pros: This is similar to how we do TP. - Cons: 1) It is cumbersome for users as they need to create a new module. 2) We need two places to parallelize the CP, as a context_parallel context manager is still required for splitting the inputs. 2. Provide a function wrapper. - Pros: Users just need to replace their FlexAttention/SDPA calls with the wrapper. - Cons: It is not the same API, though we can maintain the API signatures to be the same as the core API. Summary ~~This PR implements approach 2 and refactor the code in such a way that most code can be used by option approach 1, which will be introduced in another PR.~~ We changed this PR to implement option 1 as people like option 1 due to the consistency with the existing parallelisms. But this PR can also serve the foundation to implement option 2, which was the early version of this PR. This PR also changes `create_cp_block_mask` logic since we now only focus on ModuleWrapper approach which doesn't require to hack the seq_len field in a BlockMask. This PR also removes TorchFunctionMode dispatcher mode as it doesn't work well with SAC. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162542 Approved by: https://github.com/XilunWu	2025-10-10 22:03:43 +00:00
fduwjj	50c338c2da	[DeviceMesh] Move global state into class method (#164510 ) This is PR trying to move bookkeeping state maps from MeshEnv to DeviceMesh class members. The reason is that in general global variables are thread local and cause potential issue. We will also need to do DTensor CPU overhead benchmark for this change. 3-5% CPU overhead in DTensor has been observed: before: <img width="1147" height="535" alt="image" src="https://github.com/user-attachments/assets/9e4ac018-ec0a-46a4-8f2c-64b4dbec465c" /> After: <img width="1114" height="576" alt="image" src="https://github.com/user-attachments/assets/eaf83660-652b-4c6b-8591-f6049ccdd14c" /> running the benchmark mentioned here: https://github.com/pytorch/pytorch/issues/159169 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164510 Approved by: https://github.com/lw, https://github.com/fegin	2025-10-10 21:37:17 +00:00
angelayi	3faee20067	[opaque_obj_v2] PyObject custom op schema type (#165004 ) This is a cleaner implementation of opaque objects (https://github.com/pytorch/pytorch/pull/162660). Instead now we just need to do: Call `register_opaque_type` to register the type as being "opaque" and allowed by custom ops. You also need to pass a unique name that maps to the type. ```python class OpaqueQueue: def __init__(self, queue: list[torch.Tensor], init_tensor_: torch.Tensor) -> None: super().__init__() self.queue = queue self.init_tensor_ = init_tensor_ def push(self, tensor: torch.Tensor) -> None: self.queue.append(tensor) def pop(self) -> torch.Tensor: if len(self.queue) > 0: return self.queue.pop(0) return self.init_tensor_ def size(self) -> int: return len(self.queue) register_opaque_type(OpaqueQueue, "_TestOpaqueObject_OpaqueQueue") ``` When creating the custom op, the schema will then use the unique name: ```python self.lib = torch.library.Library("_TestOpaqueObject", "FRAGMENT") torch.library.define( "_TestOpaqueObject::queue_push", "(_TestOpaqueObject_OpaqueQueue a, Tensor b) -> ()", tags=torch.Tag.pt2_compliant_tag, lib=self.lib, ) @torch.library.impl( "_TestOpaqueObject::queue_push", "CompositeExplicitAutograd", lib=self.lib ) def push_impl(queue: OpaqueQueue, b: torch.Tensor) -> None: assert isinstance(queue, OpaqueQueue) queue.push(b) ``` Using the custom op: ```python queue = OpaqueQueue([], torch.zeros(3)) torch.ops._TestOpaqueObject.queue_push(queue, torch.ones(3)) self.assertTrue(queue.size(), 1) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165004 Approved by: https://github.com/albanD	2025-10-10 21:31:56 +00:00
Yang Wang	cafca357fb	Fix h100 daily inductor running dispatch (#165185 ) casued by merged pr: `e7ed1a00eb` the if condition should also updated Pull Request resolved: https://github.com/pytorch/pytorch/pull/165185 Approved by: https://github.com/malfet, https://github.com/huydhn	2025-10-10 21:28:58 +00:00
Dzmitry Huba	1e35b3c4e0	Augment DebugMode to support attributes reporting (#165109 ) DebugMode reports tensor type, it shapes and placements while active. This change augments reporting to tensor attributes from configured set. This feature is intended to be used to ease understanding debug string when dealing with larger outputs. For example, before running forward pass of a model we can annotate each of parameters and buffers with their fully qualified names, so that we can see which ops are being executed against specific tensors. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165109 Approved by: https://github.com/ezyang, https://github.com/pianpwk	2025-10-10 21:27:05 +00:00
Lucas Kabela	f363114852	[Bugfix][Inductor][Dynamo] Fix stride incorrectness issues for stride 0 tensor (#164897 ) Fixes #164814 - we update to include cases where we know symbolic expression is statically one. There are two errors here; first in graph capture, where a tensor with size 0 yet symbolic stride would attempt to keep the symbolic stride, resulting in a mismatch. The second is in inductor code gen, where we only checked in squeeze if size == 1, missing the case where a symbolic stride equals 1. Also fixes #164924 (@bobrenjc93 for fuzzer finding an issue affecting users : ) ### Test plan: ``` python test/dynamo/test_aot_autograd.py AotAutogradFallbackTests ``` Results in: ``` .. ---------------------------------------------------------------------- Ran 49 tests in 45.622s OK (expected failures=1) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164897 Approved by: https://github.com/laithsakka	2025-10-10 21:26:57 +00:00
Jithun Nair	0ec0120b19	Move aws OIDC credentials steps into setup-rocm.yml (#164769 ) The AWS ECR login step needs `id-token: write` permissions. We move the steps to get OIDC-based credentials from `_rocm-test.yml` to `setup-rocm.yml`. This lays the groundwork to enable access to AWS ECR in workflows in other repos such as torchtitan that use [linux_job_v2.yml](https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job_v2.yml), which also uses [setup-rocm.yml](`335f4f80a0/.github/workflows/linux_job_v2.yml (L168)`). Any caller workflows that eventually execute `setup-rocm` action will thus need to provide the `id-token: write` permission. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164769 Approved by: https://github.com/huydhn	2025-10-10 21:24:29 +00:00
Jeff Daily	8360f34c36	[ROCm] hotfix test scaled matmul cuda (#165104 ) Refactoring of scaled mm APIs and related tests caused previously passing tests on ROCm to start failing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165104 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-10 21:06:58 +00:00
Catherine Lee	370b1c12d2	[CI] Put the no gpu tests on machines that don't have gpus (#165183 ) I think this is just a copy paste error? NS: Introduced by https://github.com/pytorch/pytorch/pull/161013 Not sure where it got copied from though, the other set of no gpu tests for the other cuda version already have cpu runners Pull Request resolved: https://github.com/pytorch/pytorch/pull/165183 Approved by: https://github.com/malfet	2025-10-10 20:59:09 +00:00
Catherine Lee	6fd1ca28e1	[lint] Run full lint on ciflow/trunk (#165169 ) Add some naming stuff to differentiate between full + partial If we find that partial always == full, then we can get rid of it https://github.com/pytorch/pytorch/issues/165168 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165169 Approved by: https://github.com/Skylion007, https://github.com/malfet	2025-10-10 20:38:51 +00:00
Catherine Lee	0055f07997	Disable failing test_int8_woq_mm_cuda on slow grad check (#165147 ) Fixes #ISSUE_NUMBER Failing due to memory leak, ex https://github.com/pytorch/pytorch/actions/runs/18401518298/job/52434584458 ``` 2025-10-10T11:07:42.9485277Z _ TestSelectAlgorithmCudaCUDA.test_int8_woq_mm_cuda_batch_size_32_mid_dim_8_in_features_144_out_features_65_cuda_bfloat16 _ 2025-10-10T11:07:42.9485389Z Traceback (most recent call last): 2025-10-10T11:07:42.9485869Z File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 3278, in wrapper 2025-10-10T11:07:42.9485966Z method(args, kwargs) 2025-10-10T11:07:42.9486365Z File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 3278, in wrapper 2025-10-10T11:07:42.9486454Z method(args, **kwargs) 2025-10-10T11:07:42.9486849Z File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 3277, in wrapper 2025-10-10T11:07:42.9486933Z with policy(): 2025-10-10T11:07:42.9487380Z File "/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 2654, in __exit__ 2025-10-10T11:07:42.9487473Z raise RuntimeError(msg) 2025-10-10T11:07:42.9488533Z RuntimeError: CUDA driver API confirmed a leak in __main__.TestSelectAlgorithmCudaCUDA.test_int8_woq_mm_cuda_batch_size_32_mid_dim_8_in_features_144_out_features_65_cuda_bfloat16! Caching allocator allocated memory was 19456 and is now reported as 29184 on device 0. CUDA driver allocated memory was 356712448 and is now 358809600. 2025-10-10T11:07:42.9488543Z 2025-10-10T11:07:42.9488722Z To execute this test, run the following from the base repo dir: 2025-10-10T11:07:42.9489520Z PYTORCH_TEST_CUDA_MEM_LEAK_CHECK=1 PYTORCH_TEST_WITH_SLOW_GRADCHECK=1 python test/inductor/test_cuda_select_algorithm.py TestSelectAlgorithmCudaCUDA.test_int8_woq_mm_cuda_batch_size_32_mid_dim_8_in_features_144_out_features_65_cuda_bfloat16 2025-10-10T11:07:42.9489525Z 2025-10-10T11:07:42.9489748Z This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 ``` Got added in #161680 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165147 Approved by: https://github.com/bbeckca	2025-10-10 20:26:31 +00:00
Rahul Agrawal	4f8a986b8f	Make LOCK_TIMEOUT in codecache configurable (#165030 ) - Introduce file_lock_timeout in config (defaults to current value of 600) - Use the above config instead of hardcoded 600 config. This is useful when running stress tests. Differential Revision: D84109142 Privacy Context Container: L1297311 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165030 Approved by: https://github.com/hl475	2025-10-10 20:22:11 +00:00
PyTorch MergeBot	5c3fe9fb30	Revert "Do not decompose in functionalization/proxy tensor if autograd wouldn't have decomposed (#164939 )" This reverts commit a6fa4f9c283971c0fb6f60a89674a1f35370ac79. Reverted https://github.com/pytorch/pytorch/pull/164939 on behalf of https://github.com/izaitsevfb due to introduces numeric issues internally, see [D84326613](https://www.internalfb.com/diff/D84326613) ([comment](https://github.com/pytorch/pytorch/pull/164939#issuecomment-3392203314))	2025-10-10 20:21:12 +00:00
Pian Pawakapan	306b344a18	[dynamo][DebugMode] mask python keys in dispatch_key_set guard checks (#164992 ) I found that running any compiled function under DebugMode more than once will trigger recompilations, e.g. with the really simple modified test case in `test_compile`: ``` [0/1] [__recompiles] Recompiling function f in /data/users/pianpwk/ptclone/pytorch/test/distributed/tensor/debug/test_debug_mode.py:268 [0/1] [__recompiles] triggered by the following guard failure(s): [0/1] [__recompiles] - 0/0: [0/2] [__recompiles] Recompiling function f in /data/users/pianpwk/ptclone/pytorch/test/distributed/tensor/debug/test_debug_mode.py:268 [0/2] [__recompiles] triggered by the following guard failure(s): [0/2] [__recompiles] - 0/1: [0/2] [__recompiles] - 0/0: ``` Digging deeper, the guard failures were due to TENSOR_MATCH guards failing on dispatch key set checks (seemingly on the Python dispatch key): `5a1fbf45ad/torch/csrc/dynamo/guards.cpp (L199-L203)` This seems to due to the `ignore_compile_internals=True` flag on custom dispatch modes being on, which causes these modes to "hide" themselves during compilation, making dynamo guard on the Python dispatch key being off. The (maybe imperfect) solution is to mask out the Python keys for guard comparisons. This might be fine because custom dispatch modes won't appear here during compilation - `ignore_compile_internals=True` hides them, and `ignore_compile_internals=False` disables compile entirely? Pull Request resolved: https://github.com/pytorch/pytorch/pull/164992 Approved by: https://github.com/williamwen42	2025-10-10 20:00:28 +00:00
Yinghai Lu	94e634942a	Fix int32 overflow in embedding_dense_backward (#165095 ) If `max_partial_segment` is large we can overflow `gid` and cause a bunch of IMA. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165095 Approved by: https://github.com/ngimel, https://github.com/eqy	2025-10-10 19:47:38 +00:00
Catherine Lee	a4925c0ce0	[testing] Print something for log classifier to better differentiate reruns vs real failures (#165163 ) The normal pytest/unittest failure patterns also match flaky tests (specifically I think tests that fail -> succeed on rerun in a new subprocess) So print something specifically for log classifier that it can match against Pull Request resolved: https://github.com/pytorch/pytorch/pull/165163 Approved by: https://github.com/izaitsevfb	2025-10-10 19:28:13 +00:00
PyTorch MergeBot	d16627f4d0	Revert "[dynamo][executorch] Do not trace into exeuctorch LoweredBackendModule (#165126 )" This reverts commit 41936f4cf6ff93b70d81f6a23811d43a0647f1e1. Reverted https://github.com/pytorch/pytorch/pull/165126 on behalf of https://github.com/anijain2305 due to https://github.com/pytorch/pytorch/pull/165172 is the right way ([comment](https://github.com/pytorch/pytorch/pull/165126#issuecomment-3391975498))	2025-10-10 19:21:41 +00:00
Janani Sriram	8f78999d77	[Inductor][ATen] Fix stride rounding on Blockwise128x128 to accommodate for small shapes (#164953 ) Summary: Fix rounding issue on `Blockwise128x128` to accommodate for small shapes. The original implementation rounded all strides to 4, which caused failures for `test_fp8.py` tests as well as `test_scaled_matmul_cuda.py::test_scaled_mm_vs_emulated_block_wise` tests ([GitHub PR](https://github.com/pytorch/pytorch/pull/164259)). Test Plan: `test_fp8.py` `test_scaled_matmul_cuda.py` Differential Revision: D84103213 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164953 Approved by: https://github.com/slayton58, https://github.com/eqy	2025-10-10 19:12:58 +00:00
Nikita Shulga	7cddda1234	Update asan in slow to linux.2xlarge.memory Followup after `f2ae7084eb`	2025-10-10 12:02:29 -07:00
bobrenjc93	98b53961b9	[torchfuzz] add more context to xfail test file (#165149 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165149 Approved by: https://github.com/PaulZhang12 ghstack dependencies: #165116	2025-10-10 18:51:51 +00:00
Howard Huang	a3eb275d3c	Add torch compile check for ZeroBubble (#162511 ) Fix https://github.com/pytorch/pytorch/issues/161904 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162511 Approved by: https://github.com/fegin	2025-10-10 18:49:45 +00:00
orangeH25	6f31406723	[Code Clean] Replace std::runtime_error with TORCH_CHECK (#163927 ) Fixes part of #148114 Including: - aten/src/ATen/InferSize.h - aten/src/ATen/functorch - aten/src/ATen/cudnn/Types.cpp Pull Request resolved: https://github.com/pytorch/pytorch/pull/163927 Approved by: https://github.com/FFFrog, https://github.com/albanD Co-authored-by: Jiawei Li <ljw1101.vip@gmail.com>	2025-10-10 18:23:27 +00:00
Nikita Shulga	f2ae7084eb	[BE] Use `linux.2xlarge.memory` for ASAN builds (#165164 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165164 Approved by: https://github.com/janeyx99	2025-10-10 18:13:42 +00:00
Nikita Shulga	12d7cc5cd3	[BE] Set commit hooks to 3.10	2025-10-10 11:09:13 -07:00
Sam Larsen	a2e2e1d8c0	Add pytorch_version and mast_application_packages to pt2 compile scuba logging (#165018 ) Summary: Two more fields requested for conda-on-mast jobs Differential Revision: D84214442 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165018 Approved by: https://github.com/c00w	2025-10-10 17:57:40 +00:00
PyTorch MergeBot	b67785d9eb	Revert "C++ API handle optimizer defaults (#161825 )" This reverts commit f33201729416ed17467228e80b04d01d4d02b5f3. Reverted https://github.com/pytorch/pytorch/pull/161825 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/161825#issuecomment-3391506427))	2025-10-10 17:56:11 +00:00
Malay Bag	4cd06dc82c	[PT2 Archive] Use tensor dtype while deduping/grouping weights (state_dict/constants) (#165090 ) Summary: While saving state_dict tensors, deduping is done to reduce number of tensor data. For this storage point is used. But when the tensor is empty, storage pointer is 0. But dtype of the tensors could be different. Existing logic will consider all such tensor as same. This will fail the model later when different dtype is expected. This change will include dtype also while deduping. For non empty tensor, this should not affect as the storage point will be unique. Test Plan: TBD Differential Revision: D84243094 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165090 Approved by: https://github.com/yiming0416	2025-10-10 17:51:43 +00:00
Animesh Jain	41936f4cf6	[dynamo][executorch] Do not trace into exeuctorch LoweredBackendModule (#165126 ) Required for https://github.com/pytorch/pytorch/pull/164691 .. comments inline Pull Request resolved: https://github.com/pytorch/pytorch/pull/165126 Approved by: https://github.com/tugsbayasgalan	2025-10-10 17:41:33 +00:00
Xiao Fu	dec9a59992	[dynamo][logging] Add most recent bytecode to graph break with torch._dynamo.graph_break() and verbose (#164422 ) https://github.com/pytorch/pytorch/issues/162858 The issue described the feature implemented. This adds to the existing graph break log with the latest 20 (or viable user frame) bytecode instructions. The scenario is when the graph_break happens without errors. It happens during the case when user calling torch._dynamo.graph_break(). Meanwhile, in the testing, one can find that the generated frame based on step() is not deterministic as sometimes it reached the maximum amount, sometimes it generated the less than that. The bytecode generation is python version dependent. Thus, the testing plan excludes the bytecode output but generated the total bytecode line count. This is a helpful process to understand bytecode transformation, symbolic convert, and convert frame. It is a helpful task to provide hands-on experience with dynamo workflow. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164422 Approved by: https://github.com/williamwen42, https://github.com/mlazos Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>	2025-10-10 17:33:06 +00:00
PyTorch MergeBot	f975bd58af	Revert "Warn if AccumulateGrad stream does not match producer node stream (#165065 )" This reverts commit a70ef954b919e990ebaba715b4072e76352867bf. Reverted https://github.com/pytorch/pytorch/pull/165065 on behalf of https://github.com/izaitsevfb due to breaks lint ([comment](https://github.com/pytorch/pytorch/pull/165065#issuecomment-3391387386))	2025-10-10 17:29:29 +00:00
can-gaa-hou	af42256db4	Fix missing brackets (#165138 ) As stated in the title. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165138 Approved by: https://github.com/Aidyn-A, https://github.com/Skylion007	2025-10-10 17:23:31 +00:00
can-gaa-hou	39161e73fc	[Fix] missing lambda in torch._check (#165043 ) Fixes more missing lambda in torch._check in the source code. Inspired by #164225. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165043 Approved by: https://github.com/FFFrog, https://github.com/Skylion007	2025-10-10 17:11:55 +00:00
Avik Chaudhuri	3ed90f5a09	outline various stages from aot stage2 compile (#164808 ) Splits the training and inference paths for aot stage2 compile. 1. Split `aot_stage2_autograd` into `_aot_stage2a_partition`, `_aot_stage2b_fw_compile` and `_aot_stage2b_bw_compile`, and rest. 2. Split `aot_stage2_inference` into `_aot_stage2b_inference_compile` and rest. I'm leaving these as functions with underscore names since the I/O interfaces and the exact boundaries of these splits are somewhat in the air. Differential Revision: D84028203 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164808 Approved by: https://github.com/SherlockNoMad	2025-10-10 17:04:36 +00:00
Aidyn-A	d41aa187ec	Add more B200 smoke test (#165133 ) A follow up to #159494. This PR adds additional `test_scaled_matmul_cuda` to smoke tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165133 Approved by: https://github.com/drisspg	2025-10-10 16:46:26 +00:00
Edward Yang	8b2137e74a	Don't use C++ CIA decomps if there's a Python one (#164970 ) Some more context at https://github.com/pytorch/pytorch/pull/164939 The basic point here is that Python decomps are guaranteed to be functional, whereas C++ ones are not. If we have a Python decomp, we should prefer it over the C++ one. This currently doesn't matter too much as CIA decomps will get functionalized, but it matters after the quoted PR because we now run these decompositions very late (to make it easy for things like aot_eager to get the fused versions of operators in proxy tensor). Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164970 Approved by: https://github.com/bdhirsh	2025-10-10 16:46:09 +00:00
soulitzer	a70ef954b9	Warn if AccumulateGrad stream does not match producer node stream (#165065 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165065 Approved by: https://github.com/ngimel ghstack dependencies: #162815	2025-10-10 16:46:01 +00:00
Gheorghe-Teodor Bercea	01a2812f48	[ROCm] Adjust grid size for non-unit stride backwards indexing (#165026 ) Adjust grid size for non-unit stride backwards indexing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165026 Approved by: https://github.com/jeffdaily	2025-10-10 16:36:38 +00:00
bobrenjc93	3f27100d3e	[torchfuzz] remove fixed xfail (#165116 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165116 Approved by: https://github.com/PaulZhang12	2025-10-10 16:31:27 +00:00
Angel Li	253fd765bd	bf16 support for fake_quantize_learnable_per_channel_affine (#165098 ) Adding bf16 support for `torch._fake_quantize_learnable_per_channel_affine()` op by relaxing the type check on scale TODO: need to add bf16 support to `per_tensor_affine_` as `torch._fake_quantize_learnable_per_tensor_affine_backward` gets called in the backward pass Test Modified unit test in `test_workflow_ops.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165098 Approved by: https://github.com/jerryzh168, https://github.com/andrewor14	2025-10-10 16:24:52 +00:00
PyTorch MergeBot	abb2f7179e	Revert "Fix truediv numerics between eager and compile (#164144 )" This reverts commit 68913d8f2a953bdbada4033101b04f6e8d49dabe. Reverted https://github.com/pytorch/pytorch/pull/164144 on behalf of https://github.com/malfet due to It breaks CI again, why was it landed for 3 times in a row without any changes? ([comment](https://github.com/pytorch/pytorch/pull/164144#issuecomment-3390973016))	2025-10-10 16:10:25 +00:00
Wu, Zhenyu	b57ab9a3f2	Fix #165125 : Type "str" is not assignable to return type "None" (#165128 ) Fixes #165125 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165128 Approved by: https://github.com/malfet	2025-10-10 16:05:07 +00:00
Yuanyuan Chen	fb64da0791	[2/N] Use "is" in python type comparison (#165142 ) This is follow-up of #165037. It generally recommended to use `is/is not` to compare types. Therefore this series of changes apply this suggestion in the code base, and it aims to finally enabling related linter checks. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165142 Approved by: https://github.com/albanD	2025-10-10 15:36:44 +00:00
Thanh Ha	10a9fb641b	Switch build jobs from linux.4xlarge to c7i (#165057 ) Switch build jobs that use linux.4xlarge which uses c5 instance types to c7i variant. This should improve performance by ~15-20% while cutting costs by ~10-15%. Relates to pytorch/test-infra#7175 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165057 Approved by: https://github.com/huydhn	2025-10-10 15:13:40 +00:00
PyTorch MergeBot	9420944033	Revert "[AMP][Refactor] Simplify dtype support logic in autocast context manager (#163446 )" This reverts commit 960b0d5f0d0efb1f1962bddcf62e2a698e26edd2. Reverted https://github.com/pytorch/pytorch/pull/163446 on behalf of https://github.com/izaitsevfb due to breaks autocast tests on linux and mac ([comment](https://github.com/pytorch/pytorch/pull/163446#issuecomment-3390688642))	2025-10-10 15:12:46 +00:00
Chinmay Kuchinad	55f01a48af	[ROCm] Enable and fix several FSDP + Inductor distributed unit tests (#165011 ) This PR enables a number of distributed unit tests and applies necessary fixes to ensure they pass on ROCm platforms. The changes have been successfully tested on both MI200 and MI300 hardware. This work addresses the following issues: https://github.com/ROCm/frameworks-internal/issues/13586 https://github.com/ROCm/frameworks-internal/issues/13578 Enabled Tests The following tests have been enabled and are now passing: 1. test_compiled_autograd_ctx 2. test_simple_mlp_fullgraph_backend_aot_eager 3. test_simple_mlp_fullgraph_backend_aot_eager_decomp_partition 4. test_simple_mlp_fullgraph_backend_inductor 5. test_nested_fully_shard_backend_aot_eager 6. test_nested_fully_shard_backend_aot_eager_decomp_partition 7. test_nested_fully_shard_backend_inductor_fullgraph_True 8. test_nested_fully_shard_backend_inductor_fullgraph_True_graph_partition 9. test_transformer_backend_aot_eager 10. test_transformer_backend_aot_eager_decomp_partition 11. test_storage_resize_zero_gpu 12. test_storage_resize_nonzero_gpu 13. test_fake_distributed_inductor Tests skipped due to upstream issues: 1. test_nested_fully_shard_backend_inductor_fullgraph_False 2. test_transformer_backend_inductor_fullgraph_True 3. test_transformer_backend_inductor_fullgraph_True_graph_partition 4. test_transformer_backend_inductor_fullgraph_False Pull Request resolved: https://github.com/pytorch/pytorch/pull/165011 Approved by: https://github.com/jeffdaily	2025-10-10 14:10:54 +00:00
PaulZhang12	68913d8f2a	Fix truediv numerics between eager and compile (#164144 ) Addresses numeric differences between eager and compile in https://github.com/pytorch/pytorch/issues/141753 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164144 Approved by: https://github.com/eellison, https://github.com/jansel, https://github.com/ngimel	2025-10-10 14:00:46 +00:00
PyTorch MergeBot	b8be796a57	Revert "[2/N] More ruff SIM fixes (#165031 )" This reverts commit 38095fbd1323ee4a9541fbcbb9b28bd20f2cd956. Reverted https://github.com/pytorch/pytorch/pull/165031 on behalf of https://github.com/albanD due to One of the changed line started to fail on trunk ([comment](https://github.com/pytorch/pytorch/pull/165031#issuecomment-3390190870))	2025-10-10 13:42:14 +00:00
Howard Huang	238dd5517d	[PP] Move profiler record_function in schedule (#164976 ) Better engineering to move the `record_function` call to also encompass the custom callback, this line is the only change: https://github.com/pytorch/pytorch/pull/164976/files#diff-1d3d91f53db88fb886901fb178d69e47776e71b8103f85688fa9ca64cc55d068R2147, the rest is just formatting. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164976 Approved by: https://github.com/fegin ghstack dependencies: #162016, #164962	2025-10-10 13:09:23 +00:00
eellison	d272ed4b3e	Fix identity expansion (#165066 ) In some cases, we wrap indexing with `Identity` to prevent expansion from int32 -> int64 range. There are some checks in codegen which intend to check for constants, which did not handle Identity. Update these checks and update Identity so that it recursively prints inputs. Fix for https://github.com/pytorch/pytorch/issues/164700 Replaces https://github.com/pytorch/pytorch/pull/160190 cc @jgong5 @mingfeima @XiaobingSuper @sanchitintel @ashokei @jingxu10 @jerryzh168 @voznesenskym @penguinwu @EikanWang @Guobing-Chen @zhuhaozhe @blzheng @wenzhe-nrv @jiayisunx @ipiszy @chenyang78 @kadeng @muchulee8 @amjames @chauhang @aakhundov @coconutruben @njriasan Pull Request resolved: https://github.com/pytorch/pytorch/pull/165066 Approved by: https://github.com/njriasan, https://github.com/shunting314, https://github.com/jansel	2025-10-10 13:07:15 +00:00
Yuanyuan Chen	70925bdf82	[1/N] Use "is" in python type comparison (#165037 ) It generally recommended to use `is/is not` to compare types. Therefore this series of changes apply this suggestion in the code base, and it aims to finally enabling related linter checks. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165037 Approved by: https://github.com/mlazos	2025-10-10 12:36:50 +00:00
KarhouTam	960b0d5f0d	[AMP][Refactor] Simplify dtype support logic in autocast context manager (#163446 ) ## Description: This PR refactors the autocast context manager in `autocast_mode.py` to simplify and centralize the logic for checking supported dtypes for each device. The previous implementation repeated similar checks for multiple device types. Now, a single mapping `device_supported_dtypes` is used to associate device types with their supported dtypes, and the validation logic is unified. In my view, this makes the code easier to maintain and extend for new devices. Please share any suggestions and comments with me. BTW, in the original `xla` branch, the `supported_dtype` are `[torch.float16, torch.bfloat16]`, `5d8a226e23/torch/amp/autocast_mode.py (L358-L363)` but the warning message has only `torch.bfloat16`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163446 Approved by: https://github.com/FFFrog, https://github.com/albanD	2025-10-10 12:30:06 +00:00
FFFrog	e0abcee3b5	[Code Clean] Remove support of python3.9 (#163846 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163846 Approved by: https://github.com/ezyang	2025-10-10 11:11:56 +00:00
Shangdi Yu	77bf23d85c	Add an option to put store large mmap weights on disk (#164526 ) As title In windows, we cannot modify the .dll to append weights at the end, the windows .dll loader will complain it's not a valid .dll file. So we store the weight blob as a separete file. 1. We add the following API which allows passing in a pointer to the weight blob and get the size of the weight blob. ```cpp AOTI_API AOTIRuntimeError AOTInductorModelContainerGetConstantsBlobSize( AOTInductorModelContainerHandle container_handle, uint64_t* ret_size); // Load weights from a single blob in weight_blob_ptr AOTI_API AOTIRuntimeError AOTInductorModelUpdateConstantsFromBlob( AOTInductorModelContainerHandle container_handle, const uint8_t* weight_blob_ptr); ``` 2. We also add a method in ModelContainerRunner to load the weight: If the runner see that there is a `.blob` file in the package, if will mmap the .blob file and use the content to load the constants. 3. We also add the `USE_MMAP_EXTERNAL` macro. When this macro is defined, the model expects to load the weights from external mmap'd weights. Test Plan: ``` buck run @mode/dev-nosan caffe2/test/inductor:test_aot_inductor -- -r test_large_mmaped_weights_on_disk ``` Also tested for windows-cross compilation with `6542566585/demo/main_voxtral.cpp` ``` Loaded model.dll audio_encoder loaded C:\Users\shangdiy\source\repos\torchnative\demo\token_embedding\data\aotinductor\model\model.wrapper.so Loaded model.dll token_embedding loaded C:\Users\shangdiy\source\repos\torchnative\demo\text_decoder\data\aotinductor\model\model.wrapper.so Loaded model.dll Loading weights from C:\Users\shangdiy\source\repos\torchnative\demo\text_decoder\data\aotinductor\model\model.wrapper_weights.blob text_decoder loaded Load latency (ms): audio_encoder: 1011.234 archive extraction: 0.000 .so loading: 1011.197 token_embedding: 525.773 archive extraction: 0.000 .so loading: 525.704 text_decoder: 3324.130 archive extraction: 0.000 .so loading: 3323.979 Run latency (ms): audio_encoder: 285.958 audio_encoder output: dtype=bfloat16, shape=[1, 1125, 3072], numel=3456000 token_embedding: 6.676 token_embedding output: dtype=bfloat16, shape=[1, 1138, 3072], numel=3495936 text_decoder: 576.519 text_decoder output: dtype=bfloat16, shape=[1, 1138, 131072], numel=149159936 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164526 Approved by: https://github.com/desertfire	2025-10-10 07:53:57 +00:00
PyTorch MergeBot	d2cb183344	Revert "[inductor] verify determinism with inductor benchmark script (#164904 )" This reverts commit a3c700656f9a666eb33074b60333a23eb7e99a15. Reverted https://github.com/pytorch/pytorch/pull/164904 on behalf of https://github.com/huydhn due to Sorry for reverting your PR but there seems to be some failed vLLM failures coming out of this ([comment](https://github.com/pytorch/pytorch/pull/164904#issuecomment-3388443678))	2025-10-10 06:23:07 +00:00
Yuanyuan Chen	38095fbd13	[2/N] More ruff SIM fixes (#165031 ) This is follow-up of #164695 to apply ruff SIM rules to more files. Most changes are about simplifying dict.get because None is already the default value. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165031 Approved by: https://github.com/mlazos	2025-10-10 05:37:46 +00:00
Yuanyuan Chen	ffc9559d9f	[7/N] Apply ruff UP035 rule (#164653 ) This PR is follow-up of #164438 to continue applying `UP035` rule. All changes are about proper `Callable` importation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164653 Approved by: https://github.com/aorenste	2025-10-10 05:16:17 +00:00
Simon Layton	172d6ed8b8	Refactor _scaled_grouped_mm_cuda dispatch (#165060 ) Summary: * Clean & simplify different scaling recipe dispatch * Split out recipes into separate dispatch functions Test Plan: ``` pytest -svv -k grouped test/test_scaled_matmul_cuda.py ``` Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlayton@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/165060 Approved by: https://github.com/danielvegamyhre, https://github.com/ngimel	2025-10-10 04:44:25 +00:00
Nikita Shulga	9a3c4b917e	[CMake] Remove forcing of `-O2` from `torch_compile_options` (#164894 ) That was introduced by `75a65ffe0f` Hattip to @jathu for alerting me about the issue. As result, all our PyTorch builds were shipped with `-O2` for almost all of its modern history Partially undo the damage introduced by https://github.com/pytorch/pytorch/pull/128406 that cause cross-ISA symbols leak, to be properly followed up in https://github.com/pytorch/pytorch/issues/165123 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164894 Approved by: https://github.com/ezyang	2025-10-10 04:43:53 +00:00
PyTorch MergeBot	df514a6d5a	Revert "[inductor][eazy] change how torch.use_deterministic_algorithms affect inductor (#164905 )" This reverts commit 344e6365a0068c2d2847fcec0c55dd53291d475e. Reverted https://github.com/pytorch/pytorch/pull/164905 on behalf of https://github.com/huydhn due to Sorry for reverting your PR but there seems to be some failed vLLM failures coming out of this ([comment](https://github.com/pytorch/pytorch/pull/164905#issuecomment-3388258660))	2025-10-10 04:37:09 +00:00
Maggie Moss	48fe858fef	Fix error, remove file from pyrefly checking (#165094 ) Reported issue with formatting and parsing. Removing suppressions and avoiding this file in future type checking until we can get a more complete fix in . Pull Request resolved: https://github.com/pytorch/pytorch/pull/165094 Approved by: https://github.com/albanD	2025-10-10 04:34:51 +00:00
PyTorch MergeBot	7ab00c7c17	Revert "Hotfix test scaled matmul cuda (#165104 )" This reverts commit 9aa92f246fa5fe5cfda17970d41d167b19a0612a. Reverted https://github.com/pytorch/pytorch/pull/165104 on behalf of https://github.com/malfet due to Looks like it broke cuda tests, isn't it, see `44b1ff54e9/1` ([comment](https://github.com/pytorch/pytorch/pull/165104#issuecomment-3388247886))	2025-10-10 04:32:18 +00:00
Nikita Shulga	44b1ff54e9	[CD] Do not propagate download.pytorch.org IP into container (#165075 ) Followup after https://github.com/pytorch/pytorch/pull/164969 Should fix binary build test failures Pull Request resolved: https://github.com/pytorch/pytorch/pull/165075 Approved by: https://github.com/seemethere, https://github.com/huydhn ghstack dependencies: #164968, #164969	2025-10-10 04:27:29 +00:00
PyTorch MergeBot	daea35df5c	Revert "[CD] Do not propagate download.pytorch.org IP into container (#165075 )" This reverts commit 6d27a8e5093ee2a21d44dceeeffcb272e6e0f655. Reverted https://github.com/pytorch/pytorch/pull/165075 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/165075#issuecomment-3388228013))	2025-10-10 04:20:51 +00:00
Laith Sakka	7f2a902ea2	more sizelike deprecation (#164889 ) remove expext_size c++ bindings and usages Pull Request resolved: https://github.com/pytorch/pytorch/pull/164889 Approved by: https://github.com/mlazos ghstack dependencies: #164884, #164885, #164886, #164887, #164888	2025-10-10 03:45:06 +00:00
Mikayla Gawarecki	9c057d9863	[BE] Refresh documentation for stable ABI / API (#163899 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163899 Approved by: https://github.com/janeyx99	2025-10-10 03:26:28 +00:00
Yiming Zhou	938869e7d3	[DTensor] Improve sharding propagation error msg in DTensor dispatch (#164623 ) Fixes #164543 This PR improves the `__str__` method of DTensor's `OpSchema` to provide better readable error message when dispatch fails as the error message prints `{op_info.schema}` example 1 `aten.embedding` ``` aten.embedding.default(Spec(f32[2048, 256](S(0))), Spec(i64[16, 2048](S(0)R))) on DeviceMesh((dp=2, tp=2), 'cuda', stride=(2, 1))) ``` example 2 `aten.mm` ``` aten.mm.default(Spec(f32[1024, 512](S(1))), Spec(f32[512, 256](S(0)))) on DeviceMesh((tp=4), 'cuda', stride=(1,))) ``` example 3 `aten._scaled_dot_product_flash_attention` ``` aten._scaled_dot_product_flash_attention.default(Spec(f16[8, 16, 128, 64](RS(1))), Spec(f16[8, 16, 128, 64](RS(1))), Spec(f16[8, 16, 128, 64](RS(1)))) on DeviceMesh((dp=2, tp=4), 'cuda', stride=(4, 1))) ``` Added test ``` python test/distributed/tensor/test_dtensor_ops.py -k test_embedding_error_msg ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164623 Approved by: https://github.com/zpcore	2025-10-10 03:16:04 +00:00
Yuanyuan Chen	ce6b589545	Enable B904 check of flake8 (#165047 ) The description of `B904` is `Within an except clause, raise exceptions with raise ... from err or raise ... from None to distinguish them from errors in exception handling. ` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165047 Approved by: https://github.com/Lucaskabela	2025-10-10 03:08:01 +00:00
Dzmitry Huba	ae25dd51fc	Simplifying computation of the final result for equals op on DTensor (#164999 ) Instead of collecting local results using all_gather_object followed by local reduction, with this change we switch to using a single all_reduce with MIN reduction operation to compute the final equals result. This change is needed to enable LocalTensor work (all_gather_object introduces challenges in for DTensor and LocalTensor integration). topic: not user facing Pull Request resolved: https://github.com/pytorch/pytorch/pull/164999 Approved by: https://github.com/ezyang	2025-10-10 03:01:28 +00:00
Simon Fan	a61d0de9f9	[hop] support local_map filtered gradients (#164437 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164437 Approved by: https://github.com/ezyang ghstack dependencies: #164296, #164321, #164419, #164420, #164340, #163602, #164431, #164433	2025-10-10 02:34:27 +00:00
Simon Fan	3ad88924ad	[hop] support local_map None placements (#164433 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164433 Approved by: https://github.com/ezyang ghstack dependencies: #164296, #164321, #164419, #164420, #164340, #163602, #164431	2025-10-10 02:34:27 +00:00
Simon Fan	3241b9c15f	[hop] support local_map None gradients (#164431 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164431 Approved by: https://github.com/bdhirsh ghstack dependencies: #164296, #164321, #164419, #164420, #164340, #163602	2025-10-10 02:34:27 +00:00
Simon Fan	25d4d5107e	[dynamo] trace local_map with local shapes for AP (#163602 ) Context is in https://www.internalfb.com/excalidraw/EX519691 and https://docs.google.com/document/d/1qnuXLZk_GYt_PksHTwkn7L2ELRDnYlIRPkHAlXTyuhw/edit?tab=t.0. And the description of the previous PR: https://github.com/pytorch/pytorch/pull/164340. The previous PR adds the support on the HOP side for eager execution and AOTAutograd. Dynamo is still passing the HOP a subgraph with wrong shapes. This PR fixes that. This is similar to the HOP implementation, however we additionally need to manually keep the TensorVariable metadata in sync. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163602 Approved by: https://github.com/ydwu4 ghstack dependencies: #164296, #164321, #164419, #164420, #164340	2025-10-10 02:34:27 +00:00
Simon Fan	e4fe811be8	[hop] trace local_map with local shapes in fake key (#164340 ) Context is in https://www.internalfb.com/excalidraw/EX519691 and https://docs.google.com/document/d/1qnuXLZk_GYt_PksHTwkn7L2ELRDnYlIRPkHAlXTyuhw/edit?tab=t.0. So for Autoparallel initial trace, we want to trace the graph with global shapes initially. But, for the local_map region, we are forced to trace with the expected local tensors. To the tracers, this looks weird, because it's a plain tensor input (representing DTensor's full tensor .to_local()) that we need to "redistribute". After hacking a miserable version that had cross-key dependencies, @ydwu4 proposed this simpler approach to override the fake key. This means the shape conversion will be invisible to all dispatch keys above fake, this covers all current tracing mechanisms. This manifests as the joint graph for the HOP body being traced with local shapes: ```python # HOP forward, note local shapes (10, 80) class GraphModule(torch.nn.Module): def forward(self, primals_0: "f32[10, 80]"): # No stacktrace found for following nodes view: "f32[800]" = torch.ops.aten.view.default(primals_0, [-1]); primals_0 = None add: "f32[800]" = torch.ops.aten.add.Tensor(view, 10); view = None view_1: "f32[10, 80]" = torch.ops.aten.view.default(add, [10, 80]); add = None return (view_1,) # HOP backward, note local shapes (10, 80) class GraphModule(torch.nn.Module): def forward(self, tangents_0: "f32[10, 80]"): # No stacktrace found for following nodes clone: "f32[10, 80]" = torch.ops.aten.clone.default(tangents_0); tangents_0 = None return (clone,) ``` while the rest of the graph is still traced with global shapes: ```python # Parent graph joint, note global shapes (80, 80) class inner_f(torch.nn.Module): def forward(self, primals, tangents): primals_1: "f32[80, 80]"; tangents_1: "f32[80, 80]"; primals_1, tangents_1, = fx_pytree.tree_flatten_spec([primals, tangents], self._in_spec) # File: /home/xmfan/core/a/pytorch/test/higher_order_ops/test_local_map.py:597 in forward, code: return fn(x) call_local_map = torch._higher_order_ops.local_map.call_local_map(primals_1); primals_1 = None getitem: "f32[80, 80]" = call_local_map[0]; call_local_map = None call_local_map_1 = torch._higher_order_ops.local_map.call_local_map(tangents_1); tangents_1 = None getitem_1: "f32[80, 80]" = call_local_map_1[0]; call_local_map_1 = None return pytree.tree_unflatten([getitem, getitem_1], self._out_spec) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164340 Approved by: https://github.com/ydwu4 ghstack dependencies: #164296, #164321, #164419, #164420	2025-10-10 02:34:27 +00:00
Simon Fan	82c71af59a	[hop] local_map validate partitioned fw/bw wrt placements (#164420 ) Reviewed GPT-5 Summary: Summary / Goal Add validation that partitioned forward/backward graphs respect placements. Details - Validates placement alignment in local_map. - The HOP's autograd key gets called when we are tracing the joint, we need to validate: - the inputs to the HOP's fwd gm (typically this is the dynamo rewritten inputs) - the inputs to the HOP partitioned fwd/bwd gm - the outputs of the HOP partitioned fwd/bwd gm Motivation Catch mismatch errors earlier, improve debugging. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164420 Approved by: https://github.com/ezyang ghstack dependencies: #164296, #164321, #164419	2025-10-10 02:34:27 +00:00
Simon Fan	7bd704a346	[hop] local_map fix fw_gm/bw_gm naming (#164419 ) Reviewed GPT5 summary: Summary / Goal Fix inconsistent variable naming for forward/backward graphs. Details - Those methods are actually for both fw and bw graphs now that we reuse the same op for fw/bw Motivation Improves clarity, avoids confusion. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164419 Approved by: https://github.com/bdhirsh ghstack dependencies: #164296, #164321	2025-10-10 02:34:27 +00:00
Simon Fan	ae139b73e0	[dynamo] Better error message for local_map subgraph mismatches number of inputs/outputs with placement info (#164321 ) Reviewed GPT5 summary: Summary / Goal Improve error reporting when local_map subgraph input/output counts mismatch placement info. Details - Adds descriptive runtime error messages. Motivation Helps debug local_map misalignments. ```python AssertionError: Expecting 2 inputs to local_map function based on placements, but found 1. If the count matches for eager, Dynamo may have flattened inputs to the function or found additional tensors used via closures. Please adjust the input placements to match what the traced graph sees: class GraphModule(torch.nn.Module): def forward(self, l_args_0_: "f32[8, 8, 16]"): # File: /home/xmfan/core/a/pytorch/test/higher_order_ops/test_local_map.py:523 in mismatch_input, code: return x + scalar, scalar child: "f32[8, 8, 16]" = l_args_0_ + 10; l_args_0_ = None return (child,) . ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164321 Approved by: https://github.com/ezyang, https://github.com/mlazos ghstack dependencies: #164296	2025-10-10 02:34:27 +00:00
Simon Fan	cbaa07e438	[dtensor] add util to compute expected local sizes/strides for even sharding (#164296 ) Reviewed GPT5 summary: Summary / Goal Add a utility to compute expected local tensor sizes and strides under even sharding in dtensor. Details - New function in `torch/distributed/tensor/_utils.py`. - Computes local sizes/strides given global shape, mesh, and placements. - Enforces divisibility of global dimension by mesh size (strict even sharding). - Complements `compute_global_tensor_info`. Motivation Ensures correctness for stride/layout computations in distributed tensors. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164296 Approved by: https://github.com/ezyang	2025-10-10 02:34:27 +00:00
Yuanyuan Chen	bc0e2a0d2b	Fix a condition error in torch/_inductor/codegen/debug_utils.py (#165033 ) This PR fixes the condition ``` if arg_signatures is None and self.kernel_type == "cpp" or "extern" ``` which is interpreted as ``` if (arg_signatures is None and self.kernel_type == "cpp") or ("extern"): ``` and it is always evaluated to `True`. According to the context the intention was ``` if arg_signatures is None and (self.kernel_type == "cpp" or self.kernel_type == "extern"): ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165033 Approved by: https://github.com/Skylion007	2025-10-10 02:20:00 +00:00
drisspg	0747d95994	Add Loads from fixed inputs (#162031 ) ## TODO Check on multi indices ```Python @cute.jit def score_mod(tSrS_ssa, b_idx, h_idx, q_idx, kv_idx, buffers): in_ptr4 = buffers[0] tmp0 = tSrS_ssa tmp1 = b_idx tmp2 = h_idx tmp3 = cute.make_fragment(1, cutlass.Int32) tmp4 = tmp3.store(32tmp1 + tmp2) tmp5 = cute.make_fragment(1, cutlass.BFloat16) tmp6 = tmp3[0] tmp7 = tmp5[0] = (in_ptr4[tmp6]) tmp8 = (tmp5.load()).to(cutlass.Float32) tmp9 = (tmp0 + tmp8) tSrS_ssa = tmp9 return tSrS_ssa ``` I dont think that ``` tmp4 = tmp3.store(32tmp1 + tmp2) tmp5 = cute.make_fragment(1, cutlass.BFloat16) tmp6 = tmp3[0] tmp7 = tmp5[0] = (in_ptr4[tmp6] ``` is right since this tmp6 value will be larger than the actual index dim int his case its B -> see if its possible to 1d index Pull Request resolved: https://github.com/pytorch/pytorch/pull/162031 Approved by: https://github.com/v0i0 ghstack dependencies: #161118	2025-10-10 01:23:37 +00:00
drisspg	0a2cde2f06	Add Flash Attention support to FlexAttention (#161118 ) Relies on this PR in Flash Attention: https://github.com/Dao-AILab/flash-attention/pull/1840 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161118 Approved by: https://github.com/v0i0	2025-10-10 01:23:37 +00:00
Jithun Nair	c7b57d9349	Add gfx1100 to build target for ROCm docker builds (#165103 ) Fixes issue of gfx1100 test jobs timing out Pull Request resolved: https://github.com/pytorch/pytorch/pull/165103 Approved by: https://github.com/jeffdaily	2025-10-10 01:18:56 +00:00
PyTorch MergeBot	7614338b69	Revert "Add SVE128 ISA (#158932 )" This reverts commit 92284fb2ff44f09a9c7df0d8cf6cac9903e376a4. Reverted https://github.com/pytorch/pytorch/pull/158932 on behalf of https://github.com/malfet due to Hmm, but from OSS point of view, this is a no-op ([comment](https://github.com/pytorch/pytorch/pull/158932#issuecomment-3387961238))	2025-10-10 01:17:02 +00:00
Edward Z. Yang	a6fa4f9c28	Do not decompose in functionalization/proxy tensor if autograd wouldn't have decomposed (#164939 ) This fixes AOTAutograd rms_norm not being bitwise equivalent to eager, because it avoids a decomposition. You can force the decomposition by having the decomposition in the dispatch table, but if eager mode wouldn't have decomposed (because it went to the fused one), we now default to preserving the fused call by default. This largely reverts https://github.com/pytorch/pytorch/pull/103275/ for view ops. This means that in inference mode we could hit the wrong C++ kernel; if this occurs we should just SymInt'ify the C++ kernel. Another neat side effect of this change is that Inductor's generated kernels for rms_norm now have rms_norm in their name. Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164939 Approved by: https://github.com/bdhirsh	2025-10-10 00:15:00 +00:00
Shunting Zhang	344e6365a0	[inductor][eazy] change how torch.use_deterministic_algorithms affect inductor (#164905 ) Previously when torch.are_deterministic_algorithms_enabled() is True Inductor will - skip autotuning pointwise kernels - pick a fixed (and quite arbitrary) config for reduction This PR change the behavior to - for pointwise kernels, we still do autotuning - for reduction kernels, we use the recent added heuristic to pick a config Pull Request resolved: https://github.com/pytorch/pytorch/pull/164905 Approved by: https://github.com/jansel, https://github.com/v0i0 ghstack dependencies: #164801, #164532, #164904	2025-10-10 00:00:58 +00:00
Shunting Zhang	a3c700656f	[inductor] verify determinism with inductor benchmark script (#164904 ) Verify the deterministic mode with torch.compile benchmark scripts. Here is what my testing script does (pasted in the end): - run a model in default mode, save it's result - run the model again in default mode, but distort the benchmarking results. Compare it with the saved result. - Do the above again in deterministic mode. I tried to test a few modes - BertForMaskedLM and GoogleFnet: I can repro the numeric change by distorting the benchnmark result in the default mode. The non-determinism is gone in the deterministic mode - DistillGPT2: I can not repro the numeric change by distorting the benchmarking result in the default mode. It does not surprise me much. Reduction order change does not always cause numeric change. ``` model=GoogleFnet export TORCHINDUCTOR_WRITE_ARE_DETERMINISTIC_ALGORITHMS_ENABLED=0 export TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 # disable autotune cache export TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE=0 export TORCHINDUCTOR_FX_GRAPH_CACHE=0 export TORCHINDUCTOR_CACHE_DIR=/tmp/torchinductor_shunting/ export TORCHINDUCTOR_BENCHMARK_KERNEL=1 export TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1 export INDUCTOR_TEST_DISABLE_FRESH_CACHE=1 # Non deterministic mode # --float32 rather than --amp to make it easier to repro non-deterministic echo "Save results for non-deterministic mode" python benchmarks/dynamo/huggingface.py --backend inductor --float32 --accuracy --only $model --training --disable-cudagraphs --save-model-outputs-to=/tmp/saved-non-deterministic.pkl echo "Compare results with distorted benchmarking in non-deterministic mode" TORCHINDUCTOR_DISTORT_BENCHMARKING_RESULT=inverse python benchmarks/dynamo/huggingface.py --backend inductor --float32 --accuracy --only $model --training --disable-cudagraphs --compare-model-outputs-with=/tmp/saved-non-deterministic.pkl echo "Save results for deterministic mode" TORCHINDUCTOR_DETERMINISTIC=1 python benchmarks/dynamo/huggingface.py --backend inductor --float32 --accuracy --only $model --training --disable-cudagraphs --save-model-outputs-to=/tmp/saved-deterministic.pkl echo "Compare results with distorted benchmarking in deterministic mode" TORCHINDUCTOR_DETERMINISTIC=1 TORCHINDUCTOR_DISTORT_BENCHMARKING_RESULT=inverse python benchmarks/dynamo/huggingface.py --backend inductor --float32 --accuracy --only $model --training --disable-cudagraphs --compare-model-outputs-with=/tmp/saved-deterministic.pkl ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164904 Approved by: https://github.com/jansel, https://github.com/v0i0 ghstack dependencies: #164801, #164532	2025-10-10 00:00:58 +00:00
Yidi Wu	600db525bd	[easy][while_loop] use copy_input instead of clone in _clone_aliased_inputs (#164955 ) Compared with clone, ExternKernel.copy_input additionally realize the buffer, which downstream assumes the input buffer are realized. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164955 Approved by: https://github.com/BoyuanFeng	2025-10-09 23:39:00 +00:00
Animesh Jain	f6de195616	[dynamo][trace_rules] Add ao.quantization (#165069 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165069 Approved by: https://github.com/tugsbayasgalan, https://github.com/mlazos	2025-10-09 23:08:42 +00:00
angelayi	4a0df39f81	Symintify fused_scaled_matmul_reduce_scatter (#165086 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/165086 Approved by: https://github.com/zou3519, https://github.com/Skylion007	2025-10-09 23:07:40 +00:00
PyTorch MergeBot	34ac9b61cb	Revert "[export] Turn on install_free_tensors flag (#164691 )" This reverts commit 0e9b3a772ab96e998ab85591d5b2a9c1d41bacb0. Reverted https://github.com/pytorch/pytorch/pull/164691 on behalf of https://github.com/izaitsevfb due to breaks tests internally, author asked to revert, see [D84230990](https://www.internalfb.com/diff/D84230990) ([comment](https://github.com/pytorch/pytorch/pull/164691#issuecomment-3387718323))	2025-10-09 22:53:50 +00:00
Jeff Daily	9aa92f246f	Hotfix test scaled matmul cuda (#165104 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165104 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-09 22:51:30 +00:00
Tugsbayasgalan Manlaibaatar	a57a14868d	Better handling of restore_state_dict (#164401 ) After lean export, we might want to be able to restore the original fqn. This PR refactors one util function in export that sort of does this. Note that strict_export has some complicated logic of updating the graph signature as well which we don't want. I think we can gradually make this util more refined by handling constants, non persistent buffers etc and change how strict_export does it today. Differential Revision: [D83687844](https://www.internalfb.com/diff/D83687844) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164401 Approved by: https://github.com/avikchaudhuri	2025-10-09 22:39:11 +00:00
PyTorch MergeBot	47956196d9	Revert "Call internal log_compilation_event if it exists (#164855 )" This reverts commit 98a081a24c22072362dc536afd39a469e28939d4. Reverted https://github.com/pytorch/pytorch/pull/164855 on behalf of https://github.com/albanD due to We should not land this kind of code in core ([comment](https://github.com/pytorch/pytorch/pull/164855#issuecomment-3387692988))	2025-10-09 22:38:45 +00:00
Nikita Shulga	6d27a8e509	[CD] Do not propagate download.pytorch.org IP into container (#165075 ) Followup after https://github.com/pytorch/pytorch/pull/164969 Should fix binary build test failures Pull Request resolved: https://github.com/pytorch/pytorch/pull/165075 Approved by: https://github.com/seemethere, https://github.com/huydhn ghstack dependencies: #164968, #164969	2025-10-09 21:59:31 +00:00
Eddie Yan	cd62a73dcb	[cuDNN][SDPA] Handle noncontig nested tensors in cuDNN SDPA (#164958 ) Previously we hardcoded the assumption in cuDNN that the inputs would be dense which breaks when e.g., the user is chunking tensors yielding noncontig inputs New test added to check this when `TORCH_CUDNN_SDPA_NESTED_TENSOR_ENABLED=1` is set in `test/test_transformers.py` One issue I noticed was that the old gating of nested tensor in `sdp_utils.cpp` seems to be a no-op? All of the inputs are reported as "dense" by the time that function is called in the nested tensor tests in `test/test_nestedtensor.py -k sdpa` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164958 Approved by: https://github.com/Skylion007, https://github.com/drisspg	2025-10-09 21:58:54 +00:00
PyTorch MergeBot	4d7f9f3aed	Revert "[ATen] Fix CUDA reduction warp shuffle order (#164790 )" This reverts commit 8e1f409b8ccf64b2cf3933ece13587ad57e9d8a9. Reverted https://github.com/pytorch/pytorch/pull/164790 on behalf of https://github.com/jeffdaily due to broke cuda and rocm ci ([comment](https://github.com/pytorch/pytorch/pull/164790#issuecomment-3387558806))	2025-10-09 21:36:10 +00:00
William Wen	2b9ff99535	[flex attention] change "==" to "is" in inspect parameter comparison (#165003 ) Patch for https://github.com/pytorch/pytorch/issues/164760. This doesn't actually fix the underlying torch function issue though. Explanation: `is` is traced differently compared to `__eq__`, so we end up avoiding the issue where we attempt to evaluate `torch.eq(tensor, inspect._empty)` in the first place. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165003 Approved by: https://github.com/mlazos	2025-10-09 21:18:05 +00:00
Sam Larsen	98a081a24c	Call internal log_compilation_event if it exists (#164855 ) Summary: For internal conda on mast jobs, call the internal version of log_compilation_event if it exists. Test Plan: Ran a simple test job that just calls the API: https://fburl.com/scuba/dynamo_compile/dqx8d10g Pull Request resolved: https://github.com/pytorch/pytorch/pull/164855 Approved by: https://github.com/c00w	2025-10-09 21:15:11 +00:00
Lakshay Garg	6c0125dbc0	Mark functions const in CUDACachingAllocator (#165007 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165007 Approved by: https://github.com/eqy	2025-10-09 20:53:58 +00:00
Murray Steele	0fd976b65c	Enable mimalloc on non-Windows platforms and make default for AArch64 builds (#164741 ) This change removes the Windows requirement for mimalloc builds, and makes mimalloc the default c10 system allocator for AArch64 builds. This significantly improves the performance of AArch64 builds of PyTorch as large allocations are better cached by mimalloc than glibc. Updated Results Torchbench FP32 eager Inference, 16 threads: <img width="1510" height="733" alt="mimalloc-v2-fp32-diff" src="https://github.com/user-attachments/assets/7fe3ea0c-3b52-42e7-879b-612444479c90" /> Torchbench BF16 eager Inference, 16 threads: <img width="1510" height="733" alt="mimalloc-v2-bf16-diff" src="https://github.com/user-attachments/assets/56469a72-9e06-4d57-ae2a-aeb139ca79a3" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164741 Approved by: https://github.com/fadara01, https://github.com/aditew01, https://github.com/malfet	2025-10-09 20:49:46 +00:00
Maggie Moss	9944cac6e6	Add suppressions to torch/_inductor (#165062 ) Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283 Split this directory into two PRs to keep them from being too large. Test plan: dmypy restart && python3 scripts/lintrunner.py -a pyrefly check step 1: delete lines in the pyrefly.toml file from the project-excludes field step 2: run pyrefly check step 3: add suppressions, clean up unused suppressions before: https://gist.github.com/maggiemoss/4b3bf2037014e116bc00706a16aef199 after: INFO 0 errors (6,884 ignored) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165062 Approved by: https://github.com/oulgen, https://github.com/mlazos	2025-10-09 20:34:20 +00:00
Nikita Shulga	e7fd296930	[CI] Add full debug build to trunk (#164974 ) But not test, just import torch, as regression test for https://github.com/pytorch/pytorch/issues/164297 Test plan: Re-apply #164974 on top of this change and observer the failure in the workflows: https://github.com/pytorch/pytorch/actions/runs/18383302153/job/52375282838 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164974 Approved by: https://github.com/seemethere, https://github.com/clee2000, https://github.com/atalman ghstack dependencies: #164968, #164969	2025-10-09 20:12:16 +00:00
Sam Larsen	fac85fcfb5	[inductor] custom_graph_pass.get_hash_for_files: don't hash paths (#165020 ) Summary: We have an internal user where caching broke because the paths that are unzipped are probably different per host. We can't think of a use case where a path change matters when the file content has not changed, so removing this part Pull Request resolved: https://github.com/pytorch/pytorch/pull/165020 Approved by: https://github.com/oulgen	2025-10-09 20:07:53 +00:00
Natalia Gimelshein	228973df7f	Fix channels-last dimension mapping in CUDA parallel_cat (#165023 ) Fixes #164849 `dimension` was updated in-place, so for more than one batch of channels-last tensors the concat `dimension` for the second kernel launch was wrong ## Testing - python -m compileall test/test_tensor_creation_ops.py ------ https://chatgpt.com/codex/tasks/task_e_68e708879b30832f89b10ae55faa68e8 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165023 Approved by: https://github.com/ezyang	2025-10-09 20:04:32 +00:00
PyTorch MergeBot	ed2d514ad8	Revert "Fix truediv numerics between eager and compile (#164144 )" This reverts commit 724463d5a2fba369cd14e89215b84d1b01435df7. Reverted https://github.com/pytorch/pytorch/pull/164144 on behalf of https://github.com/malfet due to Not sure if it's related, but looks it triggered fuzzer compiler test failure, see `a2f29bcd63/1` ([comment](https://github.com/pytorch/pytorch/pull/164144#issuecomment-3387288464))	2025-10-09 19:53:38 +00:00
Tianren Gao	a2f29bcd63	[inductor] Remove Repeated Code in Subgraph (#164892 ) Discovered some repeated code blocks in the subgraph.py Pull Request resolved: https://github.com/pytorch/pytorch/pull/164892 Approved by: https://github.com/PaulZhang12	2025-10-09 19:16:02 +00:00
FFFrog	5390324984	[CodeClean] Replace std::runtime_error with TORCH_CHECK (#164129 ) As the title stated. Changes: - torch/csrc/Module.cpp - torch/csrc/utils.cpp - torch/csrc/stable - torch/lib/libshm Pull Request resolved: https://github.com/pytorch/pytorch/pull/164129 Approved by: https://github.com/albanD	2025-10-09 19:01:07 +00:00
Avik Chaudhuri	ae25ec569c	reorder wrappers in aot_stage2_inference to match forward compile in aot_stage2_autograd (#165016 ) In aot_stage2_autograd: Before calling fw_compiler, we run pre_compile for the following wrappers: * FakifiedOutWrapper * FunctionalizedRngRuntimeWrapper After, we run post_compile for the following wrappers: * EffectTokensWrapper * AOTDispatchSubclassWrapper * FunctionalizedRngRuntimeWrapper * FakifiedOutWrapper In aot_stage2_inference: Before calling inference compiler, we run pre_compile for the following wrappers (same as above): * FakifiedOutWrapper * FunctionalizedRngRuntimeWrapper After, we run post_compile for the following wrappers (different than above): * FunctionalizedRngRuntimeWrapper * FakifiedOutWrapper * EffectTokensWrapper * AOTDispatchSubclassWrapper This PR makes both do the post_compiles in the same order. Differential Revision: D84213657 Pull Request resolved: https://github.com/pytorch/pytorch/pull/165016 Approved by: https://github.com/zhxchen17, https://github.com/bdhirsh	2025-10-09 18:36:04 +00:00
PaulZhang12	8e1f409b8c	[ATen] Fix CUDA reduction warp shuffle order (#164790 ) Typical warp shuffle reduction has the following pattern: <img width="1138" height="501" alt="image" src="https://github.com/user-attachments/assets/3bd176dc-0ad2-4df6-90c7-06e467337166" /> which is exhibited in Triton generated by torch.compile: <img width="663" height="403" alt="image" src="https://github.com/user-attachments/assets/7f9f36cd-b9eb-44c1-879e-b469668a2ea8" /> Switch the warp shuffle order to make bitwise equivalence between the 2 easier. PTX difference between old and new, we see a few extra instructions: https://www.diffchecker.com/h6ly3INC/ Comparing the performance on different reduction operations, we see minimal differences. New represents the changes in this PR, old represents the past warp shuffle order: ``` Tensor Shape Operation New all dims (ms) New dim=0 (ms) New dim=1 (ms) Old all dims (ms) Old dim=0 (ms) Old dim=1 (ms) ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1024, 1024) mean 0.015817 0.016259 0.013642 0.015990 0.016258 0.013631 (1024, 1024) sum 0.015917 0.015906 0.013359 0.015707 0.016266 0.013226 (1024, 1024) min 0.016021 0.024625 0.015631 0.015761 0.024485 0.015317 (1024, 1024) max 0.016349 0.024971 0.015972 0.015771 0.025001 0.015314 (1024, 1024) argmin 0.018070 0.024448 0.015578 0.018135 0.025370 0.015322 (1024, 1024) argmax 0.018427 0.024859 0.015932 0.018164 0.024452 0.015639 (1024, 1024) var 0.020078 0.026413 0.020295 0.020199 0.026381 0.020214 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (2048, 2048) mean 0.023826 0.023726 0.022273 0.023236 0.023776 0.022248 (2048, 2048) sum 0.023840 0.023355 0.021974 0.023294 0.023354 0.021884 (2048, 2048) min 0.024519 0.041263 0.024620 0.023292 0.041491 0.024358 (2048, 2048) max 0.024509 0.041670 0.024277 0.023334 0.041231 0.024395 (2048, 2048) argmin 0.026125 0.041282 0.024567 0.026772 0.041773 0.024296 (2048, 2048) argmax 0.026117 0.041487 0.024572 0.026412 0.041477 0.024273 (2048, 2048) var 0.026603 0.048581 0.031308 0.027587 0.048603 0.030860 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (4096, 4096) mean 0.053927 0.057070 0.054073 0.053028 0.057544 0.053935 (4096, 4096) sum 0.053604 0.057410 0.054451 0.053076 0.057033 0.054266 (4096, 4096) min 0.054293 0.109122 0.058363 0.053821 0.108689 0.058382 (4096, 4096) max 0.054258 0.108035 0.058703 0.053492 0.110552 0.058376 (4096, 4096) argmin 0.056805 0.111167 0.058301 0.056836 0.112325 0.058292 (4096, 4096) argmax 0.056488 0.110958 0.058636 0.056844 0.111000 0.057928 (4096, 4096) var 0.058936 0.141755 0.068693 0.059735 0.141284 0.068500 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (8192, 8192) mean 0.145552 0.148082 0.138647 0.145364 0.147818 0.138207 (8192, 8192) sum 0.145985 0.147900 0.138714 0.145755 0.148031 0.138616 (8192, 8192) min 0.146566 0.205359 0.192739 0.145611 0.205237 0.182335 (8192, 8192) max 0.146526 0.204844 0.193050 0.146073 0.205457 0.182697 (8192, 8192) argmin 0.150190 0.206605 0.192543 0.150654 0.206847 0.182007 (8192, 8192) argmax 0.150481 0.206368 0.192535 0.150845 0.206430 0.182022 (8192, 8192) var 0.150884 0.184546 0.203900 0.151594 0.184172 0.197983 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1, 1024, 128) mean 0.014293 0.008119 0.014533 0.013861 0.008022 0.014449 (1, 1024, 128) sum 0.014039 0.007877 0.014111 0.014219 0.008227 0.014045 (1, 1024, 128) min 0.014159 0.011354 0.023493 0.014271 0.010862 0.023644 (1, 1024, 128) max 0.014154 0.011027 0.023368 0.014259 0.011234 0.023692 (1, 1024, 128) argmin 0.016403 0.005677 0.023328 0.016273 0.005683 0.024073 (1, 1024, 128) argmax 0.016734 0.005675 0.023437 0.016580 0.005318 0.023331 (1, 1024, 128) var 0.018338 0.009549 0.025538 0.018528 0.009391 0.024777 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (5, 1024, 128) mean 0.014873 0.010131 0.015546 0.015123 0.010131 0.015481 (5, 1024, 128) sum 0.015334 0.009673 0.015824 0.014736 0.009671 0.015438 (5, 1024, 128) min 0.015047 0.013252 0.024573 0.014803 0.013163 0.024551 (5, 1024, 128) max 0.015050 0.013339 0.024197 0.014810 0.013525 0.024230 (5, 1024, 128) argmin 0.017341 0.012737 0.024306 0.017471 0.012379 0.024991 (5, 1024, 128) argmax 0.017345 0.012411 0.024421 0.017422 0.012471 0.024237 (5, 1024, 128) var 0.019973 0.011453 0.026188 0.020050 0.011438 0.026282 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (10, 1024, 128) mean 0.016976 0.011575 0.016831 0.016722 0.011927 0.017173 (10, 1024, 128) sum 0.017039 0.011841 0.017159 0.016385 0.011860 0.016753 (10, 1024, 128) min 0.017036 0.015331 0.026770 0.016944 0.015205 0.027166 (10, 1024, 128) max 0.017369 0.015348 0.027077 0.016531 0.015716 0.026819 (10, 1024, 128) argmin 0.019203 0.014447 0.026813 0.018994 0.014497 0.027313 (10, 1024, 128) argmax 0.019563 0.014795 0.027140 0.019460 0.014912 0.026733 (10, 1024, 128) var 0.020529 0.014316 0.030405 0.020719 0.013960 0.029964 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (100, 1024, 128) mean 0.045046 0.039168 0.046082 0.044839 0.039217 0.045782 (100, 1024, 128) sum 0.045094 0.039150 0.045777 0.044496 0.039542 0.046083 (100, 1024, 128) min 0.045768 0.054466 0.076244 0.044915 0.053943 0.076599 (100, 1024, 128) max 0.045748 0.054459 0.076188 0.044931 0.053949 0.076856 (100, 1024, 128) argmin 0.048275 0.054046 0.076647 0.048694 0.054105 0.077004 (100, 1024, 128) argmax 0.048267 0.054395 0.077401 0.048691 0.054131 0.076751 (100, 1024, 128) var 0.049710 0.043254 0.083077 0.050971 0.043251 0.082378 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1000, 1000, 100) mean 0.202312 0.196723 0.197765 0.201774 0.196641 0.197459 (1000, 1000, 100) sum 0.202651 0.196682 0.197736 0.202175 0.196313 0.197523 (1000, 1000, 100) min 0.203022 0.264762 0.269200 0.202729 0.264129 0.268694 (1000, 1000, 100) max 0.202864 0.264396 0.269388 0.202486 0.263896 0.268720 (1000, 1000, 100) argmin 0.226727 0.263781 0.268651 0.226597 0.264676 0.268983 (1000, 1000, 100) argmax 0.226412 0.264469 0.269090 0.226570 0.264595 0.269178 (1000, 1000, 100) var 0.243223 0.204079 0.216096 0.241942 0.204079 0.215925 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (10000, 100) mean 0.016193 0.020277 0.014316 0.016152 0.020324 0.013712 (10000, 100) sum 0.016289 0.020237 0.014034 0.016168 0.020265 0.013708 (10000, 100) min 0.016046 0.030872 0.019609 0.016208 0.030867 0.018627 (10000, 100) max 0.016369 0.030835 0.019257 0.016218 0.030861 0.018209 (10000, 100) argmin 0.017957 0.031171 0.019517 0.018050 0.031556 0.018077 (10000, 100) argmax 0.017961 0.031658 0.019521 0.018060 0.031564 0.018087 (10000, 100) var 0.020393 0.035652 0.019339 0.020144 0.035987 0.019171 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (100000, 10) mean 0.015718 0.016576 0.016555 0.015999 0.016246 0.014869 (100000, 10) sum 0.015833 0.016247 0.016572 0.016007 0.016627 0.014872 (100000, 10) min 0.015888 0.020510 0.023920 0.015671 0.020821 0.021417 (100000, 10) max 0.015889 0.020479 0.023918 0.016077 0.020386 0.021421 (100000, 10) argmin 0.018233 0.020863 0.023647 0.017574 0.020864 0.021103 (100000, 10) argmax 0.017896 0.020527 0.023296 0.017569 0.020447 0.021098 (100000, 10) var 0.020005 0.024198 0.024372 0.020075 0.024167 0.022415 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1023, 1023, 1023) mean 1.874816 1.963506 1.903909 1.873279 1.963859 1.903230 (1023, 1023, 1023) sum 1.875030 1.965716 1.902458 1.873566 1.960730 1.901642 (1023, 1023, 1023) min 1.878563 2.473455 2.179092 1.875174 2.482086 2.183027 (1023, 1023, 1023) max 1.879128 2.474803 2.178895 1.874831 2.482253 2.183884 (1023, 1023, 1023) argmin 1.921800 2.476629 2.174831 1.923987 2.472641 2.170453 (1023, 1023, 1023) argmax 1.922605 2.476688 2.177927 1.923366 2.472808 2.172979 (1023, 1023, 1023) var 1.972606 3.088695 2.758797 1.978679 3.095658 2.762243 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1023, 1023, 255) mean 0.489984 0.500954 0.492957 0.489891 0.500654 0.491971 (1023, 1023, 255) sum 0.490228 0.500764 0.492289 0.489624 0.501089 0.492824 (1023, 1023, 255) min 0.491457 0.563560 0.553334 0.490355 0.564709 0.554754 (1023, 1023, 255) max 0.491396 0.563628 0.553345 0.490017 0.565004 0.554947 (1023, 1023, 255) argmin 0.503666 0.561512 0.551831 0.503845 0.560972 0.551017 (1023, 1023, 255) argmax 0.503602 0.561185 0.551407 0.504328 0.561267 0.551448 (1023, 1023, 255) var 0.510844 0.709452 0.701630 0.512693 0.710365 0.701965 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ (1023, 1023, 377) mean 0.707439 0.727646 0.712019 0.706769 0.727101 0.711632 (1023, 1023, 377) sum 0.707780 0.727453 0.711554 0.706807 0.726656 0.711729 (1023, 1023, 377) min 0.709423 0.819809 0.794379 0.707847 0.822086 0.796664 (1023, 1023, 377) max 0.709297 0.819780 0.794308 0.707566 0.821913 0.796690 (1023, 1023, 377) argmin 0.725028 0.817088 0.791695 0.726039 0.816445 0.790828 (1023, 1023, 377) argmax 0.725301 0.817011 0.791420 0.726040 0.816917 0.791143 (1023, 1023, 377) var 0.740859 1.034165 1.006712 0.743413 1.035506 1.007638 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164790 Approved by: https://github.com/ngimel, https://github.com/eqy	2025-10-09 18:08:30 +00:00
Jithun Nair	ee6a1ecb0a	[ROCm] Enable MI355 CI on PRs, and run full set of UTs on PRs (#160215 ) Useful to have PR testing for PRs such as https://github.com/pytorch/pytorch/pull/151360 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160215 Approved by: https://github.com/malfet, https://github.com/atalman Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-09 18:03:12 +00:00
Lakshay Garg	3c0577bd15	Remove shared_ptr from MHAGraphCache (#164895 ) This commit makes several cleanup changes to MHA.cpp, the main one of which is removal of shared_ptr from MHAGraphCache as the cache does not actually intend to share ownership. The changes are: 1. Remove shared_ptr from MHAGraphCache 2. Remove template arguments from MHAGraphCache 3. Remove unnecessary optional<shared_ptr<...>> vars 4. Change some functions with auto return type to the actual type Pull Request resolved: https://github.com/pytorch/pytorch/pull/164895 Approved by: https://github.com/eqy	2025-10-09 17:44:28 +00:00
PyTorch MergeBot	688efd9741	Revert "Enable mimalloc on non-Windows platforms and make default for AArch64 builds (#164741 )" This reverts commit 87eccf10e8484c9e59ef81ae7bdee68d3db4f605. Reverted https://github.com/pytorch/pytorch/pull/164741 on behalf of https://github.com/malfet due to But it breaks MacOS builds, see https://github.com/pytorch/pytorch/actions/runs/18382886648/job/52373781138 ([comment](https://github.com/pytorch/pytorch/pull/164741#issuecomment-3386859778))	2025-10-09 17:30:25 +00:00
PyTorch MergeBot	91040f4934	Revert "[Code Clean] Remove support of python3.9 (#163846 )" This reverts commit bc1690c7e859dee8c47a7f0bbd3c43cc27c6fd2a. Reverted https://github.com/pytorch/pytorch/pull/163846 on behalf of https://github.com/izaitsevfb due to breaks distributed tests ([comment](https://github.com/pytorch/pytorch/pull/163846#issuecomment-3386855437))	2025-10-09 17:27:08 +00:00
Murray Steele	87eccf10e8	Enable mimalloc on non-Windows platforms and make default for AArch64 builds (#164741 ) This change removes the Windows requirement for mimalloc builds, and makes mimalloc the default c10 system allocator for AArch64 builds. This significantly improves the performance of AArch64 builds of PyTorch as large allocations are better cached by mimalloc than glibc. Updated Results Torchbench FP32 eager Inference, 16 threads: <img width="1510" height="733" alt="mimalloc-v2-fp32-diff" src="https://github.com/user-attachments/assets/7fe3ea0c-3b52-42e7-879b-612444479c90" /> Torchbench BF16 eager Inference, 16 threads: <img width="1510" height="733" alt="mimalloc-v2-bf16-diff" src="https://github.com/user-attachments/assets/56469a72-9e06-4d57-ae2a-aeb139ca79a3" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164741 Approved by: https://github.com/fadara01, https://github.com/aditew01, https://github.com/malfet	2025-10-09 16:45:31 +00:00
Ryo Suzuki	5d459dd609	avoid bit cast for bfloat16_t (#159946 ) using bit_cast<bfloat16_t> triggers a static_assert, so replace it with intrinsics. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159946 Approved by: https://github.com/aditew01, https://github.com/malfet	2025-10-09 16:42:49 +00:00
albanD	24d69c57cb	Add view support for library custom Function (#164520 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164520 Approved by: https://github.com/soulitzer, https://github.com/ezyang	2025-10-09 16:17:48 +00:00
Catherine Lee	eaa02655ea	[CI] Run cpp tests on windows in one run_tests call (#164861 ) The windows cpp tests take ~1 hour according to logs. Each has run_test called on them individually, so I tried batching them together so it's just one run_test call for all of them. I believe it now takes 30min. I turned off TD since I don't think cpp tests are included in TD stuff. As always with batch, I'm not sure if the errorlevel/error surfacing stuff is correct This code is written with a lot of help from chatgpu and copilot Pull Request resolved: https://github.com/pytorch/pytorch/pull/164861 Approved by: https://github.com/huydhn	2025-10-09 16:07:28 +00:00
Manuel Candales	aea57b3aa3	AOTI MPS Shim Implementation (#163865 ) ## MPS Shim API * Updated MPS shimification API with handles and function declarations: * `AOTIMetalShaderLibraryHandle` and `AOTIMetalKernelFunctionHandle` types * Library management: `aoti_torch_mps_create_shader_library`, `aoti_torch_mps_delete_shader_library`, `aoti_torch_mps_get_kernel_function` * Kernel execution: `aoti_torch_mps_run_command_block`, `aoti_torch_mps_start_encoding`, `aoti_torch_mps_dispatch` variants, etc ## MPS Shader Codegen * Modified to generate source constants instead of direct `DynamicMetalShaderLibrary` instantiation: * Before: `at::native::mps::DynamicMetalShaderLibrary mps_lib_0(R"MTL(...)MTL");` * After: `const char* mps_lib_0_source = R"MTL(...)MTL";` * Updated kernel call generation to use shimified functions: * Generates calls to shimified API instead of direct libtorch calls ## Before vs After Comparison ### Section 1: Shader Library Before (Direct Library Object) ```cpp at::native::mps::DynamicMetalShaderLibrary mps_lib_0(R"MTL( ... )MTL"); ``` After (Source String) ```cpp const char* mps_lib_0_source = (R"MTL( ... )MTL"); ``` ### Section 2: Getter Functions & RAII Management Before (Direct Library Access) ```cpp const std::shared_ptr<at::native::mps::MetalKernelFunction> get_mps_lib_0() { static const auto func = mps_lib_0.getKernelFunction("generated_kernel"); return func; } AOTIMetalKernelFunctionHandle get_mps_lib_0_handle() { static const auto handle = AOTIMetalKernelFunctionHandle(get_mps_lib_0().get()); return handle; } ``` After (Shim API + RAII Wrapper) ```cpp AOTIMetalKernelFunctionHandle get_mps_lib_0_handle() { static auto kernel_handle = []() { AOTIMetalShaderLibraryHandle lib_handle = nullptr; AOTIMetalKernelFunctionHandle kern_handle = nullptr; aoti_torch_mps_create_shader_library(mps_lib_0_source, &lib_handle); aoti_torch_mps_get_kernel_function(lib_handle, "generated_kernel", &kern_handle); // RAII wrapper with custom deleter auto lib_deleter = [](AOTIMetalShaderLibraryHandle h) {{ if (h) aoti_torch_mps_delete_shader_library(h); }}; using LibDeleter = decltype(lib_deleter); using LibPtr = std::unique_ptr<AOTIMetalShaderLibraryOpaque, LibDeleter>; // Return pair of kernel handle and library smart pointer for cleanup return std::make_pair(kern_handle, LibPtr(lib_handle, lib_deleter)); }(); return kernel_handle.first; } ``` ### Section 3: Runtime Execution Before (Direct Library Methods) ```cpp void AOTInductorModel::run_impl(...) { ... get_mps_lib_0()->runCommandBlock([&] { get_mps_lib_0()->startEncoding(); aoti_torch_mps_set_arg_tensor(get_mps_lib_0_handle(), 0, buf0); aoti_torch_mps_set_arg_tensor(get_mps_lib_0_handle(), 1, arg0_1); aoti_torch_mps_set_arg_tensor(get_mps_lib_0_handle(), 2, arg1_1); get_mps_lib_0()->dispatch({static_cast<uint64_t>(10LL)}); }); ... } // AOTInductorModel::run_impl ``` After (Shim API with Lambda Pattern) ```cpp void AOTInductorModel::run_impl(...) { ... auto mps_lib_0_lambda_0 = [&](AOTIMetalKernelFunctionHandle handle) { aoti_torch_mps_start_encoding(handle); aoti_torch_mps_set_arg_tensor(handle, 0, buf0); aoti_torch_mps_set_arg_tensor(handle, 1, arg0_1); aoti_torch_mps_set_arg_tensor(handle, 2, arg1_1); aoti_torch_mps_dispatch_single(handle, static_cast<uint64_t>(10LL)); }; std::function<void(AOTIMetalKernelFunctionHandle)> mps_lib_0_func_wrapper_0 = mps_lib_0_lambda_0; aoti_torch_mps_run_command_block(get_mps_lib_0_handle(), aoti_torch_mps_shared_callback, &mps_lib_0_func_wrapper_0); ... } // AOTInductorModel::run_impl ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163865 Approved by: https://github.com/angelayi, https://github.com/desertfire	2025-10-09 16:06:36 +00:00
PyTorch MergeBot	3d1fa40ae1	Revert "[BC-Breaking] Remove long-deprecated casting functions from native_functions.yaml (#164641 )" This reverts commit 64108bdbed2f099d527060b4c9fdd5a11cad2afc. Reverted https://github.com/pytorch/pytorch/pull/164641 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/164641#issuecomment-3386346474))	2025-10-09 15:42:51 +00:00
Markus Hoehnerbach	a7fa1a91e3	fix flex attention eager bwd: more rounding (#164317 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164317 Approved by: https://github.com/drisspg ghstack dependencies: #163986	2025-10-09 15:40:49 +00:00
Tugsbayasgalan Manlaibaatar	afeec56a5a	Fix replacement reconstruct (#164937 ) If we return Dtensor, the object is created via fx graph call so we never needed to reconstruct them. But if there is side effect, we do need to reconstruct it. Differential Revision: [D84159000](https://our.internmc.facebook.com/intern/diff/D84159000) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164937 Approved by: https://github.com/StrongerXi	2025-10-09 15:31:23 +00:00
PaulZhang12	724463d5a2	Fix truediv numerics between eager and compile (#164144 ) Addresses numeric differences between eager and compile in https://github.com/pytorch/pytorch/issues/141753 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164144 Approved by: https://github.com/eellison, https://github.com/jansel, https://github.com/ngimel ghstack dependencies: #164997	2025-10-09 14:31:33 +00:00
PyTorch MergeBot	f79e212733	Revert "[CUDA][cuBLAS] addmm -- some refactoring for easier navigation between the Lt and non-Lt paths (#163955 )" This reverts commit ab94a0d544503b5c27e889b45e45ef8cf75c8183. Reverted https://github.com/pytorch/pytorch/pull/163955 on behalf of https://github.com/jeffdaily due to broke on cuda and rocm after landing though this PR had a clean signal initially ([comment](https://github.com/pytorch/pytorch/pull/163955#issuecomment-3386127145))	2025-10-09 14:24:56 +00:00
Thanh Ha	b28b24a9fc	Switch build jobs that use linux.12xlarge to c7i (#164941 ) This PR updates build jobs that currently use linux.12xlarge to the c7i varient which should increase build times by 15% - 20% depending on the job and reduce costs of these jobs by 10% - 15%. Signed-off-by: Thanh Ha <thanh.ha@linuxfoundation.org>	2025-10-09 09:58:52 -04:00
Laith Sakka	17c7170ca6	Fix Avoid DDE in item numel check (#164934 ) address https://github.com/pytorch/pytorch/issues/164725 and https://github.com/pytorch/pytorch/issues/164704 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164934 Approved by: https://github.com/ezyang, https://github.com/aorenste, https://github.com/Skylion007	2025-10-09 13:09:06 +00:00
Simon Layton	6a7f5c0d21	Add scaled_mm python API, test (#164142 ) Summary: * Add `torch.nn.functional.scaled_mm` as an abstraction around the C++ methods * Wraps `torch._scaled_mm_v2` API by default, but user can force use of the older `torch._scaled_mm` interface. * Scaled MM tests now run on the new API Test Plan: `pytest test/test_scaled_matmul_cuda.py` Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlaytonmeta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164142 Approved by: https://github.com/drisspg ghstack dependencies: #164141	2025-10-09 12:43:18 +00:00
Simon Layton	512b6b59f0	Add _scaled_mm_v2 API (#164141 ) Summary: * Add new scaled-MM API to future-proof / clean-up existing code. * Scaling is explicitly described rather than infer * Swizzling of scaled must now be defined (vs. inferred) * Adds API support for multi-level scaling * Refactor dispatch logic to make it easier to add new implementations Test Plan: Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlaytonmeta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164141 Approved by: https://github.com/drisspg	2025-10-09 12:43:18 +00:00
FFFrog	bc1690c7e8	[Code Clean] Remove support of python3.9 (#163846 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163846 Approved by: https://github.com/ezyang	2025-10-09 11:54:10 +00:00
Cui, Yifeng	53f5af8c92	Update torch-xpu-ops commit pin (#164237 ) Update the torch-xpu-ops commit to [intel/torch-xpu-ops@f30173](`f301733b03`), includes: - Install xpu internal headers to PyTorch - Fix error handling for BatchLinearAlgebra Ops - Fix unnecessary double data type conversion - Fix overflow when calculating workgroups count - Fix segmentation fault and calculation error in AveragePool2dKernel Pull Request resolved: https://github.com/pytorch/pytorch/pull/164237 Approved by: https://github.com/EikanWang	2025-10-09 10:38:59 +00:00
PyTorch MergeBot	4412026949	Revert "AOTI MPS Shim Implementation (#163865 )" This reverts commit 874efa2d72d83b00894097130f18062ce331a265. Reverted https://github.com/pytorch/pytorch/pull/163865 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/163865#issuecomment-3385196387))	2025-10-09 10:26:01 +00:00
PyTorch MergeBot	06d86e58d0	Revert "Do not decompose in functionalization/proxy tensor if autograd wouldn't have decomposed (#164939 )" This reverts commit d40a9bfb8da0dc1ac1e6e56b33a25979112874de. Reverted https://github.com/pytorch/pytorch/pull/164939 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/164939#issuecomment-3385056722))	2025-10-09 09:50:59 +00:00
Manuel Candales	874efa2d72	AOTI MPS Shim Implementation (#163865 ) ## MPS Shim API * Updated MPS shimification API with handles and function declarations: * `AOTIMetalShaderLibraryHandle` and `AOTIMetalKernelFunctionHandle` types * Library management: `aoti_torch_mps_create_shader_library`, `aoti_torch_mps_delete_shader_library`, `aoti_torch_mps_get_kernel_function` * Kernel execution: `aoti_torch_mps_run_command_block`, `aoti_torch_mps_start_encoding`, `aoti_torch_mps_dispatch` variants, etc ## MPS Shader Codegen * Modified to generate source constants instead of direct `DynamicMetalShaderLibrary` instantiation: * Before: `at::native::mps::DynamicMetalShaderLibrary mps_lib_0(R"MTL(...)MTL");` * After: `const char* mps_lib_0_source = R"MTL(...)MTL";` * Updated kernel call generation to use shimified functions: * Generates calls to shimified API instead of direct libtorch calls ## Before vs After Comparison ### Section 1: Shader Library Before (Direct Library Object) ```cpp at::native::mps::DynamicMetalShaderLibrary mps_lib_0(R"MTL( ... )MTL"); ``` After (Source String) ```cpp const char* mps_lib_0_source = (R"MTL( ... )MTL"); ``` ### Section 2: Getter Functions & RAII Management Before (Direct Library Access) ```cpp const std::shared_ptr<at::native::mps::MetalKernelFunction> get_mps_lib_0() { static const auto func = mps_lib_0.getKernelFunction("generated_kernel"); return func; } AOTIMetalKernelFunctionHandle get_mps_lib_0_handle() { static const auto handle = AOTIMetalKernelFunctionHandle(get_mps_lib_0().get()); return handle; } ``` After (Shim API + RAII Wrapper) ```cpp AOTIMetalKernelFunctionHandle get_mps_lib_0_handle() { static auto kernel_handle = []() { AOTIMetalShaderLibraryHandle lib_handle = nullptr; AOTIMetalKernelFunctionHandle kern_handle = nullptr; aoti_torch_mps_create_shader_library(mps_lib_0_source, &lib_handle); aoti_torch_mps_get_kernel_function(lib_handle, "generated_kernel", &kern_handle); // RAII wrapper with custom deleter auto lib_deleter = [](AOTIMetalShaderLibraryHandle h) {{ if (h) aoti_torch_mps_delete_shader_library(h); }}; using LibDeleter = decltype(lib_deleter); using LibPtr = std::unique_ptr<AOTIMetalShaderLibraryOpaque, LibDeleter>; // Return pair of kernel handle and library smart pointer for cleanup return std::make_pair(kern_handle, LibPtr(lib_handle, lib_deleter)); }(); return kernel_handle.first; } ``` ### Section 3: Runtime Execution Before (Direct Library Methods) ```cpp void AOTInductorModel::run_impl(...) { ... get_mps_lib_0()->runCommandBlock([&] { get_mps_lib_0()->startEncoding(); aoti_torch_mps_set_arg_tensor(get_mps_lib_0_handle(), 0, buf0); aoti_torch_mps_set_arg_tensor(get_mps_lib_0_handle(), 1, arg0_1); aoti_torch_mps_set_arg_tensor(get_mps_lib_0_handle(), 2, arg1_1); get_mps_lib_0()->dispatch({static_cast<uint64_t>(10LL)}); }); ... } // AOTInductorModel::run_impl ``` After (Shim API with Lambda Pattern) ```cpp void AOTInductorModel::run_impl(...) { ... auto mps_lib_0_lambda_0 = [&](AOTIMetalKernelFunctionHandle handle) { aoti_torch_mps_start_encoding(handle); aoti_torch_mps_set_arg_tensor(handle, 0, buf0); aoti_torch_mps_set_arg_tensor(handle, 1, arg0_1); aoti_torch_mps_set_arg_tensor(handle, 2, arg1_1); aoti_torch_mps_dispatch_single(handle, static_cast<uint64_t>(10LL)); }; std::function<void(AOTIMetalKernelFunctionHandle)> mps_lib_0_func_wrapper_0 = mps_lib_0_lambda_0; aoti_torch_mps_run_command_block(get_mps_lib_0_handle(), aoti_torch_mps_shared_callback, &mps_lib_0_func_wrapper_0); ... } // AOTInductorModel::run_impl ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163865 Approved by: https://github.com/angelayi, https://github.com/desertfire	2025-10-09 09:28:10 +00:00
PyTorch MergeBot	e09fb44ef1	Revert "Fix truediv numerics between eager and compile (#164144 )" This reverts commit d386325ca9a142419f45b987391f4bb175dd7d0b. Reverted https://github.com/pytorch/pytorch/pull/164144 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/164144#issuecomment-3384769092))	2025-10-09 08:40:52 +00:00
PyTorch MergeBot	5b8174bc28	Revert "[vllm hash update] update the pinned vllm hash (#164628 )" This reverts commit 7b691546d2949790ffc8f6bd3c674faa6a46ff7c. Reverted https://github.com/pytorch/pytorch/pull/164628 on behalf of https://github.com/huydhn due to There are some broken vLLM tests ([comment](https://github.com/pytorch/pytorch/pull/164628#issuecomment-3384560957))	2025-10-09 07:43:02 +00:00
PyTorch MergeBot	5209c8ce07	Revert "Fix Avoid DDE in item numel check (#164934 )" This reverts commit a9a9a3438a374f96a308b707a1718036aaec790d. Reverted https://github.com/pytorch/pytorch/pull/164934 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/164934#issuecomment-3384390621))	2025-10-09 06:57:03 +00:00
Yuanyuan Chen	f231be25c6	Mark unused parameters in C++ code (#164912 ) This PR adds unused parameter name comments in C++ declarations to improve code readability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164912 Approved by: https://github.com/Skylion007	2025-10-09 06:23:25 +00:00
PyTorch MergeBot	a753ffa9af	Revert "Use runner with more memory for ASAN builds (#165000 )" This reverts commit f5fd18f7e24378bd9eb91404f697f1c81a8187d5. Reverted https://github.com/pytorch/pytorch/pull/165000 on behalf of https://github.com/izaitsevfb due to not sure how, but this broke lint ([comment](https://github.com/pytorch/pytorch/pull/165000#issuecomment-3384286412))	2025-10-09 06:22:28 +00:00
Laith Sakka	a9a9a3438a	Fix Avoid DDE in item numel check (#164934 ) address https://github.com/pytorch/pytorch/issues/164725 and https://github.com/pytorch/pytorch/issues/164704 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164934 Approved by: https://github.com/ezyang, https://github.com/aorenste, https://github.com/Skylion007	2025-10-09 06:06:25 +00:00
Seonmyeong Bak	263db92563	Add knobs in FR dump by watchdog (stacktrace and only active collectives) and trigger FR even on any exceptions (#164591 ) This PR includes a couple of changes to extend FlightRecorder dump by PyTorch watchdog - New knobs to control FR dump as suggested in the public documentation even for watchdog (TORCH_INCLUDE_STACK_TRACE, TORCH_INCLUDE_ONLY_ACTIVE) - Trigger the flight recorder dump on exceptions which could be triggered by any CUDA / host side error (TORCH_NCCL_EXTRA_DUMP_ON_EXEC) -> Can be used as a snapshot of the workload progress for post-mortem analysis Pull Request resolved: https://github.com/pytorch/pytorch/pull/164591 Approved by: https://github.com/fduwjj	2025-10-09 05:33:35 +00:00
Nicolas Macchioni	ed6156e3ea	non-fb impls + unit tests (#164722 ) Test Plan: ``` buck test fbcode//mode/opt caffe2/test/inductor:caching ``` Differential Revision: D83714692 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164722 Approved by: https://github.com/NikhilAPatel, https://github.com/adamomainz	2025-10-09 05:10:57 +00:00
Edward Z. Yang	d40a9bfb8d	Do not decompose in functionalization/proxy tensor if autograd wouldn't have decomposed (#164939 ) This fixes AOTAutograd rms_norm not being bitwise equivalent to eager, because it avoids a decomposition. You can force the decomposition by having the decomposition in the dispatch table, but if eager mode wouldn't have decomposed (because it went to the fused one), we now default to preserving the fused call by default. This largely reverts https://github.com/pytorch/pytorch/pull/103275/ for view ops. This means that in inference mode we could hit the wrong C++ kernel; if this occurs we should just SymInt'ify the C++ kernel. Another neat side effect of this change is that Inductor's generated kernels for rms_norm now have rms_norm in their name. Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164939 Approved by: https://github.com/bdhirsh ghstack dependencies: #164573	2025-10-09 04:49:44 +00:00
Sherlock Huang	e532f62e0d	Introduce joint_custom_pass callback (#164981 ) ``` def joint_custom_pass(joint_gm: torch.fx.GraphModule, joint_inputs): # apply your pass for joint graph here return joint_gm class M(torch.nn.Module): def forward(self, x): return x.sin() x = torch.randn(10, requires_grad=False) compiled_fn = torch.compile(M(), backend="aot_eager") with torch._functorch.config.patch("joint_custom_pass", joint_custom_pass): out = compiled_fn(x) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164981 Approved by: https://github.com/ezyang, https://github.com/anijain2305	2025-10-09 04:40:54 +00:00
Pian Pawakapan	1f73b96668	[PGO] log missing sources in allowlist (#164881 ) Summary: - logs missing dynamic sources - emits MLHub insight only on size mismatch recompiles Test Plan: test_pgo Differential Revision: D84098898 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164881 Approved by: https://github.com/bobrenjc93	2025-10-09 04:39:09 +00:00
PyTorch UpdateBot	7b691546d2	[vllm hash update] update the pinned vllm hash (#164628 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164628 Approved by: https://github.com/pytorchbot	2025-10-09 04:35:36 +00:00
PaulZhang12	f05e23e1bc	Add less warps config to inner reductions (#162447 ) Add less warps to ensure proper vectorization + memory coalescing for inner reductions, prefer more work per thread <img width="1717" height="731" alt="Screenshot 2025-09-17 at 10 03 25 AM" src="https://github.com/user-attachments/assets/7b1f4a30-62f2-4bee-bb9c-122501bde63e" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162447 Approved by: https://github.com/v0i0, https://github.com/eellison, https://github.com/shunting314	2025-10-09 04:22:16 +00:00
PaulZhang12	d386325ca9	Fix truediv numerics between eager and compile (#164144 ) Addresses numeric differences between eager and compile in https://github.com/pytorch/pytorch/issues/141753 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164144 Approved by: https://github.com/eellison, https://github.com/jansel, https://github.com/ngimel ghstack dependencies: #164997	2025-10-09 04:22:03 +00:00
Maggie Moss	7457d139c5	Add pyrefly suppressions to torch/distributed (7/n) (#165002 ) Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283 One more PR after this one. Test plan: dmypy restart && python3 scripts/lintrunner.py -a pyrefly check step 1: delete lines in the pyrefly.toml file from the project-excludes field step 2: run pyrefly check step 3: add suppressions, clean up unused suppressions before: https://gist.github.com/maggiemoss/4b3bf2037014e116bc00706a16aef199 after: INFO 0 errors (6,884 ignored) Pull Request resolved: https://github.com/pytorch/pytorch/pull/165002 Approved by: https://github.com/oulgen	2025-10-09 04:08:25 +00:00
Nikita Vedeneev	ab94a0d544	[CUDA][cuBLAS] addmm -- some refactoring for easier navigation between the Lt and non-Lt paths (#163955 ) As per title. Additionally, some Lt selection conditions are revisited, and some redundancy removed (especially in the ROCm vs non-ROCm paths). Pull Request resolved: https://github.com/pytorch/pytorch/pull/163955 Approved by: https://github.com/ngimel, https://github.com/eqy	2025-10-09 04:07:45 +00:00
Animesh Jain	0e9b3a772a	[export] Turn on install_free_tensors flag (#164691 ) The final step in removing the discrepancy between torch.compile(fullgraph=True) and torch.export(strict=True). Pull Request resolved: https://github.com/pytorch/pytorch/pull/164691 Approved by: https://github.com/avikchaudhuri ghstack dependencies: #164721	2025-10-09 03:25:15 +00:00
Animesh Jain	af7ca55ced	[export][dynamo] Fallback to slowpath for MultiHeadAttention for strict export (#164721 ) In https://github.com/pytorch/pytorch/pull/106824, export decided to slow-path for MultiHeadAttention module (look into the PR description as to why). But that PR eventually caused a divergence between Dynamo and export. Today, strict-export does not inline into builtin modules (like MultiHeadAttention), and therefore make_fx sees the original nn.Module and takes the slow path. But compile inlines into the nn module, and at this time the condition `_is_make_fx_tracing` is False. As a result, Dynamo takes a fast path, resulting in a different op being called. This divergence is undesirable. There are 2 ways to fix it 1) Make export take the fast path - As explained in the https://github.com/pytorch/pytorch/pull/106824 , this might be difficult. So, we go to (2) 2) Make compile as well take the slow path - This is easy to implement. The con here is that Pytorch eager and compile will use different operators, which can cause numerics issues etc. Since (2) is easy to do, we will follow this path. We are tracking the issue in https://github.com/pytorch/pytorch/issues/164062 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164721 Approved by: https://github.com/avikchaudhuri, https://github.com/tugsbayasgalan	2025-10-09 03:25:15 +00:00
Yuanyuan Chen	a029675f6f	More ruff SIM fixes (#164695 ) This PR applies ruff `SIM` rules to more files. Most changes are about simplifying `dict.get` because `None` is already the default value. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164695 Approved by: https://github.com/ezyang	2025-10-09 03:24:50 +00:00
PaulZhang12	54ae61c573	Change test_emulate_precision_casts_mean_ratio_chain from gelu to relu (#164997 ) gelu can be instable on local builds due to libdevice differences, as we lower to libdevice.erf. That combined with the semantics in the test can lead to catastrophic cancellation. We switch this test from gelu to relu to fix this instability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164997 Approved by: https://github.com/eellison, https://github.com/jansel	2025-10-09 03:14:05 +00:00
Jeddie Ji	2fe37b5fde	[RecSys][Combo Kernel] skip combo kernel generation if parition group is empty (#164918 ) Summary: Noticed sometimes the combo kernel partition will contain empty group. Skip kernel generation in this case to unblock head model launching. The change in this diff is safe, but it's better to root cause why empty group is being created. Test Plan: Lowering passed after applying the diff Differential Revision: D84134471 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164918 Approved by: https://github.com/mlazos	2025-10-09 02:55:23 +00:00
ruisizhang123	96d91da792	[dynamo] allow placement subclass to be traceble (#164985 ) This pr is to unblock SimpleFSDP+`gradient_divide_factor` [here](https://github.com/pytorch/torchtitan/pull/1793). We will need to create a subclass for DTensor `Partial` placement. When tracing `SimpleFSDPPartial`, I hit the assertion error that `SimpleFSDPPartial` is not in `ok_types`. I'm updating the code to check placement dtype via `isinstance` instead of `type(val)`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164985 Approved by: https://github.com/ezyang, https://github.com/eellison	2025-10-09 01:44:21 +00:00
Ivan Zaitsev	f5fd18f7e2	Use runner with more memory for ASAN builds (#165000 ) An attempt to [address OOM here](`aed5ed1076/1`). Pull Request resolved: https://github.com/pytorch/pytorch/pull/165000 Approved by: https://github.com/seemethere, https://github.com/malfet, https://github.com/huydhn	2025-10-09 01:09:28 +00:00
fduwjj	8ca986ee60	[fr] Enable reset the FR recording for fault tolerance (#164988 ) We also want to have a python side API for users to reset FR recording for FR entries. We don't need to reset the PGNCCL's member counter since we are creating new PGNCCL anyway. FR is a global ring buffer, so we need to reset it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164988 Approved by: https://github.com/tushar00jain ghstack dependencies: #164752	2025-10-09 01:03:01 +00:00
atalman	81dbeb06f4	CUDA aarch64 12.6 and 12.8 builds fix triton constraints (#165013 ) Since we have introduced CUDA aarch64 builds for all cuda versions we need to remove this constraint. This was missed by https://github.com/pytorch/pytorch/pull/162364 Proper constraint on triton should be: ``` Requires-Dist: triton==3.5.0; platform_system == "Linux" ``` not: ``` Requires-Dist: triton==3.5.0; platform_system == "Linux" and platform_machine == "x86_64" ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/165013 Approved by: https://github.com/Camyll, https://github.com/nWEIdia, https://github.com/tinglvv	2025-10-09 00:49:28 +00:00
fduwjj	7a1ead755f	[DeviceMesh] Add a warning for slicing flattened dim from root mesh and types for _get_slice_mesh_layout (#164993 ) As title, we want to add a deprecate warning for slicing flattened dim from root mesh. Also cosmetic changes for adding types for `_get_slice_mesh_layout`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164993 Approved by: https://github.com/fegin ghstack dependencies: #164750, #164954	2025-10-09 00:47:08 +00:00
Boyuan Feng	90b4e130d6	[Benchmark] cleanup torchbench models (#164816 ) Prune models from TorchInductor dashboard to reduce ci cost. This PR prunes torchbench models according to the [doc](https://docs.google.com/document/d/1nLPNNAU-_M9Clx9FMrJ1ycdPxe-xRA54olPnsFzdpoU/edit?tab=t.0), which removes timm and huggingface models from torchbench. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164816 Approved by: https://github.com/anijain2305, https://github.com/seemethere, https://github.com/huydhn, https://github.com/malfet	2025-10-09 00:31:25 +00:00
Animesh Jain	4308b8a28f	[dynamo] Support torch.fx.traceback.annotate (#164678 ) Builds on top of https://github.com/pytorch/pytorch/pull/163673 and https://github.com/pytorch/pytorch/pull/164174. This will be used in the followup PRs to apply regional inductor compilation. The existing implementation let Dynamo trace into the `torch.fx.traceback.annotate`, but thats not what we want. We want Dynamo to essentially run the torch.fx.traceback.annotate function in eager, so that every Fx node created in Dynamo Fx graph has the custom meta node. What does not work? * We still have to set the context manager `torch.fx.traceback.preserve_node_meta()` in the user code because CI was unhappy. This can be fixed but with some perseverance. * This does not work with graph breaks yet. But we can solve that problem, if needed, in a separate PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164678 Approved by: https://github.com/SherlockNoMad, https://github.com/jansel, https://github.com/xmfan	2025-10-08 22:41:00 +00:00
Nikita Shulga	94b1ec8c7c	[BE] Use torch check the way its intended (#164987 ) Replace `if (!foo) TORCH_CHECK(false, "bar");` with `TORCH_CHECK(foo, "bar");` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164987 Approved by: https://github.com/albanD, https://github.com/Skylion007	2025-10-08 22:28:08 +00:00
eellison	054268c9eb	Consider collective inputs to be deallocated only when wait is completed (#164945 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164945 Approved by: https://github.com/IvanKobzarev ghstack dependencies: #164738, #164783, #164944	2025-10-08 22:19:25 +00:00
eellison	af40828bbb	Limit coll bucketing within node idxs (#164944 ) Respect max_coll_distance from overlap scheduler in bucketing, also, add an optimization in path searching. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164944 Approved by: https://github.com/IvanKobzarev ghstack dependencies: #164738, #164783	2025-10-08 22:18:53 +00:00
bobrenjc93	5a1fbf45ad	[ez] remove unnecessary wrapper (#164720 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164720 Approved by: https://github.com/ydwu4	2025-10-08 22:12:29 +00:00
eellison	aed5ed1076	Refactor memory estimator to use node storages, add test (#164783 ) - Update the Memory Estimator to use node storages for analysis, which simplifies book keeping, as opposed to manually looking at operator schema. This will also allow me to reuse this component elsewhere. - Factor out into separate class, so that this same logic can be used in scheduling (node allocations / aliasing / uses) - Adds Tests for correctness - right now only on fwd/bwd by itself, not with both. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164783 Approved by: https://github.com/ruisizhang123 ghstack dependencies: #164738	2025-10-08 22:07:43 +00:00
William Wen	af4c29fea8	[dynamo, nested graph breaks] fix nested step graph break related issues (#162737 ) Turns out codegen'ing a nested step graph break is significantly more complicated than first thought. The optimized function should actually do: - call graph/load values/do side effects etc. - call into the leaf's resume function, but skipped (this essentially step graph break function for just the leaf function) - call into all the other resume functions, traced. This PR also adds `torch._dynamo.step_unsupported()`, which can be used for internal testing purposes to better test step graph break handling. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162737 Approved by: https://github.com/Lucaskabela ghstack dependencies: #160601	2025-10-08 22:02:52 +00:00
William Wen	486b4d2414	[dynamo, nested graph breaks] move cell codegen before side effects codegen (#160601 ) This is needed because if we codegen cells for nested frames AFTER side effects, then reconstruction could get messed up. From below: >The added test case demonstrates the reconstruction failure if we kept cell codegen at the original place (only happens with nested graph breaks since we reconstruct nested frame cells from VariableTracker rather than directly using LOAD_CLOSURE). >At a high level, what happened before this change was that side_effects was pruning the cells (I don't recall exactly why this happens), and because cells were codegen'd after the side effects were applied, we were unable to properly reconstruct the cell. The error I was seeing was a list/tuple IndexError. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160601 Approved by: https://github.com/mlazos	2025-10-08 22:02:52 +00:00
Hari Krishna Sai Kodali	8f83b3e71c	add device generalization support for distributed checkpoint tests (#159242 ) ## MOTIVATION To generalize Distributed checkpoint test cases for non-CUDA devices ## CHANGES 18 test files with minimal device abstraction changes updated in test/distributed/checkpoint/ - Use device_type from DTensorTestBase wherever appropriate - Replaced hard coded device names with torch.accelerator.current_accelerator() - extend multi gpu decrator for other devices test/distributed/checkpoint/test_state_dict_stager.py has large diff, that's because i changed the name cuda_obj to gpu_obj. Functional change is minimum. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159242 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-10-08 21:56:31 +00:00
Howard Huang	f0c9f3bddb	[PP] [BE] Remove runtime tests (#164962 ) BE cleaning up dead code since we migrated the Multi-stage schedules to use schedule execution runtime Pull Request resolved: https://github.com/pytorch/pytorch/pull/164962 Approved by: https://github.com/Skylion007 ghstack dependencies: #162016	2025-10-08 21:42:33 +00:00
Isalia20	1d182dd81c	[MPS] sparse norm (#164961 ) Norms for sparse mps tensors Pull Request resolved: https://github.com/pytorch/pytorch/pull/164961 Approved by: https://github.com/malfet	2025-10-08 21:41:42 +00:00
fduwjj	0b15f7ae05	[fr] Enable dynamic path write for FR dump when it comes to torchft (#164752 ) When it comes to FR dump, in the case of fault tolerance, users want to set the dump path to a different one when there is restart, so we just enable this case for users. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164752 Approved by: https://github.com/tushar00jain	2025-10-08 21:36:32 +00:00
Nikita Shulga	f1229b6db9	[BE] Remove manual IP address resolution (#164969 ) As https://github.com/pytorch/pytorch/issues/100400 has been closed a while back Pull Request resolved: https://github.com/pytorch/pytorch/pull/164969 Approved by: https://github.com/seemethere ghstack dependencies: #164968	2025-10-08 21:22:34 +00:00
Anshul Sinha	b1ac252f55	[Replicate][Test] tests that pp model grads are the same as single-device model grads (#164890 ) Summary: Created a test so that we can verify that a model that has been pipelined + replicated has the same gradients as a reference model. To do this, I mapped the layers and their parameters in each partial model to the original full model and then compared the gradients. Test Case 1. pytest test/distributed/_composable/test_composability/test_pp_composability.py -k test_replicate_pp_grads Pull Request resolved: https://github.com/pytorch/pytorch/pull/164890 Approved by: https://github.com/H-Huang	2025-10-08 21:07:05 +00:00
fduwjj	5ba11df4f8	[DeviceMesh] Make all members of DeviceMesh private and add public access API (#164954 ) This is mostly mechanical change which make device mesh members all private and use a public property API instead. This is not a BC breaking change since the new API still guarantee BC. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164954 Approved by: https://github.com/fegin ghstack dependencies: #164750	2025-10-08 21:04:07 +00:00
Nikita Shulga	15800888b6	[CI] Print GPU info during setup linux (#164968 ) I.e. run `nvidia-smi` if present Helps detecting what driver version this runner is on, which would have helped debugging some of the issues recently Pull Request resolved: https://github.com/pytorch/pytorch/pull/164968 Approved by: https://github.com/ngimel	2025-10-08 20:58:33 +00:00
Catherine Lee	e7ed1a00eb	Run inductor-perf-test-nightly-h100 once per day (#164967 ) To reduce inductor costs, though I'm not sure how much this one matters specifically since h100s are reserved Pull Request resolved: https://github.com/pytorch/pytorch/pull/164967 Approved by: https://github.com/BoyuanFeng	2025-10-08 20:58:19 +00:00
Shunting Zhang	2982406721	[inductor] ban benchmarking by default in deterministic mode (#164532 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164532 Approved by: https://github.com/eellison ghstack dependencies: #164801	2025-10-08 20:55:15 +00:00
Howard Huang	005c3d449e	Support custom callback functions in schedule (#162016 ) This is going to be used in https://github.com/pytorch/torchtitan/issues/1682 Add a `register_custom_function` to the `_PipelineScheduleRuntime` which allows users to implement any custom function to replace the runtime operation dynamically. The signature of the callback should look like: ```python class _CustomFunctionProtocol(Protocol): def __call__(self, action: _Action, ctx: _PipelineContext) -> None: ... ``` `_PipelineContext` contains a reference to the schedule which is executing the operations. ### Testing Added a test which adds custom methods for `FORWARD` and `OVERLAP_F_B` which are just the same implementations as those used in the default schedule runtime. Check that the schedule can still run, numerics are correct, and the callbacks are executed the correct number of times. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162016 Approved by: https://github.com/fegin	2025-10-08 20:43:26 +00:00
fduwjj	b2b3947565	[DeviceMesh] Remove private _set_mesh_dim_group_options API (#164750 ) We allow passing in PG option via https://github.com/pytorch/pytorch/pull/159371 and we did a clean up of Meta internal usage of `_set_mesh_dim_group_options`, since this a private API, we don't have any bc guarantee, we want to directly remove so that people use the new behavior from now on. Also since we now allow passing pg in both DeviceMesh constructor and flatten API, so that we also want to get rid of the global pg option override variable. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164750 Approved by: https://github.com/lw, https://github.com/fegin	2025-10-08 20:38:17 +00:00
Shunting Zhang	81994b08a0	[inductor] don't tune xblock for reduction (#164801 ) It turns out that tuning XBLOCK for a reduction can also change numerics ( https://github.com/pytorch/pytorch/pull/164525#pullrequestreview-3306235454 ). The PR skip tuning XBLOCK for a reduction. If we have multiple configs left with different XBLOCKs, the heuristic will pick the configs with second-largest XBLOCK. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164801 Approved by: https://github.com/jansel, https://github.com/mlazos, https://github.com/v0i0	2025-10-08 20:31:39 +00:00
soulitzer	71aefd5595	[reland] Allow setting grad_dtype on leaf tensors (#164751 ) ghstack-source-id: e44b3941530be83a630ec93f1478eec741ffca2e Pull-Request-resolved: https://github.com/pytorch/pytorch/pull/162815 Fixes #ISSUE_NUMBER Relanding due to internal weirdness. Separate PR to codev w/o ghstack. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164751 Approved by: https://github.com/albanD	2025-10-08 20:23:13 +00:00
eellison	001e1d2637	Add memory estimator (#164738 ) Original work by @ShatianWang, with lints applied. I am going to a few changes and add tests in subsequent prs but I want to preserve original commit first. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164738 Approved by: https://github.com/IvanKobzarev	2025-10-08 20:04:33 +00:00
Aleksandar Samardžić	e0cb1848d0	Use TMA loads always for Triton grouped MM kernel (#164256 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164256 Approved by: https://github.com/ngimel	2025-10-08 19:40:06 +00:00
Lakshay Garg	a4110fedcf	Use insert_or_assign instead of erase+emplace (#164868 ) insert_or_assign does effectively the same thing as erase+emplace but more efficiently since the search does not need to be repeated Pull Request resolved: https://github.com/pytorch/pytorch/pull/164868 Approved by: https://github.com/eqy	2025-10-08 19:13:49 +00:00
Natalia Gimelshein	37c6087334	Add split-K control to cuBLAS reduced-precision settings (#164766 ) ## Summary - add a CuBLASReductionOption enum so the CUDA context can track reduced-precision and split-K options - extend the Python bindings, backend helpers, and docs to accept an optional allow_splitk argument for fp16/bf16 matmul controls - update cuBLAS/cuBLASLt call sites plus dynamo guards and tests to respect the new combinations ## Testing - python test/test_cuda.py TestCuda.test_cublas_allow_fp16_reduced_precision_reduction_get_set -v (fails: ModuleNotFoundError: No module named 'psutil') ------ https://chatgpt.com/codex/tasks/task_e_68e404623178832f8a3e1d34e1e175da Pull Request resolved: https://github.com/pytorch/pytorch/pull/164766 Approved by: https://github.com/malfet, https://github.com/albanD	2025-10-08 18:48:45 +00:00
Laith Sakka	0b85236477	Fix refine_ranges corner case (#164075 ) (#164846 ) Summary: address https://github.com/pytorch/pytorch/issues/161360 u0>0 should update the range of u0 to start from [1, ..] this fix it. it was not doing that. Test Plan: contbuild & OSS CI, see `27234792ad` D84038721 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164846 Approved by: https://github.com/izaitsevfb, https://github.com/ezyang	2025-10-08 18:42:37 +00:00
Janani Sriram	4c0fec3e4d	[Max Autotune][B200] Skip carveout tests (#164435 ) Summary: Skip sm `carveout` tests on B200, as carveout is currently unsupported. Test Plan: ``` buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:max_autotune -c fbcode.nvcc_arch=b200a -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8 -c fbcode.re_gpu_tests=False -- test_honor_sm_carveout_with_triton_tma ``` Differential Revision: D83395610 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164435 Approved by: https://github.com/eellison	2025-10-08 18:39:43 +00:00
cyy	fdc622b513	[CMake] Remove LLVM link code (#134940 ) This handling is not needed no recent LLVM APIs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/134940 Approved by: https://github.com/ezyang, https://github.com/malfet	2025-10-08 18:39:16 +00:00
bobrenjc93	91b9484264	[ez] fix small doc error (#164915 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164915 Approved by: https://github.com/svekars	2025-10-08 18:27:44 +00:00
Ke Wen	5c827a4133	[SymmMem] Multi-root tile reduction (#164757 ) Stack from [ghstack](https://github.com/ezyang/ghstack/tree/0.12.0) (oldest at bottom): Perform multiple tile reductions concurrently, with each tile reduced to a separate root. - The number of concurrent reductions can be smaller than world size, i.e. roots can be a subset of all ranks. But all ranks are still required to call into this API. - Currently supports NVLink SHARP scope only. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164757 Approved by: https://github.com/weifengpy, https://github.com/fegin ghstack dependencies: #162243	2025-10-08 17:28:00 +00:00
Boyuan Feng	83458197d1	[Benchmark] remove old timm models from benchmark (#164805 ) Prune models from TorchInductor dashboard to reduce ci cost. This PR prunes for timm models according to the [doc](https://docs.google.com/document/d/1nLPNNAU-_M9Clx9FMrJ1ycdPxe-xRA54olPnsFzdpoU/edit?tab=t.0), which reduces from 60 to 14 models. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164805 Approved by: https://github.com/anijain2305, https://github.com/seemethere, https://github.com/huydhn, https://github.com/malfet	2025-10-08 17:14:58 +00:00
Gheorghe-Teodor Bercea	0b01ff4de0	[ROCm] Improve non stride-one backwards indexing for small index sets (#164409 ) This patch fixes a performance problem which occurs when a small set of indices is used and there are practically no duplicates. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164409 Approved by: https://github.com/jerrymannil, https://github.com/jeffdaily	2025-10-08 17:04:52 +00:00
Nikita Shulga	01f3a43462	[MPS] Update OS version in error message (#164946 ) Followup after https://github.com/pytorch/pytorch/pull/159912 Fixes https://github.com/pytorch/pytorch/issues/164943 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164946 Approved by: https://github.com/Camyll	2025-10-08 16:43:50 +00:00
Sean McGovern	f332017294	C++ API handle optimizer defaults (#161825 ) Fixes #141884 This fixes the issue for all optimizers and parameter options. A member function `overwrite_from` is added to the optimizer base class. Each optimizer then implements this function for comparing their accepted parameters to defaults. A SFINAE approach to handle the different optimizer parameters generically (in optimizer.h only) was evaluated, but I think this is easier to review and maintain. This mirrors the Python API up to one edge case. An example of the edge case is provided below. Python can distinguish between 1) Key not present in dict = "not specified" and 2) Key present in dict = "explicitly set". The C++ implementation cannot. The issue hinges on whether or not to track if a particular parameter was set by the user explicitly or not (discrepancy in the case when the constructor default is explicitly passed in). To track this seems like it will take more intervention than would be worth it (modify TORCH_ARG to keep track, use std::optional for the parameter types, use bitset tracking) and was not pursued in the current PR. I'm happy to alter the design if appropriate. ### Example of edge case hinging on CONSTRUCTOR DEFAULTS vs OPTIMIZER DEFAULTS 1. CONSTRUCTOR DEFAULTS: These are the values you get when calling AdamOptions() AdamOptions().lr() = 0.001 AdamOptions().weight_decay() = 0 AdamOptions().eps() = 1e-08 2. OPTIMIZER DEFAULTS: These are the values the user chose when creating the optimizer User's optimizer defaults: optimizer.lr() = 0.005 optimizer.weight_decay() = 0.1 optimizer.eps() = 1e-07 3. THE PROBLEM SCENARIO: User wants to add a parameter group with explicit weight_decay=0.0 User sets: weight_decay(0) 4. THE CONFUSION: Constructor default weight_decay: 0 User's explicit weight_decay: 0 Are they equal? YES Since they're equal, our overwrite_from() logic thinks: "User didn't set weight_decay explicitly, use optimizer default" 5. CURRENT BEHAVIOR: Final weight_decay: 0.1 User expected: 0 Match? ❌ NO === KEY INSIGHT === Constructor defaults are built into the C++ class definition. Optimizer defaults are chosen by the user at runtime. We want to respect the user intention. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161825 Approved by: https://github.com/janeyx99	2025-10-08 16:40:45 +00:00
mingyuan.wang	0a3e4e894c	[PP]: Optimize memory by early releasing stage inputs' gradients (#164329 ) Seems that we can release input activations' gradients early in `stage_backward()` in PP, which helps to reduce the peak memory. I tested this using `1F1B` and `Interleaved1F1B` PP strategy (for simplicity, I use 4 decoder layers of llama3, set PP size to 2 and set num_microbatches to 128) based on torchtitan run command using torchtitan: ```bash CUDA_VISIBLE_DEVICES=4,5 LOG_RANK=0,1 NGPU=2 CONFIG_FILE=./torchtitan/models/llama3/train_configs/llama3_8b.toml ./run_train.sh --metrics.log_freq 1 --training.seq_len 8192 --training.steps 10 --parallelism.data_parallel_shard_degree 1 --activation_checkpoint.mode full --model.tokenizer_path /workspace/torchtitan-v0.1.0/torchtitan/torchtitan/datasets/tokenizer/original/tokenizer.model --tr aining.dataset wikipedia --parallelism.pipeline_parallel_degree 2 --training.local_batch_size 128 --parallelism.pipeline_parallel_microbatch_size 1 --training.dataset_path /workspace/wikipedia_subset --training.seed 42 --parallelism.pipeline_parallel_schedule 1F1B ``` ## 1F1B torchtitan train results ### before fix <img width="1526" height="606" alt="b8e281cce1dac15e827c216e7d83f402" src="https://github.com/user-attachments/assets/545c0a80-6276-40c0-893f-fd2df0a53b8d" /> ### after fix <img width="1526" height="594" alt="70d5ceba311a8398d041189bf8897cfc" src="https://github.com/user-attachments/assets/0d606e08-238a-4115-a1c0-b40df101d867" /> after fix, the memory usage on rank1, i.e., non first stages saving 6.9GB compare to before fix. the memory usage on rank0 remains unchanged (rank0 represents stage0) ## Interleaved1F1B torchtitan train results ### before fix <img width="1514" height="601" alt="a28b7f9704b9234870619c43194e8a72" src="https://github.com/user-attachments/assets/2c28565f-ffff-4747-a8f5-722b5c65dc7e" /> ### after fix <img width="1526" height="621" alt="2d8d6d956b72885186f8c7059146c41a" src="https://github.com/user-attachments/assets/8c4a4ff2-336b-4e0b-8ac4-014ae22c2ed1" /> after fix, the memory usage on rank1 saving 14.57GB (rank1 holds layer1 and layer3) and rank0 saving 7.5GB (rank0 holds layer0 and layer2) ## Memory snapshot results also, I have dumped the memory snapshot to observe the memory under the 1F1B PP strategy. ### before fix <img width="1906" height="918" alt="6fd4e4ba82b8bacf9ca6edee4f3d5581" src="https://github.com/user-attachments/assets/d1b9245c-b09f-43c5-87ce-87ba48533a70" /> we can see the memory is increasing as pp step_microbatches running. (the lifetime of input activation's gradient, i.e., the output of `FusedRMSNormBackward` lasts too long) ### after fix <img width="1903" height="918" alt="2e415f25af6750d06e5e647683b212b9" src="https://github.com/user-attachments/assets/b657c8f6-5a56-46bd-8743-f3b8375c81b0" /> after fix, we got more steady memory usage during training. (the input activation's gradient will be released or return allocator soon) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164329 Approved by: https://github.com/H-Huang	2025-10-08 16:12:00 +00:00
Adnan Akhundov	73adac05d1	Triton 3.5.x pin update to 7416ffc (#164587 ) Updates triton pin to latest: https://github.com/triton-lang/triton/commits/release/3.5.x/ This updates contains 1 cherry-pick to fix flex_attention_fwd regression on B200: - https://github.com/triton-lang/triton/pull/8366 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164587 Approved by: https://github.com/atalman	2025-10-08 16:07:18 +00:00
eqy	0d39ecb2ce	[cuDNN][RNN] cuDNN RNN supports BFloat16 inputs since 9.13 (#164411 ) seems to work Pull Request resolved: https://github.com/pytorch/pytorch/pull/164411 Approved by: https://github.com/Skylion007	2025-10-08 15:26:50 +00:00
Nikita Shulga	90c0825e2d	[GHF] Allow reverts from pytorch-auto-revert app (#164911 ) This is a bit weird, but author_login is not a unique field, but author_url is. Explicitly allow https://github.com/apps/pytorch-auto-revert to issue revert commands Update mocks by running ``` sed -i -e s/8e262b0495bd934d39dda198d4c09144311c5ddd6cca6a227194bd48dbfe7201/47860a8f57a214a426d1150c29893cbc2aa49507f12b731483b1a1254bca3428/ gql_mocks.json ``` Test plan: Run ```python from trymerge import GitHubPR pr=GitHubPR("pytorch", "pytorch", 164660) print(pr.get_last_comment().author_url, pr.get_comment_by_id(3375785595).author_url) ``` that should produce ``` https://github.com/pytorch-auto-revert https://github.com/apps/pytorch-auto-revert ``` Plus added a regression test that checks two particular comments for revert validity `pytorch-auto-revert` user is my alter ego :) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164911 Approved by: https://github.com/jeanschmidt	2025-10-08 15:15:45 +00:00
PyTorch MergeBot	fd4bde430a	Revert "list_stored_sd_metadata API. (#160610 )" This reverts commit da903b6a8be422529d47649e89c0d50bb95c37ca. Reverted https://github.com/pytorch/pytorch/pull/160610 on behalf of https://github.com/jeffdaily due to broke ROCm CI, but flaky also on CUDA CI https://hud.pytorch.org/failure?name=periodic%20%2F%20linux-jammy-rocm-py3.10%20%2F%20test%20(distributed%2C%202%2C%203%2C%20linux.rocm.gpu.mi250.4%2C%20module%3Arocm%2C%20oncall%3Adistributed)&jobName=undefined&failureCaptures=distributed%2Fcheckpoint%2Ftest_list_stored_state_dict.py%3A%3ATestListStateDict%3A%3Atest_list_stored_sd_metadata ([comment](https://github.com/pytorch/pytorch/pull/160610#issuecomment-3382023022))	2025-10-08 15:10:38 +00:00
PyTorch MergeBot	b5e93ffdcf	Revert "Limit path search within range (#164581 )" This reverts commit 415e641572473479fc9d9eaea12762e1a223a9e0. Reverted https://github.com/pytorch/pytorch/pull/164581 on behalf of https://github.com/eellison due to merge sets makes this trickier ([comment](https://github.com/pytorch/pytorch/pull/164581#issuecomment-3381955240))	2025-10-08 14:56:21 +00:00
PyTorch MergeBot	f8d0d65ddc	Revert "Add memory estimator (#164738 )" This reverts commit ab01a0d7d352e7fd07989b8d6bf035bf82aea74e. Reverted https://github.com/pytorch/pytorch/pull/164738 on behalf of https://github.com/eellison due to merge sets makes this trickier ([comment](https://github.com/pytorch/pytorch/pull/164581#issuecomment-3381955240))	2025-10-08 14:56:21 +00:00
Jeff Daily	f46ddb1e65	[ROCm][CI] add gfx1150 gfx1151 to docker images for binary builds (#164854 ) Fixes #164346. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164854 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-08 14:34:22 +00:00
PyTorch MergeBot	20082d7136	Revert "fix flex attention eager bwd: more rounding (#164317 )" This reverts commit 41808b2ba9a61ab2f4c7af394c1668d09a4a0331. Reverted https://github.com/pytorch/pytorch/pull/164317 on behalf of https://github.com/jeffdaily due to inductor/test_flex_attention.py::TestFlexAttentionCUDA::test_builtin_score_mods_seqlen_lt_custom_sparse_block_size_score_mod4_cuda_float16 [GH job link](https://github.com/pytorch/pytorch/actions/runs/18330774537/job/52207370954) [HUD commit link](`41808b2ba9`) ([comment](https://github.com/pytorch/pytorch/pull/164317#issuecomment-3381812090))	2025-10-08 14:29:10 +00:00
Laith Sakka	7158aa22e8	remove more (#164753 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164753 Approved by: https://github.com/aorenste, https://github.com/mlazos ghstack dependencies: #164664, #164665, #164667, #164668	2025-10-08 14:23:38 +00:00
Laith Sakka	2035f6b2e6	use check_size instead of check_is_size in ops.py (#164668 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164668 Approved by: https://github.com/angelayi ghstack dependencies: #164664, #164665, #164667	2025-10-08 14:23:38 +00:00
Mwiza Kunda	2b58adc3bd	[inductor][templates] Distinguish between kernel input nodes and codegen input nodes (#163752 ) If there is a single autotuner choice, the wrong type of input node is used to instantiate `TritonTemplateBuffer` through `TritonTemplateCaller.output_node`. This PR distinguishes the input nodes used in `AlgorithmSelectorCache.__call__` between the actual inputs passed to the kernel at runtime, vs the possibly viewed inputs that influence scheduling behaviour (e.g. `MemoryDeps`) and codegen. See the added unit test for more detail. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163752 Approved by: https://github.com/eellison	2025-10-08 14:12:14 +00:00
angelayi	322091d8d8	[opaque_obj] Add make_fx tracing support (#163278 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163278 Approved by: https://github.com/zou3519 ghstack dependencies: #163279, #163277	2025-10-08 09:09:16 +00:00
angelayi	2bb4e6876c	[opaque obj] Error for torch.library.custom_op infer_schema (#163277 ) Unsure how we can get infer_schema to infer the scriptObject type from just the type annotation, so for now will just error clearly and ask users to specify a schema. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163277 Approved by: https://github.com/zou3519 ghstack dependencies: #163279	2025-10-08 09:09:16 +00:00
angelayi	56ef7743fc	[opaque_obj] Add __eq__ and __deepcopy__ (#163279 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163279 Approved by: https://github.com/zou3519	2025-10-08 09:09:16 +00:00
Yuanyuan Chen	64108bdbed	[BC-Breaking] Remove long-deprecated casting functions from native_functions.yaml (#164641 ) This PR removes `torch._cast_XXX` from generated OPs. They were deprecated in PyTorch 1 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164641 Approved by: https://github.com/albanD, https://github.com/justinchuby	2025-10-08 08:27:58 +00:00
Maggie Moss	c855f8632e	Pyrefly suppressions 7/n (#164913 ) Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283 Almost there! Test plan: dmypy restart && python3 scripts/lintrunner.py -a pyrefly check step 1: delete lines in the pyrefly.toml file from the project-excludes field step 2: run pyrefly check step 3: add suppressions, clean up unused suppressions before: https://gist.github.com/maggiemoss/4b3bf2037014e116bc00706a16aef199 after: INFO 0 errors (6,884 ignored) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164913 Approved by: https://github.com/oulgen	2025-10-08 07:27:17 +00:00
morrison-turnansky	12d2ef557f	Update round size with 1 division behavior (#162203 ) have round size return nearest power of 2 greater than or equal to size with 1 division Fixes #161139 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162203 Approved by: https://github.com/ezyang	2025-10-08 06:41:46 +00:00
Edward Yang	65aa62d50d	Use codegen for the boxed interpreters (#164573 ) Authored with claude code. The arg parsing is kind of horrible, open to more suggestions. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164573 Approved by: https://github.com/albanD, https://github.com/jansel	2025-10-08 06:27:44 +00:00
Jane Xu	6a09f9306c	Fix #164742 , all header-impl'd userfacing functions should be inline (#164871 ) It is as @mxmpl pointed out; we are missing an inline. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164871 Approved by: https://github.com/mikaylagawarecki	2025-10-08 05:57:19 +00:00
Ke Wen	19bf67be32	multimem reduce (#164517 ) Modified `multimem_one_shot_all_reduce_out` function to accept a `root` argument, making it a `multimem_reduce` op. The original `multimem_one_shot_all_reduce` op becomes a caller of the `multimem_reduce`, with each rank providing its own rank id as root. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164517 Approved by: https://github.com/ngimel	2025-10-08 05:25:16 +00:00
PyTorch MergeBot	1927783aa3	Revert "Reland vision pinned commit hash update (#164492 )" This reverts commit 6861a270624b44954826688f8dad668eb0154452. Reverted https://github.com/pytorch/pytorch/pull/164492 on behalf of https://github.com/izaitsevfb due to see autorevert msg above, inductor breakage is legit ([comment](https://github.com/pytorch/pytorch/pull/164492#issuecomment-3379537888))	2025-10-08 04:38:26 +00:00
Nicolas Macchioni	184817c7a8	locks + unit tests (#164636 ) Test Plan: ``` buck test fbcode//mode/opt caffe2/test/inductor:caching ``` Reviewed By: aorenste D83714690 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164636 Approved by: https://github.com/aorenste	2025-10-08 04:34:22 +00:00
Pradeep Fernando	da903b6a8b	list_stored_sd_metadata API. (#160610 ) Summary: 1\ Certain checkpoint load use cases are not aware of the properties of the data/tensors they want to load. 2\ These usecases include data loader checkpoints, reading data for post processing (when the original model definition is not available). 3\ There, we have to use saved checkpoint (metadata) as our source of truth. 4\ This RFC proposal exposes the checkpoint metadata using a public API. In this proposal we expose the stored state-dict metadata (minus associated storage/chunk metadata). Chunk/storage details should not be exposed to the users and is a impl detail of the storage writer/reader. Test Plan: UT. Rollback Plan: Differential Revision: D80231457 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160610 Approved by: https://github.com/saumishr	2025-10-08 04:33:51 +00:00
Boyuan Feng	f76fdcaaf8	[Benchmark] cleanup huggingface models (#164815 ) Prune models from TorchInductor dashboard to reduce ci cost. This PR prunes for hugging face models according to the [doc](https://docs.google.com/document/d/1nLPNNAU-_M9Clx9FMrJ1ycdPxe-xRA54olPnsFzdpoU/edit?tab=t.0), which reduces from 46 to 27 models. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164815 Approved by: https://github.com/anijain2305, https://github.com/seemethere, https://github.com/huydhn, https://github.com/malfet	2025-10-08 03:21:04 +00:00
Sam Larsen	608792153f	[inductor][codecache] Print bytes in codecache debug output (#164898 ) Summary: We have an internal request to help understand why the hash of `post_grad_custom_post_pass` is changing between attempts. We don't get useful info from the debug output, because we just print "<bytes>". Instead, attempt to print at least _some_ of the value in case it contains readable characters. Test Plan: Registered a dummy post_grad_custom_pass and printed codecache debug output `TORCH_LOGS=+torch._inductor.codecache python ~/foo.py` Yields something like: ``` V1007 16:41:19.024000 3546009 /data/users/slarsen/pytorch-3.10_4/torch/_inductor/codecache.py:989] [0/0] [law2ujt2wzjb5tyiu6jh64r2lxpvl62yvxcsmdouhg3qyelhhdv] post_grad_custom_post_pass: HelloWorld!��... ``` Differential Revision: [D84108770](https://our.internmc.facebook.com/intern/diff/D84108770) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164898 Approved by: https://github.com/oulgen	2025-10-08 02:45:20 +00:00
Maggie Moss	086dec3235	Pyrefly suppressions 6/n (#164877 ) Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283 Almost there! Test plan: dmypy restart && python3 scripts/lintrunner.py -a pyrefly check step 1: delete lines in the pyrefly.toml file from the project-excludes field step 2: run pyrefly check step 3: add suppressions, clean up unused suppressions before: https://gist.github.com/maggiemoss/4b3bf2037014e116bc00706a16aef199 after: INFO 0 errors (5,064 ignored) Only four directories left to enable Pull Request resolved: https://github.com/pytorch/pytorch/pull/164877 Approved by: https://github.com/oulgen	2025-10-08 02:30:57 +00:00
Aaron Orenstein	ad7b2bebc6	Use tuples to have a deterministic ordering. (#164851 ) When debugging I noticed some non-deterministic behavior and tracked it down to this literal set. Changed to be a tuple for determinism. Changed two other small literal sets also because using a set for a small lookup like that is slow. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164851 Approved by: https://github.com/bobrenjc93, https://github.com/bdhirsh	2025-10-08 02:12:03 +00:00
Ke Wen	d444384003	[SymmMem] Tiled reduce (#162243 ) Added op: `tile_reduce(Tensor input, Tensor(a!) out, int root, str group_name)` For now supports only: - NVSHMEM backed symmetric tensor; - 2D tensor and tile; - torch.float. Testing on right-bottom quandrant: ``` rank 0: tensor([[0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0., 0., 1., 1., 1., 1.], [0., 0., 0., 0., 1., 1., 1., 1.], [0., 0., 0., 0., 1., 1., 1., 1.], [0., 0., 0., 0., 1., 1., 1., 1.]], device='cuda:0') PASSED ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162243 Approved by: https://github.com/ngimel	2025-10-08 02:03:04 +00:00
PyTorch MergeBot	3040a5d294	Revert "[dynamo] Support torch.fx.traceback.annotate (#164678 )" This reverts commit 801e282f39e9ef4424dfd3ecfd2b550a44595229. Reverted https://github.com/pytorch/pytorch/pull/164678 on behalf of https://github.com/izaitsevfb due to breaks executorch internally, see [D84068062](https://www.internalfb.com/diff/D84068062?entry_point=16) ([comment](https://github.com/pytorch/pytorch/pull/164678#issuecomment-3379281844))	2025-10-08 01:49:34 +00:00
PyTorch MergeBot	97463d4cf3	Revert "Fix double dispatch to Python for detach (#163671 )" This reverts commit c32118dc3e50505fd285e6e448a90883fce11535. Reverted https://github.com/pytorch/pytorch/pull/163671 on behalf of https://github.com/izaitsevfb due to breaks export tests ([comment](https://github.com/pytorch/pytorch/pull/163671#issuecomment-3379281422))	2025-10-08 01:46:45 +00:00
Howard Huang	c813617c53	[PP] Migrate other schedules to use PipelineScheduleRuntime (#164777 ) Second fix for https://github.com/pytorch/pytorch/issues/164756 This has been a TODO to make the all schedules execute using the same runtime. Now after this change, schedules will use the same logic for `_PipelineScheduleRuntime` where it adds `UNSHARD` and `RESHARD` operations to the schedules which fixes the issue mentioned above. <img width="920" height="406" alt="image" src="https://github.com/user-attachments/assets/a4d5bcd0-7dac-43cd-96f9-8ca33cfd8b91" /> A test is failing after the conversion: - Fixed a gradient scaling issue for dWeight Pull Request resolved: https://github.com/pytorch/pytorch/pull/164777 Approved by: https://github.com/fegin ghstack dependencies: #164775	2025-10-08 01:45:57 +00:00
Howard Huang	e659661ffa	[PP] Fix FSDP unshard/reshard (#164775 ) First fix for https://github.com/pytorch/pytorch/issues/164756 In the pipeline IR we call `UNSHARD` and `RESHARD`, but there is a bug because when we call `module.unshard()` these do not recursively call the FSDP modules, hence leading to sometime call allgather before the module forward. Since we want the pipeline IR to explicitly handle this, we can call `group.unshard` instead which ensures that all the modules are unsharded. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164775 Approved by: https://github.com/weifengpy	2025-10-08 01:45:57 +00:00
Markus Hoehnerbach	41808b2ba9	fix flex attention eager bwd: more rounding (#164317 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164317 Approved by: https://github.com/drisspg ghstack dependencies: #163986	2025-10-08 01:17:45 +00:00
Xilun Wu	c0510dc447	[ContextParallel] add `_LoadBalancer` classes, and load-balance interface to Context Parallel APIs (#161062 ) Summary This PR provides an interface for users to specify how to load-balance the attention input. The load-balance is essentially a rearrangement of the input tensor(s) over the seq_dim before sharding and can be specified via an index tensor `rearrange` such that Q[rearrange] is the balanced Q users want (i.e. `rearrange[i] == j` where `i` is the new index of `Q[j]` in the balanced Q). An example is the `_generate_round_robin_indices()` added in https://github.com/pytorch/pytorch/pull/155442. New `_LoadBalancer` classes New `_LoadBalancer` class (defined in `torch/distributed/tensor/experimental/_load_balancer.py`) provides one interface for defining load-balance behavior: `_generate_indices(self, restore: bool = False)`. When `restore == False`, this method should output an index Tensor (namely `rearrange_idx`) such that QKV will be transformed into Q' K' V' in a way that `Q'[i] == Q[rearrange_idx[i]]` (same applies to K and V). When `restore == True`, this method outputs an index Tensor (namely `restore_idx` such that `Q'[restore_idx] == Q` (same applies to K and V). Impact 2 public CP APIs and 1 private CP API is modified. This PR should be backward-compatible by: - For uses w/ SDPA, existing users must be using the `context_parallel()` API which does not take in the extra `load_balancer` argument and solely determines from the global var `_cp_options.enable_load_balance`. - For new users including who want to try `flex_attention()`, we require to use the new API `_context_parallel_buffers` to explicitly shard the QKV input instead of using `context_parallel()` because we no longer rely on TorchDispatchMode nor TorchFunctionMode for op replacement. And we also require users to explicitly pass in a `load_balancer` argument if load-balancing is demanded. Load-Balance Behavior `context_parallel_unshard()`, and `create_cp_block_mask()` APIs now take an extra optional argument `load_balancer`. This argument is optional because of backward compatibility but we require new users to explicitly pass in a `load_balancer` if load-balancing is demanded: - if `load_balancer == None` and `_cp_options.enable_load_balance == False`, CP performs no load-balancing on input Tensors. - if `load_balancer == None` and `_cp_options.enable_load_balance ==True`, CP performs head-tail load-balancing (e.g. split a Tensor into 2N chunks and first N are called head and the rest are called tail. Place the first head chunk the last tail chunk on rank 0, and the second head along with the second last tail chunk on rank 1, and so on). `_context_parallel_buffers()` also takes the extra optional argument `load_balancer`, but the behavior is slightly different from the other 2 APIs -- it doesn't branch on `_cp_options.enable_load_balance` : - if `load_balancer == None`, no load-balancing will be performed - otherwise, apply load-balancing using `load_balancer._generate_indices()` before sharding. Changes* This PR moves the index Tensor generation logic into a set of LoadBalancer classes and make LoadBalancer the common interface for Context Parallel APIs that leverages load-balancing: * _context_parallel_buffers * context_parallel_unshard * create_cp_block_mask The `_LoadBalancer` classes added are: - `_LoadBalancer`: the abstract base class that provides “_generate_indices” interface index Tensor generation. - `_HeadTailLoadBalancer`: Implements head-tail balancing logic. - `_PerDocumentHeadTailLoadBalancer`: Supports per-document head-tail balancing for batched sequences. Test `pytest test/distributed/tensor/test_attention.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161062 Approved by: https://github.com/fegin	2025-10-08 01:09:14 +00:00
Nicolas Macchioni	9ec10dc26a	utils + unit tests (#164551 ) Test Plan: ``` buck test fbcode//mode/opt caffe2/test/inductor:caching ``` Reviewed By: aorenste Differential Revision: D83714691 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164551 Approved by: https://github.com/aorenste	2025-10-08 01:05:45 +00:00
Yuanyuan Chen	43fc859625	Don't return values in void functions (#164809 ) This PR fixes returning values in void C++ functions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164809 Approved by: https://github.com/janeyx99	2025-10-08 01:04:14 +00:00
PyTorch MergeBot	f713abab16	Revert "Enable all flake8-logging-format rules (#164655 )" This reverts commit e98c4e835b1db22092fc93b49d2cddd7b3537d1f. Reverted https://github.com/pytorch/pytorch/pull/164655 on behalf of https://github.com/malfet due to Looks like it broke lint in trunk, see `bd3b98a8a5/1` ([comment](https://github.com/pytorch/pytorch/pull/164655#issuecomment-3379209309))	2025-10-08 00:55:17 +00:00
Pian Pawakapan	bd3b98a8a5	[dynamic shapes] make backed_size_oblivious behavior consistent b/w symbolic_shapes/inductor (#164796 ) Summary: call guard_or_ directly to enable backed_size_obl in inductor calls to guard_or Test Plan: CI and unit test added. Differential Revision: D84009392 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164796 Approved by: https://github.com/laithsakka	2025-10-08 00:19:06 +00:00
Yuanyuan Chen	e98c4e835b	Enable all flake8-logging-format rules (#164655 ) These rules are enabled by removing existing suppressions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164655 Approved by: https://github.com/janeyx99	2025-10-08 00:16:13 +00:00
Yiming Zhou	7b15534434	[export] Fix weight sharing when there is no complete tensor (#164857 ) Summary: As titled. Test Plan: CI Differential Revision: D84079625 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164857 Approved by: https://github.com/yushangdi	2025-10-07 23:40:13 +00:00
Scott Wolchok	c32118dc3e	Fix double dispatch to Python for detach (#163671 ) This fixes #71725. Differential Revision: [D83857880](https://our.internmc.facebook.com/intern/diff/D83857880) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163671 Approved by: https://github.com/ezyang, https://github.com/albanD	2025-10-07 23:34:37 +00:00
Chien-Chin Huang	e3ae80fc03	[PP] Let PP split BlockMask into micro-BlockMask (#164111 ) BlockMask has batch dimension information. So PP has to split it as well just like all other tensors. All the tensors in BlockMask have the batch dimension, so we can just split it without too many issues. However, `mask_mod` requires the batch index as the input, which the value is going to be changed after the split. So we have to wrap it inside a closure to modify the batch index. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164111 Approved by: https://github.com/H-Huang	2025-10-07 23:25:34 +00:00
atalman	483f4e0db9	CUDA 13.0 builds fix on Amazon Linux 2023 (#164870 ) During 2.9 rc testing I am seeing an issue on Amazon Linux 2023 with CUDA 13.0 builds This is related to: https://github.com/pytorch/pytorch/issues/152756 Workflow: https://github.com/pytorch/test-infra/actions/runs/18324074610/job/52184079262 Error: ``` WARNING: There was an error checking the latest version of pip. + python3.11 .ci/pytorch/smoke_test/smoke_test.py --package torchonly Traceback (most recent call last): File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 333, in _load_global_deps ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL) File "/usr/lib64/python3.11/ctypes/__init__.py", line 376, in __init__ self._handle = _dlopen(self._name, mode) ^^^^^^^^^^^^^^^^^^^^^^^^^ OSError: libcudart.so.13: cannot open shared object file: No such file or directory During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/pytorch/pytorch/.ci/pytorch/smoke_test/smoke_test.py", line 12, in <module> import torch File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 425, in <module> _load_global_deps() File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 383, in _load_global_deps _preload_cuda_deps(lib_folder, lib_name) File "/usr/local/lib64/python3.11/site-packages/torch/__init__.py", line 317, in _preload_cuda_deps raise ValueError(f"{lib_name} not found in the system path {sys.path}") Traceback (most recent call last): ValueError: libnvToolsExt.so.*[0-9] not found in the system path ['/pytorch/pytorch/.ci/pytorch/smoke_test', '/usr/lib64/python311.zip', '/usr/lib64/python3.11', '/usr/lib64/python3.11/lib-dynload', '/usr/local/lib64/python3.11/site-packages', '/usr/local/lib/python3.11/site-packages', '/usr/lib64/python3.11/site-packages', '/usr/lib/python3.11/site-packages'] File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 102, in <module> main() File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 98, in main run_cmd_or_die(f"docker exec -t {container_name} /exec") File "/home/ec2-user/actions-runner/_work/test-infra/test-infra/test-infra/.github/scripts/run_with_env_secrets.py", line 39, in run_cmd_or_die raise RuntimeError(f"Command {cmd} failed with exit code {exit_code}") RuntimeError: Command docker exec -t 7d9c5bd403cac9a9ee824d63a1d6f6057ecce89a7daa94a81617dbf8eff0ff2e /exec failed with exit code 1 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164870 Approved by: https://github.com/Camyll Co-authored-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com>	2025-10-07 22:52:53 +00:00
Aaron Gokaslan	d1a62c8036	[BE][Ez]: Enable RUF007 Prefer itertools.pairwise over zip slicing (#164856 ) Now that our min version is 3.10 we can support this rule. This is more concise, readable, and efficient than the previous zip slicing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164856 Approved by: https://github.com/williamwen42	2025-10-07 22:51:17 +00:00
Huy Do	6861a27062	Reland vision pinned commit hash update (#164492 ) Redo https://github.com/pytorch/pytorch/pull/154694 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164492 Approved by: https://github.com/yangw-dev	2025-10-07 22:45:05 +00:00
amdfaa	955f21dc2c	[ROCm][CI] Add support for gfx1100 in rocm workflow + test skips (#148355 ) This PR adds infrastructure support for gfx1100 in the rocm workflow. Nodes have been allocated for this effort. @dnikolaev-amd contributed all the test skips. Pull Request resolved: https://github.com/pytorch/pytorch/pull/148355 Approved by: https://github.com/jeffdaily Co-authored-by: Dmitry Nikolaev <dmitry.nikolaev@amd.com> Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-07 22:36:25 +00:00
Pian Pawakapan	9f5e1beaf3	[multi-kernel] base tensor sizes for shape cache key (#164499 ) to match shape key in `3ca09d65f1/torch/_inductor/select_algorithm.py (L3571)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164499 Approved by: https://github.com/ColinPeppler	2025-10-07 21:27:40 +00:00
Mwiza Kunda	2e027e8742	[inductor] Improve bound on the number of dims to match for the block (#163755 ) - Removes redundant broadcast code when `len(kernel.range_tree_nodes)` is much larger than `len(range_tree.nodes)`. For example: ```python # before, the broadcast is to [1, 1, XBLOCK, R0_BLOCK] tmp0 = tl.reshape(tl.broadcast_to(tl.load(block_ptr0, boundary_check=[2], padding_option='zero', eviction_policy='evict_last')[:, None, :, :], [(511 + XBLOCK) // 512, ((1) * ((1) <= ((511 + XBLOCK) // 512)) + ((511 + XBLOCK) // 512) * (((511 + XBLOCK) // 512) < (1))), ((512) * ((512) <= (XBLOCK)) + (XBLOCK) * ((XBLOCK) < (512))), R0_BLOCK]), [XBLOCK, R0_BLOCK]) # after tmp0 = tl.reshape(tl.load(block_ptr0, boundary_check=[2], padding_option='zero', eviction_policy='evict_last'), [XBLOCK, R0_BLOCK]) ``` - Fix: also save range_tree_nodes per subgraph Pull Request resolved: https://github.com/pytorch/pytorch/pull/163755 Approved by: https://github.com/eellison, https://github.com/blaine-rister	2025-10-07 21:02:37 +00:00
PyTorch MergeBot	1e42fde45e	Revert "[CUDA] Add experimental green context support for SM carveout (#159104 )" This reverts commit 746fe78ecd52f3e9cfddda41f0ac82dada7bdd0b. Reverted https://github.com/pytorch/pytorch/pull/159104 on behalf of https://github.com/malfet due to Breaks Windows CD build ([comment](https://github.com/pytorch/pytorch/pull/159104#issuecomment-3378675515))	2025-10-07 20:51:22 +00:00
PyTorch MergeBot	f505caa71b	Revert "multimem reduce (#164517 )" This reverts commit d1cbb74fb16406488a174832e1b58b7c242f418d. Reverted https://github.com/pytorch/pytorch/pull/164517 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/164517#issuecomment-3378529654))	2025-10-07 20:12:38 +00:00
Howard Huang	65f10becdf	Support OVERLAP_F_B in schedule (#161072 ) Previously, we converted the overlap_f_b into separate forward and backward operations in the plan. This is a small change that includes it in the plan and handles it in the runtime Pull Request resolved: https://github.com/pytorch/pytorch/pull/161072 Approved by: https://github.com/fegin, https://github.com/wconstab	2025-10-07 19:55:10 +00:00
PyTorch MergeBot	df640df68a	Revert "Reapply "C++-accessible Placements via pybind11 (#163030 )" (#164519 )" This reverts commit 8c0bc879b97bc580aaa0777b2d266bdd068cb528. Reverted https://github.com/pytorch/pytorch/pull/164519 on behalf of https://github.com/malfet due to Still breaks internal workflows ([comment](https://github.com/pytorch/pytorch/pull/164519#issuecomment-3378469432))	2025-10-07 19:46:17 +00:00
zhxchen17	4c3c0ef2f1	[precompile] Load source cache for AOT compile as well. (#164773 ) Adding source_get_cache also to AOT compile case. Since the guard manager loader code can be shared between AOT and caching, we added a new function load_guard_manager to avoid code duplication between two workflows, for loading guards. Test Plan: test_guard_serialization.py Pull Request resolved: https://github.com/pytorch/pytorch/pull/164773 Approved by: https://github.com/yiming0416, https://github.com/dolpm	2025-10-07 18:47:09 +00:00
Parshant Sharma	bc33b10202	fix copy_ for scalar in inductor (#164167 ) Fixes #158437 ### Summary - TorchInductor was not properly handling scalar copy operations `(tensor.copy_(scalar_value))` - Ensured scalar sources are converted to appropriate tensor representations with correct dtype and device ### Impact - Enables compilation of models using ` tensor.copy_(scalar) `patterns - module: inductor Pull Request resolved: https://github.com/pytorch/pytorch/pull/164167 Approved by: https://github.com/shunting314	2025-10-07 18:31:37 +00:00
Colin Peppler	2855a045b3	Use sym_eq and sym_and on symbolic shapes in common_meta_baddbmm_bmm (#164781 ) Differential Revision: D84005053 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164781 Approved by: https://github.com/Skylion007	2025-10-07 18:25:00 +00:00
Lakshay Garg	9ecd092bd9	Add python bindings for NCCL CTA policies (#164309 ) NCCLConfig can now be constructed with non-default [cta policies][1] ```python import torch from torch.distributed import ProcessGroupNCCL as nccl config = nccl.NCCLConfig() config.cta_policy = nccl.NCCL_CTA_POLICY_ZERO # NCCL version >= 2.28 ``` [1]: https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2283/user-guide/docs/api/flags.html#nccl-communicator-cta-policy-flags Pull Request resolved: https://github.com/pytorch/pytorch/pull/164309 Approved by: https://github.com/eqy	2025-10-07 18:16:20 +00:00
Avik Chaudhuri	078d475d3b	move partition and compiler fns from stage 1 to stage 2 (#164765 ) Differential Revision: D83995689 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164765 Approved by: https://github.com/zhxchen17	2025-10-07 18:02:03 +00:00
Mikayla Gawarecki	f37a6523ef	Move version.h to torch/headeronly (#164381 ) Differential Revision: [D83685392](https://our.internmc.facebook.com/intern/diff/D83685392) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164381 Approved by: https://github.com/janeyx99	2025-10-07 17:47:30 +00:00
Maggie Moss	b13cd141b3	Add pyrefly suppressions (#164748 ) Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283 Test plan: dmypy restart && python3 scripts/lintrunner.py -a pyrefly check step 1: delete lines in the pyrefly.toml file from the `project-excludes` field step 2: run pyrefly check step 3: add suppressions, clean up unused suppressions before: https://gist.github.com/maggiemoss/4b3bf2037014e116bc00706a16aef199 after: 0 errors (4,263 ignored) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164748 Approved by: https://github.com/oulgen	2025-10-07 17:31:18 +00:00
Lakshay Garg	5e47b4dd60	Remove device_id param from DeviceCachingAllocator::malloc (#164798 ) The `malloc` call in DeviceCachingAllocator accepts a DeviceIndex param which can be confusion because the allocator can only allocate memory for the device that it corresponds to. This associated device is fixed at construction time and the runtime param can be misleading. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164798 Approved by: https://github.com/ngimel, https://github.com/cyyever, https://github.com/eqy	2025-10-07 16:42:04 +00:00
Yuanyuan Chen	ee5389d520	Enable batch samples in sparse tests (#164677 ) The test cases are enabled because the issue was fixed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164677 Approved by: https://github.com/albanD	2025-10-07 15:58:37 +00:00
eellison	ab01a0d7d3	Add memory estimator (#164738 ) Original work by @ShatianWang, with lints applied. I am going to a few changes and add tests in subsequent prs but I want to preserve original commit first. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164738 Approved by: https://github.com/IvanKobzarev ghstack dependencies: #164568, #164569, #164581	2025-10-07 15:32:27 +00:00
Animesh Jain	801e282f39	[dynamo] Support torch.fx.traceback.annotate (#164678 ) Builds on top of https://github.com/pytorch/pytorch/pull/163673 and https://github.com/pytorch/pytorch/pull/164174. This will be used in the followup PRs to apply regional inductor compilation. The existing implementation let Dynamo trace into the `torch.fx.traceback.annotate`, but thats not what we want. We want Dynamo to essentially run the torch.fx.traceback.annotate function in eager, so that every Fx node created in Dynamo Fx graph has the custom meta node. This does not work with graph breaks yet. But we can solve that problem, if needed, in a separate PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164678 Approved by: https://github.com/SherlockNoMad, https://github.com/jansel, https://github.com/xmfan	2025-10-07 14:54:26 +00:00
Aleksei Nikiforov	87c9fbda22	Follow up to PR 163980 for s390x (#164464 ) Now with same updates propagated to s390x it works on s390x runners too. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164464 Approved by: https://github.com/atalman	2025-10-07 12:02:29 +00:00
YyWangCS	3cc8af2d67	torch.topk: refactor global histogram/cumsum into a dedicated kernel to eliminate redundant memory access (#164459 ) # TLDR This PR removes the regression in torch.topk introduced from torch 2.7.0 and delivers much better performance for large inputs. The table below reports execution times on H20 for various input sizes with float32 data, extracting the top-100 values. Results indicate that this PR restores and improves performance, especially on large inputs. \| Input Shape \| torch2.6.0 (ms) \| torch2.8.0 (ms) \| 2.8.0+this PR (ms) \| \| -------------- \| --------------- \| --------------- \| ------------------ \| \| (1, 1B) \| 36.6 \| 1564.1 \| 25.6 \| \| (1, 100M) \| 3.56 \| 17.4 \| 2.54 \| \| (1, 1000,000) \| 0.135 \| 0.145 \| 0.098 \| \| (512, 128000) \| 1.33 \| 1.33 \| 1.32 \| \| (8192, 128000) \| 19.6 \| 19.6 \| 19.4 \| # Background After upgrading PyTorch from 2.6.0 to 2.7.0, we observed a significant GPU performance regression in `torch.topk` on NVIDIA GPUs. For instance, extracting the top-1000 largest values from one billion floats on an NVIDIA H20 increased from 36 ms to 1.6 s. Profiling with Nsight Compute indicates that the slowdown is caused by redundant memory accesses introduced in [PR #145536](https://github.com/pytorch/pytorch/pull/145536). # Analysis `torch.topk` relies on RadixSelect to find the target values. Each radix pass requires computing a histogram of the input values. For large inputs, histogram computation is split into two stages: 1. Local histogram: Each CUDA block processes a subset of the input and writes its local histogram to global memory. 2. Global reduction: A single CUDA block reads all local histograms from global memory and reduces them into the final global histogram. Before [PR #145536](https://github.com/pytorch/pytorch/pull/145536), both stages ran inside a single kernel (`radixFindKthValues`), using a semaphore to ensure that all local histograms were completed before reduction. In PR #145536, the global histogram computation was merged with subsequent top-k calculations into a single kernel (`computeBlockwiseKthCounts`) to avoid the semaphore. While this simplifies synchronization, it introduces redundant memory reads: - `computeBlockwiseKthCounts` launches `numInputSlices * blocks_per_slice` blocks. - For each row (slice), `blocks_per_slice` CUDA blocks redundantly reload the same local histograms from global memory. # This PR To address this inefficiency, we introduce the following optimizations: 1. Dedicated kernel: Refactor global histogram and cumsum computation into a separate GPU kernel, `computeDigitCumSum`. 2. Loop unrolling: Apply loop unrolling in `computeDigitCumSum` to speed up local histogram reads. # Performance We benchmarked torch.topk on NVIDIA H20 with float32 inputs, extracting the top-100 values across different input sizes. The results in the table below demonstrate that this PR effectively eliminates the performance regression introduced in 2.7.0 and delivers substantial improvements on large inputs. \| Input Shape \| torch2.6.0 (ms) \| torch2.8.0 (ms) \| 2.8.0+this PR (ms) \| \| -------------- \| --------------- \| --------------- \| ------------------ \| \| (1, 1B) \| 36.6 \| 1564.1 \| 25.6 \| \| (1, 100M) \| 3.56 \| 17.4 \| 2.54 \| \| (1, 1000,000) \| 0.135 \| 0.145 \| 0.098 \| \| (512, 128000) \| 1.33 \| 1.33 \| 1.32 \| \| (8192, 128000) \| 19.6 \| 19.6 \| 19.4 \| Besides, I have verified the correctness of this PR with different inputs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164459 Approved by: https://github.com/ngimel, https://github.com/Skylion007	2025-10-07 11:04:03 +00:00
Nicolas Macchioni	1fb072ac2a	exceptions + unit tests (#164550 ) Test Plan: ``` buck test fbcode//mode/opt caffe2/test/inductor:caching ``` Reviewed By: aorenste Differential Revision: D83714688 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164550 Approved by: https://github.com/aorenste	2025-10-07 10:04:58 +00:00
Animesh Jain	cac5e13e13	[dynamo] Inline nn module calls using __call__ methods (#164817 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164817 Approved by: https://github.com/SherlockNoMad, https://github.com/mlazos	2025-10-07 08:57:20 +00:00
Ivan Zaitsev	68350660ee	Increase timeout for nightly macOS performance tests to 300 minutes (#164793 ) the Test step time recently went slightly up. hopefully this fixes https://github.com/pytorch/alerting-infra/issues/263 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164793 Approved by: https://github.com/seemethere	2025-10-07 08:44:07 +00:00
Laith Sakka	ef7e2ca77e	remove check_is_size from test_misc.py (#164667 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164667 Approved by: https://github.com/angelayi ghstack dependencies: #164664, #164665	2025-10-07 07:33:50 +00:00
Laith Sakka	cdaaf3e4a3	remove size-like based size-oblivious special max simplifications (#164665 ) As we removed guard_size_oblivious this simplification is no longer relevant, this is part of the process of deprecation for guard_size_oblivious and its dependencies. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164665 Approved by: https://github.com/aorenste ghstack dependencies: #164664	2025-10-07 07:33:50 +00:00
Laith Sakka	0ea59c3c55	do not suggest torch._check_is_size() (#164664 ) size like concept for data dependency is not relevant anymore as we removed all guard_size_oblivious calls. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164664 Approved by: https://github.com/angelayi, https://github.com/mlazos	2025-10-07 07:33:50 +00:00
Nicolas Macchioni	8f705d019a	context + unit tests (#164549 ) Summary: the context module provides configurable context selection + isolation key hashing; context selection is broken into runtime and compile context. runtime context is decided at call time (inductor configs, precision configs, etc.) and compile context is decided at compile time (hardware type, software hashes). callees will be given access to SelectedRuntimeContext and SelectedCompileContext, which they can use to determine and select what context is necessary with regards to the function which is being cached. these selected contexts are wrapped in an IsolationSchema, which denotes what context should be taken into consideration when producing an isolation key. The isolation key is essentially a salt of the function signature key, which says that some function signature key result is valid under a given context (isolation schema) Test Plan: ``` buck test fbcode//mode/opt caffe2/test/inductor:caching ``` Reviewed By: aorenste D83714689 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164549 Approved by: https://github.com/aorenste	2025-10-07 06:02:10 +00:00
bobrenjc93	4bcc05777e	[torchfuzz] synthesize inputs for data dependent ops (#164716 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164716 Approved by: https://github.com/pianpwk ghstack dependencies: #164432, #164434, #164514, #164646, #164647, #164649, #164687, #164688, #164693, #164694, #164715	2025-10-07 05:40:32 +00:00
bobrenjc93	2a6cdba6e5	[torchfuzz] various edge case fixes (#164715 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164715 Approved by: https://github.com/pianpwk ghstack dependencies: #164432, #164434, #164514, #164646, #164647, #164649, #164687, #164688, #164693, #164694	2025-10-07 05:30:46 +00:00
bobrenjc93	53f6cc7529	[torchfuzz] make ops_fuzzer deterministic (#164694 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164694 Approved by: https://github.com/pianpwk ghstack dependencies: #164432, #164434, #164514, #164646, #164647, #164649, #164687, #164688, #164693	2025-10-07 05:30:46 +00:00
bobrenjc93	ac901bf79a	[torchfuzz] consolidate on a base implementation of args_codegen (#164693 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164693 Approved by: https://github.com/pianpwk ghstack dependencies: #164432, #164434, #164514, #164646, #164647, #164649, #164687, #164688	2025-10-07 05:20:28 +00:00
bobrenjc93	c965d6dbb2	[torchfuzz] move into experimental dir (#164688 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164688 Approved by: https://github.com/pianpwk ghstack dependencies: #164432, #164434, #164514, #164646, #164647, #164649, #164687	2025-10-07 05:09:08 +00:00
bobrenjc93	ac08556f67	[torchfuzz] support more unbacked functions (#164687 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164687 Approved by: https://github.com/pianpwk ghstack dependencies: #164432, #164434, #164514, #164646, #164647, #164649	2025-10-07 05:00:03 +00:00
bobrenjc93	5fe7f29b9e	[torchfuzz] add support for operator weights (#164649 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164649 Approved by: https://github.com/pianpwk ghstack dependencies: #164432, #164434, #164514, #164646, #164647	2025-10-07 05:00:03 +00:00
bobrenjc93	ded099ecbf	[torchfuzz] don't use the first gpu in multi process fuzzer (#164647 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164647 Approved by: https://github.com/pianpwk ghstack dependencies: #164432, #164434, #164514, #164646	2025-10-07 04:59:56 +00:00
bobrenjc93	63fcc3e6c4	[torchfuzz] update README.md (#164646 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164646 Approved by: https://github.com/pianpwk ghstack dependencies: #164432, #164434, #164514	2025-10-07 04:59:50 +00:00
Xiao Fu	fd3e15c14f	Fix typo in class definition of bytecodedispatchtable (#164762 ) ghstack-source-id: 84f0d7bb7e3780ca75473782abfae530010be56e Pull Request resolved: https://github.com/pytorch/pytorch/pull/164761 Fixes the type in naming of bytecodedispatchtable Pull Request resolved: https://github.com/pytorch/pytorch/pull/164762 Approved by: https://github.com/StrongerXi, https://github.com/williamwen42	2025-10-07 04:36:09 +00:00
Yuanyuan Chen	ff5faa744a	Remove unused THPXXX macros (#164660 ) These macros are not used in OSS. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164660 Approved by: https://github.com/albanD	2025-10-07 04:04:21 +00:00
Tugsbayasgalan Manlaibaatar	4725871a81	Return fake mode from export graph capture API (#164730 ) This PR is to temporarily unblock various experiments to re-use dynamo create fake mode. Note that this is still not what we want as the end state. The end state should look sth like: ``` out = fulllgraph_capture(mod, inputs) fake_mode = out.backend_inputs.fake_mode gm = out.module() ``` This doesn't work today because export requires we need to wrap the original module to setup a flat module to trace for easier handling of pytree. As a result, we would need to carry export specific flag in fullgraph_capture which seems not ideal. Regardless, the end state is that we need to give downstream user a graph module and a fake mode in some form, so I think _dynamo_graph_capture_for_export returning a fake mode within graph module itself via gm.meta Pull Request resolved: https://github.com/pytorch/pytorch/pull/164730 Approved by: https://github.com/avikchaudhuri	2025-10-07 03:42:46 +00:00
Animesh Jain	bcd96cc6ff	[annotate] Copy fwd to bwd metadata for subgraphs as well (#164795 ) The test is in the next PR. My older PR on dynamo annotate - https://github.com/pytorch/pytorch/pull/164678 is getting reverted due to unknown reasons, so difficult to add a test right now in this PR. When I reland, I can add a test for this as well. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164795 Approved by: https://github.com/yushangdi ghstack dependencies: #164772	2025-10-07 02:42:47 +00:00
Yuanyuan Chen	50e077beaa	Fix outdated info in requirements-ci.txt (#164441 ) Fixes installation instructions and descriptions for `numba` and `scikit-image` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164441 Approved by: https://github.com/albanD	2025-10-07 02:10:41 +00:00
albanD	56d66ac0d7	Make custom op alias check consistent (#164576 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164576 Approved by: https://github.com/soulitzer	2025-10-07 02:05:09 +00:00
rraminen	49f7d8d19d	[ROCm] Fix test_cuda_synchronize failure on ROCm (#164735 ) This PR skips the hipify step of torch/csrc/jit/ir/ir.h to avoid a build-time error for the JIT cuda namespace. This fixes two skipped tests in test/jit/test_cuda.py. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164735 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-07 01:14:24 +00:00
PyTorch MergeBot	afee8062d5	Revert "Fix mesh.get_local_rank when it is > 1d (#164473 )" This reverts commit 83d71dfb2fd993a6242372b8123549acaa85ffdb. Reverted https://github.com/pytorch/pytorch/pull/164473 on behalf of https://github.com/izaitsevfb due to appears to be causing vision_maskrcnn regression ([comment](https://github.com/pytorch/pytorch/pull/164473#issuecomment-3374738997))	2025-10-07 00:37:41 +00:00
Chris Leonard	e89d12bf5d	Numpy zerotensor handling (#164487 ) Fixes #89034 Updated tensor_to_numpy() function in tensor_numpy.cpp to handle ZeroTensors by throwing an error if force=False and returning an array full of zeros if force=True. @ngimel, I just saw that you mentioned PyTorch is not too concerned with this issue but I had already worked on it so I figured I would push it anyways and see what you thought. Feel free to close the PR if you think it is not worth merging. @albanD Pull Request resolved: https://github.com/pytorch/pytorch/pull/164487 Approved by: https://github.com/izaitsevfb	2025-10-07 00:34:14 +00:00
Yedidya Feldblum	d4752bc7f6	[caffe2] tweak Unpickler::readInstruction handling TUPLE (#164764 ) Summary: Creating the vector was a bit awkward. Use the natural iterator-pair constructor with move-iterators. Test Plan: CI. Reviewed By: dolpm Differential Revision: D83995108 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164764 Approved by: https://github.com/drisspg	2025-10-07 00:18:10 +00:00
Jeff Daily	44a5d41993	[ROCm] add gfx1150 gfx1151 to supported gemm lists (#164744 ) This is one of a few PRs needed to address https://github.com/pytorch/pytorch/pull/164744 fully. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164744 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-07 00:02:23 +00:00
Animesh Jain	361c5d362c	[fx][traceback] Actually disable preservation of node metadata when enable=False (#164772 ) This will come in handy when we run graph passes that add new nodes, and create_proxy can add seq_nr meta. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164772 Approved by: https://github.com/SherlockNoMad	2025-10-06 23:39:12 +00:00
PyTorch MergeBot	1fc71d1b57	Revert "Numpy zerotensor handling (#164487 )" This reverts commit f7ad6dbad67161333a1473d1e0b478b7475a0ec1. Reverted https://github.com/pytorch/pytorch/pull/164487 on behalf of https://github.com/malfet due to Did it break torchbench?, see `8c728e129d/1` ([comment](https://github.com/pytorch/pytorch/pull/164487#issuecomment-3374635051))	2025-10-06 23:32:12 +00:00
Jeff Daily	8f54e27e5d	[ROCm][CI] rebuild magma binary for gfx1150 gfx1151 (#164782 ) After #164763 added gfx1150 gfx1151 to list of targets, this PR will trigger rebuild of magma binary for ROCm 7 with the new targets. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164782 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-06 23:29:21 +00:00
Scott Wolchok	8c0bc879b9	Reapply "C++-accessible Placements via pybind11 (#163030 )" (#164519 ) This makes Placement data representation available in C++ via pybind11. Reapply with fix for internal errors. D83788896 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164519 Approved by: https://github.com/Skylion007, https://github.com/ezyang	2025-10-06 23:19:14 +00:00
Eddie Yan	746fe78ecd	[CUDA] Add experimental green context support for SM carveout (#159104 ) Low-level PyTorch APIs should be usable/stable enough at this point but we might move the underlying driver API usage a bit from here... Built on top of @drisspg 's branch Pull Request resolved: https://github.com/pytorch/pytorch/pull/159104 Approved by: https://github.com/ngimel Co-authored-by: drisspg <drisspguessous@gmail.com>	2025-10-06 23:11:23 +00:00
Yuanyuan Chen	b63bbe1661	Remove old ROCm version check in tests (#164245 ) This PR removes ROCm<6 version checks. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164245 Approved by: https://github.com/jeffdaily	2025-10-06 22:42:01 +00:00
PyTorch MergeBot	3912ba3e94	Revert "Fix refine_ranges corner case (#164075 )" This reverts commit 27234792add2ee9bedd84ca02dbf34f8f244bc5c. Reverted https://github.com/pytorch/pytorch/pull/164075 on behalf of https://github.com/izaitsevfb due to fails executorch builds, see [D83938444](https://www.internalfb.com/diff/D83938444) ([comment](https://github.com/pytorch/pytorch/pull/164075#issuecomment-3374430964))	2025-10-06 22:09:39 +00:00
PyTorch MergeBot	cfc5cc17dc	Revert "[dynamo] Support torch.fx.traceback.annotate (#164678 )" This reverts commit 2883b5ab773daf5861d43ff0b65be49a441ab3f9. Reverted https://github.com/pytorch/pytorch/pull/164678 on behalf of https://github.com/izaitsevfb due to fails inductor:max_autotune tests internally, see D83948169 ([comment](https://github.com/pytorch/pytorch/pull/164678#issuecomment-3374407009))	2025-10-06 22:03:42 +00:00
zeshengzong	fdc8ccc5bc	Make `Adam`, `AdamW` work with nonzero-dim Tensor betas (#149939 ) Fixes #147921 ## Changes - Convert tensor `betas` using `_to_scalar` - Change annotation of `betas` param - Change param type in docs ## Test Result ```bash pytest -s test/test_optim.py -k test_tensor_lr -vv ``` ![image](https://github.com/user-attachments/assets/312ee045-1e8b-4789-aa6e-ba63e6df7e81) ![image](https://github.com/user-attachments/assets/7e6ec274-645b-46b9-b1a6-2b340a685203) Pull Request resolved: https://github.com/pytorch/pytorch/pull/149939 Approved by: https://github.com/janeyx99 Co-authored-by: Jane (Yuan) Xu <31798555+janeyx99@users.noreply.github.com>	2025-10-06 22:03:25 +00:00
Yuanyuan Chen	48b54b45d6	Replace pynvml with nvidia-ml-py in win-test.sh (#164681 ) pynvml was deprecated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164681 Approved by: https://github.com/Aidyn-A, https://github.com/eqy	2025-10-06 21:57:26 +00:00
Eddie Yan	6861fa43e5	[CUDA] Cleanup persistent cuBLASLt workspaces before compile-regions test (#163299 ) Fixes some tests that seemed to start flaking out as reported in #163202, due to cuBLASLt workspaces becoming persistent following that change. It's relatively obvious why the workspaces/allocations corresponding to them should be cleaned up for `test_memory_snapshot_script` but less obvious for `test_memory_plots_free_segment_stack`? Why does not cleaning up workspace prevent `empty_cache` from showing up? Pull Request resolved: https://github.com/pytorch/pytorch/pull/163299 Approved by: https://github.com/albanD	2025-10-06 21:13:03 +00:00
atalman	c1f40d33c8	Fix docker build issue after 164575 (#164774 ) Looks like https://github.com/pytorch/pytorch/pull/164575 introduced an issue. The command is wrong: ``` conda install -c "whl/nightly" -y python=3.11 conda=25.7.0 ``` Should be just using default conda channel: ``` conda install -y python=3.11 conda=25.7.0 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164774 Approved by: https://github.com/Camyll	2025-10-06 20:28:20 +00:00
Jeff Daily	7e7ac2039d	[ROCm][CI] add gfx1150 gfx1151 to almalinux image (#164763 ) First PR necessary to address missing gfx1151 reported in https://github.com/pytorch/pytorch/issues/164346. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164763 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-06 20:19:43 +00:00
Zhengxu Chen	23ab6a45e5	[precompile][ez] Add instrumentation for guard loading/building. (#164602 ) Summary: as title. Test Plan: CI Differential Revision: D83868533 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164602 Approved by: https://github.com/dolpm	2025-10-06 20:16:09 +00:00
Rohit Singh Rathaur	b558c986e8	Add regression test for get_root_mesh with multiple independent meshes (#164731 ) Fixes #163330 I tried to reproduce the bug with my 4-GPU setup (the original issue used 8 GPUs). I created several different test scenarios, trying to trigger the bug by: - creating two different device meshes - slicing them in various ways - checking if get_root_mesh() would get confused but the bug didn't show up! Everything worked correctly in `2.10`. I found that there was a massive refactoring of the `DeviceMesh` code (PR #163213) that landed on October 2nd. That PR completely rewrote how `DeviceMesh` tracks relationships between parent meshes and submeshes using. It seems like this refactoring fixed the bug! But I added a regression test to make sure it doesn't come back. The test (`test_get_root_mesh_multiple_independent_meshes`) does exactly what the bug report described: - creates two independent meshes - slices them both - verifies that each submesh correctly points back to its real parent - makes sure submeshes from mesh1 don't incorrectly claim mesh2 as their parent Pull Request resolved: https://github.com/pytorch/pytorch/pull/164731 Approved by: https://github.com/fduwjj	2025-10-06 18:52:25 +00:00
eellison	415e641572	Limit path search within range (#164581 ) When we are looking if two nodes are dependent, limit path search within the bounds of their node idxs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164581 Approved by: https://github.com/ezyang ghstack dependencies: #164568, #164569	2025-10-06 18:29:27 +00:00
Scott Wolchok	11f5f65686	Use PyObject_GetOptionalAttrString in PyObject_FastGetAttrString when available (#164624 ) Python 3.13 added PyObject_GetOptionalAttrString. I'm not 100% certain that it is strictly better than the old approach in all cases, but based on documentation/comments it seems to be meant for this type of use, and it's faster when I profile torchtitan training (which gets to the "check for the `__torch_function__` attr on some object" part of maybe_has_torch_function frequently enough to notice, but wastes a bunch of time generating exceptions that we then suppressed here). Pull Request resolved: https://github.com/pytorch/pytorch/pull/164624 Approved by: https://github.com/Skylion007	2025-10-06 18:26:09 +00:00
albanD	af32d16a71	Add pure view support in autograd Function (#164736 ) This is the same as https://github.com/pytorch/pytorch/pull/164467 But it needs to be co-deved due to internal insanity. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164736 Approved by: https://github.com/soulitzer	2025-10-06 18:21:05 +00:00
Colin L Reliability Rice	ba480d6bf7	torch.compile: Increase subprocess parent death check interval to lower cpu (#164594 ) Summary: This check is a good idea (we could potentially do it with prctl). However we're seeing elevated rates of cpu usage in idle worker threads. This causes issues on production jobs, causing a large amount of spikeness in qps. Test Plan: Tested on a prod job with caches force disabled via TORCH_COMPILE_FORCE_DISABLE_CACHES=1 Baseline <img width="454" height="403" alt="image" src="https://github.com/user-attachments/assets/b88583a1-5b99-48cb-b03d-cd9b69546579" /> With this diff - <img width="426" height="403" alt="image" src="https://github.com/user-attachments/assets/431217f1-0ed0-4f6e-9d81-6428bf34e0e3" /> Differential Revision: D83803302 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164594 Approved by: https://github.com/masnesral	2025-10-06 18:15:21 +00:00
Jeff Daily	4a6abba0d9	[ROCm][CI] test_convolution.py uses miopen immediate mode (#164598 ) This should help stabilize some flaky test behavior where miopen would pick different solutions for different parts of the same test and the test expects bitwise identical results. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164598 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-06 17:48:50 +00:00
Henry Tsang	96181d6f76	[BE][cutlass backend] BE changes post cutlass_cppgen name change (#164589 ) Differential Revision: D83809105 Handle reviews from https://github.com/pytorch/pytorch/pull/164159 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164589 Approved by: https://github.com/Skylion007	2025-10-06 17:22:08 +00:00
Yiming Zhou	2164b66121	[export] Better state_dict and constant dedup in torch.export.save (#164196 ) Summary: Previously, weight deduplication was done by simply grouping tensors with their untyped storage and saving the first tensor in the group. A more rigorous approach would be to find a complete tensor that covers the storage and store that tensor. This is particularly important for GPU weights because when saving to raw bytes, we move the weight to CPU first, and if the weight being saved is not a complete one, it will lose the storage information during the copy to CPU. In this diff, we reuse code in `_package_weights.py` for better weights and constants deduplication in `torch.export.save`. Test Plan: buck2 run mode/dev-nosan caffe2/test:test_export -- -r test_weight_sharing_gpu Differential Revision: D83523690 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164196 Approved by: https://github.com/angelayi	2025-10-06 17:03:15 +00:00
Janani Sriram	bde18c445d	[Max Autotune][B200] Relax absolute tolerance for MM+MM test (#164022 ) Summary: Relax absolute tolerance from 1e-2 to 1e-1 for `test_non_contiguous_input_mm_plus_mm` in `test_max_autotune.py`. Test Plan: `test_max_autotune.py` Differential Revision: D83391942 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164022 Approved by: https://github.com/eellison	2025-10-06 16:29:07 +00:00
Janani Sriram	f3e43ff2d7	[Max Autotune][B200] Fix decompose_k test failure (#164021 ) Summary: Fix decompose_k test failure (`test_max_autotune_decompose_k `) in `test_max_autotune.py` on B200s by setting `torch._inductor.config` patches for variables `comprehensive_padding` and `shape_padding`. Initial failure was `AssertionError: False is not true : Could not find a split in {3, 9, 2187, 81, 243, 729, 27} in # AOT ID: ['6_forward']`. Refactor decompose_k test to follow patch semantics when setting all environment variables within a test. Test Plan: `test_max_autotune.py`: ``` buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:max_autotune -c fbcode.nvcc_arch=b200a -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8 -c fbcode.re_gpu_tests=False -- test_max_autotune_decompose_k ``` Differential Revision: D83390563 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164021 Approved by: https://github.com/njriasan, https://github.com/mlazos, https://github.com/eellison	2025-10-06 16:28:23 +00:00
bobrenjc93	39d0c06ed0	[torchfuzz] check in some more xfail repros (#164619 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164619 Approved by: https://github.com/ezyang	2025-10-06 16:20:44 +00:00
Maggie Moss	4ab847bbc7	Pyrefly suppressions 4/n (#164615 ) Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283 Test plan: dmypy restart && python3 scripts/lintrunner.py -a pyrefly check step 1: uncomment lines in the pyrefly.toml file step 2: run pyrefly check step 3: add suppressions, clean up unused suppressions before: https://gist.github.com/maggiemoss/356645cf8cfe33123d9a27f23b30f7b1 after: 0 errors (2,753 ignored) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164615 Approved by: https://github.com/oulgen	2025-10-06 16:14:36 +00:00
Zhengxu Chen	4bd1505f84	[precompile][ez] Inline type definition for dynamo cache entry. (#164580 ) Summary: as title. DynamoCaptureOutput in package.py is not actively used in other files. Inline it to reduce confusion. Test Plan: CI Differential Revision: D83846957 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164580 Approved by: https://github.com/dolpm	2025-10-06 16:00:59 +00:00
amdfaa	1f9614cef8	[ROCm][CI] Change rocm periodic workflow label to linux.rocm.gpu.mi250.4 (#164616 ) Testing done on this PR: https://github.com/pytorch/pytorch/pull/156491 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164616 Approved by: https://github.com/jeffdaily, https://github.com/huydhn	2025-10-06 15:51:07 +00:00
eellison	35f66b83f8	respect aten planned overlap in inductor (#164569 ) Now that we have a hop to add implicit deps - use those deps for comm/compute overlap. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164569 Approved by: https://github.com/ezyang, https://github.com/IvanKobzarev ghstack dependencies: #164568	2025-10-06 15:47:55 +00:00
eellison	4a39820e5e	Add hop for additional control dependencies (#164568 ) Adds [control_deps](https://en.wikipedia.org/wiki/Control_dependency) higher-order operator to enforce explicit scheduling dependencies in FX graphs. This prevents unwanted operation reordering/fusion by giving nodes additional dependencies, which we also respect in inductor by adding weakdeps on the additional dependencies. This can be generally useful (such as for ordering collectives) but in this case I am using it so that fusions do not interfere with aten planned comm-compute overlap. There's definitely some similarity with the `with_effects` hop. Talked with @angelayi - when @zou3519 is back we will figure out how we want to consolidate. The implementation needs to be a subgraph (as opposed to `with_effects`) because inductor relies on `V.graph.current_node`. Changing the signature of the node with `with_effects` breaks this, and additionally, also breaks striding constraints on the wrapped node - see this [TODO](`aed66248a0/torch/fx/experimental/proxy_tensor.py (L1246-L1249)`). By maintaining the node with its original calling structure in subgraph this all works. Example transformation: Before: ``` %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%arg0_1, 1), kwargs = {}) %mm : [num_users=1] = call_function[target=torch.ops.aten.mm.default](args = (%arg1_1, %arg1_1), kwargs = {}) %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add, 2), kwargs = {}) ``` After: ``` add: "f32[256, 256]" = torch.ops.aten.add.Tensor(arg0_1, 1) mm: "f32[256, 256]" = torch.ops.higher_order.control_deps((add,), subgraph_mm, arg1_1, arg1_1) mul: "f32[256, 256]" = torch.ops.higher_order.control_deps((mm,), subgraph_mul, add) ``` The mm operation now explicitly depends on add completing first, and mul depends on mm, with original operations preserved in subgraphs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164568 Approved by: https://github.com/ezyang, https://github.com/IvanKobzarev	2025-10-06 15:47:55 +00:00
PaulZhang12	600267ea56	Add num_store to inductor_meta and use it to scale persistent reduction x block (#162446 ) Scale up XBLOCK for contiguous persistent reductions based on rnumel and number of loads + stores <img width="928" height="656" alt="Screenshot 2025-09-18 at 5 02 57 PM" src="https://github.com/user-attachments/assets/ec3c561f-2a3f-4459-9e14-653715898da3" /> Differential Revision: [](https://our.internmc.facebook.com/intern/diff/) Differential Revision: [](https://our.internmc.facebook.com/intern/diff/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162446 Approved by: https://github.com/v0i0, https://github.com/eellison, https://github.com/shunting314 ghstack dependencies: #162296	2025-10-06 14:29:07 +00:00
PyTorch UpdateBot	f11ac803d7	Update slow tests (#164726 ) This PR is auto-generated weekly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/weekly.yml). Update the list of slow tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164726 Approved by: https://github.com/pytorchbot	2025-10-06 12:57:29 +00:00
PyTorch UpdateBot	ea42517e45	[xla hash update] update the pinned xla hash (#164727 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned xla hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164727 Approved by: https://github.com/pytorchbot	2025-10-06 11:54:10 +00:00
Tugsbayasgalan Manlaibaatar	91c211fb8c	AC should work with pre-dispatch IR (#164505 ) Previously we had to rely on turning off export verifier because the AC body was torch IR instead of aten IR. This PR makes it so that we create an IR that is export compatible. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164505 Approved by: https://github.com/ydwu4, https://github.com/xmfan	2025-10-06 11:05:22 +00:00
Wei Feng	660e369a68	[FSDP2] check storage equal and consider data_ptr() == 0 (#164595 ) resolve https://github.com/pytorch/pytorch/issues/164554 unit test * `pytest -s test/distributed/_composable/fsdp/test_fully_shard_state_dict.py -k test_cached_state_dict` * `pytest -s test/distributed/_composable/fsdp/test_fully_shard_init.py -k test_meta_device_1d_init` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164595 Approved by: https://github.com/fegin	2025-10-06 08:44:38 +00:00
Animesh Jain	2883b5ab77	[dynamo] Support torch.fx.traceback.annotate (#164678 ) Builds on top of https://github.com/pytorch/pytorch/pull/163673 and https://github.com/pytorch/pytorch/pull/164174. This will be used in the followup PRs to apply regional inductor compilation. The existing implementation let Dynamo trace into the `torch.fx.traceback.annotate`, but thats not what we want. We want Dynamo to essentially run the torch.fx.traceback.annotate function in eager, so that every Fx node created in Dynamo Fx graph has the custom meta node. This does not work with graph breaks yet. But we can solve that problem, if needed, in a separate PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164678 Approved by: https://github.com/SherlockNoMad, https://github.com/jansel, https://github.com/xmfan	2025-10-06 02:59:24 +00:00
Yuanyuan Chen	9fff8155c3	[2/N] Fix clang-tidy readability checks (#164652 ) This PR applies clang-tidy readability checks to jit sources and all headers in the code base. `readability-redundant-inline-specifier` is suppressed because it incurs too many changes. `readability-redundant-inline-specifier` is used to detect redundant inline specifiers on function and variable declarations. There are many in-class method definitions that are marked inline. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164652 Approved by: https://github.com/Skylion007	2025-10-06 01:06:01 +00:00
PyTorch MergeBot	331191ce4b	Revert "[BE] Make PyObjectSlot use a global PyInterpreter (#162659 )" This reverts commit 29cbcbac4215e0d9070a1b7a07ddaec9a36bbd08. Reverted https://github.com/pytorch/pytorch/pull/162659 on behalf of https://github.com/izaitsevfb due to reverted internally, see [D83214133](https://www.internalfb.com/diff/D83214133) ([comment](https://github.com/pytorch/pytorch/pull/162659#issuecomment-3369348172))	2025-10-05 21:39:57 +00:00
PyTorch MergeBot	2c5ed6e7c0	Revert "[2/N] Fix clang-tidy readability checks (#164652 )" This reverts commit 3c5ca685d6f5b6f3971c0cd20a054aa355610419. Reverted https://github.com/pytorch/pytorch/pull/164652 on behalf of https://github.com/izaitsevfb due to need to revert due to a conflict with revert of https://github.com/pytorch/pytorch/pull/162659 ([comment](https://github.com/pytorch/pytorch/pull/164652#issuecomment-3369346707))	2025-10-05 21:36:57 +00:00
PyTorch MergeBot	5d7360bb03	Revert "Enable all SIM rules except disabled ones (#164645 )" This reverts commit 321e6026925f6b6e8a36e3a8b7c0295cd7541911. Reverted https://github.com/pytorch/pytorch/pull/164645 on behalf of https://github.com/izaitsevfb due to causes lint failures ([comment](https://github.com/pytorch/pytorch/pull/164645#issuecomment-3369274351))	2025-10-05 19:32:21 +00:00
Yuanyuan Chen	321e602692	Enable all SIM rules except disabled ones (#164645 ) `SIM` rules are useful for simplifying boolean expressions and enhances code readability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164645 Approved by: https://github.com/ezyang	2025-10-05 07:38:25 +00:00
Yuanyuan Chen	3c5ca685d6	[2/N] Fix clang-tidy readability checks (#164652 ) This PR applies clang-tidy readability checks to jit sources and all headers in the code base. `readability-redundant-inline-specifier` is suppressed because it incurs too many changes. `readability-redundant-inline-specifier` is used to detect redundant inline specifiers on function and variable declarations. There are many in-class method definitions that are marked inline. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164652 Approved by: https://github.com/Skylion007	2025-10-05 07:05:11 +00:00
yewentao256	5178d0a480	[Compile] Fix Compile Warning for Capture Id (#163898 ) ```bash DEBUG /data/vllm-community-homes/vllm-user-6/pytorch/aten/src/ATen/cuda/CUDAGraph.h(59): warning #68-D: integer conversion resulted in a change of sign DEBUG CaptureId_t capture_id_ = -1; DEBUG ^ DEBUG DEBUG Remark: The warnings can be suppressed with "-diag-suppress <warning-number>" DEBUG DEBUG /data/vllm-community-homes/vllm-user-6/pytorch/aten/src/ATen/cuda/CUDAGraph.h(59): warning #68-D: integer conversion resulted in a change of sign DEBUG CaptureId_t capture_id_ = -1; DEBUG ^ DEBUG DEBUG Remark: The warnings can be suppressed with "-diag-suppress <warning-number>" DEBUG DEBUG /data/vllm-community-homes/vllm-user-6/pytorch/aten/src/ATen/cuda/CUDAGraph.h(59): warning #68-D: integer conversion resulted in a change of sign DEBUG CaptureId_t capture_id_ = -1; DEBUG ^ ``` Cuda won't use 0 as a capture id, so it is safe to initialize with 0, which also matches the initialization in `pytorch/aten/src/ATen/native/cudnn/RNN.cpp:2362` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163898 Approved by: https://github.com/houseroad	2025-10-05 06:51:33 +00:00
Yuanyuan Chen	cf0a00d4f3	Enable ruff FURB161 rule (#164654 ) This PR enables FURB161 in ruff. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164654 Approved by: https://github.com/Skylion007	2025-10-04 23:26:28 +00:00
Laith Sakka	5ed4270440	remove more no longer needed torch._check_is_size calls 1 (#164630 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164630 Approved by: https://github.com/Skylion007 ghstack dependencies: #164627	2025-10-04 22:06:04 +00:00
Laith Sakka	8c728e129d	remove no longer needed torch._check_is_size calls from test_dynamic_shapes (#164627 ) No longer needed in those tests to prevent DDE Pull Request resolved: https://github.com/pytorch/pytorch/pull/164627 Approved by: https://github.com/ezyang	2025-10-04 22:06:04 +00:00
Laith Sakka	9fc2c6446d	remove guard_size_oblivious from is_contiguous python eager eval path. (#164622 ) Summary: this should not be needed anymore we shall have explicit is_contiguous_or_false calls where appropriate already ! Test Plan: run existing tests. Differential Revision: D83884977 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164622 Approved by: https://github.com/bobrenjc93	2025-10-04 21:02:39 +00:00
William Wen	409aece3f9	[dynamo, 3.14] prevent StackRef compilation in 3.14 Windows (#164400 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164400 Approved by: https://github.com/Camyll, https://github.com/atalman	2025-10-04 18:38:08 +00:00
Edward Z. Yang	b116c51330	torch.cond on DTensor triggers an internal assert, add xfail for this. (#164389 ) Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164389 Approved by: https://github.com/albanD	2025-10-04 18:12:06 +00:00
PyTorch MergeBot	2e1742dd63	Revert "Add device argument to torch.random.get_rng_state (#163034 )" This reverts commit 9580539e2f73d68e89544c713ff460bea3038701. Reverted https://github.com/pytorch/pytorch/pull/163034 on behalf of https://github.com/cyyever due to It cased partially initialised torch module ([comment](https://github.com/pytorch/pytorch/pull/163034#issuecomment-3368349209))	2025-10-04 15:25:45 +00:00
Chris Leonard	f7ad6dbad6	Numpy zerotensor handling (#164487 ) Fixes #89034 Updated tensor_to_numpy() function in tensor_numpy.cpp to handle ZeroTensors by throwing an error if force=False and returning an array full of zeros if force=True. @ngimel, I just saw that you mentioned PyTorch is not too concerned with this issue but I had already worked on it so I figured I would push it anyways and see what you thought. Feel free to close the PR if you think it is not worth merging. @albanD Pull Request resolved: https://github.com/pytorch/pytorch/pull/164487 Approved by: https://github.com/ngimel, https://github.com/albanD	2025-10-04 12:03:48 +00:00
PyTorch MergeBot	f46bb04dcc	Revert "Add pure view support in autograd Function (#164467 )" This reverts commit 10335ffb2cce26c99958d055f415a16c1d14bc35. Reverted https://github.com/pytorch/pytorch/pull/164467 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/164467#issuecomment-3368152304))	2025-10-04 11:42:46 +00:00
PyTorch MergeBot	6f6a919366	Revert "Make custom op alias check consistent (#164576 )" This reverts commit e438db254602cf39ba536aed0590b4144c019ee8. Reverted https://github.com/pytorch/pytorch/pull/164576 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/164467#issuecomment-3368152304))	2025-10-04 11:42:45 +00:00
Francisco Massa	83d71dfb2f	Fix mesh.get_local_rank when it is > 1d (#164473 ) Previously, we would not take the arguments passed by get_local_rank into account. This means that we wouldn't be able to trace this call if we had a device_mesh > 1d Pull Request resolved: https://github.com/pytorch/pytorch/pull/164473 Approved by: https://github.com/xmfan, https://github.com/Skylion007	2025-10-04 11:27:55 +00:00
Yuanyuan Chen	5103ecc5d8	[1/N] Fix clang-tidy readability checks (#164561 ) Check all `.cpp` files except `jit` files for readability thoroughly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164561 Approved by: https://github.com/Skylion007	2025-10-04 09:40:38 +00:00
Evan Conway	9580539e2f	Add device argument to torch.random.get_rng_state (#163034 ) Fixes #162812 Adds support for either passing a device directly into get_rng_state, or passing in a string or int (which is then wrapped into a device inside, as in torch.cuda.get_rng_state). I wasn't exactly sure where tests for this should go, please let me know. I used this script for testing: ```python import torch # note: when running with CUDA GPU, first three tests will give the same result, # as will the last two # test with no device specified print(torch.get_rng_state()) # test with CPU cpu_device = torch.device("cpu") print(torch.get_rng_state(cpu_device)) # test with direct name print(torch.get_rng_state("cpu")) # test with CUDA cuda_device = torch.device("cuda:0") print(torch.get_rng_state(cuda_device)) # test with integer print(torch.get_rng_state(0)) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163034 Approved by: https://github.com/ezyang, https://github.com/cyyever	2025-10-04 06:48:39 +00:00
Yuanyuan Chen	a11a66ef32	Remove CUDA 11 branches for sparse code (#164531 ) This PR removes outdated CUDA version checks from sparse code in aten/src/ATen/cuda. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164531 Approved by: https://github.com/eqy	2025-10-04 06:07:49 +00:00
Shangdi Yu	6b768e1890	Support propagating custom meta field to backward graph nodes (#164174 ) # Propagate custom meta data to backward Support propagating the user annotation tags to backward graph, by extending the `copy_fwd_metadata_to_bw_nodes` utils (recommended by @xmfan , thanks!). Example annotation API (added in https://github.com/pytorch/pytorch/pull/163673): ``` class M(torch.nn.Module): def forward(self, x): with fx_traceback.annotate({"pp_stage": 0}): with fx_traceback.annotate({"fdsp_bucket": 0}): x = x + 1 x = x - 2 with fx_traceback.annotate({"cuda_stream": 2, "fsdp_bucket": 1}): x = x * 2 x = x / 3 return x ``` Assumptions (some inherited from https://github.com/pytorch/pytorch/pull/126573): - I am trusting the seq_nr mapping introduced to aot_autograd nodes in https://github.com/pytorch/pytorch/pull/103129 - I am also trusting that the forward is single threaded, since seq_nr is thread local. If this isn't always true, we'll need to also plumb thread_id through the same machinery which is populating seq_nr. - (This is changed in this PR!) I assume all backward graph nodes has "is_backward" for 'partitioner_tag', and all other nodes are forward graph nodes. If we don't run export before `aot_export_join_with_descriptors`, then none of the nodes has "nn_module_stack" in node meta. If we do run export first, then we don't need this change. - I copy "custom" node meta from forward to backward graph nodes. Question: - Is it a good idea to copy all "custom" node meta? Or should we create a dedicated key in custom node meta to be copied? @SherlockNoMad - Do we expect people to run export before using `aot_export_join_with_descriptors`? - Can we assume the following for graph produced by `aot_export_join_with_descriptors`? "all backward graph nodes has "is_backward" for 'partitioner_tag', and all other nodes are forward graph nodes". Maybe this is a question for @ezyang ``` python test/functorch/test_aot_joint_with_descriptors.py -k test_preserve_ python test/export/test_export.py -k preserve_anno python test/distributed/tensor/test_dtensor_export.py ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164174 Approved by: https://github.com/xmfan, https://github.com/SherlockNoMad	2025-10-04 05:03:32 +00:00
Yuanyuan Chen	35c4130fd1	[2/N] Fix ruff warnings (#164460 ) Apply ruff `SIM` rules. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164460 Approved by: https://github.com/ezyang	2025-10-04 03:40:32 +00:00
Kirthi Shankar Sivamani	34042a9145	Change intra-graph offset dtype to `uint64_t` (#164515 ) Even though `offset_intragraph_` only tracks RNG consumption within a single graph replay, we have observed that the 32bit storage for these offsets is easy to overshoot, especially for cases with big CUDA graph captures including kernels that are generating a large amount of random numbers. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164515 Approved by: https://github.com/eee4017, https://github.com/eqy	2025-10-04 03:39:09 +00:00
Ken	9d1ab4f4bb	[CI] Limit Numba CUDA-13 patch to CUDA environments only (#164607 ) The patch introduced in https://github.com/pytorch/pytorch/pull/163111 caused issues in ROCm environments. This change guards the patching logic to CUDA environments only, thus ameliorating test failures in ROCm environments. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164607 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-04 02:39:07 +00:00
Tugsbayasgalan Manlaibaatar	3e0826c9d7	Update disabling fast-path for strict-export inside MultiheadAttention (#164544 ) For some reason, executorch needs the slow path. But the original flag doesn't work for new export because we inline torch modules even before getting into make_fx. We still have to keep the old flag because lot of code assumes this exist.... grr Differential Revision: [D83810733](https://our.internmc.facebook.com/intern/diff/D83810733) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164544 Approved by: https://github.com/anijain2305, https://github.com/mikaylagawarecki	2025-10-04 02:20:55 +00:00
fduwjj	86c789849e	[fr] Re-order mismatch check in fr analysis script (#164606 ) In reality we found the current mismatch order does not match the actual error distribution, so we reorder it a bit as following: 1. We do collective type check first 2. Then size check (excluding all2all) 3. dtype check 4. state check Pull Request resolved: https://github.com/pytorch/pytorch/pull/164606 Approved by: https://github.com/VieEeEw	2025-10-04 01:16:15 +00:00
Yuanyuan Chen	f3afbcf340	[ONNX] Bump tested onnxruntime to 1.23.0 and onnxscript to 0.5.2 (#164440 ) Performs tests on the latest ONNX environment. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164440 Approved by: https://github.com/justinchuby, https://github.com/albanD	2025-10-04 01:10:47 +00:00
Shunting Zhang	40b25578e4	[Inductor] deterministic mode (#163589 ) Add a deterministic mode to skip the on device benchmarking that we know should affect numeric. This include - pad-mm - dynamic rblock scaling - template autotuning - coordinate descent tuning for reduction - reduction config autotuning in CachingAutotuner. For reduction both RBLOCK, num_warps should affect numeric. XBLOCK does not. We can still autotune XBLOCK for reductions. - benchmarking for computation communication reordering pass The mode definitely has perf hit. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163589 Approved by: https://github.com/v0i0	2025-10-04 01:05:08 +00:00
Jeff Daily	412c6d28ec	[ROCm][CI] additional dynamo benchmarks for inductor-periodic (#164279 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164279 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-04 00:55:17 +00:00
soulitzer	7d570129e0	Fix custom autograd Function memory leak when saving mutated view (#164407 ) Fixes https://github.com/pytorch/pytorch/issues/160317 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164407 Approved by: https://github.com/albanD	2025-10-04 00:47:12 +00:00
Avik Chaudhuri	97ca21106d	move fw\|bw compiler args in aot joint with descriptors (#164584 ) Summary: Minor refactor where we push some args in the aot joint with descriptors workflow that are not used in export stage to the compile stage where they are actually used. Test Plan: existing tests should pass Differential Revision: D83850316 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164584 Approved by: https://github.com/tugsbayasgalan	2025-10-04 00:24:46 +00:00
Laith Sakka	27234792ad	Fix refine_ranges corner case (#164075 ) address https://github.com/pytorch/pytorch/issues/161360 u0>0 should update the range of u0 to start from [1, ..] this fix it. it was not doing that. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164075 Approved by: https://github.com/ColinPeppler	2025-10-03 23:30:46 +00:00
Pierre Moulon	b6b7a44dec	Fix common typos and misspellings (#164413 ) Summary: This commit fixes numerous typos and misspellings found throughout the codebase. The fixes improve code readability and documentation consistency across C++, Python, CUDA, and documentation files. ## Typos Fixed \| Before \| After \| Occurrences \| \|--------\|-------\|-------------\| \| occured \| occurred \| 14 \| \| accross \| across \| 9 \| \| lenght/lenghts \| length/lengths \| 8 \| \| unneccessary \| unnecessary \| 5 \| \| Peform \| Perform \| 4 \| \| furture \| future \| 3 \| \| paritioned \| partitioned \| 2 \| \| desireable \| desirable \| 2 \| \| registerations \| registrations \| 2 \| \| seperated \| separated \| 2 \| \| intialized \| initialized \| 2 \| \| capatibility \| compatibility \| 2 \| \| peformed \| performed \| 2 \| \| Exmple \| Example \| 2 \| \| comma_seperated \| comma_separated \| 2 \| \| cumsuming \| consuming \| 2 \| \| neccessary \| necessary \| 1 \| \| ParamterMetadataTable \| ParameterMetadataTable \| 1 \| \| matached \| matched \| 1 \| \| conaitner \| container \| 1 \| \| reivew \| review \| 1 \| \| prioriry \| priority \| 1 \| \| Alocated \| Allocated \| 1 \| \| opportunixtically \| opportunistically \| 1 \| \| peformance \| performance \| 1 \| \| equavalent \| equivalent \| 1 \| \| asssumed \| assumed \| 1 \| \| valdiation \| validation \| 1 \| \| apprear \| appear \| 1 \| \| consectuve \| consecutive \| 1 \| \| dependending \| depending \| 1 \| \| copnversion \| conversion \| 1 \| \| weigted \| weighted \| 1 \| \| repreesenting \| representing \| 1 \| \| finialize \| finalize \| 1 \| \| unintialized \| uninitialized \| 1 \| \| conbined \| combined \| 1 \| \| tesnor \| tensor \| 1 \| \| desugared \| discarded \| 1 \| \| behaviour \| behavior \| 1 \| \| paramerizaitons \| parametrizations \| 1 \| \| compute_output_lenghths_kernel \| compute_output_lengths_kernel \| 1 \| Test Plan: N/A - mostly comments - waiting on CI Differential Revision: D83695665 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164413 Approved by: https://github.com/eqy, https://github.com/larryliu0820	2025-10-03 23:19:41 +00:00
PyTorch MergeBot	3ddf2018d0	Revert "Support setting grad_dtype on leaf tensors (#162815 )" This reverts commit dca73982c53e9f99f96246b5d9ed9bab83c7423f. Reverted https://github.com/pytorch/pytorch/pull/162815 on behalf of https://github.com/yangw-dev due to break internal test D83850533, see more details below ([comment](https://github.com/pytorch/pytorch/pull/162815#issuecomment-3367498501))	2025-10-03 23:14:28 +00:00
Catherine Lee	fac6f20ae3	[CI] Add another win shard (#164605 ) Since its timing out `0b4f2b46d9/1` the first shard is disproportionately long because of cpp tests, I'm trying to figure that out but for now we can do this or increase the timeout Pull Request resolved: https://github.com/pytorch/pytorch/pull/164605 Approved by: https://github.com/seemethere, https://github.com/malfet	2025-10-03 22:51:09 +00:00
Pradeep Fernando	1894082000	UT/Examples for resharding checkpoint save/loads for distributed tensors with uneven shards. (#160533 ) 1\ DTensor abstraction on its own, does not support arbitrary length shards in its distributed tensors representation. It supports a single uneven shard, bit it has to be the last shard in the sharding dimension. 2\ However, DCP supports an API called checkpointable. This API allows you to define your custom shardable tensor structure. I have given a UT example ( look for CheckpointableDistTensor). Therefore, one option is to use CheckpointableDistTensor to save/load uneven shards. 3\ While exploring this path, I also noticed that torch.rec module also encountered a similar problem while working with DTensor. They workaround it by implementing Checkpointable API in DTensor and introducing an auxillary structure called LocalShardsWrapper. This is the second option we can use to unblock data loader resharding work. In summary; Use LocalShardWrapper + DTensor as the first option to unblock. Second preference is to use new implementation of Checkpointable API. ( similar to CheckpointbaleDistTensor I have introduced in this example). Differential Revision: D80182564 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160533 Approved by: https://github.com/saumishr	2025-10-03 22:15:02 +00:00
William Wen	5a66ff4915	[dynamo, 3.14] fix _detect_and_normalize_assert_statement for 3.14 (#164005 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164005 Approved by: https://github.com/anijain2305, https://github.com/atalman	2025-10-03 22:07:54 +00:00
Pian Pawakapan	abadea70f3	[inductor] thread hint_override in more kernel args (#164494 ) ensure hint_override is threaded in benchmarking args Pull Request resolved: https://github.com/pytorch/pytorch/pull/164494 Approved by: https://github.com/bobrenjc93	2025-10-03 22:07:12 +00:00
Maggie Moss	f414aa8e0d	Add pyrefly suppressions (3/n) (#164588 ) Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283 Test plan: dmypy restart && python3 scripts/lintrunner.py -a pyrefly check step 1: uncomment lines in the pyrefly.toml file step 2: run pyrefly check step 3: add suppressions, clean up unused suppressions before: https://gist.github.com/maggiemoss/bb31574ac8a59893c9cf52189e67bb2d after: 0 errors (1,970 ignored) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164588 Approved by: https://github.com/oulgen	2025-10-03 22:03:03 +00:00
albanD	e438db2546	Make custom op alias check consistent (#164576 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164576 Approved by: https://github.com/soulitzer ghstack dependencies: #164467	2025-10-03 21:42:11 +00:00
albanD	10335ffb2c	Add pure view support in autograd Function (#164467 ) Fix https://github.com/pytorch/pytorch/issues/73604 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164467 Approved by: https://github.com/ezyang, https://github.com/soulitzer	2025-10-03 21:42:11 +00:00
Lakshay Garg	f006aee601	Speed up FP precision lookup (#164044 ) This commit simplifies the precision lookup and setting logic by reducing the number of branches and using a custom hash function. Fixes #161822. The issue described in #163709 still persists. This is meant as a short term fix. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164044 Approved by: https://github.com/ngimel, https://github.com/eqy	2025-10-03 21:35:20 +00:00
Eli Uriegas	8d53d788fe	lint: add .pyi to changed files on .pyi.in changes (#164603 ) We were observing issues where the lint on trunk vs. PRs would be different due to missing .pyi files. This change adds the .pyi files to the changed files list when .pyi.in files are changed. Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164603 Approved by: https://github.com/atalman, https://github.com/malfet, https://github.com/Skylion007	2025-10-03 21:30:54 +00:00
PyTorch MergeBot	0b4f2b46d9	Revert "[inductor] require shape in TritonCSEVariable (#162275 )" This reverts commit f465ea6752c91498de63eb57439a74f4836e568a. Reverted https://github.com/pytorch/pytorch/pull/162275 on behalf of https://github.com/yangw-dev due to break interal test, see more details in next comment ([comment](https://github.com/pytorch/pytorch/pull/162275#issuecomment-3367213941))	2025-10-03 21:07:00 +00:00
Cynthia Yang	960c4b9937	[inductor] Enable triton kernels with unbacked inputs (#164509 ) Summary: We need to pass in fallback value to avoid converting symbols to int original failure log in onefeed Slimper MB - P1973406565 `raise TypeError("Cannot convert symbols to int")` Test Plan: if not passing in fallback value - https://www.internalfb.com/intern/everpaste/?handle=GGeAoh_M11kEGOECAFELOaq8ooRCbswMAAAz `raise TypeError("Cannot convert symbols to int")` ``` buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:unbacked_symints -- test_triton_kernel_with_unbacked_symint_fallback --print-passing-details --env TORCHDYNAMO_EXTENDED_DEBUG_CPP=1 --env TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(u0, 0)" ``` Buck UI: https://www.internalfb.com/buck2/4d27cd49-770b-40de-8c65-9ee04c5dd687 Test UI: https://www.internalfb.com/intern/testinfra/testrun/9570149324695031 Network: Up: 0B Down: 16MiB (reSessionID-8e8b07a2-e31c-402d-bf6a-ebb92253e654) Executing actions. Remaining 0/6 5.0s exec time total Command: test. Finished 2 cache (100% hit) 5.0s exec time cached (100%) Time elapsed: 33.8s Tests finished: Pass 2. Fail 0. Fatal 0. Skip 0. Build failure 0 Differential Revision: D83684260 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164509 Approved by: https://github.com/ColinPeppler	2025-10-03 21:05:18 +00:00
Yuanyuan Chen	1f8ee5da11	[TorchGen] Remove unused variables and function imports (#164538 ) This PR removes unused code in torchgen. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164538 Approved by: https://github.com/Skylion007, https://github.com/albanD	2025-10-03 20:49:36 +00:00
rraminen	da49a57d34	[ROCm] Enabled JIT UTs on ROCm (#164582 ) This PR is to enable the following tests rocm. test/test_jit.py::TestBackends::test_save_load test/test_jit.py::TestBackends::test_execution test/test_jit.py::TestBackends::test_errors test/test_jit.py::TestCUDA::test_current_stream Verified that the tests pass on AMD gfx90a and gfx942 arch. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164582 Approved by: https://github.com/jeffdaily	2025-10-03 20:16:41 +00:00
PyTorch MergeBot	8ec8c14ace	Revert "[CUDA] Add experimental green context support for SM carveout (#159104 )" This reverts commit 3c59351c6ea2fc29d346903e28e95c5f4d0ccdbb. Reverted https://github.com/pytorch/pytorch/pull/159104 on behalf of https://github.com/clee2000 due to failed lint, pyfmt not caught pyi file, I think they need special handling since theyre not in the changed files list? ([comment](https://github.com/pytorch/pytorch/pull/159104#issuecomment-3367077208))	2025-10-03 20:15:56 +00:00
Yuanyuan Chen	2d50678dcc	Fix -Wno-duplicate-decl-specifier is valid for C/ObjC but not for C++ (#164552 ) Fixes #99715 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164552 Approved by: https://github.com/Skylion007	2025-10-03 20:12:49 +00:00
Prachi	3ca09d65f1	[ROCm] Enable several distributed UTs (#164390 ) Increase the tolerance for the following UTs as there was a slight mismatch seen on MI200. - test_data_parallel.py:test_strided_grad_layout - test_c10d_nccl.py:test_grad_layout_1devicemodule_1replicaperprocess Skip for MI200: - test_fully_shard_training.py:test_2d_mlp_with_nd_mesh - test_2d_composability.py:test_train_parity_2d_mlp - test_fully_shard_overlap.py:test_fully_shard_training_overlap Fixes #159489 Fixes #159488 Fixes #152700 Fixes #125555 Fixes #134139 Working as is on both MI200 and MI300: Fixes #125991 Fixes #125918 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164390 Approved by: https://github.com/jeffdaily	2025-10-03 19:52:51 +00:00
Nikita Shulga	1bb68271b7	Stop building nativert in OSS (#164463 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164463 Approved by: https://github.com/albanD, https://github.com/Skylion007	2025-10-03 19:41:15 +00:00
Aaron Gokaslan	9eb89a4ad5	Add missing TypeIs to torch/_inductor/ir.py (#164489 ) This should be a TypeIs here Pull Request resolved: https://github.com/pytorch/pytorch/pull/164489 Approved by: https://github.com/mlazos	2025-10-03 19:34:20 +00:00
blorange-amd	15d726005d	Enable several unit tests on ROCm (#163087 ) Code change enables: test_nn::TestNNDeviceTypeCUDA::test_transformerencoderlayer_cuda_float16 test_nn::TestNNDeviceTypeCUDA::test_transformerencoderlayer_cuda_float32 test_nn::TestNNDeviceTypeCUDA::test_transformerencoderlayer_cuda_float64 test_nn::TestNNDeviceTypeCUDA::test_transformerencoderlayer_gelu_cuda_float16 test_linalg::TestLinalgCUDA::test_eigh_svd_illcondition_matrix_input_should_not_crash_cuda_float32 test_linalg::TestLinalgCUDA::test_eigh_svd_illcondition_matrix_input_should_not_crash_cuda_float64 test_ops::TestCommonCUDA::test_complex_half_reference_testing_as_strided_scatter_cuda_complex32 Fixes https://github.com/pytorch/pytorch/issues/134687 Fixes https://github.com/pytorch/pytorch/issues/78205 Closing github issues: inductor/test_gpu_cpp_wrapper unit tests: Fixes https://github.com/pytorch/pytorch/issues/157084 test_nn unit tests: Fixes https://github.com/pytorch/pytorch/issues/157167 Fixes https://github.com/pytorch/pytorch/issues/157119 Fixes https://github.com/pytorch/pytorch/issues/157118 Fixes https://github.com/pytorch/pytorch/issues/157115 Fixes https://github.com/pytorch/pytorch/issues/157081 Fixes https://github.com/pytorch/pytorch/issues/155216 Fixes https://github.com/pytorch/pytorch/issues/157259 Fixes https://github.com/pytorch/pytorch/issues/157166 Fixes https://github.com/pytorch/pytorch/issues/157165 Fixes https://github.com/pytorch/pytorch/issues/157164 Fixes https://github.com/pytorch/pytorch/issues/157117 Fixes https://github.com/pytorch/pytorch/issues/157116 Fixes https://github.com/pytorch/pytorch/issues/157114 Fixes https://github.com/pytorch/pytorch/issues/157113 Fixes https://github.com/pytorch/pytorch/issues/157082 Fixes https://github.com/pytorch/pytorch/issues/157080 Fixes https://github.com/pytorch/pytorch/issues/157079 Fixes https://github.com/pytorch/pytorch/issues/157078 test_linalg unit tests: Fixes https://github.com/pytorch/pytorch/issues/157427 Fixes https://github.com/pytorch/pytorch/issues/157414 Fixes https://github.com/pytorch/pytorch/issues/157369 Fixes https://github.com/pytorch/pytorch/issues/157349 Fixes https://github.com/pytorch/pytorch/issues/157348 Fixes https://github.com/pytorch/pytorch/issues/157337 Fixes https://github.com/pytorch/pytorch/issues/157336 Fixes https://github.com/pytorch/pytorch/issues/157297 Fixes https://github.com/pytorch/pytorch/issues/157281 Fixes https://github.com/pytorch/pytorch/issues/157260 Fixes https://github.com/pytorch/pytorch/issues/157171 Fixes https://github.com/pytorch/pytorch/issues/157169 Fixes https://github.com/pytorch/pytorch/issues/157168 Fixes https://github.com/pytorch/pytorch/issues/157125 Fixes https://github.com/pytorch/pytorch/issues/157124 Fixes https://github.com/pytorch/pytorch/issues/157123 Fixes https://github.com/pytorch/pytorch/issues/157089 Fixes https://github.com/pytorch/pytorch/issues/157088 Fixes https://github.com/pytorch/pytorch/issues/157087 Fixes https://github.com/pytorch/pytorch/issues/157068 Fixes https://github.com/pytorch/pytorch/issues/157067 Fixes https://github.com/pytorch/pytorch/issues/157066 Fixes https://github.com/pytorch/pytorch/issues/157047 Fixes https://github.com/pytorch/pytorch/issues/157046 Fixes https://github.com/pytorch/pytorch/issues/157045 Fixes https://github.com/pytorch/pytorch/issues/157044 Fixes https://github.com/pytorch/pytorch/issues/156997 Fixes https://github.com/pytorch/pytorch/issues/156996 Fixes https://github.com/pytorch/pytorch/issues/156995 Fixes https://github.com/pytorch/pytorch/issues/156994 Fixes https://github.com/pytorch/pytorch/issues/156993 Fixes https://github.com/pytorch/pytorch/issues/156991 Fixes https://github.com/pytorch/pytorch/issues/156990 Fixes https://github.com/pytorch/pytorch/issues/156989 Fixes https://github.com/pytorch/pytorch/issues/105118 Fixes https://github.com/pytorch/pytorch/issues/157415 Fixes https://github.com/pytorch/pytorch/issues/157282 Fixes https://github.com/pytorch/pytorch/issues/157261 Fixes https://github.com/pytorch/pytorch/issues/157170 Fixes https://github.com/pytorch/pytorch/issues/157126 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163087 Approved by: https://github.com/jeffdaily, https://github.com/pruthvistony	2025-10-03 19:30:59 +00:00
Zhengxu Chen	16f9bef642	[precompile] Fix guard serialization loading bugs. (#164490 ) Summary: Added a set of fixes triggered by fm training job. Overall the theme here is that we should get rid of saved objects as much as possible when they are not used in guard reconstruction. Sometimes for objects that cannot be saved (like local functions) we still try our best to save their closures. Test Plan: test_guard_serialization.py test_lazy_awatiable.py Differential Revision: D83766926 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164490 Approved by: https://github.com/jamesjwu	2025-10-03 19:20:07 +00:00
Eddie Yan	3c59351c6e	[CUDA] Add experimental green context support for SM carveout (#159104 ) Low-level PyTorch APIs should be usable/stable enough at this point but we might move the underlying driver API usage a bit from here... Built on top of @drisspg 's branch Pull Request resolved: https://github.com/pytorch/pytorch/pull/159104 Approved by: https://github.com/ngimel Co-authored-by: drisspg <drisspguessous@gmail.com>	2025-10-03 18:59:12 +00:00
Eli Uriegas	7eb1eb4313	ci: Removing ROCm tests from trunk. (#164585 ) Had a conversation with the AMD team today and I think we are all in agreement that the current state of queueing for AMD is beyond where we'd like to be for there to be blocking CI for ROCm. Moving the representative testing jobs for this into the ciflow/rocm workflow. I'd love for these to be back in trunk if we can get to a state where our queueing metrics are below an hour for ROCm infrastructure. Dashboards: * ROCm Queueing (>60mins) ([link](https://hud.pytorch.org/queue_time_analysis?dateRange=30&startDate=2025-09-03T16%3A06%3A45.025Z&endDate=2025-10-03T16%3A06%3A45.025Z&granularity=week&chartType=bar&repos=pytorch%2Fpytorch&category=machine_type&machineTypes=linux.rocm.gpu.2&machineTypes=linux.rocm.gpu.4&machineTypes=linux.rocm.gpu.mi250&machineTypes=linux.rocm.gpu.gfx942.1&machineTypes=linux.rocm.gpu.mi250.4&machineTypes=linux.rocm.gpu.gfx942.4&machineTypes=linux.rocm.gpu.mi355.2&machineTypes=linux.rocm.gpu.gfx942.4.test&machineTypes=linux.rocm.gpu.mi250.1&machineTypes=linux.rocm.gpu.gfx942.1.test&machineTypes=linux.rocm.gpu.gfx90a.1&machineTypes=linux.rocm.gpu.gfx90a.4&items=linux.rocm.gpu.2&items=linux.rocm.gpu.4&items=linux.rocm.gpu.mi250&items=linux.rocm.gpu.gfx942.1&items=linux.rocm.gpu.mi250.4&items=linux.rocm.gpu.gfx942.4&items=linux.rocm.gpu.mi355.2&items=linux.rocm.gpu.gfx942.4.test&items=linux.rocm.gpu.mi250.1&items=linux.rocm.gpu.gfx942.1.test&items=linux.rocm.gpu.gfx90a.1&items=linux.rocm.gpu.gfx90a.4)) * NVIDIA queueing (<5mins) ([link](https://hud.pytorch.org/queue_time_analysis?dateRange=30&startDate=2025-09-03T16%3A05%3A08.000Z&endDate=2025-10-03T16%3A05%3A08.000Z&granularity=week&chartType=bar&repos=pytorch%2Fpytorch&category=machine_type&machineTypes=lf.linux.g4dn.4xlarge.nvidia.gpu&machineTypes=linux.g4dn.12xlarge.nvidia.gpu&machineTypes=linux.g4dn.metal.nvidia.gpu&machineTypes=linux.g5.4xlarge.nvidia.gpu&machineTypes=lf.linux.g4dn.12xlarge.nvidia.gpu&machineTypes=lf.linux.g5.12xlarge.nvidia.gpu&machineTypes=lf.linux.g5.4xlarge.nvidia.gpu&machineTypes=lf.linux.g6.4xlarge.experimental.nvidia.gpu&machineTypes=linux.g6.4xlarge.experimental.nvidia.gpu&machineTypes=linux.4xlarge.nvidia.gpu&machineTypes=linux.g5.12xlarge.nvidia.gpu&machineTypes=linux.g4dn.4xlarge.nvidia.gpu&machineTypes=lf.linux.4xlarge.nvidia.gpu&machineTypes=linux.g6.12xlarge.nvidia.gpu&items=lf.linux.g4dn.4xlarge.nvidia.gpu&items=linux.g4dn.12xlarge.nvidia.gpu&items=linux.g4dn.metal.nvidia.gpu&items=linux.g5.4xlarge.nvidia.gpu&items=lf.linux.g4dn.12xlarge.nvidia.gpu&items=lf.linux.g5.12xlarge.nvidia.gpu&items=lf.linux.g5.4xlarge.nvidia.gpu&items=lf.linux.g6.4xlarge.experimental.nvidia.gpu&items=linux.g6.4xlarge.experimental.nvidia.gpu&items=linux.4xlarge.nvidia.gpu&items=linux.g5.12xlarge.nvidia.gpu&items=linux.g4dn.4xlarge.nvidia.gpu&items=lf.linux.4xlarge.nvidia.gpu&items=linux.g6.12xlarge.nvidia.gpu)) Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164585 Approved by: https://github.com/malfet, https://github.com/yangw-dev, https://github.com/atalman, https://github.com/jeffdaily	2025-10-03 18:19:24 +00:00
Banit Agrawal	f39789cdab	[PyTorch Pinned Allocator] Add support of reserved pinned memory segment to avoid slow paths (#164501 ) Summary: This diff adds the feature of allocating a large pinned memory segment upfront based on the provided config. This large segment is then used to serve all the small pinned memory requests to avoid expensive device level APIs (slow paths). Example: PYTORCH_CUDA_ALLOC_CONF=pinned_reserve_segment_size_mb:2048 This reserves a 2GB pinned memory segment for the process and then all incoming small requests are just served from this segment and no cudaHostAlloc/cudaHostRegister apis are being called. Differential Revision: D83779074 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164501 Approved by: https://github.com/yangw-dev	2025-10-03 18:11:27 +00:00
Yuanyuan Chen	3d9d41c801	Remove old workaround in launch_logcumsumexp_cuda_kernel (#164567 ) Remove workaround for CUDA 11.4 . Pull Request resolved: https://github.com/pytorch/pytorch/pull/164567 Approved by: https://github.com/Aidyn-A, https://github.com/Skylion007	2025-10-03 18:07:02 +00:00
Pian Pawakapan	5b0b4cda4a	[dtensor] avoid shape recompilations on DTensorSpec (#163820 ) skips DTensorSpec.sizes/strides in metadata guard checks Pull Request resolved: https://github.com/pytorch/pytorch/pull/163820 Approved by: https://github.com/azahed98	2025-10-03 17:18:18 +00:00
Tugsbayasgalan Manlaibaatar	2a11ce2c78	Support calling torch.compile inside non-strict export (#164171 ) So this fixes at least two issues: 1) When we are invoking inductor backend, we apply pre-grad passes which try to find correct fake mode to use. In the nested case, we will run into clash when there is closure variable in the inductor region because non-strict would have fakified this variable before hand and inner torch.compile would have created a new fresh fake mode. This is not a problem in regular torch.compile because inner torch.compile gets ignored. I don't know if we are supposed to inherit fake mode from parent context in this case. But we can avoid this problem if we just default to eager backend which is fine in this case because the point of export is to capture aten operators. Going to inductor would mean we will lose inner torch.compile ops. 2) There is custom torch function modes in export that track number of torch fns executed and inner compile itself doesn't work because of guard failure as this mode state gets changed. I noticed torch.cond fixes this problem by carefully stashing the torch function mode and defer it in the backend. So the correct thing to do here is just re-use torch.cond implementation unconditionally. So the things i did for fixing above were: 1) Always default to eager backend when compile is invoked inside export. I needed to make how torch.cond sets up the fresh tracing env into an util that can be shared. 2) The previous eager backend for torch.cond was wrong because the context managers didn't actually persist until the backend is invoked. 3) torch.cond used only disable TorchFunctionMetadata tf mode and stash it for later, but in fact, we should do both TorchFunctionMetadata and PreDispatchTorchFunctionMode. With above fixes, we are able to export flex attention in export. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164171 Approved by: https://github.com/ydwu4	2025-10-03 16:31:07 +00:00
drisspg	3288fbf374	Change default device to current acclerator (#164399 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164399 Approved by: https://github.com/albanD	2025-10-03 16:15:09 +00:00
James Wu	fa5306b4f5	Support partial _DynamoCacheEntries when not all backends available (#163521 ) Differential Revision: [D82735769](https://our.internmc.facebook.com/intern/diff/D82735769/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163521 Approved by: https://github.com/zhxchen17	2025-10-03 16:14:32 +00:00
Jeff Daily	5656d45c8f	forward fix #164481 (#164578 ) PR #164481 added unit test test_scaled_mm_preserves_strides in test/inductor/test_fp8.py. It was missing the adjustment for ROCm's F8 types on MI300. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164578 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-03 15:44:34 +00:00
atalman	e40fe634b1	Pin conda version for Docker builds (#164575 ) Mitigates https://github.com/pytorch/pytorch/issues/164574 Remove unused CUDA_CHANNEL var - this was used before when we had pytorch install via conda. Please note: CUDA 13.0 failures are expected since the CI tries to build against prod and CUDA 13.0 is not available in prod yet. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164575 Approved by: https://github.com/malfet, https://github.com/Camyll	2025-10-03 15:01:35 +00:00
bobrenjc93	3db2164341	[torchfuzz] add norm operators (#164514 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164514 Approved by: https://github.com/pianpwk ghstack dependencies: #164432, #164434	2025-10-03 14:44:19 +00:00
bobrenjc93	5bb8f04d3e	[torchfuzz] add nn functional ops (#164434 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164434 Approved by: https://github.com/pianpwk ghstack dependencies: #164432	2025-10-03 14:44:19 +00:00
Yuanyuan Chen	5743d731c1	Use torch.testing.test_close instead of torch.testing.test_allclose (#164539 ) Because torch.testing.test_allclose is deprecated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164539 Approved by: https://github.com/mlazos	2025-10-03 14:39:10 +00:00
PyTorch UpdateBot	aed66248a0	[vllm hash update] update the pinned vllm hash (#164319 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164319 Approved by: https://github.com/pytorchbot Co-authored-by: Huy Do <huydhn@gmail.com>	2025-10-03 12:30:33 +00:00
Nicolas Macchioni	6c3c9414eb	config for dcache + unit tests (#164512 ) Test Plan: ``` buck test fbcode//mode/opt caffe2/test/inductor:caching ``` Reviewed By: aorenste Differential Revision: D83714687 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164512 Approved by: https://github.com/jananisriram	2025-10-03 10:52:59 +00:00
CWOA	eccf561326	Move call to output generated code in inductor (#161615 ) This PR moves the call to copy the generated code from `/tmp/...` so that it is still called if attempting to compile the generated code fails. In both cases now, the generated code will be copied across to `torch_compile_debug/run_.../torchinductor/output_code.py` which makes debugging bad generated code easier. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161615 Approved by: https://github.com/eellison	2025-10-03 10:23:22 +00:00
jainapurva	ddf8de28c2	Add Rocm to Operator Microbenchmark CI (#164173 ) This pull request adds support for running operator microbenchmarks on ROCm (AMD GPU) environments in the CI workflow. The main changes involve introducing new build and test jobs for ROCm in the `.github/workflows/operator_microbenchmark.yml` file. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164173 Approved by: https://github.com/huydhn	2025-10-03 07:35:32 +00:00
bobrenjc93	7617b113ad	[torchfuzz] Support EagerVsFullGraphDynamicCompileWithNumericsCheck (#164432 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164432 Approved by: https://github.com/pianpwk	2025-10-03 06:42:20 +00:00
fduwjj	2a760dc51e	[DeviceMesh] Simplifying internal bookkeeping with CuTe layout (#163213 ) We want to refactor the internal bookkeeping of DeviceMesh so that: Simply the bookkeeping logics and make it generic enough so that it is easy to support new transformations like flatten noncontiguous dim, reshape and unflatten. (We leveraged the CuTe layout). This new layout also let us handle non-contiguous slicing, flatten, transpose possible. Concretely, in this PR, we do the following: 1. Use the `_MeshLayout` to handle all index operations rather use a map to record mesh dims. 2. Removed `flatten_name_to_root_dims`, because now we can directly get layout from a flattened device mesh. 3. Replaced `_get_slice_mesh_dims` with `_get_slice_mesh_layout`. 4. Use the newly added function `check_overlap` to check layout overlap. 5. Use a new function `to_remapping_tensor` to use layout ranks as indices when the mesh tensor is not representable as CuTe. The reason is that layout acts as a backend of mesh tensor bookkeeping (indexing indices), it needs to be used as indices for remap back to the mesh tensor for new DeviceMesh generation and backend init. For example, in the case of 2K to 4K, the underlying layout is (2K, 1) but the actual value of the mesh tensor is [2K, 2K+1, ....,]. While flattening, slicing, we need to remap the layout back to the new mesh tensor so it maps the actual device allocation. For example, in the 2K to 4K case, if the shape is (1K, 1K) with dim_names ("dp", "tp"). Then when slicing "tp", the mesh tensor should be (2K, 2K+1, ..., 3K-1) or (3K, 3K+1, ... 4K-1). not the global ranks generated from the layout. (1K, 1). Verified that loss curve is very close for DeepSeekV3 on torchtitan, note that exact same match is challenging because even if we run the baseline twice, the loss curve does not exactly match. <img width="1113" height="490" alt="image" src="https://github.com/user-attachments/assets/7877b5a4-337e-4ad8-b878-2378f4f0f38d" /> The PR looks big indeed but we don't change any existing behavior of DeviceMesh, so it is a pure refactor. With this refactoring we also enabled the slicing and flatten of non-contiguous dims of a device mesh which is hard to implement without cute layout. This is a continue of https://github.com/pytorch/pytorch/pull/161106 (original one got messed with EasyCLA) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163213 Approved by: https://github.com/lw, https://github.com/fegin	2025-10-03 05:51:28 +00:00
Henry Tsang	6c209bfc5c	[cutlass-4][take 2] upgrade to cutlass 4.2.1 (#164159 ) Test Plan: Sandcastle Differential Revision: D83492704 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164159 Approved by: https://github.com/Skylion007, https://github.com/mlazos	2025-10-03 03:47:59 +00:00
Maggie Moss	1051c1de5c	Add pyrefly suppressions 2/n (#164513 ) Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283 Test plan: dmypy restart && python3 scripts/lintrunner.py -a pyrefly check --- step 1: uncomment lines in the `pyrefly.toml` file before: https://gist.github.com/maggiemoss/911b4d0bc88bf8cf3ab91f67184e9d46 after: ``` INFO Checking project configured at `/Users/maggiemoss/python_projects/pytorch/pyrefly.toml` INFO 0 errors (1,152 ignored) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164513 Approved by: https://github.com/oulgen	2025-10-03 02:46:13 +00:00
Ke Wen	d1cbb74fb1	multimem reduce (#164517 ) Modified `multimem_one_shot_all_reduce_out` function to accept a `root` argument, making it a `multimem_reduce` op. The original `multimem_one_shot_all_reduce` op becomes a caller of the `multimem_reduce`, with each rank providing its own rank id as root. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164517 Approved by: https://github.com/ngimel	2025-10-03 02:41:10 +00:00
Markus Hoehnerbach	91c4db76cb	fix flex attention eager: dont round down scores to low-precision (closes #163588 ) (#163986 ) Fixes: https://github.com/pytorch/pytorch/issues/163588 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163986 Approved by: https://github.com/drisspg, https://github.com/mlazos	2025-10-03 01:09:59 +00:00
eellison	4691fe6070	remove unnecessary registration (#164481 ) scaled_mm already had `needs_exact_strides` in its op registration. also added a test showing these strides are being respected. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164481 Approved by: https://github.com/drisspg, https://github.com/mlazos	2025-10-03 01:03:12 +00:00
Kurt Mohler	ef50c6e3e3	[MPS] Add backward pass for `embedding_bag` (#163931 ) Fixes #162270 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163931 Approved by: https://github.com/malfet	2025-10-03 00:48:38 +00:00
eellison	86474ce996	Update mask dtype (#164472 ) Differential Revision: [D83781684](https://our.internmc.facebook.com/intern/diff/D83781684) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164472 Approved by: https://github.com/bdhirsh	2025-10-03 00:19:36 +00:00
Yuanyuan Chen	18e18488e8	[6/N] Apply ruff UP035 rule (#164438 ) Continued code migration to enable ruff UP035. Most changes are about moving `Callable` from typing to from collections.abc. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164438 Approved by: https://github.com/ezyang	2025-10-03 00:15:32 +00:00
Eddie Yan	f7082e92b3	[cuBLAS] update cuBLAS determinism docs, remove workspace requirement checks (#161749 ) Since CUDA 11.x (need to update the docs for this, current PR is saying 12.2 which is incorrect) we've been allocating cuBLAS workspaces explicitly per handle/stream combination https://github.com/pytorch/pytorch/pull/85447 According to the cuBLAS documentation, this appears to be sufficient for determinism without any explicit workspace requirements to e.g., `:4096:8` or `:16:8` as was previously expressed in PyTorch docs https://docs.nvidia.com/cuda/cublas/#results-reproducibility Planning to add an explicit determinism test as well... Pull Request resolved: https://github.com/pytorch/pytorch/pull/161749 Approved by: https://github.com/ngimel	2025-10-03 00:09:47 +00:00
Yang Wang	95a053284c	Fix vllm build issue (#164361 ) Fixes #ISSUE_NUMBER unstable https://github.com/pytorch/pytorch/issues/164362 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164361 Approved by: https://github.com/huydhn Co-authored-by: Huy Do <huydhn@gmail.com>	2025-10-02 23:34:21 +00:00
Jagadish Krishnamoorthy	c7e30ae4dd	MX: Remove redundant PLATFORM_SUPPORTS_MX_GEMM constant (#164320 ) Deleted duplicate definition of PLATFORM_SUPPORTS_MX_GEMM, was introduced in https://github.com/pytorch/pytorch/pull/162209 Also, adjusted BLOCK_SIZE and fp4_scaling_dtype in test_matmul_cuda.py to enable test_blockwise_nvfp4_compile on ROCm. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164320 Approved by: https://github.com/jeffdaily	2025-10-02 23:30:56 +00:00
soulitzer	dca73982c5	Support setting grad_dtype on leaf tensors (#162815 ) `grad_dtype` is a new attribute on Tensor to control gradient dtype: - Access/setting is leaf-only. - grad_dtype is respected when (1) when assigning to .grad, and (2) in the engine after the previous node produces incoming gradients for AccumulateGrad. (See table below for details) - Not setting grad_dtype preserves the current behavior. Accessing it returns `t.dtype` - `grad_dtype` cannot be set when there is already a `.grad` present and the dtypes conflict. \| `grad_dtype` setting \| Setting `.grad` manually \| Incoming gradient from autograd engine \| \|-----------------------\|--------------------------\|-----------------------------------------\| \| Default (tensor’s dtype) \| `.grad` must match tensor’s dtype \| Engine casts incoming grad to tensor’s dtype \| \| Set to specific dtype \| `.grad` must match that dtype \| Engine casts incoming grad to the specified dtype \| \| Set to `None` \| `.grad` may be any dtype \| Engine does not cast; accepts incoming grad dtype as-is \| Pull Request resolved: https://github.com/pytorch/pytorch/pull/162815 Approved by: https://github.com/albanD	2025-10-02 23:09:07 +00:00
Nan Zhang	43848b71d9	Improved support for autotuning in wrapper_fxir (#164132 ) Summary: - correct dtype propagation - allow more more options to be passed to compiler Test Plan: in follow up change Differential Revision: D83367909 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164132 Approved by: https://github.com/jansel	2025-10-02 22:54:22 +00:00
Laith Sakka	15c8bdcc5e	Fix FloorDiv should not generate non integer rationals (due to sympy bug) (#164398 ) FloorDiv eval have this optimization ``` # Expands (x + y) // b into x // b + y // b. # This only works if floor is an identity, i.e. x / b is an integer. ``` Before this PR this optimization would generate a result in an expression like this. Duo to a bug in sympy. ``` Mul(Rational(1, 22), Add(Mul(Integer(24), Symbol('s37', integer=True, positive=True)), Integer(672)), FloorDiv(Mul(Symbol('s14', integer=True, positive=True), Symbol('s46', integer=True, positive=True)), Integer(2016))) ``` This is because in sympy an expression can have .is_integer =True yet have 1/22 in it! This PR ensure we do not generate that by simply opting out if this optimization if we end up with quotient that have such rational. Fix https://github.com/pytorch/pytorch/issues/164385, https://github.com/pytorch/pytorch/issues/154996 https://github.com/pytorch/pytorch/issues/153375 https://github.com/pytorch/pytorch/issues/164063 and internal user issue. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164398 Approved by: https://github.com/jansel, https://github.com/isuruf	2025-10-02 22:51:03 +00:00
PyTorch MergeBot	22e219d996	Revert "[DeviceMesh] Simplifying internal bookkeeping with CuTe layout (#163213 )" This reverts commit b0985144b59db8fb20964829b5e0a9d2f9a3f0d6. Reverted https://github.com/pytorch/pytorch/pull/163213 on behalf of https://github.com/yangw-dev due to caused internal test failure ([comment](https://github.com/pytorch/pytorch/pull/163213#issuecomment-3363414435))	2025-10-02 22:22:26 +00:00
Anthony Barbier	bdc0a421d7	Stop parsing command line arguments every time common_utils is imported. (#156703 ) Last PR in the series to re-submit https://github.com/pytorch/pytorch/pull/134592 as smaller PRs: https://github.com/pytorch/pytorch/pull/154612 https://github.com/pytorch/pytorch/pull/154628 https://github.com/pytorch/pytorch/pull/154715 https://github.com/pytorch/pytorch/pull/154716 https://github.com/pytorch/pytorch/pull/154725 https://github.com/pytorch/pytorch/pull/154728 Pull Request resolved: https://github.com/pytorch/pytorch/pull/156703 Approved by: https://github.com/clee2000	2025-10-02 22:22:04 +00:00
ankushwahaRH	ece5e0f01b	Fake process group Direct construction error (#163665 ) Fixes #162129. Added validation in _rank_not_in_group() to check if ```FakeProcessGroup``` is properly initialized before use, raising a clear error message if ```torch.distributed.init_process_group(backend='fake')``` hasn't been called first. This prevents silent failures and ensures proper dispatch system integration for all distributed operations. Added test case test_fake_process_group_direct_usage_error() that validates the error is raised for ```all_reduce``` and ```all_to_all_single``` operations. Please let me know if additional distributed operators should be tested or if any other updates are needed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163665 Approved by: https://github.com/ezyang	2025-10-02 22:19:26 +00:00
PyTorch MergeBot	a34797e031	Revert "Add provenance to inductor IR nodes created after graph.run (#164255 )" This reverts commit b9e73e639e36f3aa628752161711e68878231b30. Reverted https://github.com/pytorch/pytorch/pull/164255 on behalf of https://github.com/jeffdaily due to broke rocm; inductor/test_provenance_tracing.py::TestProvenanceTracingStackTraces::test_deferred_triton_kernels [GH job link](https://github.com/pytorch/pytorch/actions/runs/18200790301/job/51821738132) [HUD commit link](`b9e73e639e`) ([comment](https://github.com/pytorch/pytorch/pull/164255#issuecomment-3363360088))	2025-10-02 22:01:41 +00:00
Isuru Fernando	f465ea6752	[inductor] require shape in TritonCSEVariable (#162275 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162275 Approved by: https://github.com/mlazos ghstack dependencies: #164158	2025-10-02 21:52:09 +00:00
Isuru Fernando	a8edccfbf4	[inductor] fix TestTemplateRender in select_algorithm (#164158 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164158 Approved by: https://github.com/mlazos	2025-10-02 21:52:09 +00:00
Rohit Singh Rathaur	6389658ec6	Fix type hints in PrepareModuleInput and PrepareModuleInputOutput (#164482 ) Fixes #161646 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164482 Approved by: https://github.com/Skylion007	2025-10-02 21:40:43 +00:00
Xilun Wu	cc71ab86a6	[DTensor] raise error if the local_tensor argument passed to DTensor.from_local is a DTensor (#164496 ) Summary Raise error when the `local_tensor` argument passed to `DTensor.from_local` is a DTensor, this prevents users from accidentally calling `from_local` over a DTensor object. The error message is organized in this way: ``` the local_tensor argument only accepts torch.Tensor but got <class 'torch.distributed.tensor.DTensor'> value. ``` Test `pytest test/distributed/tensor/test_dtensor.py -k test_from_local` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164496 Approved by: https://github.com/ezyang	2025-10-02 21:25:01 +00:00
PyTorch MergeBot	2a7c486750	Revert "Speed up FP precision lookup (#164044 )" This reverts commit 723ba213932bb1eca90109e003250ebb0da45eb1. Reverted https://github.com/pytorch/pytorch/pull/164044 on behalf of https://github.com/yangw-dev due to broke internal build In file included from xplat/caffe2/aten/src/ATen/DeviceAccelerator.cpp:1: xplat/caffe2/aten/src/ATen/Context.h:502:38: error: shift count >= width of type [-Werror,-Wshift-count-overflow] 502 \| return std::hash<size_t>{}((k1 << 32) \| k2); ([comment](https://github.com/pytorch/pytorch/pull/164044#issuecomment-3363016702))	2025-10-02 21:00:44 +00:00
Maggie Moss	5f18f240de	Add initial suppressions for pyrefly (#164177 ) Adds suppressions to pyrefly will typecheck clean: https://github.com/pytorch/pytorch/issues/163283 Test plan: `python3 scripts/lintrunner.py` `pyrefly check` --- Pyrefly check before: https://gist.github.com/maggiemoss/3a0aa0b6cdda0e449cd5743d5fce2c60 After: ``` INFO Checking project configured at `/Users/maggiemoss/python_projects/pytorch/pyrefly.toml` INFO 0 errors (1,063 ignored) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164177 Approved by: https://github.com/Lucaskabela	2025-10-02 20:57:41 +00:00
Jeff Daily	6b7970192f	[ROCm][CI] fix test_cudnn_convolution_relu_cuda (#164466 ) Fixes #162816. Test was comparing output of conv vs fused conv but inputs were different memory formats. Also fix test_cudnn_convolution_add_relu. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164466 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-02 20:36:54 +00:00
Yuanyuan Chen	115af42e9d	Fix readibility checks in TIDY and apply them (#164475 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/164475 Approved by: https://github.com/albanD, https://github.com/Skylion007 Co-authored-by: Aaron Gokaslan <aaronGokaslan@gmail.com>	2025-10-02 20:34:49 +00:00
Yu, Guangye	5f775bdfb7	Fix THP_PyObject_VirtualFree return type (#163763 ) # Motivation `void THP_PyObject_VirtualFree` should have no return value; otherwise, it would raise a build warning ```bash C:\Users\guangyey\pytorch\torch\csrc\dynamo\cpython_defs.c(264): warning C4098: 'THP_PyObject_VirtualFree': 'void' function returning a value ``` # Additional Context Refer to `c4f21d7c7c/Include/cpython/objimpl.h (L59-L68)` PyObjectArenaAllocator::free is defined with `void` return type. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163763 Approved by: https://github.com/albanD, https://github.com/williamwen42	2025-10-02 20:21:53 +00:00
bobrenjc93	8c54101933	add tensor subclass printing support in fx/graph.py (#164403 ) it was previously quite misleading since it looks like the inputs to the dynamo graph are plain tensors when in reality they are tensor subclasses before ``` class GraphModule(torch.nn.Module): def forward(self, L_input_batch_inputs_: "i64[2, 512][512, 1]cuda:0", L_self_parameters_weight_: "f32[202048, 256][256, 1]cuda:0"): ``` after ``` class GraphModule(torch.nn.Module): def forward(self, L_input_batch_inputs_: "DTensor(i64[2, 512][512, 1]cuda:0)", L_self_parameters_weight_: "DTensor(f32[202048, 256][256, 1]cuda:0)"): ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164403 Approved by: https://github.com/ezyang	2025-10-02 20:06:12 +00:00
RajeshvShiyal	c45d56dd00	typo corrected in ivalue.cpp's comment (#164485 ) Fixes #164483 typo corrected in ivalue.cpp's comment. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164485 Approved by: https://github.com/Skylion007	2025-10-02 20:01:17 +00:00
Yuanyuan Chen	33b17bc619	Remove old CUDA version checks (#164199 ) Remove some version check code for CUDA <12. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164199 Approved by: https://github.com/ezyang	2025-10-02 19:55:47 +00:00
Jian Wen	22b1710252	Use posix_fallocate() to reserve disk space for shared memory (#161910 ) Shared memory is allocated by creating a file in /dev/shm (by default) that can run out of space. Pytorch reserves the file size by calling ftruncate() that creates a sparse file, so it succeeds even if sufficient disk space is not available. This could lead to a situation when a shared memory region is successfully created but a subsequent access to a shared memory page results in SIGBUS due to the disk being full. Using posix_fallocate() instead of ftruncate() eliminates this problem because the former syscall always allocates space and it returns an error if the disk is full. Related to https://github.com/pytorch/pytorch/issues/5040 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161910 Approved by: https://github.com/mikaylagawarecki	2025-10-02 19:12:57 +00:00
Tugsbayasgalan Manlaibaatar	4661200125	[RELAND v2] Close some sources of fake tensors (#164372 ) Changelog: 1. When we run into an operation we didn't proxy, we end up emitting fake constants. We error under a config and we disable the config for some internal users. The reason we want to error is this signals a coverage problem we need to address but at the same time, we don't wnat to be disruptive to already working flows. 2. Previous attribute mutation detection logic in non-strict didn't account for nested module structure. This fixes silent incorrectness issue of exporting esm and qwen in non-strict and some torchbench models like levit_128 and demucs. 3. Previous logic didn't work on the cases where we mutate a container attribute as the previous approach used to pytree over old and new attributes resulting in length mismatch. We gracefully handle this now. Differential Revision: [D83673054](https://our.internmc.facebook.com/intern/diff/D83673054) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164372 Approved by: https://github.com/avikchaudhuri	2025-10-02 18:58:52 +00:00
adabeyta	6a31f42da4	Fix NestedTensor max/min operations for integer dtypes. (#162273 ) Fixes: https://github.com/pytorch/pytorch/issues/162049 ### Summary max_dim and min_dim functions incorrectly used torch.finfo() for all dtypes, causing TypeError for integer tensors. ### Changes - Use torch.iinfo() for integer dtypes instead of torch.finfo(). - Add CPU test: `test_jagged_max_min_dtypes` covering `int8, int16, int32, int64, uint8, float16, bfloat16, float32 and float64` ### Testing Before Fix: `python -m pytest test/test_nestedtensor.py -k "test_jagged_max_min_dtypes" -v` Output: ``` FAILED [0.0006s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_bfloat16 - TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo' FAILED [0.0006s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_float16 - TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo' FAILED [0.0006s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_float32 - TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo' FAILED [0.0006s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_float64 - TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo' FAILED [0.0006s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_int16 - TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo' FAILED [0.0005s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_int32 - TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo' FAILED [0.0005s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_int64 - TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo' FAILED [0.0004s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_int8 - TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo' FAILED [0.0004s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_uint8 - TypeError: torch.finfo() requires a floating point input type. Use torch.iinfo to handle 'torch.finfo' ``` After Fix: `python -m pytest test/test_nestedtensor.py -k "test_jagged_max_min_dtypes" -v` Output: ``` Running 9 items in this shard test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_bfloat16 PASSED [0.0086s] [ 11%] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_float16 PASSED [0.0011s] [ 22%] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_float32 PASSED [0.0011s] [ 33%] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_float64 PASSED [0.0011s] [ 44%] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_int16 PASSED [0.0009s] [ 55%] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_int32 PASSED [0.0010s] [ 66%] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_int64 PASSED [0.0010s] [ 77%] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_int8 PASSED [0.0010s] [ 88%] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_jagged_max_min_dtypes_cpu_uint8 PASSED [0.0011s] [100%] ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162273 Approved by: https://github.com/Skylion007, https://github.com/jbschlosser	2025-10-02 18:46:27 +00:00
Aidyn-A	c6a6c80a73	Add Aidyn-A to CUDA codeowners (#164436 ) Adding myself to "CUDA and CUDA math libraries" section. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164436 Approved by: https://github.com/mikaylagawarecki, https://github.com/eqy	2025-10-02 18:34:10 +00:00
Shangdi Yu	bf717ce346	[AOTI win] Add ABI stable method for updating constant buffer (#163819 ) Add `struct AOTInductorConstantMapEntry` to represent the constant map in AOTI Model. We cannot use `std::unordered_map` for cross-compilation, because it is not ABI stable. it will be tested when we test `update_user_managed_constant_buffer` for windows cross-compilation Example usage: ``` // Load constants. Create random constants here. auto* fc1_w = new slim::SlimTensor(slim::empty({16, 10}, c10::kFloat, c10::Device(c10::kCUDA, 0))); fc1_w->fill_(1.0); ..... // Build pairs std::vector<AOTInductorConstantPair> constants{ {"fc1_weight", fc1_w}, {"fc1_bias", fc1_b}, {"fc2_weight", fc2_w}, {"fc2_bias", fc2_b}, }; // Call runtime (pass raw pointer + size) update_user_managed_constant_buffer_abi( container_handle, constants.data(), constants.size(), /use_inactive=/false, /validate_full_update=/true); ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163819 Approved by: https://github.com/desertfire	2025-10-02 18:31:00 +00:00
PyTorch MergeBot	f6f7676756	Revert "C++-accessible Placements via pybind11 (#163030 )" This reverts commit 3e03deab6f3c268c85c8efd9546e28cdda0fa4cc. Reverted https://github.com/pytorch/pytorch/pull/163030 on behalf of https://github.com/swolchok due to doesn't pass pyre ([comment](https://github.com/pytorch/pytorch/pull/163030#issuecomment-3362450379))	2025-10-02 18:25:24 +00:00
Parthava Adabala	e6d4b26776	Update torch.rst (#164408 ) Corrected grammatical mistake Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/164408 Approved by: https://github.com/mikaylagawarecki	2025-10-02 18:12:47 +00:00
PyTorch MergeBot	6bb021c125	Revert "Use TMA loads always for Triton grouped MM kernel (#164256 )" This reverts commit b1033789fea2bc82901eafed498a5252985b80e9. Reverted https://github.com/pytorch/pytorch/pull/164256 on behalf of https://github.com/yangw-dev due to failed internal test: (pytorch.tritonbench.test.test_gpu.main.TestTritonbenchGpu) Error Details: torch._inductor.exc.InductorError: LoweringException: NoValidChoicesError: No choices to select. Provided reason: All choices failed to compile for backend. please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. ([comment](https://github.com/pytorch/pytorch/pull/164256#issuecomment-3362359624))	2025-10-02 17:55:37 +00:00
Shangdi Yu	b9e73e639e	Add provenance to inductor IR nodes created after graph.run (#164255 ) Summary: as title - Some IR nodes are created during `finalize_multi_template_buffers()` in Scheduler. This PR adds provenance (`origin_node` and `origins`) for those nodes. - Extract `assign_origin_node` function Differential Revision: D82871244 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164255 Approved by: https://github.com/mlazos	2025-10-02 17:32:46 +00:00
PyTorch MergeBot	0319556a35	Revert "[vision hash update] update the pinned vision hash (#154694 )" This reverts commit bcafea5c92ca2ee1b0dc8f6d8b62ecabb6f40228. Reverted https://github.com/pytorch/pytorch/pull/154694 on behalf of https://github.com/yangw-dev due to break the unittest for inductor with improved, update benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv, see failure example https://github.com/pytorch/pytorch/actions/runs/18185852421/job/51776537817 ([comment](https://github.com/pytorch/pytorch/pull/154694#issuecomment-3362285901))	2025-10-02 17:32:04 +00:00
atalman	f4cf75688f	Add CUDA release architecture matrix (#164471 ) We should surface the CUDA architecture matrix to make things more transparent. I believe this can later become its own page where we will publish supported matrix for each release. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164471 Approved by: https://github.com/Camyll	2025-10-02 16:59:48 +00:00
PyTorch MergeBot	39189592fd	Revert "Stop parsing command line arguments every time common_utils is imported. (#156703 )" This reverts commit ac7b4e7fe4d233dcd7f6343d42b4fa3d64bce548. Reverted https://github.com/pytorch/pytorch/pull/156703 on behalf of https://github.com/clee2000 due to failing internally D80206253, see above comment for details ([comment](https://github.com/pytorch/pytorch/pull/156703#issuecomment-3362156908))	2025-10-02 16:54:22 +00:00
Andrey Talman	235b995ce1	Make sure Windows CUDA 12.8 build follow same arches as Linux builds (#164470 ) I believe ``set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0`` is the one thats actually used. Hence remove 6.1 to align the support with Linux support. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164470 Approved by: https://github.com/tinglvv, https://github.com/nWEIdia, https://github.com/Camyll	2025-10-02 16:34:42 +00:00
Anthony Barbier	ac7b4e7fe4	Stop parsing command line arguments every time common_utils is imported. (#156703 ) Last PR in the series to re-submit https://github.com/pytorch/pytorch/pull/134592 as smaller PRs: https://github.com/pytorch/pytorch/pull/154612 https://github.com/pytorch/pytorch/pull/154628 https://github.com/pytorch/pytorch/pull/154715 https://github.com/pytorch/pytorch/pull/154716 https://github.com/pytorch/pytorch/pull/154725 https://github.com/pytorch/pytorch/pull/154728 Pull Request resolved: https://github.com/pytorch/pytorch/pull/156703 Approved by: https://github.com/clee2000	2025-10-02 15:48:47 +00:00
PyTorch MergeBot	c6329524d8	Revert "Add magic TORCH_MAKE_PYBIND_ENUM_FASTER macro (#163527 )" This reverts commit 50c0550f5a5b1e35885d892081a7d5115d8b4489. Reverted https://github.com/pytorch/pytorch/pull/163527 on behalf of https://github.com/swolchok due to breaking import torch in debug builds, see #164297 ([comment](https://github.com/pytorch/pytorch/pull/163527#issuecomment-3361919142))	2025-10-02 15:42:42 +00:00
fduwjj	b0985144b5	[DeviceMesh] Simplifying internal bookkeeping with CuTe layout (#163213 ) We want to refactor the internal bookkeeping of DeviceMesh so that: Simply the bookkeeping logics and make it generic enough so that it is easy to support new transformations like flatten noncontiguous dim, reshape and unflatten. (We leveraged the CuTe layout). This new layout also let us handle non-contiguous slicing, flatten, transpose possible. Concretely, in this PR, we do the following: 1. Use the `_MeshLayout` to handle all index operations rather use a map to record mesh dims. 2. Removed `flatten_name_to_root_dims`, because now we can directly get layout from a flattened device mesh. 3. Replaced `_get_slice_mesh_dims` with `_get_slice_mesh_layout`. 4. Use the newly added function `check_overlap` to check layout overlap. 5. Use a new function `to_remapping_tensor` to use layout ranks as indices when the mesh tensor is not representable as CuTe. The reason is that layout acts as a backend of mesh tensor bookkeeping (indexing indices), it needs to be used as indices for remap back to the mesh tensor for new DeviceMesh generation and backend init. For example, in the case of 2K to 4K, the underlying layout is (2K, 1) but the actual value of the mesh tensor is [2K, 2K+1, ....,]. While flattening, slicing, we need to remap the layout back to the new mesh tensor so it maps the actual device allocation. For example, in the 2K to 4K case, if the shape is (1K, 1K) with dim_names ("dp", "tp"). Then when slicing "tp", the mesh tensor should be (2K, 2K+1, ..., 3K-1) or (3K, 3K+1, ... 4K-1). not the global ranks generated from the layout. (1K, 1). Verified that loss curve is very close for DeepSeekV3 on torchtitan, note that exact same match is challenging because even if we run the baseline twice, the loss curve does not exactly match. <img width="1113" height="490" alt="image" src="https://github.com/user-attachments/assets/7877b5a4-337e-4ad8-b878-2378f4f0f38d" /> The PR looks big indeed but we don't change any existing behavior of DeviceMesh, so it is a pure refactor. With this refactoring we also enabled the slicing and flatten of non-contiguous dims of a device mesh which is hard to implement without cute layout. This is a continue of https://github.com/pytorch/pytorch/pull/161106 (original one got messed with EasyCLA) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163213 Approved by: https://github.com/lw, https://github.com/fegin	2025-10-02 15:42:03 +00:00
PyTorch MergeBot	7cfecd76b2	Revert "Improve repeat op to a single copy (#163842 )" This reverts commit 590224f83c8d575b52c6bc40a984132fa593256e. Reverted https://github.com/pytorch/pytorch/pull/163842 on behalf of https://github.com/yangw-dev due to internal test failed: RuntimeError: false INTERNAL ASSERT FAILED at aten/src/ATen/quantized/Quantizer.cpp:441, . cannot call qscheme on UnknownQuantizer please reach out folks who have internal access for furthur debugging. ([comment](https://github.com/pytorch/pytorch/pull/163842#issuecomment-3361746041))	2025-10-02 15:22:19 +00:00
soulitzer	bac0f289a3	Add methods to access data and unpack_hook on SavedVariable (#164358 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164358 Approved by: https://github.com/albanD	2025-10-02 13:05:16 +00:00
Edward Z. Yang	39c340ec9e	Add failing bitwise equivalence UT for aot_eager on rms_norm (#164280 ) Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164280 Approved by: https://github.com/albanD	2025-10-02 09:05:28 +00:00
drisspg	cfd46d13e6	Fix SAC + Flex issue (#164421 ) # Summary This happends when flex_attention is not tagged with the ` CheckpointPolicy.MUST_SAVE` policy. This causes the lse to be unrealized. I think in general this probably not the best policy but we shoudn't error Pull Request resolved: https://github.com/pytorch/pytorch/pull/164421 Approved by: https://github.com/Skylion007	2025-10-02 09:02:17 +00:00
Animesh Jain	0e5773b7fa	[dynamo][export] Do not graph break on torch.autograd._profiler_enabled for export (#164418 ) Actually we would like to not graph break even in the case of Dynamo. But there is a weird-unsolved bug with Kineto + Dynamo when there are distributed jobs that lead to NCCL timeouts. This bug is a rare edege case, but we have not been able to root cause it yet. But for export, we do not anticipate JIT tracing in distributed job training and therefore this PR is safe for export. Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/164418 Approved by: https://github.com/StrongerXi, https://github.com/williamwen42	2025-10-02 09:00:00 +00:00
angelayi	2c2e1268b7	[inductor] Handle patterns where input/output nodes are the same (#163994 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/163994 Approved by: https://github.com/jansel, https://github.com/mlazos	2025-10-02 08:37:55 +00:00
bobrenjc93	00f0365b95	[torchfuzz] add test suite of fuzzer repros that we xfail (#164430 ) i'll add the rest of the repros once in a follow up PR once we agree on a good test harness Pull Request resolved: https://github.com/pytorch/pytorch/pull/164430 Approved by: https://github.com/ezyang	2025-10-02 08:05:11 +00:00
Banit Agrawal	6bb586eafd	[PyTorch / Sigrid GPU] Fixes in pinned stats collection and add new ODS pinned memory stats (#164412 ) We do some fixes in pinned memory allocation stats collection and better differentiate between active vs allocated bytes. Reviewed By: bbus, sayitmemory Differential Revision: D83162346 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164412 Approved by: https://github.com/mradmila	2025-10-02 08:04:05 +00:00
Xuehai Pan	9697a7ce9e	Better path handling for nightly setup tool (#164215 ) Resolves https://github.com/pytorch/pytorch/issues/164010#issuecomment-3349283789, cc @filipviz Previously, the `checkout` subcommand would reuse the `venv`, while the `pull` subcommand would remove and recreate a fresh new `venv` (without prompting before deleting). This PR: - Keep and reuse the existing `venv` by default (both `pull` and `checkout`). - Add a new `--fresh` option to delete and recreate a fresh new `venv`. - Prompt the user for confirmation (add a new `--yes` option) before deleting the existing prefix path. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164215 Approved by: https://github.com/ezyang, https://github.com/malfet ghstack dependencies: #162324, #164214	2025-10-02 07:59:17 +00:00
Sherlock Huang	27eb36debb	DebugMode add ignore_compile_internals (#164205 ) Fixes #164143 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164205 Approved by: https://github.com/albanD	2025-10-02 07:39:54 +00:00
Yuanyuan Chen	a43c4c3972	[5/N] Apply ruff UP035 rule (#164423 ) Continued code migration to enable ruff `UP035`. Most changes are about moving `Callable` from `typing` to `from collections.abc`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164423 Approved by: https://github.com/ezyang	2025-10-02 07:31:11 +00:00
PyTorch UpdateBot	bcafea5c92	[vision hash update] update the pinned vision hash (#154694 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vision hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/154694 Approved by: https://github.com/pytorchbot Co-authored-by: Huy Do <huydhn@gmail.com>	2025-10-02 07:02:40 +00:00
Laith Sakka	3924f784ba	unbacked reshape_copy (#164336 ) address https://github.com/pytorch/pytorch/issues/162110 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164336 Approved by: https://github.com/ColinPeppler	2025-10-02 06:50:48 +00:00
Avik Chaudhuri	93e833de0f	[inductor] separate preamble from main work in compile_fx (#164169 ) A couple minor things to clean up the structure of `compile_fx` before we hit pre grad passes: 1. After patching config and recursively calling `compile_fx`, we don't need the patches any more. We make the subsequent logic call a `_maybe_wrap_and_compile_fx_main` (both when cpp wrapper exists and doesn't). 2. There's some recursive wrapping that happens on inputs and outputs before hitting pre grad passes, which are now also separated out before calling a `_compile_fx_main`, where actual work finally happens. These also happen to fix a couple of TODOs in the old code. Differential Revision: D83500704 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164169 Approved by: https://github.com/zhxchen17	2025-10-02 05:44:31 +00:00
Avik Chaudhuri	14791ea947	[inductor] teach bisector to look at pre_grad passes (#164250 ) Bisector was not aware of pre-grad passes. Now that pre-grad passes use their own graph transformer observer subsystem, it is possible to disable these passes in the bisector. Differential Revision: D83573614 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164250 Approved by: https://github.com/eellison, https://github.com/mlazos	2025-10-02 05:42:18 +00:00
Pat Vignola	702f6e703b	[MTIA] Enable deserialization for FP8 checkpoint loading (#163559 ) Summary: It looks like loading FP8 checkpoints goes through that path which wasn't enabled for MTIA beforehand, whereas loading BF16 checkpoints didn't. Differential Revision: D82997140 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163559 Approved by: https://github.com/mikaylagawarecki	2025-10-02 04:18:46 +00:00
bobrenjc93	39b31a6bfd	[torchfuzz] keep track of operator stats (#164334 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164334 Approved by: https://github.com/pianpwk ghstack dependencies: #164034, #164209, #164211, #164210, #164397, #164284	2025-10-02 03:48:07 +00:00
bobrenjc93	0fbe3f19c7	[torchfuzz] add matmuls (#164284 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164284 Approved by: https://github.com/pianpwk ghstack dependencies: #164034, #164209, #164211, #164210, #164397	2025-10-02 03:33:10 +00:00
bobrenjc93	144378615a	[torchfuzz] make fuzzer deterministic (#164397 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164397 Approved by: https://github.com/pianpwk ghstack dependencies: #164034, #164209, #164211, #164210	2025-10-02 03:10:30 +00:00
Colin Peppler	5dbae1eae2	Fix unbacked replacement where LHS is purely backed expr and RHS is unbacked expr (#164013 ) ## Scenario - If there's a `torch._check(backed_expr == unbacked_symbol)` - then we should replace unbacked_symbol for backed_expr - currently, we don't do that when generating inputs for autotune_at_compile_time ## Error traceback ``` $ python test/inductor/test_aot_inductor.py -k test_size_with_unbacked_add_expr_transitive ... File "/data/users/colinpeppler/pytorch/torch/_inductor/compile_fx.py", line 1696, in fx_codegen_and_compile return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) File "/data/users/colinpeppler/pytorch/torch/_inductor/compile_fx.py", line 1187, in codegen_and_compile dynamo_utils.preserve_rng_state(), File "/home/colinpeppler/.conda/envs/pytorch/lib/python3.12/contextlib.py", line 158, in __exit__ self.gen.throw(value) File "/data/users/colinpeppler/pytorch/torch/_dynamo/utils.py", line 2236, in preserve_rng_state torch.cuda.set_rng_state(cuda_rng_state) # type: ignore[possibly-undefined] File "/data/users/colinpeppler/pytorch/torch/cuda/random.py", line 79, in set_rng_state _lazy_call(cb) File "/data/users/colinpeppler/pytorch/torch/cuda/__init__.py", line 341, in _lazy_call callable() File "/data/users/colinpeppler/pytorch/torch/cuda/random.py", line 77, in cb default_generator.set_state(new_state) torch.AcceleratorError: CUDA error: an illegal memory access was encountered ``` ## Bad autotuning input generation ``` # assume unbacked_symint_fallback = 16 # we generate too small of an input (16) buf11 = generate_example_value((16, 256), (256, 1), 'cuda:0', torch.float32, 0, (16, 256)) triton_poi_fused_ones_1.run(buf11, 4096, stream=stream0) stream0 = get_raw_stream(0) buf12 = generate_example_value((16, 256), (256, 1), 'cuda:0', torch.float32, 0, (16, 256)) buf13 = generate_example_value((16, 256), (256, 1), 'cuda:0', torch.float32, 0, (16, 256)) add_kernel_1.run(buf11, buf12, buf13, 4096, 16, 1, 1, stream=stream0) del buf11, buf12 stream0 = get_raw_stream(0) buf15 = generate_example_value((10500, 256), (256, 1), 'cuda:0', torch.float32, 0, (10500, 256)) triton_poi_fused_add_mul_2.run(buf2, buf13, buf15, 2688000, stream=stream0) ``` ## Good autotuning input generation ``` # notice we generate with the proper size now (10500) buf11 = generate_example_value((10500, 256), (256, 1), 'cuda:0', torch.float32, 0, (10500, 256)) triton_poi_fused_ones_1.run(buf11, 2688000, stream=stream0) stream0 = get_raw_stream(0) buf12 = generate_example_value((10500, 256), (256, 1), 'cuda:0', torch.float32, 0, (10500, 256)) buf13 = generate_example_value((10500, 256), (256, 1), 'cuda:0', torch.float32, 0, (10500, 256)) add_kernel_1.run(buf11, buf12, buf13, 2688000, 10500, 1, 1, stream=stream0) del buf11, buf12 stream0 = get_raw_stream(0) buf15 = generate_example_value((10500, 256), (256, 1), 'cuda:0', torch.float32, 0, (10500, 256)) triton_poi_fused_add_mul_2.run(buf2, buf13, buf15, 2688000, stream=stream0) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164013 Approved by: https://github.com/cp2923, https://github.com/laithsakka	2025-10-02 02:40:54 +00:00
Scott Wolchok	3e03deab6f	C++-accessible Placements via pybind11 (#163030 ) This makes Placement data representation available in C++ via pybind11. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163030 Approved by: https://github.com/ezyang	2025-10-02 02:38:23 +00:00
henrylhtsang	349e9e922d	[cutass backend] remove cutlass presets (#164380 ) Differential Revision: [D83674898](https://our.internmc.facebook.com/intern/diff/D83674898/) Changes made by claude code (need to remove test too) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164380 Approved by: https://github.com/Skylion007, https://github.com/mlazos	2025-10-02 01:26:00 +00:00
Aidyn-A	8b29c59844	[CI][CUDA] Fix distributed tests for b200 (#164345 ) This PR fixes the tests that were encountered in #159323. Namely it fixes #162746 and #162745. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164345 Approved by: https://github.com/eqy	2025-10-02 01:13:49 +00:00
lichuyang	53860ef4e1	Better error handling in torch/csrc/jit/codegen/* (#163948 ) Refactor error handling by using TORCH_CHECK for improved clarity in constants and scope management in torch/csrc/jit/codegen/* Fixes some parts of ISSUE #148114 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163948 Approved by: https://github.com/cyyever, https://github.com/FFFrog, https://github.com/albanD	2025-10-02 01:10:09 +00:00
Lakshay Garg	723ba21393	Speed up FP precision lookup (#164044 ) This commit simplifies the precision lookup and setting logic by reducing the number of branches and using a custom hash function. Fixes #161822. The issue described in #163709 still persists. This is meant as a short term fix. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164044 Approved by: https://github.com/ngimel, https://github.com/eqy	2025-10-02 00:59:19 +00:00
PyTorch MergeBot	a10207e61b	Revert "[DCP] Decrease checkpoint background process Gloo pg init timeout (#162760 )" This reverts commit 0925c644edafbb6a8ff42fef5f3bd48b6042fad3. Reverted https://github.com/pytorch/pytorch/pull/162760 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/162760#issuecomment-3358630631))	2025-10-02 00:44:44 +00:00
Shunting Zhang	ffda8e5ddf	[inductor] log kernel autotuning result to a csv (#164191 ) Example output: https://gist.github.com/shunting314/2d646c6b6cd9a79fff7a35ffee82baed ``` for each model: for each triton kernel: for each triton config: the csv contains a line for the latency and pointer to find the kernel module in the file system ``` Would use this to try to come up with heuristics to pick a single config. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164191 Approved by: https://github.com/jansel, https://github.com/mlazos	2025-10-02 00:25:34 +00:00
jainapurva	1a5d023a5b	Add B200 to Operator Microbenchmark CI (#164288 ) Add B200 to operator microbenchmarks nightly run Pull Request resolved: https://github.com/pytorch/pytorch/pull/164288 Approved by: https://github.com/huydhn	2025-10-01 23:56:34 +00:00
atalman	566ea4e86a	Work Around exposing statically linked libstdc++ CXX11 ABI strong symbols (#163980 ) Work Around for: https://github.com/pytorch/pytorch/issues/133437 Test plan: 1. Build whl in CI 2. Download 3. Run ``nm -D libtorch_cpu.so \| grep "recursive_directory_iterator"`` Test with check_binary_symbols.py: Success: ``` num_cxx11_symbols: 2326 num_pre_cxx11_symbols: 0 lib: /home/ec2-user/github/variant-repack/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so num_statically_linked_symbols (T): 0 ``` Fail when using "W" instead of "T" as type calling ``cxx11_statically_linked_symbols = grep_symbols( lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="W" )`` : ``` num_cxx11_symbols: 2326 num_pre_cxx11_symbols: 0 lib: /home/ec2-user/github/variant-repack/.venv/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so num_statically_linked_symbols (T): 20 Traceback (most recent call last): File "/home/ec2-user/github/variant-repack/test/pytorch/.ci/pytorch/smoke_test/check_binary_symbolsc.py", line 130, in <module> main() File "/home/ec2-user/github/variant-repack/test/pytorch/.ci/pytorch/smoke_test/check_binary_symbolsc.py", line 126, in main check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path) File "/home/ec2-user/github/variant-repack/test/pytorch/.ci/pytorch/smoke_test/check_binary_symbolsc.py", line 95, in check_lib_statically_linked_libstdc_cxx_abi_symbols raise RuntimeError( RuntimeError: Found statically linked libstdc++ symbols (recursive_directory_iterator), but there shouldn't be any, see: ['std::filesystem::__cxx11::recursive_directory_iterator::recursion_pending() const', 'std::filesystem::__cxx11::recursive_directory_iterator::depth() const', 'std::filesystem::__cxx11::recursive_directory_iterator::options() const', 'std::filesystem::__cxx11::recursive_directory_iterator::operator() const', 'std::__shared_ptr<std::filesystem::__cxx11::recursive_directory_iterator::_Dir_stack, (__gnu_cxx::_Lock_policy)2>::operator bool() const', 'std::filesystem::__cxx11::recursive_directory_iterator::disable_recursion_pending()', 'std::filesystem::__cxx11::recursive_directory_iterator::pop(std::error_code&)', 'std::filesystem::__cxx11::recursive_directory_iterator::pop()', 'std::filesystem::__cxx11::recursive_directory_iterator::increment(std::error_code&)', 'std::filesystem::__cxx11::recursive_directory_iterator::recursive_directory_iterator(std::filesystem::__cxx11::path const&, std::filesystem::directory_options, std::error_code)', 'std::filesystem::__cxx11::recursive_directory_iterator::recursive_directory_iterator(std::filesystem::__cxx11::path const&, std::filesystem::directory_options, std::error_code*)', 'std::filesystem::__cxx11::recursive_directory_iterator::~recursive_directory_iterator()', 'std::filesystem::__cxx11::recursive_directory_iterator::~recursive_directory_iterator()', 'std::filesystem::__cxx11::recursive_directory_iterator::operator=(std::filesystem::__cxx11::recursive_directory_iterator&&)', 'std::filesystem::__cxx11::recursive_directory_iterator::operator=(std::filesystem::__cxx11::recursive_directory_iterator const&)', 'std::filesystem::__cxx11::recursive_directory_iterator::operator++()', 'std::__shared_ptr<std::filesystem::__cxx11::recursive_directory_iterator::_Dir_stack, (__gnu_cxx::_Lock_policy)2>::__shared_ptr(std::__shared_ptr<std::filesystem::__cxx11::recursive_directory_iterator::_Dir_stack, (__gnu_cxx::_Lock_policy)2>&&)', 'std::__shared_ptr<std::filesystem::__cxx11::recursive_directory_iterator::_Dir_stack, (__gnu_cxx::_Lock_policy)2>::__shared_ptr()', 'std::__shared_ptr<std::filesystem::__cxx11::recursive_directory_iterator::_Dir_stack, (__gnu_cxx::_Lock_policy)2>::__shared_ptr(std::__shared_ptr<std::filesystem::__cxx11::recursive_directory_iterator::_Dir_stack, (__gnu_cxx::_Lock_policy)2>&&)', 'std::__shared_ptr<std::filesystem::__cxx11::recursive_directory_iterator::_Dir_stack, (__gnu_cxx::_Lock_policy)2>::__shared_ptr()'] ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163980 Approved by: https://github.com/isuruf, https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-10-01 23:17:30 +00:00
Edward Z. Yang	9065364995	Add xfailing test case for inplace mutation of local DTensor (#164355 ) Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164355 Approved by: https://github.com/albanD	2025-10-01 23:16:26 +00:00
Yiming Zhou	6eb8d9671b	Enable torch.nn.functional.batch_norm in test_export_opinfo (#164261 ) Summary: There are actually 2 `nn.functional.batch_norm` in op_db. See https://github.com/pytorch/pytorch/blob/main/torch/testing/_internal/common_methods_invocations.py#L16797-L16831 So previously the test failed at `assert len(ops)==1` Test Plan: python test/export/test_export_opinfo.py TestExportOnFakeCudaCUDA Differential Revision: D83581427 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164261 Approved by: https://github.com/SherlockNoMad	2025-10-01 21:56:08 +00:00
Han Qi	b5c4f46bb9	Add functions to setup PrivateUse1 as a python backend device. (#157859 ) Fixes #156052 and #156444. This PR setup the privateuseone key in Python to be used as a python backend for pytorch. Meaning that, after calling `setup_privateuseone_for_python_backend('npy')`, one can use a subclass to with that device to hold arbitrary python data as "device data" and use `torch.library` to register ops that takes that Tensor. Changes done in this PR: 1. Register an vanilla Device Guard: I extended NoOpDeviceGuard to have allow device index of 0 and to not raise errors when event related functions are accessed. If I don't do those, when calling backward I would get errors. (CPU backend uses NoOpDeviceGuard just fine, although there seems to be special treatment of CPU in the autograd engine. 2. Tensor subclass allows not having `__torch_dispatch__` if the device is not CUDA or CPU. The comment of the check suggests it was to avoid segfault when calling into ops that expects a storage. Here we have a different device so will not call into those ops. 3. python function that invokes the other incantations to setup the privateusekey backend. This took inspiration of https://github.com/bdhirsh/pytorch_open_registration_example and https://github.com/tinygrad/tinygrad/blob/master/extra/torch_backend/wrapped_tensor.cpp; great thanks to @bdhirsh and @geohot. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157859 Approved by: https://github.com/albanD	2025-10-01 21:32:59 +00:00
Wei Wang	773c6762b8	[CD][CUDA13][NCCL] Fix nccl version typo for cu13 (#164383 ) https://pypi.org/project/nvidia-nccl-cu13/#history does not have 2.27.5 but 2.27.7+. Companion PR: https://github.com/pytorch/pytorch/pull/164352 Fixes a potential binary breakage due to non-existence of referenced NCCL cu13 version. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164383 Approved by: https://github.com/tinglvv, https://github.com/Skylion007, https://github.com/atalman	2025-10-01 21:32:25 +00:00
Shangdi Yu	7320f44cdc	Skip windows unittest in fbcode (#164363 ) Summary: as title Test Plan: ``` buck run fbcode//caffe2/test/inductor:aot_inductor_windows ``` Differential Revision: D83664801 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164363 Approved by: https://github.com/angelayi	2025-10-01 20:18:19 +00:00
Catherine Lee	e5c0e6b5e3	[testing] Better short job name during upload additional stats (#164287 ) I think we usually we leave the ` / test` in for clarity Pull Request resolved: https://github.com/pytorch/pytorch/pull/164287 Approved by: https://github.com/atalman, https://github.com/malfet	2025-10-01 19:56:20 +00:00
Jeff Daily	7304b9e7d2	[ROCm] fix carveout feature (#164303 ) Fixes #164271. Carveout had been applied with an opposite bitmask. Besides being incorrect, this lead to flaky unit test behavior due to carveout being too high. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164303 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-01 19:25:41 +00:00
Yuanyuan Chen	315ffdc1e4	[4/N] Apply ruff UP035 rule to python code (#164206 ) Follows #164104 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164206 Approved by: https://github.com/albanD	2025-10-01 19:05:53 +00:00
Isuru Fernando	8c590cab9d	[inductor] add a runtime assert for triton shapes (#164242 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164242 Approved by: https://github.com/eellison, https://github.com/mlazos ghstack dependencies: #164241	2025-10-01 18:55:33 +00:00
Isuru Fernando	9357c31b53	[inductor] Fix constant shape for float constants (#164241 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164241 Approved by: https://github.com/mlazos	2025-10-01 18:55:33 +00:00
Nikita Shulga	f63d16c6a9	Make viable/strict updatable again (#164374 ) To allow viable/strict to move forward, after https://github.com/pytorch/pytorch/pull/164260 was landed Pull Request resolved: https://github.com/pytorch/pytorch/pull/164374 Approved by: https://github.com/seemethere	2025-10-01 18:09:07 +00:00
Animesh Jain	8dfc8efffd	[export] Preserve nn_module_stack for aliased nn modules (#164311 ) Preparing for install_free_tensors flag. Thanks to @tugsbayasgalan in coming up with the change. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164311 Approved by: https://github.com/tugsbayasgalan	2025-10-01 18:04:33 +00:00
Anshul Sinha	3ffaab3bc8	[Replicate][Pipeline Parallelism] integration of new replicate function with pipeline parallelism (#164031 ) Summary: In order to test numerics for replicate + pp, stage.py needs to be able to call replicate's backward manually as pipeline parallelism doesn't have this feature. Test Case 1. pytest test/distributed/_composable/test_composability/test_pp_composability.py -k test_replicate_pp Pull Request resolved: https://github.com/pytorch/pytorch/pull/164031 Approved by: https://github.com/weifengpy, https://github.com/H-Huang ghstack dependencies: #163897	2025-10-01 18:01:16 +00:00
Ke Wen	ebd0707578	[SymmMem] Add get_nbi the nonblocking version (#163540 ) ```Py @triton.jit def foo(dest, src): nvshmem.get_nbi(dest, src, 100, 0) # Some independent computation which overlaps with the get operation ... # Wait for completion of the get operation nvshmem.quiet() ``` Allows us to overlap comm and compute in the same kernel, instead of two kernels + signals. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163540 Approved by: https://github.com/ngimel, https://github.com/fegin	2025-10-01 17:50:24 +00:00
Edward Yang	76ddbc2bbb	Add option to FakeProcessGroup to raise error if comms are invoked. (#162841 ) The current behavior is to do "nothing", which means you will corrupt data. If you're doing something similar to LocalTensor, where you're overriding the behavior of collectives to do something numerically, this can be unwelcome behavior. If you can error when this happens it can help prevent silent numerical incorrectness. Authored with claude code. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162841 Approved by: https://github.com/dcci	2025-10-01 17:48:19 +00:00
PyTorch MergeBot	69c5c08a01	Revert "[dynamo, 3.14] fix _detect_and_normalize_assert_statement for 3.14 (#164005 )" This reverts commit 5ed4672477c71492a2f41ac0395dd0630446d6a5. Reverted https://github.com/pytorch/pytorch/pull/164005 on behalf of https://github.com/williamwen42 due to broke some tests e.g. https://github.com/meta-pytorch/autoparallel/actions/runs/18167350261/job/51719783636?pr=179 ([comment](https://github.com/pytorch/pytorch/pull/164005#issuecomment-3357433475))	2025-10-01 17:47:22 +00:00
Anshul Sinha	3dab36bdb4	[FSDP][Replicate] created ReplicateModule and changed replicate to use it instead of FSDPModule (#163897 ) Summary: In order to minimize the code copied from FSDP to make replicate work, I made all replicated modules FSDPModule. While this was sufficient originally, there are changes to codebase like below that require us to differentiate between a FSDPModule and a ReplicateModule so that we can access replicate_state or fsdp_state: https://www.internalfb.com/code/fbsource/[a9a8e5102052]/fbcode/caffe2/torch/distributed/pipelining/stage.py?lines=629-666. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163897 Approved by: https://github.com/weifengpy	2025-10-01 17:30:10 +00:00
Ivan Zaitsev	1288c6d8bb	Enable keep-going for trunk tags (#164307 ) Tags like `trunk/{sha}` are used to re-run signals by [autorevert project](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md). We need to have `keep-going` enabled for those reruns, so that they surface all test failures, not just the first one. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164307 Approved by: https://github.com/clee2000	2025-10-01 17:21:43 +00:00
Colin Peppler	80ed522910	[export] support unbacked stack (#163867 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163867 Approved by: https://github.com/laithsakka	2025-10-01 16:48:46 +00:00
Yuanyuan Chen	f7ab8a2710	[1/N] Fix ruff warnings (#164333 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/164333 Approved by: https://github.com/albanD	2025-10-01 16:48:32 +00:00
Ke Wen	e419dc6d08	[PP] Customize pipeline's submod name (#164037 ) Changing PP submodules' name from `submod_i` to `submod_pp_i` to distinguish from the submodule created by HOP. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164037 Approved by: https://github.com/H-Huang ghstack dependencies: #164045, #164035	2025-10-01 16:29:19 +00:00
Ke Wen	5f868ca110	[fx] Allow customization of submod name in split graph (#164035 ) Fixes #164030: HOP and pipelining both name things submod_i by adding an optional argument `partition_affix` to `split_module` API. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164035 Approved by: https://github.com/ezyang ghstack dependencies: #164045	2025-10-01 16:26:14 +00:00
PyTorch MergeBot	20edc5b26a	Revert "Add num_store to inductor_meta and use it to scale persistent reduction x block (#162446 )" This reverts commit 22c5e8c17c7551c9dd2855589ae774c1e147343a. Reverted https://github.com/pytorch/pytorch/pull/162446 on behalf of https://github.com/PaulZhang12 due to perf regression in https://github.com/pytorch/pytorch/issues/164301#issuecomment-3354028620 ([comment](https://github.com/pytorch/pytorch/pull/162446#issuecomment-3357164274))	2025-10-01 16:23:03 +00:00
PyTorch MergeBot	59a86cb137	Revert "[fx] Allow customization of submod name in split graph (#164035 )" This reverts commit 615da7b95ef22ec0fa07f296dcb103d7d5aeda34. Reverted https://github.com/pytorch/pytorch/pull/164035 on behalf of https://github.com/yangw-dev due to internal build failed Buck build failed for this target, and is likely caused by your changes. ([comment](https://github.com/pytorch/pytorch/pull/164035#issuecomment-3357113348))	2025-10-01 16:09:50 +00:00
PyTorch MergeBot	36a37b81cd	Revert "[PP] Customize pipeline's submod name (#164037 )" This reverts commit 704cd771f6a63abf9498934aeb7f3079ab9e2232. Reverted https://github.com/pytorch/pytorch/pull/164037 on behalf of https://github.com/yangw-dev due to internal build failed Buck build failed for this target, and is likely caused by your changes. ([comment](https://github.com/pytorch/pytorch/pull/164035#issuecomment-3357113348))	2025-10-01 16:09:50 +00:00
albanD	2610746375	Revert nccl upgrade back to 2.27.5 (#164352 ) Revert https://github.com/pytorch/pytorch/pull/162351 as it breaks H100 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164352 Approved by: https://github.com/atalman, https://github.com/malfet	2025-10-01 15:27:40 +00:00
Aleksandar Samardžić	b1033789fe	Use TMA loads always for Triton grouped MM kernel (#164256 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164256 Approved by: https://github.com/ngimel ghstack dependencies: #163895	2025-10-01 15:24:51 +00:00
PyTorch MergeBot	07d896fa48	Revert "CUDACachingHostAllocatorImpl skip event query during capture (#164001 )" This reverts commit 4cf29004749714670fee9e7e3776778faf5ced25. Reverted https://github.com/pytorch/pytorch/pull/164001 on behalf of https://github.com/yangw-dev due to failed internal error with multiple errors found: Not equal to tolerance rtol=0.1, atol=0.1.. ([comment](https://github.com/pytorch/pytorch/pull/164001#issuecomment-3356894787))	2025-10-01 15:11:21 +00:00
Nicolas De Carli	31681bcacc	[PyTorch] Pull ARM's box-cox (#164152 ) Summary: ARM has provided with an SVE128 box-cox implementation. It uses the same underlying algorithm as the previous version, but it has better log and exp implementations. These supplied mathematical functions have switches to adjust the precision/speed trade-off. We've noted a slight precision improvement, while also about a 5% peroformance increase Before: ZeroLambda1 61.66ns 16.22M NonZeroLambda1 125.73ns 7.95M NonZeroLambdaManyColumns 1.84ms 542.11 NonZeroLambdaEigenColumnar 262.31us 3.81K NonZeroLambdaEigenRowMajor 275.17us 3.63K NonZeroLambdaWithPyTorchColumnar 97.43us 10.26K NonZeroLambdaWithPyTorchRowMajor 90.82us 11.01K NonZeroLambdaWithPyTorchRowMajorFullBatch 96.96us 10.31K NonZeroLambdaBatch 151.84us 6.59K After: ZeroLambda1 57.85ns 17.29M NonZeroLambda1 118.85ns 8.41M NonZeroLambdaManyColumns 1.82ms 548.16 NonZeroLambdaEigenColumnar 261.67us 3.82K NonZeroLambdaEigenRowMajor 274.53us 3.64K NonZeroLambdaWithPyTorchColumnar 89.12us 11.22K NonZeroLambdaWithPyTorchRowMajor 83.49us 11.98K NonZeroLambdaWithPyTorchRowMajorFullBatch 88.79us 11.26K NonZeroLambdaBatch 144.74us 6.91K Test Plan: Correctness: buck2 test @//mode/opt //koski/functions_contrib/df4ai/tests:batch_box_cox_test Performance: buck2 run @//mode/opt //koski/functions_contrib/df4ai/benchmark:boxcox_benchmark Differential Revision: D83485704 Privacy Context Container: L1196524 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164152 Approved by: https://github.com/ezyang	2025-10-01 15:00:03 +00:00
Edward Yang	e901866dd7	Add a RECORD_FUNCTION for Python fallback so it shows in profile (#160573 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160573 Approved by: https://github.com/bdhirsh, https://github.com/albanD	2025-10-01 14:10:44 +00:00
Aleksandar Samardžić	70d1043bdf	Fix non-TMA loads in grouped MM Triton kernel (#163895 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163895 Approved by: https://github.com/lezcano	2025-10-01 12:21:13 +00:00
atalman	69fa26d9b4	Triton 3.5.x pin update (#164268 ) Updates triton pin to latest: https://github.com/triton-lang/triton/commits/release/3.5.x/ This updates contains 2 cherry-pick to remove Python 3.9 from list of supported python versions: https://github.com/triton-lang/triton/pull/8288 https://github.com/triton-lang/triton/pull/8287 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164268 Approved by: https://github.com/aakhundov	2025-10-01 11:41:50 +00:00
Robert Hardwick	d9c80ef97d	Build and Install Arm Compute Library in manylinux docker image (#159737 ) ---- This PR will be part of a series of PR's that aims to remove `.ci/aarch64_linux` folder entirely, such that Aarch64 manylinux build happens as part of `.ci/manywheel/build.sh`, the same as other platforms. In this PR: - We prebuild + install Arm Compute Library in the manylinux docker image ( at /acl ), instead of a build time for every pytorch build. Also updated jammy install path to be /acl too. - We can therefore remove build_ArmComputeLibrary functions from the ci build scripts. - There is also some refactoring of install_openblas.sh and install_acl.sh to align them together ( similar formatting, similar variable names, same place for version number update ) - We had 2 places to define openblas version, this has been reduced to 1 now ( install_openblas.sh ). - ACL_VERSION and OPENBLAS_VERSION are now able to be overriden at build.sh level for developers, but there is only 1 version of each hardcoded for ci. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159737 Approved by: https://github.com/seemethere, https://github.com/aditew01	2025-10-01 11:33:51 +00:00
William Wen	ac1bc51608	[dynamo] do not pop from framelocals dict in Python 3.10 (#164316 ) Followup to https://github.com/pytorch/pytorch/pull/164038 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164316 Approved by: https://github.com/anijain2305	2025-10-01 10:20:46 +00:00
Syed Tousif Ahmed	ed90040d33	Releases multicast object before releasing mapped buffers in CUDASymmetricMemory (#163750 ) Fixes: https://github.com/pytorch/pytorch/issues/162429. In B200, cuMulticastUnbind can error if the mapped buffers are free'd before the multicast object is free'd. The only documentation I could find is here: `e11d7f77c1/src/transport/nvls.cc (L113)`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163750 Approved by: https://github.com/ngimel, https://github.com/Skylion007, https://github.com/kwen2501, https://github.com/nWEIdia, https://github.com/cyyever ghstack dependencies: #163575	2025-10-01 09:07:48 +00:00
Syed Tousif Ahmed	4dab208d97	Adds Issue#153109 as a test for CUDAPluggableAllocator (#163575 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163575 Approved by: https://github.com/ngimel	2025-10-01 09:07:48 +00:00
Tristan Trouwen	9fd53a2bdc	Register MTIA kernel for all_all_out (#164293 ) Reviewed By: srsuryadev Differential Revision: D83517879 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164293 Approved by: https://github.com/Skylion007, https://github.com/malfet	2025-10-01 09:05:08 +00:00
Xuehai Pan	17ab99463a	[Easy] Add notes for setting up dev venv with specific Python version (#164214 ) Resolves https://github.com/pytorch/pytorch/issues/164010#issuecomment-3340751377 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164214 Approved by: https://github.com/ezyang ghstack dependencies: #162324	2025-10-01 08:25:13 +00:00
Xuehai Pan	eca6ac2293	[BE][Easy] update CUDA and ROCm sources in nightly tool (#162324 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162324 Approved by: https://github.com/ezyang	2025-10-01 08:25:13 +00:00
Xuanteng Huang	12d4cb0122	Suppress `FutureWarning`s in `torch.distributed.algorithms.ddp_comm_hooks` (#163939 ) Fixes #163938 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163939 Approved by: https://github.com/cyyever, https://github.com/kwen2501	2025-10-01 07:51:12 +00:00
Haifeng Jin	590224f83c	Improve repeat op to a single copy (#163842 ) In #163455 , the `reshape` was not a pure view op. The `permute` before it created an non-contiguous tensor, which would trigger a data copy during the reshape. This PR improved the implementation by remove the `urtensor` intermediate tensor completely. By simply expanding the `xtensor` would achieve the `repeat` effect. Before this PR, there were two data copies (in `urtensor.copy_` and `urtensor.reshape`). Now, there is only one data copy in the `.copy_()`. Reshape would not copy data because it is on a contiguous tensor. One more note is that we do want at one copy because we want to duplicate the elements for the repeats. User can inplace modify single elements without afffecting others. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163842 Approved by: https://github.com/Skylion007 Co-authored-by: Aaron Gokaslan <aaronGokaslan@gmail.com>	2025-10-01 06:27:53 +00:00
Yuanyuan Chen	cc8b14d09a	[2/N] Simplify "in" operation for containers of a single item (#164323 ) These issues are detected by ruff [FURB171](https://docs.astral.sh/ruff/rules/single-item-membership-test/#single-item-membership-test-furb171). Pull Request resolved: https://github.com/pytorch/pytorch/pull/164323 Approved by: https://github.com/justinchuby, https://github.com/Skylion007	2025-10-01 05:39:11 +00:00
Animesh Jain	96c3b9e275	[dynamo] Use strings instead of modules for fqn info tracking (#164272 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164272 Approved by: https://github.com/Skylion007, https://github.com/williamwen42, https://github.com/mlazos	2025-10-01 04:22:57 +00:00
Nikita Shulga	9ddfc59b9b	[BE] Delete stale non-ephemeral runners workarounds (#164285 ) As all Win runners are ephemeral, no need to cleanup leftover processes or uninstall PyTorch at the end of the test Pull Request resolved: https://github.com/pytorch/pytorch/pull/164285 Approved by: https://github.com/Skylion007	2025-10-01 03:47:36 +00:00
Nikita Shulga	6d4dfa0878	[CI] Push `viable/strict/${time}` tags (#164183 ) Every time viable strict is updated Pull Request resolved: https://github.com/pytorch/pytorch/pull/164183 Approved by: https://github.com/seemethere	2025-10-01 03:41:10 +00:00
Banit Agrawal	11ccb95ccb	[PyTorch Pinned Allocator] Pinned memory stats and perf fixes around allocating blocks (#163777 ) Summary: This diff adds bucket stats for pinned memory and also a perf fix to not check for sizes when background thread is enabled Differential Revision: D83162186 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163777 Approved by: https://github.com/bbus	2025-10-01 03:28:58 +00:00
Nikita Shulga	bd0907dc4c	[BE][CI] Unify requirments (#163396 ) Both Linux, Windows and MacOS CI workflows should use `.ci/docker/requirements-ci.txt` TODOS: - Investigate why `choco install cmake` is needed to successfully detect MKL - Move `psutil` installation from specific scripts into requirements-ci.txt Pull Request resolved: https://github.com/pytorch/pytorch/pull/163396 Approved by: https://github.com/Skylion007	2025-10-01 03:28:48 +00:00
Alexander Grund	8bb71c07c4	Skip symmetric memory tests calling `_scaled_mm` on CCC < 8.9 (#164251 ) This avoids them failing on e.g. A100 GPUs with > RuntimeError: torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+ Pull Request resolved: https://github.com/pytorch/pytorch/pull/164251 Approved by: https://github.com/Skylion007, https://github.com/kwen2501	2025-10-01 03:26:21 +00:00
Yuanyuan Chen	fa90090735	Use dataclass features in two classes (#164221 ) This PR completes two TODO items by using features of `dataclass`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164221 Approved by: https://github.com/Skylion007, https://github.com/mlazos Co-authored-by: Aaron Gokaslan <aaronGokaslan@gmail.com>	2025-10-01 03:20:39 +00:00
Aaron Gokaslan	591997490a	[BE][Easy]: Add prims common TypeGuard (#164263 ) Slightly improves typing by adding a TypeGuard. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164263 Approved by: https://github.com/albanD	2025-10-01 03:13:10 +00:00
mansiag05	531f3bf5e1	Adding check for square matrix for input tensor in matrix_exp backwar… (#163357 ) …d op. Fixes #146796 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163357 Approved by: https://github.com/lezcano	2025-10-01 03:12:30 +00:00
ankushwahaRH	2a5ce2feb4	Add algorithm in header (#164295 ) Fixes #163307. Added ```#include <algorithm>``` to vulkan QueryPool for the std::for_each call Pull Request resolved: https://github.com/pytorch/pytorch/pull/164295 Approved by: https://github.com/Skylion007	2025-10-01 03:09:50 +00:00
Yiming Zhou	3787a5a60e	[export] Explicitly passing requires_grad to nn.Parameter() in deserialization (#164290 ) Summary: `nn.Parameter()` by default has `requires_grad=True` and would cause issues when there are non-float parameters. Test Plan: buck2 run mode/dev-nosan caffe2/test:test_export -- -r test_non_float_weight Differential Revision: D83598796 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164290 Approved by: https://github.com/angelayi	2025-10-01 02:55:20 +00:00
Animesh Jain	c66d18d24d	[dynamo][sac] Support functools partial context_fn for sac (#164308 ) Fixes https://github.com/pytorch/pytorch/issues/164300 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164308 Approved by: https://github.com/Lucaskabela, https://github.com/soulitzer	2025-10-01 02:47:55 +00:00
eellison	e0f118585f	skip non memory deps in memory estimator (#164294 ) Differential Revision: [D83601030](https://our.internmc.facebook.com/intern/diff/D83601030) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164294 Approved by: https://github.com/mlazos	2025-10-01 02:44:58 +00:00
bobrenjc93	10a005e87f	[torchfuzz] add layout operators (#164210 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164210 Approved by: https://github.com/pianpwk ghstack dependencies: #164034, #164209, #164211	2025-10-01 02:33:19 +00:00
bobrenjc93	1f3995cdc8	[torchfuzz] raise if Operator abstract method is not implemented (#164211 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164211 Approved by: https://github.com/pianpwk ghstack dependencies: #164034, #164209	2025-10-01 02:33:19 +00:00
bobrenjc93	abfcce58a4	[torchfuzz] remove erroneous can_produce check (#164209 ) can_produce is an abstract method that always return false Pull Request resolved: https://github.com/pytorch/pytorch/pull/164209 Approved by: https://github.com/pianpwk ghstack dependencies: #164034	2025-10-01 02:33:19 +00:00
Jane Xu	5b1c39f5a1	Add smoke tests to verify that stable ABI FA3 wheel runs w/ newer torch (#163782 ) Passing CI: https://github.com/pytorch/pytorch/actions/runs/18141589975/job/51635340255?pr=163782 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163782 Approved by: https://github.com/huydhn, https://github.com/mikaylagawarecki	2025-10-01 02:30:38 +00:00
Simon Layton	8df3f2fa98	Revert new-test part of #163829 (#164259 ) Summary: New test sizes for `test_scaled_mm_vs_emulated_block_wise` all fail with ``` RuntimeError: Invalid scaling configuration ``` Disable these new tests for now (the remaining test is a parametrized version of the original test case) Test Plan: `pytest test/test_scaled_matmul_cuda.py` Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlayton@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164259 Approved by: https://github.com/jananisriram ghstack dependencies: #164266	2025-10-01 02:23:21 +00:00
Simon Layton	7a9119948e	Split scaled-mm tests into separate file (#164266 ) Summary: * Split scaled-mm-specific tests into `test/test_scaled_matmul.py` Test Plan: ``` pytest test/test_matmul_cuda.py pytest test/test_scaled_matmul_cuda.py ``` Reviewers: Subscribers: Tasks: Tags: Signed-off-by: Simon Layton <simonlayton@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164266 Approved by: https://github.com/Skylion007, https://github.com/albanD	2025-10-01 02:23:21 +00:00
Shangdi Yu	28c1d2f81b	[aoti] AOTI mingw cross compilation (#163188 ) To run this, you need to install `mingw64-gcc-c++` and download windows cuda library toolkit. See design doc and demo instructions in https://docs.google.com/document/d/1iDaChqA5nNKkBFTzsdkmoomvQlXHbnlb1Z4yEp7xaJA/edit?tab=t.0 If cross_platform_target is windows, we do the following: - do not link to `sleef`. This can be improved in the future if we need it. Currently I avoid it because that requires extra setup on the linux side - Use `mingw64-gcc-c++` to compile - Use `WINDOWS_CUDA_HOME` instead of `CUDA_HOME` when linking to cuda ``` python test/inductor/test_aot_inductor_windows.py -k so ``` Other changes: - de-couples compile_standalone config and dynamic link flag - create a new aot_inductor_mode config module, which is used to control configs in aot_inductor. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163188 Approved by: https://github.com/desertfire	2025-10-01 02:22:06 +00:00
Banit Agrawal	c4bbc6433e	[PyTorch CCA] Add an API to get expandable segment sizes (#163771 ) Summary: This diffs add an API to query expandable segment size for each stream so that we can use this info to warmup the segment in advance, so we dont incur any performance penalty during steady state inference for new CUDA memory allocations. Differential Revision: D76447308 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163771 Approved by: https://github.com/bbus	2025-10-01 02:16:58 +00:00
Jeff Daily	ad7e3c93b1	[ROCm][CD] librocroller.so missing from ROCm 7 wheel (#164244 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164244 Approved by: https://github.com/jeffdaily, https://github.com/Skylion007 Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-10-01 00:02:34 +00:00
Jane Xu	7f3dc45300	Migrate DeviceType to torch/headeronly (#163999 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163999 Approved by: https://github.com/mikaylagawarecki	2025-09-30 23:13:27 +00:00
PyTorch UpdateBot	ff715366aa	[vllm hash update] update the pinned vllm hash (#164190 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164190 Approved by: https://github.com/pytorchbot	2025-09-30 22:43:49 +00:00
Sherlock Huang	60a4961ff4	[DTensor] Allow redistribute to Partial if src matches (#164253 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164253 Approved by: https://github.com/zpcore	2025-09-30 22:42:49 +00:00
Frank Lin	bec6541d84	[CUDA][CUDAGraph] Reduce capture overhead in CUDA Graph memory reuse (#162186 ) Previous work #158352 delivered CUDAGraph memory footprint reduction with no replay-time impact, but capture time regressed (up to 20× slower) due to repeated full-graph traversals. See previous benchmark results [here](https://github.com/pytorch/pytorch/pull/158352#issuecomment-3215947565) This PR removes capture/reply overhead while preserving the memory savings: 1. Terminals as free markers We stop inserting empty nodes and instead record the current stream terminals as free markers. This avoids mutating the user’s graph and keeps semantics unchanged. 2. Incremental, cached reachability We add a per-graph reuse context that caches reverse-traversal state: * `graph_reuse_context[graph].visited[stream]` tracks nodes already seen from that stream’s terminal frontier. * On each allocation during capture, we resume traversal from the latest terminals and only visit unseen nodes. * A block is freed when all its recorded markers are in the visited set of its allocation stream—i.e., all markers are proven predecessors of future work. See [the performance results here](https://docs.google.com/spreadsheets/d/e/2PACX-1vRPvdd9Xa8W87ixbiA0da_qvOhrUAjUpFz0G-_j-MsDnoeRyhEa4_ut_W3rqcg1VVZVFJ-gucwov-3b/pubhtml?gid=1468302443&single=true), we sweep synthetic multi-stream CUDA Graphs built by `capture_benchmark.py` (same as before, we generate random interleaving of alloc/free/join with given probabilities, see [gist here](https://gist.github.com/eee4017/e2092d215b1d4bd46534148939af39e3)), and we compare median capture/replay times and memory. On an NVIDIA H100 PCIe across 24 configs, the optimization preserves reserved memory reduction at ~24–98%, leaves allocated memory unchanged, and brings capture time back to baseline (range 0.96–1.04× vs. baseline) with replay time unchanged (range 0.97–1.11×). Pull Request resolved: https://github.com/pytorch/pytorch/pull/162186 Approved by: https://github.com/eqy, https://github.com/ngimel	2025-09-30 22:28:46 +00:00
fduwjj	1f1de20ba9	[c10d][BE][ez] Update tensor ptr inside nccl.cpp (#164276 ) This is mostly a cosmetic change which replace the deprecating `data_ptr` API with mutable or const one. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164276 Approved by: https://github.com/Skylion007, https://github.com/eqy, https://github.com/kwen2501	2025-09-30 22:05:12 +00:00
Anshul Sinha	2810977d3a	[FSDP][Replicate] tests replicate type casting behavior and edge cases in mixed precision (#162861 ) Summary: Ensures that replicate can handle the same type casting behavior and edge cases that fully shard can when mixed precision is used Test Cases 1. pytest test/distributed/_composable/test_replicate_mixed_precision.py -k test_float16_on_one_submodule 2. pytest test/distributed/_composable/test_replicate_mixed_precision.py -k test_submodules_with_external_inputs 3. pytest test/distributed/_composable/test_replicate_mixed_precision.py -k test_norm_modules_bf16 4. pytest test/distributed/_composable/test_replicate_mixed_precision.py -k test_norm_modules_fp16 5. pytest test/distributed/_composable/test_replicate_mixed_precision.py -k test_clamp_reduce_dtype 6. pytest test/distributed/_composable/test_replicate_mixed_precision.py -k test_dataclass_input Pull Request resolved: https://github.com/pytorch/pytorch/pull/162861 Approved by: https://github.com/mori360 ghstack dependencies: #162830, #162836, #162839, #162851, #162853, #162855	2025-09-30 22:03:23 +00:00
Wei Feng	ae4fd4ea75	[FSDP2] support AC(FSDP) for torchtitan's MOE (#164009 ) for fsdp2 + EP, titan has fully_shard(AC(layer)) and fully_shard(layer.moe.experts): https://github.com/pytorch/torchtitan/issues/1624 for implicit prefetching, backward order is * _pre_backward unshard (norm, output) * _backward_prefetch unshard layers.6 * post_backward reshard (norm, output) * _pre_backward unshard layers.6 (no-op, unsharded already) * _backward_prefetch unshard layers.6.moe.experts * recompute_fn pre_forward unshard layers.6.moe.experts (no-op, unsharded already) * ~~recompute_fn post_forward reshard layers.6.moe.experts~~ <----- this PR make it a no-op * _pre_backward unshard layers.6.moe.experts (no-op, unsharded already) * _backward_prefetch unshard layers.5 * post_backward reshard layers.6.moe.experts * post_backward reshard layers.6 unit test: `pytest -s test/distributed/_composable/fsdp/test_fully_shard_comm.py -k test_set_modules_to_backward_prefetch_inside_ac` before fix: `NGPU=4 CONFIG_FILE="./torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml" ./run_train.sh --parallelism.expert_parallel_degree=2` ``` [rank0]:[titan] 2025-09-30 11:43:01,714 - root - INFO - step: 1 loss: 12.0162 grad_norm: 1.7315 memory: 45.64GiB(48.05%) tps: 1,028 tflops: 10.87 mfu: 1.10% [rank0]:[titan] 2025-09-30 11:43:01,714 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 [rank0]:[titan] 2025-09-30 11:43:35,233 - root - INFO - [GC] Performing periodical GC collection 0.06 seconds [rank0]:[titan] 2025-09-30 11:43:35,987 - root - INFO - step: 50 loss: 6.9302 grad_norm: 0.9985 memory: 59.66GiB(62.80%) tps: 11,712 tflops: 123.89 mfu: 12.53% ``` after fix: `NGPU=4 CONFIG_FILE="./torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml" ./run_train.sh --parallelism.expert_parallel_degree=2` ``` [rank0]:[titan] 2025-09-30 11:38:57,377 - root - INFO - step: 1 loss: 12.0134 grad_norm: 1.6916 memory: 38.42GiB(40.45%) tps: 805 tflops: 8.51 mfu: 0.86% [rank0]:[titan] 2025-09-30 11:38:57,377 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 [rank0]:[titan] 2025-09-30 11:39:28,541 - root - INFO - [GC] Performing periodical GC collection 0.06 seconds [rank0]:[titan] 2025-09-30 11:39:29,279 - root - INFO - step: 50 loss: 6.9346 grad_norm: 1.1875 memory: 52.58GiB(55.36%) tps: 12,583 tflops: 133.10 mfu: 13.46% ``` for explicit prefetching, layers.6 backward prefetch layers.5 and layers.5.moe.experts. layers.6.moe.experts does not have explicit prefetch. backward order is like this * _pre_backward unshard (norm, output) * _prefetch_unshard layers.6 * post_backward reshard (norm, output) * _pre_backward unshard layers.6 (no-op, unsharded already) * _prefetch_unshard layers.5 * _prefetch_unshard layers.5.moe.experts * recompute_fn pre_forward unshard layers.6.moe.experts * ~~recompute_fn post_forward reshard layers.6.moe.experts~~ <----- this PR makes it a no-op * _pre_backward unshard layers.6.moe.expert (no-op, unsharded already) * post_backward reshard layers.6.moe.expert * post_backward reshard layers.6 before fix: `NGPU=4 CONFIG_FILE="./torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml" ./run_train.sh --parallelism.expert_parallel_degree=2` ``` [rank0]:[titan] 2025-09-30 11:53:24,574 - root - INFO - step: 1 loss: 12.0180 grad_norm: 1.6948 memory: 45.77GiB(48.18%) tps: 849 tflops: 8.98 mfu: 0.91% [rank0]:[titan] 2025-09-30 11:53:24,574 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 [rank0]:[titan] 2025-09-30 11:53:57,768 - root - INFO - [GC] Performing periodical GC collection 0.07 seconds [rank0]:[titan] 2025-09-30 11:53:58,515 - root - INFO - step: 50 loss: 6.9358 grad_norm: 1.0528 memory: 59.80GiB(62.95%) tps: 11,827 tflops: 125.10 mfu: 12.65%``` ``` after fix: `NGPU=4 CONFIG_FILE="./torchtitan/models/deepseek_v3/train_configs/deepseek_v3_16b.toml" ./run_train.sh --parallelism.expert_parallel_degree=2` ``` [rank0]:[titan] 2025-09-30 12:08:39,404 - root - INFO - step: 1 loss: 12.0143 grad_norm: 1.7030 memory: 38.55GiB(40.58%) tps: 988 tflops: 10.45 mfu: 1.06% [rank0]:[titan] 2025-09-30 12:08:39,404 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 [rank0]:[titan] 2025-09-30 12:09:10,482 - root - INFO - [GC] Performing periodical GC collection 0.06 seconds [rank0]:[titan] 2025-09-30 12:09:11,168 - root - INFO - step: 50 loss: 6.9356 grad_norm: 0.9911 memory: 52.81GiB(55.59%) tps: 12,637 tflops: 133.68 mfu: 13.52% ``` Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/164009 Approved by: https://github.com/soulitzer	2025-09-30 22:02:24 +00:00
Animesh Jain	adc11a7634	[export] avoid checks during tracing of export verification (#164219 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/164219 Approved by: https://github.com/Lucaskabela	2025-09-30 21:46:59 +00:00
Anshul Sinha	99e28ffab3	[FSDP][Replicate] tests replicate core functionality with mixed precision (#162855 ) Summary: Ensures that replicate functionality works the same as fully shard's when mixed precision is used Test Cases 1. pytest test/distributed/_composable/test_replicate_mixed_precision.py -k TestReplicateMixedPrecisionTraining Pull Request resolved: https://github.com/pytorch/pytorch/pull/162855 Approved by: https://github.com/mori360 ghstack dependencies: #162830, #162836, #162839, #162851, #162853	2025-09-30 21:45:58 +00:00
Anshul Sinha	01dd2c2b42	[FSDP][Replicate] tests replicate is composable with tp (#162853 ) Summary: Proof that new replicate API is composable with TP Test Case 1. pytest test/distributed/_composable/test_replicate_training.py -k test_replicate_tp Pull Request resolved: https://github.com/pytorch/pytorch/pull/162853 Approved by: https://github.com/mori360 ghstack dependencies: #162830, #162836, #162839, #162851	2025-09-30 21:29:54 +00:00
Anshul Sinha	d3bdf8c32e	[FSDP][Replicate] tests replicate with custom forward method (#162851 ) Summary: tests replicate works when users use custom forward methods Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_register_fsdp_forward_method Pull Request resolved: https://github.com/pytorch/pytorch/pull/162851 Approved by: https://github.com/mori360 ghstack dependencies: #162830, #162836, #162839	2025-09-30 21:15:34 +00:00
Anshul Sinha	1ce9563ff6	[FSDP][Replicate] tests replicate gradient accumulation and 1f1b microbatching (#162839 ) Summary: In order to ensure that replicate acts as intended (a specialized version of hsdp) we need to make sure that it can pass the same tests that fully_shard can for training. The first test verifies Replicate works with gradient accumulation properly. The second verifies that replicate works correctly with a One-Forward-One-Backward (1F1B) pipeline parallelism schedule Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_gradient_accumulation 2. pytest test/distributed/_composable/test_replicate_training.py -k test_1f1b_microbatching Pull Request resolved: https://github.com/pytorch/pytorch/pull/162839 Approved by: https://github.com/mori360 ghstack dependencies: #162830, #162836	2025-09-30 21:00:16 +00:00
xadupre	9e631392dc	Missing lambda in torch._check (#164225 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164225 Approved by: https://github.com/Skylion007	2025-09-30 20:32:38 +00:00
PaulZhang12	1cce6efdb8	Fix silent incorrectness for bmm/baddmm out_dtype overload (#164095 ) Add input checks like meta functions for standard ops in `ATen/native/LinearAlgebra.cpp` for the `out_dtype` variants. Fixes silent incorrectness in https://github.com/pytorch/pytorch/issues/163816 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164095 Approved by: https://github.com/ngimel	2025-09-30 20:13:13 +00:00
Nikita Shulga	5a93f00c79	[CI] Delete binary smoke workflows (#164260 ) Those were very useful in the past, because: - CI builder jobs did not generates wheels, but rather run `python setup.py develop` and shared docker layers, which is no longer the case, all CI jobs produce wheels - CD jobs were targeting pre-CXX11 ABI, but this is no longer the case after manylinux2_28 migration Existing, but acceptable gaps: - Windows libtorch debug builds sometimes might fail, but IMO it's ok not to be able to produce those for a few days, as number of libtorch users are somewhat small - All CD jobs are based on AlmaLinux, while CI are based on Ubuntu, but this could be adjusted if needed, besides AlmaLinux-9 and Ubuntu-22.04 are pretty close in terms of glibc and gcc versions - CD jobs build for all GPU architectures, while CI only for the one being tested, but there are now periodic H100 and B200 jobs, and not a lot of development happens for Voltas or Pascals Besides there are better tools to alert about the nightly failures Pull Request resolved: https://github.com/pytorch/pytorch/pull/164260 Approved by: https://github.com/seemethere, https://github.com/atalman	2025-09-30 20:00:07 +00:00
Yuanyuan Chen	e30f01b5b5	[1/N] Simplify "in" operation for containers of a single item (#164224 ) These issues are detected by ruff [FURB171](https://docs.astral.sh/ruff/rules/single-item-membership-test/#single-item-membership-test-furb171). Pull Request resolved: https://github.com/pytorch/pytorch/pull/164224 Approved by: https://github.com/rec, https://github.com/Skylion007	2025-09-30 19:59:43 +00:00
Jeff Daily	ffc645c870	half support for fused_moving_avg_obs_fake_quant() op (#164175 ) Follow up to https://github.com/pytorch/pytorch/pull/162620. Add half support, as well. This fixes some failures in inductor benchmarks such as from this log https://github.com/pytorch/pytorch/actions/runs/18051942373/job/51376749459. `NotImplementedError: "aminmax_kernel" not implemented for 'Half'` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164175 Approved by: https://github.com/malfet, https://github.com/jerryzh168	2025-09-30 19:35:17 +00:00
Han Qi	60f0a356fd	Update persons of interest for XLA. The previous one is out of date. (#158652 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158652 Approved by: https://github.com/JackCaoG, https://github.com/albanD	2025-09-30 19:21:18 +00:00
Kohaku-Blueleaf	d2c5f231f6	Fix the shape check inside gnll loss (#147522 ) Fixes #147521 This modification allow user to put any size of var in GaussianNLLLoss if the var is broadcastable (to input/target's size) Therefore, the demo code in #147521 will result in expected behaviour and correct output. This allow all input size that match: `input.size = (..., n, ...), var.size = (..., 1, ...)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/147522 Approved by: https://github.com/mikaylagawarecki	2025-09-30 18:40:15 +00:00
PyTorch MergeBot	cc5d74c366	Revert "[BE] Remove HermeticPyObjectTLS and Simplify PythonOpRegistrationTrampoline (#163464 )" This reverts commit 94195a37ae4eae9c486a81b0f67725c8970f74d6. Reverted https://github.com/pytorch/pytorch/pull/163464 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/163464#issuecomment-3353307034))	2025-09-30 18:20:20 +00:00
Markus Hoehnerbach	a707042353	fix: inductor non_blocking test - warmup events to make test pass whether it is the first run or not (#164188 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164188 Approved by: https://github.com/williamwen42	2025-09-30 18:20:17 +00:00
Pian Pawakapan	d615f6b935	[inductor] use hint_override in kernel benchmark args (#164207 ) Summary: forward fix T239259207 Test Plan: test_multi_kernel Differential Revision: D83539263 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164207 Approved by: https://github.com/bobrenjc93, https://github.com/mlazos	2025-09-30 18:09:29 +00:00
Nick Riasanovsky	719b64ee8b	Fix TMA transpose logic to handle 1D shapes + string differences (#163966 ) Fixes #163702. This fixes 2 issues: 1. The value may inconsistently be a shape or string. This normalizes to handle both of these. 2. 1D shapes should not transpose data. This fixes the order of operations to prevent this. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163966 Approved by: https://github.com/eellison	2025-09-30 17:51:37 +00:00
Mwiza Kunda	1cf1b9138d	[inductor][templates] Template hooks should be finalised inside a kernel context (#164229 ) The prologue buffer added in https://github.com/pytorch/pytorch/pull/160480 is added to template code in the DEF_KERNEL [hook](`29221b9828/torch/_inductor/select_algorithm.py (L742)`). The lines in this buffer may be of type `DeferredLine`, and so require the correct kernel context to determine whether lines should be added or removed. Test plan: Tested with a custom template using tensor descriptors for prologue fused inputs, whose tensor descriptors need to be hoisted to the top of the kernel. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164229 Approved by: https://github.com/njriasan	2025-09-30 17:50:59 +00:00
William Wen	5ed4672477	[dynamo, 3.14] fix _detect_and_normalize_assert_statement for 3.14 (#164005 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164005 Approved by: https://github.com/anijain2305 ghstack dependencies: #161838, #161555, #161839, #163009, #163109, #163110, #163191, #163292, #163796, #163818, #163919, #163920, #164004	2025-09-30 17:43:03 +00:00
William Wen	2600f8b3d1	[dynamo, 3.14] fix tracing typing.Union (#164004 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164004 Approved by: https://github.com/anijain2305, https://github.com/mlazos ghstack dependencies: #161838, #161555, #161839, #163009, #163109, #163110, #163191, #163292, #163796, #163818, #163919, #163920	2025-09-30 17:43:03 +00:00
William Wen	9ce31e4278	[3.14] make unbacked_sym[int/float]_counter integers (#163920 ) 3.14 removed copy/deepcopy/pickle support for `itertools` iterators: https://docs.python.org/3.14/whatsnew/3.14.html#itertools Change unbacked_sym[int/float]_counter from `itertools.count` to regular integers. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163920 Approved by: https://github.com/ezyang ghstack dependencies: #161838, #161555, #161839, #163009, #163109, #163110, #163191, #163292, #163796, #163818, #163919	2025-09-30 17:42:55 +00:00
William Wen	0657de9c61	[dynamo, 3.14] support LOAD_COMMON_CONSTANT (#163919 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163919 Approved by: https://github.com/anijain2305, https://github.com/mlazos ghstack dependencies: #161838, #161555, #161839, #163009, #163109, #163110, #163191, #163292, #163796, #163818	2025-09-30 17:42:47 +00:00
William Wen	4ead8ebf70	[dynamo, 3.14] fix BUILD_TUPLE with 0 args (#163818 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163818 Approved by: https://github.com/anijain2305 ghstack dependencies: #161838, #161555, #161839, #163009, #163109, #163110, #163191, #163292, #163796	2025-09-30 17:42:40 +00:00
William Wen	d4b785a6a7	[dynamo, 3.14] fix stack ref copy error (#163796 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163796 Approved by: https://github.com/anijain2305 ghstack dependencies: #161838, #161555, #161839, #163009, #163109, #163110, #163191, #163292	2025-09-30 17:42:33 +00:00
William Wen	9278b18ec0	[dynamo, 3.14] fix WITH_EXCEPT_START (#163292 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163292 Approved by: https://github.com/anijain2305 ghstack dependencies: #161838, #161555, #161839, #163009, #163109, #163110, #163191	2025-09-30 17:42:26 +00:00
William Wen	008b0a9425	[dynamo, 3.14] fix inactive ctx handling in resume functions (#163191 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163191 Approved by: https://github.com/anijain2305 ghstack dependencies: #161838, #161555, #161839, #163009, #163109, #163110	2025-09-30 17:42:19 +00:00
William Wen	44677ad917	[dynamo, 3.14] support LOAD_CONST on slice, codegen LOAD_CONST slice instead of BINARY/STORE_SLICE (#163110 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163110 Approved by: https://github.com/anijain2305 ghstack dependencies: #161838, #161555, #161839, #163009, #163109	2025-09-30 17:42:11 +00:00
William Wen	1c9987fdf4	[dynamo, 3.14] fix context managers (#163109 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163109 Approved by: https://github.com/anijain2305, https://github.com/mlazos ghstack dependencies: #161838, #161555, #161839, #163009	2025-09-30 17:42:03 +00:00
William Wen	7cbc011700	[dynamo, 3.14] support some bytecodes, fix CALL_FUNCTION_EX (#163009 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163009 Approved by: https://github.com/anijain2305 ghstack dependencies: #161838, #161555, #161839	2025-09-30 17:41:56 +00:00
William Wen	09c774145e	[dynamo, 3.14] Python dynamo changes to get basic programs working (#161839 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161839 Approved by: https://github.com/Lucaskabela, https://github.com/anijain2305 ghstack dependencies: #161838, #161555	2025-09-30 17:41:49 +00:00
William Wen	763ab2a6ed	[dynamo, 3.14] compile actual code in C dynamo (#161555 ) No 3.14 CI tests enabled yet, but this was enough to get Dynamo compiling locally and Python Dynamo is at least being called. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161555 Approved by: https://github.com/anijain2305 ghstack dependencies: #161838	2025-09-30 17:41:42 +00:00
William Wen	4b8fe795f8	[dynamo] format cpython_defs.c (#161838 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161838 Approved by: https://github.com/Skylion007, https://github.com/anijain2305	2025-09-30 17:41:35 +00:00
IvanKobzarev	84e1cd7392	[inductor] fx comm overlap: align runtime estimations across dist ranks (#164226 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164226 Approved by: https://github.com/eellison	2025-09-30 17:29:18 +00:00
Yiming Zhou	937869657e	Exporting aten.sdpa with cuda under fake mode on a cuda-less machine (#164162 ) Summary: As titled. sdpa will select backend based on hardware check, and it fails when exporting with cuda under fake mode on a cuda-less machine. We guard `at::cuda::is_available()` check before `at::cuda::getCurrentDeviceProperties()` and give warnings. Test Plan: buck2 run mode/dev-nosan caffe2/test:test_export -- -r nn_functional_scaled_dot_product_attention Differential Revision: D83496154 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164162 Approved by: https://github.com/SherlockNoMad	2025-09-30 17:21:31 +00:00
Henry Tsang	7d7ae4d7b2	[submodule] upgrade cutlass version to 4.2.1 and completely resolved python/cutlass name collision (#164156 ) Differential Revision: D83489362 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164156 Approved by: https://github.com/Skylion007, https://github.com/mlazos	2025-09-30 17:04:57 +00:00
Jeff Daily	906fe7b120	[ROCm][CI] no longer build almalinux image for ROCm 6.3 (#164201 ) Missed during ROCm 7 upgrades. We only build N and N-1. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164201 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-30 16:59:31 +00:00
Blaine Burton Rister	7edd18f0fd	[Inductor-FX] Generalize FloorDiv conversion to handle more complex launch grids. Remove python_slow grid mode. (#163828 ) # Problem Inductor's FX backend receives sympy expressions for Triton launch grids, and passes these to a tracer to generate equivalent FX IR. However, the tracer does not support all possible sympy expressions. In particular, it can't handle ops like `floor` and `Pow` which would be found in an expression like `floor(x / y)`. Instead, it expects `FloorDiv(x, y)`, which has the advantage that all intermediate values are integers, unlike `x / y`. Inductor's Python backend uses a trick where `ceil(x / y)` is computed in Python as `-(x // -y)`, which is faster when evaluating Python launch grids at runtime. However, this trick generates more complex sympy expressions, so the FX backend introduced a `"python_slow"` mode using a more familiar form of ceil division. However, this mode is slower to evaluate, which increased production CPU usage. (Internal reviewers see T237853632.) # Solution To get the best of both worlds, this PR removes `"python_slow"` mode, and generalizes the `replace_floor_div` function to handle the more complex expressions resulting from the `"python"` grid mode. The new algorithm is conceptually similar to the existing one, except instead of analyzing only the first argument to a `sympy.Mul` op, it checks all factors, so it can handle expressions containing both `Rational` and `Pow` ops, among other cases. It also uses `Mul.make_args` to handle the case when the argument to `floor` is not a `Mul`. Finally, it uses `expr.is_positive` to check the sign of symbolic exponents. This new algorithm is guaranteed to convert all `floor` ops to an equivalent expression using `FloorDiv`. (To see this, consider that `floor(x) == FloorDiv(x, 1)`.) Note it may not remove all `Pow` ops, with a counterexample being `floor(x / (2 + z ** y))`, but it covers everything we've seen in practice for symbolic launch grids. In particular, it covers the typical case where `Pow` is a factor of the argument to `floor`, and the exponent is `-1`. Is this situation, we move the `Pow` to the denominator of `FloorDiv` and the exponent becomes `1`, eliminating the `Pow` op. # Test plan This PR adds an end-to-end test for static padding with dynamic outer dimensions, which creates a difficult sympy expression that the existing algorithm would not be able to handle. This PR also adds some unit tests for the `replace_floor_div` function. It can be difficult to construct end-to-end tests that expose all the trickiest expressions, as those tests have to pass through a number of other systems handling dynamic shapes. Therefore, it's easier to expose the edge cases with these new unit tests. The tests check that we can replace all `floor` ops in the input expression with `FloorDiv`, then they expand `FloorDiv` back to `floor` and check equality with the original expression. Note this PR also requires some MTIA changes to pass internal tests. Those will be stacked onto the imported diff. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163828 Approved by: https://github.com/nandesuka, https://github.com/angelayi, https://github.com/jansel	2025-09-30 16:47:49 +00:00
Sherlock Huang	3564cd294c	Fix TestExportOpInfo (#164184 ) Fixes https://github.com/pytorch/pytorch/issues/163699 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164184 Approved by: https://github.com/yiming0416, https://github.com/tugsbayasgalan	2025-09-30 16:12:39 +00:00
Zhengxu Chen	1412a4a42f	[precompile] Add option to disable guard check on aot-compiled function. (#163432 ) Summary: Under circumstances it seems reasonable to return a callable directly without guard check when user use aot_compile on a function with single compilation result. When having multiple entries (aot_compile_module), we should start enabling guard check to differetiate different compiled functions apart. Test Plan: CI Pull Request resolved: https://github.com/pytorch/pytorch/pull/163432 Approved by: https://github.com/dolpm, https://github.com/mlazos	2025-09-30 16:10:15 +00:00
clee2000	96330f490d	[testing] Add upload for test status during test stat uploads (#164189 ) Add test status (flaky, success, skipped, failure) upload for easier comparison between test status on two commits Pull Request resolved: https://github.com/pytorch/pytorch/pull/164189 Approved by: https://github.com/huydhn, https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-09-30 15:53:53 +00:00
eqy	66abba8f49	[CUDA][Expandable Segments] Follow-up cleanups for even more expandable segments tests (#163297 ) Gets original setting even earlier in case of crashes, fixes previous get call where set should be Pull Request resolved: https://github.com/pytorch/pytorch/pull/163297 Approved by: https://github.com/Skylion007	2025-09-30 15:39:14 +00:00
Svetlana Karslioglu	e88cca0691	Update Sphinx theme (#164147 ) Fix links in the top nav bar: `71e55749be` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164147 Approved by: https://github.com/albanD	2025-09-30 15:35:58 +00:00
Jonah Bernard	5c020beba4	Update LPPool docs to clarify ceil_mode padding semantics when ceil_mode=True (#163186 ) # Summary - Add a note to each `nn.LPPoold` docstring explaining how `ceil_mode=True` interacts with right padding. - Mirror the same clarification in the `torch.nn.functional.lp_pool` docstrings so the rendered functional docs stay in sync. # Motivation The current PyTorch spec for LPPool does not fully match runtime behavior, which has led to downstream confusion in other specs (e.g., ONNX) and runtimes (e.g., [onnxruntime issue #25848](https://github.com/microsoft/onnxruntime/issues/25848)). A corresponding clarification was also made in the ONNX spec: [onnx/onnx#5741](https://github.com/onnx/onnx/pull/5741). PyTorch’s LPPool implementation calls into AvgPool, which enforces the rule that windows starting entirely in the right padded region are ignored when `ceil_mode=True`. As a result, LPPool inherits the same behavior. This is an edge case where the output size formula shown in the LPPool docs/spec is not sufficient on its own. Without the added caveat, the documentation is technically incorrect. This PR brings the LPPool docs in line with actual behavior. Note that this is a trivial fix to the spec as all major implementers of the spec adhere to this caveat. For comparison, both MaxPool and AvgPool already include this clarification in their spec. Their docstrings explicitly state: > When `ceil_mode=True`, sliding windows are allowed to go off-bounds if they start within the left padding or the input. Sliding windows that would start in the right padded region are ignored. Adding the same note to LPPool ensures consistency across all pooling operators. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163186 Approved by: https://github.com/mikaylagawarecki	2025-09-30 15:22:46 +00:00
atalman	edd9e07aff	[BE] Remove not existing mnist mirror (#164238 ) Looks like original source is empty now: http://yann.lecun.com/exdb/mnist/ Pytorch hosted mirror exist. Hence leaving it as only option. https://ossci-datasets.s3.amazonaws.com/mnist/ Fixes these errors in pytorch/ci: ``` C:\actions-runner\_work\pytorch\pytorch>python tools\download_mnist.py --quiet -d C:\actions-runner\_work\pytorch\pytorch\test\cpp\api\mnist Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz ... Failed to download (trying next): HTTP Error 404: Not Found Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz ... Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz ... Failed to download (trying next): HTTP Error 404: Not Found Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz ... Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz ... Failed to download (trying next): HTTP Error 404: Not Found Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz ... Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz ... Failed to download (trying next): HTTP Error 404: Not Found Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz ... ``` Link to workflow with example: https://github.com/pytorch/pytorch/actions/runs/18109150240/job/51542177282#step:15:2335 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164238 Approved by: https://github.com/jeanschmidt	2025-09-30 15:15:13 +00:00
PyTorch MergeBot	0fb89b84b9	Revert "Consistently use c10_ovrsource in arvr mode everywhere (#164128 )" This reverts commit efd7fd5ed5ac7ec03201a546a09fb19ec59de431. Reverted https://github.com/pytorch/pytorch/pull/164128 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/164128#issuecomment-3352544006))	2025-09-30 14:43:52 +00:00
PyTorch MergeBot	79fcfd49d6	Revert "[CI] Push `viable/strict/${time}` tags (#164183 )" This reverts commit 9f27b0c24515d9cf319d9a728d5009bf9ed035cf. Reverted https://github.com/pytorch/pytorch/pull/164183 on behalf of https://github.com/malfet due to Hmm, didn't work that way ([comment](https://github.com/pytorch/pytorch/pull/164183#issuecomment-3352494098))	2025-09-30 14:32:46 +00:00
PyTorch MergeBot	71b4fada57	Revert "Add less warps config to inner reductions (#162447 )" This reverts commit 84d673ef577d42d6ec20c6c9f09863583c3111f5. Reverted https://github.com/pytorch/pytorch/pull/162447 on behalf of https://github.com/PaulZhang12 due to internal failure ([comment](https://github.com/pytorch/pytorch/pull/162447#issuecomment-3352474768))	2025-09-30 14:28:19 +00:00
Yuanyuan Chen	46ec0664e3	Remove unused PyIntXXX, THPUtils_newReal_BOOL, THPQXXX macros (#164056 ) The removed macros are not used in other places of the `pytorch` GitHub org. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164056 Approved by: https://github.com/albanD	2025-09-30 13:48:25 +00:00
PyTorch MergeBot	410ed3006b	Revert "Add functions to setup PrivateUse1 as a python backend device. (#157859 )" This reverts commit 1310d6a1f9194ddcf6753f7e12fb78f278451f8a. Reverted https://github.com/pytorch/pytorch/pull/157859 on behalf of https://github.com/jeanschmidt due to introduce linting errors ([comment](https://github.com/pytorch/pytorch/pull/157859#issuecomment-3352140098))	2025-09-30 13:24:37 +00:00
zeshengzong	77354e22e1	[OpenReg] Add AMP Integration guide for accelerators (#162050 ) Fix part of #158917 Add AMP integration document and OpenReg code as example to explain steps of integration. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162050 Approved by: https://github.com/albanD Co-authored-by: FFFrog <ljw1101.vip@gmail.com>	2025-09-30 12:27:11 +00:00
ankushwahaRH	7f29c47a4f	Fix cdist export compute mode validation (#161724 ) Fixes #161089. Added '0' as the acceptable value for compute mode in _meta_registrations.py. Also, added a test case in test_export.py file. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161724 Approved by: https://github.com/albanD, https://github.com/angelayi	2025-09-30 12:23:20 +00:00
Mwiza Kunda	ace6c76103	[inductor] Small refactor of CachingAutotuner (#162406 ) This is a simple refactor that just moves some logic in `_precompile_config` to two new functions for separation of concerns. This will allow subclasses e.g. out of tree to configure options and metadata for triton.compile. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162406 Approved by: https://github.com/exclamaforte	2025-09-30 11:29:15 +00:00
Han Qi	1310d6a1f9	Add functions to setup PrivateUse1 as a python backend device. (#157859 ) Fixes #156052 and #156444. This PR setup the privateuseone key in Python to be used as a python backend for pytorch. Meaning that, after calling `setup_privateuseone_for_python_backend('npy')`, one can use a subclass to with that device to hold arbitrary python data as "device data" and use `torch.library` to register ops that takes that Tensor. Changes done in this PR: 1. Register an vanilla Device Guard: I extended NoOpDeviceGuard to have allow device index of 0 and to not raise errors when event related functions are accessed. If I don't do those, when calling backward I would get errors. (CPU backend uses NoOpDeviceGuard just fine, although there seems to be special treatment of CPU in the autograd engine. 2. Tensor subclass allows not having `__torch_dispatch__` if the device is not CUDA or CPU. The comment of the check suggests it was to avoid segfault when calling into ops that expects a storage. Here we have a different device so will not call into those ops. 3. python function that invokes the other incantations to setup the privateusekey backend. This took inspiration of https://github.com/bdhirsh/pytorch_open_registration_example and https://github.com/tinygrad/tinygrad/blob/master/extra/torch_backend/wrapped_tensor.cpp; great thanks to @bdhirsh and @geohot. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157859 Approved by: https://github.com/albanD	2025-09-30 08:39:36 +00:00
Tristan Rice	7f4c3e7d2f	distributed/serialization: support zero sized tensors (#164198 ) Fixes ``` [4] ValueError: both buffer length (0) and count (-1) must not be 0 ``` Test plan: ``` pytest test/distributed/test_serialization.py ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164198 Approved by: https://github.com/amirafzali	2025-09-30 08:11:29 +00:00
Sherlock Huang	6e5b4249a5	[DTensor][Export] Supporting exporting a model with DTensor params/inputs (#163609 ) I experimented with 3 paths to get joint graph for DTensorized module and input 1. strict_export + aot_export_joint_with_descriptors 2. graph_capture + aot_export_joint_with_descriptors 3. aot_export_joint_with_descriptors alone Added test to guard them. 1 doesn't work, as bw graph region is missing from the joint graph. I am leaning towards making 2 the recommended path. If 2 doesn't work going forward, we can fallback to 3. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163609 Approved by: https://github.com/tugsbayasgalan Co-authored-by: suo <suo@fb.com>	2025-09-30 07:54:13 +00:00
Animesh Jain	5274753873	[dynamo][device_mesh] Support mesh_dim_names (#164200 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164200 Approved by: https://github.com/SherlockNoMad, https://github.com/jansel	2025-09-30 07:16:28 +00:00
Yavuz Yetim	7afcb030d8	Back out "Revert D81959389" (#163905 ) Summary: Original commit changeset: 06888d7ebff0 Original Phabricator Diff: D82932788 Restricted the test to SM90 for scaled_grouped_mm Test Plan: TBD (will share the linux CI results) Differential Revision: D83283991 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163905 Approved by: https://github.com/angelayi	2025-09-30 07:05:13 +00:00
Animesh Jain	bbf6816f35	[dynamo] Special path for cloning of torch dispatch tensors (#164081 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164081 Approved by: https://github.com/tugsbayasgalan, https://github.com/mlazos	2025-09-30 05:15:56 +00:00
vishalgoyal316	ace89350fc	better error handling for rrelu when lower or upper range is infinite (#160965 ) … - issue#153281 Fixes #153281 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160965 Approved by: https://github.com/janeyx99	2025-09-30 05:01:32 +00:00
eellison	7d59e37434	Add Comm-Compute Preserving Bucketer (#163960 ) tl;dr performs bucketing while preserving comm-compute overlap. In comm-compute overlap we will have a graph with: ``` def foo(...): ag = all_gather(...) hiding_compute = mm(...) wait(ag) ``` There is no explicit dependency between the hiding compute and the collectives, but we want to add implicit dependencies from wait->hiding_compute, and from hiding_compute->all_gather to preserve overlap. Additionally, while bucketing, we will merge collective starts and collective waits together. In this case, we will want to treat the two nodes as a single subgraph - each node in the merged set will have the union of all deps in the set. We perform bucketing while augmenting the graph with these relationships. This can be done separably from comm-compute overlap, so long as the hiding compute relationships are passed in. TODO: - need to instrument fx graph so inductor respects these relationships. - the compile time of the bucketing search can be sped up significantly by limiting what portion of the graph we traverse through - more memory aware handling Pull Request resolved: https://github.com/pytorch/pytorch/pull/163960 Approved by: https://github.com/ruisizhang123, https://github.com/v0i0, https://github.com/IvanKobzarev ghstack dependencies: #163215, #163754, #163959	2025-09-30 04:53:58 +00:00
eellison	92108f4abd	Helper to augment graph with additional deps (#163959 ) In comm-compute overlap we will have a graph with: ``` def foo(...): ag = all_gather(...) hiding_compute = mm(...) wait(ag) ``` There is no explicit dependency between the hiding compute and the collectives, but we want to add implicit dependencies from wait->hiding_compute, and from hiding_compute->all_gather to preserve overlap. Additionally, while bucketing, we will merge collective starts and collective waits together. In this case, we will want to treat the two nodes as a single subgraph - each node in the merged set will have the union of all deps in the set. This pr adds `AugmentedGraphHelper` that adds the apis, and allows querying for dependency with this augmented graph. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163959 Approved by: https://github.com/v0i0, https://github.com/IvanKobzarev ghstack dependencies: #163215, #163754	2025-09-30 04:53:58 +00:00
eellison	0b2fdc30a2	refactor bucketing (#163754 ) Preparatory refactory Pull Request resolved: https://github.com/pytorch/pytorch/pull/163754 Approved by: https://github.com/IvanKobzarev ghstack dependencies: #163215	2025-09-30 04:53:58 +00:00
eellison	0d7994ca97	[inductor] do comm compute overlap at aten fx level (#163215 ) This is first part of the stack that does comm/compute reordering, and then uses the exposure analysis to do bucketing. Subsequent prs will handle: - use of exposure analysis to do bucketing - make sure inductor respects comm/compute overlapping done at fx level - non-profiling mm estimation/rank broadcasting of profile results Other mis: - Validate accuracy of nccl estimations ( use ruisi's profiling instead ?) For a llama 2d parallelism test, on forward, we overlap all but 2 of potentially hidden collectives. For backward, we overlap 217/269 of potentially hidden collectives. If you increase `compute_overlap_multipler` (for fudge factor of inaccurate comms estimation), that goes down to all but 16 of potentially hidden collectives. fwd example: https://gist.github.com/eellison/76209c49d8829c5f1e323d34a3f040c3 bwd example: https://gist.github.com/eellison/6cfc2285df53a94cfa4012f5fdae5c51 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163215 Approved by: https://github.com/IvanKobzarev	2025-09-30 04:53:58 +00:00
bobrenjc93	c39357bab6	[torchfuzz] Make scalar and tensor distribution configurable (#164034 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164034 Approved by: https://github.com/pianpwk	2025-09-30 04:50:54 +00:00
Yuanyuan Chen	a293206bd5	Fix invalid f-strings (#164112 ) Fixes invalid f-strings detected by `ruff`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164112 Approved by: https://github.com/Skylion007, https://github.com/mlazos	2025-09-30 04:17:13 +00:00
Nikita Shulga	9f27b0c245	[CI] Push `viable/strict/${time}` tags (#164183 ) Every time viable strict is updated Pull Request resolved: https://github.com/pytorch/pytorch/pull/164183 Approved by: https://github.com/seemethere	2025-09-30 04:00:22 +00:00
Yuanyuan Chen	85012fe167	Remove unnecessary list comprehensions (#164103 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/164103 Approved by: https://github.com/Lucaskabela, https://github.com/mlazos	2025-09-30 03:56:54 +00:00
PyTorch MergeBot	ca19815e3c	Revert "Enable outer reductions in fbcode (#163884 )" This reverts commit 872edd89d62f0095d3fbd8ae9204d7c8bd980460. Reverted https://github.com/pytorch/pytorch/pull/163884 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/163884#issuecomment-3349822031))	2025-09-30 03:42:24 +00:00
Rachel Guo	0b0ed6fd33	[doc] Add AOTInductor intermediate debug printer OSS user manual (#163794 ) Summary: Add a OSS user manual for AOTI intermediate debug printer so we can link it in the Pytorch conference poster. Test Plan: N/A Differential Revision: D83171374 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163794 Approved by: https://github.com/yushangdi	2025-09-30 03:01:03 +00:00
Nikita Shulga	55840fb4bb	[CMake] Fix `USE_FBGEMM_GENAI` option (#164165 ) ---- - `cmake_dependent_option` condition should be `USE_ROCM OR (USE_CUDA AND NOT MSVC)` (similar to the one for flash attention) - Default settings should be user overridable, i.e. even if one builds for SM_10, they should be able to pass `USE_FBGEMM_GENAI=0` and skip the build Pull Request resolved: https://github.com/pytorch/pytorch/pull/164165 Approved by: https://github.com/Skylion007	2025-09-30 02:38:03 +00:00
Jeff Daily	b7419b920d	[ROCm][CI] Upgrade ROCm to 7.0 (#163140 ) Upgrade all the ROCm docker image to ROCm 7.0 release version. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163140 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-30 02:23:26 +00:00
Wei Wang	3b4ad4a17d	[AARCH64][CD][CUDA13][Triton][PTXAS] Turn on BUILD_BUNDLE_PTXAS=1 (#163988 ) See also #163972, which was intended to be this PR. Triton (release/3.5.x) by default ships CUDA12.8 ptxas. This PR tries to bundle a ptxas version for cuda13, so that it can help https://github.com/pytorch/pytorch/issues/163801 when users run on new devices like THOR and Spark. Fixes https://github.com/pytorch/pytorch/issues/163801 Test Plan: Check binary size increase against nightly or v2.9RC Install the binary from into a working THOR and GB200/GH100 machine (reproduce the original issue first on THOR), then install the binary built from this PR and we expect the issue to be gone without any additional user setting. Testing on GB200 is to ensure no regression. Reference: https://github.com/pytorch/pytorch/pull/119750 and `5c814e2527` Note: with this PR, the pytorch world's torch.compile is supposed to find ptxas via "torch/_inductor/runtime/compile_tasks.py" and "_set_triton_ptxas_path". Use cases that do not go through "_set_triton_ptxas_path" may not be able to use the cuda13 ptxas binary. However, as is, the triton world does not know the existence of this new cuda13 ptxas. So IF a users thinks there is already pytorch/bin/ptxas and delete the ptxas from triton, then `c6ad34f7eb/python/triton/knobs.py (L216)` would still complain ptxas not found (if removed - it won't know this new one available) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163988 Approved by: https://github.com/atalman	2025-09-30 01:56:12 +00:00
Jeff Daily	4cf2900474	CUDACachingHostAllocatorImpl skip event query during capture (#164001 ) The CUDACachingAllocator already does this, so there is precedent. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164001 Approved by: https://github.com/eqy	2025-09-30 01:19:53 +00:00
Pian Pawakapan	474d07554a	[dynamic shapes] unbacked-safe slicing (#161414 ) Summary: Generates new unbacked symbols for slice output size & storage offset, when appropriate semantics are unclear. Teaches inductor to codegen the slice with flexible semantics. Test Plan: contbuild & OSS CI, see `56218d85e2` Rollback Plan: Differential Revision: D80948073 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161414 Approved by: https://github.com/laithsakka	2025-09-30 01:15:19 +00:00
Yukio Siraichi	089f9130ed	Install `fmtlib` headers. (#164139 ) `fmtlib` version was updated to 12.0.0 in #163441. In this new version, due to https://github.com/fmtlib/fmt/pull/4536, PyTorch started not installing `fmtlib` headers anymore. Because of that, PyTorch/XLA build CI started to fail https://github.com/pytorch/xla/issues/9653. While we did fix it internally https://github.com/pytorch/xla/pull/9650, I believe that PyTorch should continue installing the `fmtlib` headers, since it is a dependency of its C API [`python_arg_parser.h`][1]. PyTorch/XLA CI was moved to `unstable.yml` in #159272, and later removed in #163564. This PyTorch/XLA build failure went under the radar, since the `fmtlib` update only landed on September 22. [1]: `84d673ef57/torch/csrc/utils/python_arg_parser.h (L42)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164139 Approved by: https://github.com/Skylion007, https://github.com/malfet	2025-09-30 01:10:13 +00:00
Yuanyuan Chen	da003d7b95	[3/N] Import Callable from collections.abc in torch/distributed (#164104 ) This is the result of applying the ruff `UP035` check. `Callable` is imported from `collections.abc` instead of `typing`. This PR is the follow-up of #164054. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164104 Approved by: https://github.com/Skylion007	2025-09-30 00:28:53 +00:00
atalman	cee4e36f9a	[BE] remove manylinuxcxx11-abi-builder:cpu-cxx11-abi docker image (#164187 ) I believe this image is not used anywhere anymore. Test: ``` git grep manylinuxcxx11-abi-builder git grep manylinuxcxx11 ``` Return no results. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164187 Approved by: https://github.com/izaitsevfb, https://github.com/malfet, https://github.com/seemethere	2025-09-30 00:26:20 +00:00
Ke Wen	704cd771f6	[PP] Customize pipeline's submod name (#164037 ) Changing PP submodules' name from `submod_i` to `submod_pp_i` to distinguish from the submodule created by HOP. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164037 Approved by: https://github.com/H-Huang ghstack dependencies: #164045, #164035	2025-09-29 23:29:52 +00:00
eellison	d58f7c3ad1	[Easy] Add pointwise tag to fma (#164149 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164149 Approved by: https://github.com/fmassa	2025-09-29 22:40:04 +00:00
dependabot[bot]	170e0309ca	Bump protobuf from 5.29.4 to 5.29.5 in /.ci/docker (#156157 ) * Bump protobuf from 5.29.4 to 5.29.5 in /.ci/docker Bumps [protobuf](https://github.com/protocolbuffers/protobuf) from 5.29.4 to 5.29.5. - [Release notes](https://github.com/protocolbuffers/protobuf/releases) - [Changelog](https://github.com/protocolbuffers/protobuf/blob/main/protobuf_release.bzl) - [Commits](https://github.com/protocolbuffers/protobuf/compare/v5.29.4...v5.29.5) --- updated-dependencies: - dependency-name: protobuf dependency-version: 5.29.5 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> * Update .ci/docker/requirements-ci.txt --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-09-29 15:20:44 -07:00
PyTorch MergeBot	0f619c1f89	Revert "[inductor] do comm compute overlap at aten fx level (#163215 )" This reverts commit c9b5af9a384e7ef5f95613abe1622f5f55133c3a. Reverted https://github.com/pytorch/pytorch/pull/163215 on behalf of https://github.com/yangw-dev due to seems fails inductor/test_aten_comm_compute_reordering for macos test, see `c9b5af9a38 (51526707590-box)` ([comment](https://github.com/pytorch/pytorch/pull/163215#issuecomment-3349177940))	2025-09-29 21:53:42 +00:00
PyTorch MergeBot	b28e4f1f87	Revert "refactor bucketing (#163754 )" This reverts commit e1bd5b60cf243d3a026a6c89733488a6d9d4b33d. Reverted https://github.com/pytorch/pytorch/pull/163754 on behalf of https://github.com/yangw-dev due to seems fails inductor/test_aten_comm_compute_reordering for macos test, see `c9b5af9a38 (51526707590-box)` ([comment](https://github.com/pytorch/pytorch/pull/163215#issuecomment-3349177940))	2025-09-29 21:53:42 +00:00
PyTorch MergeBot	84dc54ae5e	Revert "Helper to augment graph with additional deps (#163959 )" This reverts commit b5d4d350f573db12b8181ee13f9386d6ef8a1e57. Reverted https://github.com/pytorch/pytorch/pull/163959 on behalf of https://github.com/yangw-dev due to seems fails inductor/test_aten_comm_compute_reordering for macos test, see `c9b5af9a38 (51526707590-box)` ([comment](https://github.com/pytorch/pytorch/pull/163215#issuecomment-3349177940))	2025-09-29 21:53:42 +00:00
Klaus Zimmermann	50d418f69f	Replace setup.py bdist_wheel with python -m build --wheel (#156712 ) Previously we already replaced most use of `python setup.py develop/install`. This PR also replaces the use of `setup.py bdist_wheel` with the modern `python -m build --wheel` alternative. Pull Request resolved: https://github.com/pytorch/pytorch/pull/156712 Approved by: https://github.com/atalman ghstack dependencies: #156711	2025-09-29 21:51:32 +00:00
Catherine Lee	c332d58184	[testing] upload test stats: Add info to the invoking file summary and some other changes (#164016 ) * Changes some internal logic for grouping so hopefully it's slightly less annoying write code for * Changes the invoking file summary to just use file, which I think is correct most of the time * Adds some fields to the file summary, like skips, errors, etc so I can reuse it for file report regression things Output should be the same, maybe with slightly more fields since I got rid of some of the pops Pull Request resolved: https://github.com/pytorch/pytorch/pull/164016 Approved by: https://github.com/huydhn	2025-09-29 21:20:18 +00:00
Edward Yang	efd7fd5ed5	Consistently use c10_ovrsource in arvr mode everywhere (#164128 ) Summary: Previously, many arvr targets transitively depended on c10, not c10_ovrsource, because they either explicitly depended on c10 (because they didn't know better) or they depended on legacy Caffe2, which never got the ovrsource treatment. So we found all these spots (driven by D82283623) and forced them to query arvr mode to figure out which one they should use. The goal is you NEVER have both targets in the same build rule at the same time. This diff could be reverted if D82224960 works out but I haven't gotten it to work yet. Test Plan: sandcastle Reviewed By: EscapeZero Differential Revision: D82390436 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164128 Approved by: https://github.com/albanD, https://github.com/malfet	2025-09-29 20:47:20 +00:00
eellison	b5d4d350f5	Helper to augment graph with additional deps (#163959 ) In comm-compute overlap we will have a graph with: ``` def foo(...): ag = all_gather(...) hiding_compute = mm(...) wait(ag) ``` There is no explicit dependency between the hiding compute and the collectives, but we want to add implicit dependencies from wait->hiding_compute, and from hiding_compute->all_gather to preserve overlap. Additionally, while bucketing, we will merge collective starts and collective waits together. In this case, we will want to treat the two nodes as a single subgraph - each node in the merged set will have the union of all deps in the set. This pr adds `AugmentedGraphHelper` that adds the apis, and allows querying for dependency with this augmented graph. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163959 Approved by: https://github.com/v0i0, https://github.com/IvanKobzarev ghstack dependencies: #163215, #163754	2025-09-29 20:43:12 +00:00
Nikita Shulga	6db1b9dd21	[MPS] Chunk fillBuffer into 4Gb slices (#164108 ) To avoid regression on MacOS 26, which one could observe by running the following script ```swift import Metal let bufferSize = 1<<32 + 4 guard let device = MTLCreateSystemDefaultDevice() else { fatalError("No Metal device found") } guard let buffer = device.makeBuffer(length: bufferSize, options: .storageModeShared) else { fatalError("Failed to create buffer") } guard let cmdQueue = device.makeCommandQueue() else { fatalError("Failed to create command queue") } guard let cmdBuffer = cmdQueue.makeCommandBuffer() else { fatalError("Failed to create command buffer") } guard let blitEncoder = cmdBuffer.makeBlitCommandEncoder() else { fatalError("Failed to create blit encoder") } blitEncoder.fill(buffer: buffer, range: 0..<bufferSize, value: 0x42) blitEncoder.endEncoding() cmdBuffer.commit() cmdBuffer.waitUntilCompleted() let tailOffs = 8 let hostPtr = buffer.contents().bindMemory(to: UInt8.self, capacity: bufferSize) let tail = Array(UnsafeBufferPointer(start: hostPtr + (bufferSize - tailOffs), count: tailOffs)) for (idx, val) in tail.enumerated() { print("Offs 0x\(String(bufferSize - tailOffs + idx, radix: 16)): 0x\(String(val, radix: 16))") } ``` Test plan: run `test_indexing.py` on MacOS-26 Fixes https://github.com/pytorch/pytorch/issues/161265 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164108 Approved by: https://github.com/Skylion007	2025-09-29 20:19:29 +00:00
PyTorch MergeBot	9e792f583a	Revert "[export] Skip the check instead of disable (#164084 )" This reverts commit c2768d0f5af840a94c342ed9eac3e26c819aa3f0. Reverted https://github.com/pytorch/pytorch/pull/164084 on behalf of https://github.com/yangw-dev due to broke internal tests ([comment](https://github.com/pytorch/pytorch/pull/164084#issuecomment-3348862668))	2025-09-29 20:09:13 +00:00
PyTorch MergeBot	6650f5af74	Revert "[dynamo] Special path for cloning of torch dispatch tensors (#164081 )" This reverts commit 811c693c49f7cd3da2ea174955d12f2f8780bd46. Reverted https://github.com/pytorch/pytorch/pull/164081 on behalf of https://github.com/yangw-dev due to broke internal tests ([comment](https://github.com/pytorch/pytorch/pull/164084#issuecomment-3348862668))	2025-09-29 20:09:13 +00:00
atalman	349c960970	Use linux.g4dn.4xlarge.nvidia.gpu for cuda 12.4 legacy driver tests (#163956 ) Workaround for https://github.com/pytorch/pytorch/issues/163658 Looks like the workflow passes on 12.8 build that use inux.g4dn.4xlarge.nvidia.gpu but its failing on 12.6 builds that use linux.4xlarge.nvidia.gpu: https://github.com/pytorch/pytorch/actions/runs/17953843505/job/51080623612#step:13:470 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163956 Approved by: https://github.com/malfet Co-authored-by: Mark Saroufim <marksaroufim@meta.com>	2025-09-29 19:38:17 +00:00
atalman	f090818a40	Rename remaining periodic and xpu workflows py3.9->py3.10 (#164127 ) Fix naming py3.9 should be py 3.10 These jobs where already migrated to 3.10 Please see: https://github.com/pytorch/pytorch/actions/runs/18091356163/job/51472526131#step:16:224 ``` Python version: + python --version Python 3.10.18 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164127 Approved by: https://github.com/malfet	2025-09-29 19:26:21 +00:00
eellison	e1bd5b60cf	refactor bucketing (#163754 ) Preparatory refactory Pull Request resolved: https://github.com/pytorch/pytorch/pull/163754 Approved by: https://github.com/IvanKobzarev ghstack dependencies: #163215	2025-09-29 18:32:41 +00:00
eellison	c9b5af9a38	[inductor] do comm compute overlap at aten fx level (#163215 ) This is first part of the stack that does comm/compute reordering, and then uses the exposure analysis to do bucketing. Subsequent prs will handle: - use of exposure analysis to do bucketing - make sure inductor respects comm/compute overlapping done at fx level - non-profiling mm estimation/rank broadcasting of profile results Other mis: - Validate accuracy of nccl estimations ( use ruisi's profiling instead ?) For a llama 2d parallelism test, on forward, we overlap all but 2 of potentially hidden collectives. For backward, we overlap 217/269 of potentially hidden collectives. If you increase `compute_overlap_multipler` (for fudge factor of inaccurate comms estimation), that goes down to all but 16 of potentially hidden collectives. fwd example: https://gist.github.com/eellison/76209c49d8829c5f1e323d34a3f040c3 bwd example: https://gist.github.com/eellison/6cfc2285df53a94cfa4012f5fdae5c51 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163215 Approved by: https://github.com/IvanKobzarev	2025-09-29 18:18:03 +00:00
Blaine Burton Rister	604da4bb9a	[Inductor-FX] Support unbacked symbol definitions (#163729 ) # Problem Inductor sometimes generates unbacked symints to handle things like mismatched branches of `torch.cond`. This code is represented by `pytree.KeyPath`, with special codegen logic to convert it to Python and C++. This was not previously supported by the FX backend. # Feature This PR adds support for unbacked symbol declarations to the FX backend. The implementation is fairly straightforward. 1. Instead of raw Python/C++, update the wrapper codegen method to emit a new Wrapper IR line called `UnbackedSymbolDefsLine`. This contains all the information needed to generate the Python and C++ code. 2. Move the existing Python/C++ codegen to a private method, which is invoked by `UnbackedSymbolDefsLine.codegen()`. 3. Implement a method to generate FX IR from unbacked symbol definitions. The implementation is based on recursive descent, consuming some keypath entries, emitting an FX IR node, and recursing to the rest of the keypath. It is conceptually identical to the existing algorithm for Python and C++, except it generates FX nodes. 4. The FX backend currently relies on size hints to generate autotuning arguments, and consequently autotuning does not support unbacked SymInts. At some point, we would like to generalize the autotuning logic to support these. But for now, simply emit a warning and skip autotuning when we see them. 5. The new test case exposed some tricky issues reconciling Triton call args with constants stored in `triton_meta`. This PR rewrites the relevant helper function to do this in a more principled way. # Test plan This PR imports an existing control flow test to the FX backend's test suite. The test uses unbacked symbol definitions to handle mismatched dynamic shapes coming from `torch.cond` branches. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163729 Approved by: https://github.com/jansel	2025-09-29 18:10:37 +00:00
Nikita Shulga	8f32adc90a	[MPSHooks] Release pending command encoder (#164093 ) Before returning a comand buffer, as subsequent calle are very likely to allocate their own encoder, which results in the following runtime error ``` tryCoalescingPreviousComputeCommandEncoderWithConfig:nextEncoderClass:]:1090: failed assertion `A command encoder is already encoding to this command buffer' ``` Added regression test to `test_mps_extension` Please note, that `torch::mps::get_command_buffer()` should be called with dispatch_queue held, both before and after this change, but many implementations skip that Fixes https://github.com/pytorch/pytorch/issues/163721 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164093 Approved by: https://github.com/atalman, https://github.com/Skylion007	2025-09-29 17:50:12 +00:00
Nikita Shulga	3fa3bfbfda	[EZ][BE] Fix unused parameter warnings in EmbeddingBag (#164135 ) Before this change following were emitted during compilation ``` [7/31] Compiling /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal to EmbeddingBag_31.air /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:28:12: warning: unused parameter 'is_first' [-Wunused-parameter] bool is_first) { ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:47:16: warning: unused parameter 'per_sample_weights_index' [-Wunused-parameter] uint32_t per_sample_weights_index, ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:48:19: warning: unused parameter 'per_sample_weights' [-Wunused-parameter] constant T* per_sample_weights, ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:49:16: warning: unused parameter 'per_sample_weights_stride' [-Wunused-parameter] uint32_t per_sample_weights_stride) { ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:74:19: warning: unused parameter 'weight_val' [-Wunused-parameter] opmath_t<T> weight_val, ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:75:19: warning: unused parameter 'out_val' [-Wunused-parameter] opmath_t<T> out_val, ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:76:12: warning: unused parameter 'is_first' [-Wunused-parameter] bool is_first, ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:77:17: warning: unused parameter 'max_idx' [-Wunused-parameter] thread I& max_idx, ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:78:9: warning: unused parameter 'weight_idx' [-Wunused-parameter] I weight_idx, ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/EmbeddingBag.metal:79:12: warning: unused parameter 'pad' [-Wunused-parameter] bool pad) {} ^ ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164135 Approved by: https://github.com/Skylion007	2025-09-29 17:44:09 +00:00
Fabian	8701f18bc0	Adjust ...mark_unbacked() -> ...decorators.mark_unbacked() in logs. (#164131 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164131 Approved by: https://github.com/albanD, https://github.com/Skylion007	2025-09-29 17:44:00 +00:00
Janani Sriram	a56e7a1920	[Max Autotune][B200] Add addmm config to avoid test OOM (#164020 ) Summary: Add a new `addmm` config that is small enough to not cause an OOM (out of memory error), since the configs for `blackwell_persistent_mm_configs`, which `addmm` used, are too large. Test Plan: `test_max_autotune.py` Differential Revision: D83378477 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164020 Approved by: https://github.com/coconutruben, https://github.com/njriasan	2025-09-29 17:38:46 +00:00
Janani Sriram	e2c894c97d	[Inductor][ATen][FP8] Relax stride check for block-wise scaling when scaling dimension is 1 (#163829 ) Summary: Relax stride check for block-wise scaling (1x128, 128x128) when a dimension of the scaling factor is 1. When the scaling tensor has a dimension of size 1, the stride is effectively "meaningless" to PyTorch, i.e. PyTorch decides to replace its stride with a default of `[1, 1]`. However, the old stride check required the stride to match one of the scaling dimensions. Here, we relax the stride check when the effective stride is 1 in order to allow for cases in which `K <= 128` and `N <= 128`. Test Plan: ``` pytest -s -v test/test_matmul_cuda.py::TestFP8MatmulCUDA::test_scaled_mm_vs_emulated_block_wise_float32_lhs_block_1_rhs_block_128_cuda 2>&1 \| tee ~/personal/stride_check.log ``` Differential Revision: D83023706 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163829 Approved by: https://github.com/lw, https://github.com/eqy	2025-09-29 17:28:26 +00:00
PyTorch MergeBot	6b473c90cf	Revert "[inductor] require shape in TritonCSEVariable (#162275 )" This reverts commit c257570e6cd25753f9f0a640b965148ead2cf918. Reverted https://github.com/pytorch/pytorch/pull/162275 on behalf of https://github.com/jeffdaily due to sorry this broke rocm CI; inductor/test_select_algorithm.py::TestTemplateRender::test_finalized_subclass_hooks [GH job link](https://github.com/pytorch/pytorch/actions/runs/18048893250/job/51366715091) [HUD commit link](`c257570e6c`) ([comment](https://github.com/pytorch/pytorch/pull/162275#issuecomment-3348159095))	2025-09-29 17:26:54 +00:00
Janani Sriram	6bcc6bbc85	[Inductor][FP8] Add op_name for ScaledMM TMA template heuristic (#164019 ) Summary: For H100s and below, add `op_name="scaled_mm"` to the template heuristic for `CUDAScaledTMATemplateConfigHeuristic` such that `scaled_mm` persistent + TMA tests do not default to the "mm" heuristics. Test Plan: `test_max_autotune.py` Differential Revision: D83390775 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164019 Approved by: https://github.com/njriasan	2025-09-29 17:24:26 +00:00
Nikita Shulga	95be302889	Skip test_conv3d_cudnn_broken on ROCM (#164138 ) Followup after https://github.com/pytorch/pytorch/pull/163903 Fixes https://github.com/pytorch/pytorch/issues/164137 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164138 Approved by: https://github.com/Camyll	2025-09-29 16:56:51 +00:00
Yuanyuan Chen	f433e681b9	Remove export of slice_in_dim (#164117 ) Cannot find `slice_in_dim` in OSS. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164117 Approved by: https://github.com/soulitzer	2025-09-29 16:56:14 +00:00
Dev Sashidhar	5ff2387dbe	Fix comment on broadcasting example to clarify dimension mismatch (#162177 ) Fixes #162116 Updated the comment in the broadcasting example to clarify that tensors with mismatched dimension sizes (0 vs 2) are not broadcastable. Removed incorrect reference to missing dimensions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162177 Approved by: https://github.com/soulitzer	2025-09-29 16:47:48 +00:00
Nikita Shulga	84b57c93db	[MPSInductor] Unskip test_repeat_interleave_Tensor_decomp (#164136 ) Not sure what was the problem, but it passes for me locally Fixes https://github.com/pytorch/pytorch/issues/159408 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164136 Approved by: https://github.com/v0i0	2025-09-29 16:20:34 +00:00
Markus Hoehnerbach	069ccf5f1e	[inductor] pdl: enable launch and deduplicate waits (#162014 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162014 Approved by: https://github.com/eellison	2025-09-29 16:10:26 +00:00
Vismai Khanderao	1c12d7416b	[SDPA] [MPS] Fixes regression in 2.8.0 for scaled_dot_product_attention using mps (#163598 ) Fixes #163597 - Updates fast SDPA implementations to take in query tensor stride info similar to key and value instead of assuming stride. - Updated tests with additional transpose/permutation layouts. New tests catch the regression. ### Benchmarking with script found in [implementation PR](https://github.com/pytorch/pytorch/pull/152781#:~:text=19.8%25%20speed%20improvement-,Script%20to%20get%20perf%3A,-import%20torch%0Aimport) Times are averaged over 100000 iterations. This change should not have any significant performance difference. Tested on an M3 Pro ### Vector Fast Path (q_len=1, k_len=256) - Before: 0.160 ms - After: 0.157 ms ### Vector 2-pass (q_len=1, k_len=4096) - Before: 0.342 ms - After: 0.339 ms ### Vector Fast Path (q_len=8, k_len=256) - Before: 0.228 ms - After: 0.231 ms ### Vector 2-pass (q_len=8, k_len=4096) - Before: 0.432 ms - After: 0.436 ms Pull Request resolved: https://github.com/pytorch/pytorch/pull/163598 Approved by: https://github.com/malfet	2025-09-29 16:09:46 +00:00
dolpm	3746039b47	[inductor] fix: 'get_raw_stream' undefined (#163707 ) Summary: ran into this when precompiling baidu/ERNIE-4.5-21B-A3B-PT codegen after fix: ```py import triton import triton.language as tl from torch._inductor.runtime.triton_heuristics import start_graph, end_graph from torch._C import _cuda_getCurrentRawStream as get_raw_stream with torch.cuda._DeviceGuard(0): stream0 = get_raw_stream(0) ... ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163707 Approved by: https://github.com/jamesjwu	2025-09-29 15:48:16 +00:00
Paul Zhang	872edd89d6	Enable outer reductions in fbcode (#163884 ) Summary: Enabling the outer reduction optimization in fbcode Test Plan: Evals in https://docs.google.com/document/d/1-tcItRsyEaibaXL56Zq2-CWh5wCmHXDDgDQT_9uOvXE/edit?tab=t.0#bookmark=id.tkgzaitxacg0 Differential Revision: D81948542 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163884 Approved by: https://github.com/Skylion007	2025-09-29 15:25:17 +00:00
Howard Huang	47ed41109f	Fix PgNccl coalseced profiling (#160680 ) Admittedly I'm a noob when looking at traces, but this looked pretty off to me: <img width="1528" height="824" alt="Screenshot 2025-08-14 at 5 27 49 PM" src="https://github.com/user-attachments/assets/871e7b4c-0e47-4c84-97cc-8198b7b76d4b" /> 1. Why are there so many "nccl:coalesced" on the CPU thread 2. Why is there "nccl:coalesced" on compute stream (stream 7) Here is what is happening: CPU side: In `endCoalescing`, we create a [work object ](`3be70dc30e/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp (L3473)`) with the profiling title "nccl:coalesced" GPU side: The CUDA kernels will inherit this profiling title What is missing: We forgot to call the record function [callback](`3be70dc30e/torch/csrc/distributed/c10d/Work.cpp (L35-L38)`). With this change we finishs immediately on the CPU side, but the ncclDevKernel_SendRecv still have the coalesced title. New trace looks like this: <img width="1123" height="637" alt="image" src="https://github.com/user-attachments/assets/f015fd64-85cd-452a-be24-3e7724f84e44" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160680 Approved by: https://github.com/fegin, https://github.com/kwen2501	2025-09-29 15:21:55 +00:00
Klaus Zimmermann	fa54b08cd5	Replace setup.py install with pip install (#156711 ) #156027 already replaced most use of `python setup.py install`. This PR only adds a few more occurrences and adds `--no-build-isolation` in a few places. Pull Request resolved: https://github.com/pytorch/pytorch/pull/156711 Approved by: https://github.com/atalman	2025-09-29 15:15:10 +00:00
Nicolas De Carli	92284fb2ff	Add SVE128 ISA (#158932 ) Summary: Partly Importing and adapting https://github.com/pytorch/pytorch/pull/138388, adding SVE128 as ISA. Intention is to add SVE128 translation layers for Vectorized data types. Idea is to have 1 PR per file, aside from the current one, plus a last one modifying cmake files to enable the new ISA selectively. Tested current changes on a nightly run, to verify no regressions occur on systems leveraging SVE256. No regressions spotted when running test_ops.py, a set of 34k unit tests. A machine leveraging SVE128 was used towards this testing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158932 Approved by: https://github.com/malfet	2025-09-29 14:49:19 +00:00
PaulZhang12	84d673ef57	Add less warps config to inner reductions (#162447 ) Add less warps to ensure proper vectorization + memory coalescing for inner reductions, prefer more work per thread <img width="1717" height="731" alt="Screenshot 2025-09-17 at 10 03 25 AM" src="https://github.com/user-attachments/assets/7b1f4a30-62f2-4bee-bb9c-122501bde63e" /> Differential Revision: [D83343892](https://our.internmc.facebook.com/intern/diff/D83343892) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162447 Approved by: https://github.com/v0i0, https://github.com/eellison, https://github.com/shunting314	2025-09-29 13:48:36 +00:00
Jean Schmidt	d633bac252	Update issue templates adding a DISABLE AUTOREVERT option (#163858 ) This should be used to disable autorevert functionality if users feels the need to. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163858 Approved by: https://github.com/izaitsevfb	2025-09-29 13:10:05 +00:00
PyTorch UpdateBot	d81476e211	[xla hash update] update the pinned xla hash (#163494 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned xla hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163494 Approved by: https://github.com/pytorchbot	2025-09-29 12:31:16 +00:00
PyTorch UpdateBot	a0ae2f9aa0	Update slow tests (#163493 ) This PR is auto-generated weekly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/weekly.yml). Update the list of slow tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163493 Approved by: https://github.com/pytorchbot	2025-09-29 11:58:17 +00:00
Ke Wen	615da7b95e	[fx] Allow customization of submod name in split graph (#164035 ) Fixes #164030: HOP and pipelining both name things submod_i by adding an optional argument `partition_affix` to `split_module` API. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164035 Approved by: https://github.com/ezyang ghstack dependencies: #164045	2025-09-29 09:16:36 +00:00
Deng, Daisy	4fd70d4e7b	[1/N]Enable some tests in test_ops.TestCommon on Intel GPU (#159944 ) For https://github.com/pytorch/pytorch/issues/114850, we will port aten unit tests to Intel GPU. This PR will work on some test case of test/test_ops.py. We could enable Intel GPU with following methods and try the best to keep the original code styles: 1. Extended XPUTestBase.get_all_devices to support multiple devices 2. Added skipXPU decorator 3. Extended onlyOn to support device list 4. Enabled 'xpu' for some test pathes 5. Added allow_xpu=True for supported test class. 6. Replaced onlyCUDA with onlyOn(['cuda', 'xpu']) for supported tests 7. Use skipIfXpu and skipXPU to disable unsupported test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159944 Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/albanD	2025-09-29 09:08:04 +00:00
Animesh Jain	e1e5e040cd	[dynamo][export] Add some missing trace rules (#164080 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164080 Approved by: https://github.com/tugsbayasgalan	2025-09-29 08:47:24 +00:00
Ke Wen	5ddad22196	[PP] Use default export mode (non-strict) (#164045 ) export's default mode has switched from strict to non-strict. We just follow suit in PP. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164045 Approved by: https://github.com/H-Huang	2025-09-29 06:31:06 +00:00
Valentine233	90512fa5bd	[Quant] extend the op list for quant lift up (#163621 ) Add `aten.reshape.default` into the op list of quant lift up, in order to fuse more potential quantized kernels. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163621 Approved by: https://github.com/mingfeima, https://github.com/Xia-Weiwen, https://github.com/jansel	2025-09-29 06:14:45 +00:00
Isalia20	48a5470cf8	[CUDA] fix indexing on large tensor causing nvalid configuration argument (#164049 ) Fixes #164048 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164049 Approved by: https://github.com/eqy	2025-09-29 06:07:35 +00:00
CaoE	b9854c9d89	[Inductor][CPP] Fix the test case of test_linear_reuse_kernels (#163723 ) Fixes #163491. Add tolerances to make `test_linear_reuse_kernels` more stable. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163723 Approved by: https://github.com/leslie-fang-intel	2025-09-29 05:29:01 +00:00
can-gaa-hou	eb4361a801	[Fix] Adding missing `f` prefixes to formatted strings [1/N] (#164065 ) As stated in the title. * #164068 * #164067 * #164066 * __->__ #164065 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164065 Approved by: https://github.com/Skylion007	2025-09-29 04:53:00 +00:00
PyTorch UpdateBot	d131f213ac	[vllm hash update] update the pinned vllm hash (#164092 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164092 Approved by: https://github.com/pytorchbot	2025-09-29 04:41:06 +00:00
can-gaa-hou	7c7ae86991	[Fix] Adding missing `f` prefixes to formatted strings [2/N] (#164066 ) As stated in the title. * #164068 * #164067 * __->__ #164066 * #164065 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164066 Approved by: https://github.com/Skylion007	2025-09-29 04:40:44 +00:00
can-gaa-hou	ad32ed83b3	[Fix] Adding missing `f` prefixes to formatted strings [3/N] (#164067 ) As stated in the title. * #164068 * __->__ #164067 * #164066 * #164065 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164067 Approved by: https://github.com/Skylion007	2025-09-29 04:35:23 +00:00
Animesh Jain	d8becd1cf4	[dynamo][export] Make the source_stack and fqn info same between dynamo and export (#164085 ) preparing for landing the install_free_tensors flag Pull Request resolved: https://github.com/pytorch/pytorch/pull/164085 Approved by: https://github.com/tugsbayasgalan	2025-09-29 04:35:13 +00:00
can-gaa-hou	e64dd8c694	[Fix] Adding missing `f` prefixes to formatted strings [4/N] (#164068 ) As stated in the title. * __->__ #164068 * #164067 * #164066 * #164065 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164068 Approved by: https://github.com/Skylion007	2025-09-29 04:07:07 +00:00
Xuehai Pan	047ae24e34	Eliminate setup.py install/develop in the codebose (#162329 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162329 Approved by: https://github.com/ezyang	2025-09-29 03:54:28 +00:00
Yuanyuan Chen	3cda34ebde	[2/N] Apply ruff UP035 check in torch files (#164054 ) This is the result of applying the ruff `UP035` check. `Callable` is imported from `collections.abc` instead of `typing`. `TypeAlias` is also imported from `typing`. This PR is the follow-up of #163947. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164054 Approved by: https://github.com/ezyang, https://github.com/Skylion007	2025-09-29 03:35:32 +00:00
Yuanyuan Chen	352197c508	Remove old ROCm skip conditions in tests (#164058 ) This PR removes skip conditions for ROCM <= 3.5. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164058 Approved by: https://github.com/kwen2501	2025-09-29 03:00:58 +00:00
Animesh Jain	811c693c49	[dynamo] Special path for cloning of torch dispatch tensors (#164081 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164081 Approved by: https://github.com/tugsbayasgalan ghstack dependencies: #164084	2025-09-29 01:44:44 +00:00
Animesh Jain	c2768d0f5a	[export] Skip the check instead of disable (#164084 ) Its unclear why we had disable in the first place. With install_free_tensors, we are tracing into this hook. A better way would be to place the tracer without any hook. For now, disable the checking while dynamo is tracing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164084 Approved by: https://github.com/tugsbayasgalan	2025-09-29 01:44:44 +00:00
Yuanyuan Chen	a8c528c105	[1/N] Apply UP035 rule in tests (#163947 ) Apply UP035 `ruff` rule in tests, but some tests for `fx` and `dynamo` are excluded in case the old typing is the test target. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163947 Approved by: https://github.com/ezyang	2025-09-29 01:42:01 +00:00
Animesh Jain	dc54ce7554	[hops] Support unspecialized nn module for export hops (#164082 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164082 Approved by: https://github.com/tugsbayasgalan ghstack dependencies: #164079	2025-09-29 01:34:10 +00:00
Animesh Jain	1981ed4f60	[dynamo][logging] Add to param_count only if metrics_count is active (#164079 ) This is rare but happens with executorch tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164079 Approved by: https://github.com/tugsbayasgalan	2025-09-29 00:59:18 +00:00
jainapurva	54b38f3b46	Add operator benchmarking run to CI nightly (#162530 ) This PR introduces a new "operator microbenchmark" CI workflow and GitHub Actions for operator microbenchmarks, updating test scripts and job matrices to support new parameters, and broadening the operator benchmark tests to include more data types, larger shapes, and gradient tests. The benchmark configurations now focus more on different cuda hardware and multiple dtypes (bf16, fp16, fp32), for both compile and eager mode. Benchmark Configuration and Coverage: * Expanded operator benchmark configurations in `addmm_test.py`, `bmm_test.py`, `matmul_test.py`, and `mm_test.py` to benchmark multiple dtypes on CUDA devices, in eager and compile mode, for forward and backward run. The configs with tag "long" for the above mentioned files are being run in CI. * The CI benchmarking is running on various hardwares: H100, A100. * The CI job also uploads the microbenchmarking outputs to a [HUD](https://hud.pytorch.org/benchmark/llms?repoName=pytorch%2Fpytorch&benchmarkName=PyTorch+operator+microbenchmark) dashboard. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162530 Approved by: https://github.com/huydhn Co-authored-by: Huy Do <huydhn@gmail.com>	2025-09-29 00:46:38 +00:00
RiyaP-QA	bc5a072ebf	fixes import error 'functionalize' from functorch (#163746 ) Fixes #163637 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163746 Approved by: https://github.com/malfet	2025-09-28 23:16:45 +00:00
RajeshvShiyal	d1b3481131	registraion replaced with registration in jit_type.h file comment (#164072 ) Fixes #164071 typo correction done Pull Request resolved: https://github.com/pytorch/pytorch/pull/164072 Approved by: https://github.com/Skylion007	2025-09-28 22:55:24 +00:00
Yuanyuan Chen	3766513d25	Remove C++ workarounds for Python < 3.10 (#164055 ) Remove two unnecessary `PY_VERSION_HEX` branches. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164055 Approved by: https://github.com/ezyang	2025-09-28 20:00:02 +00:00
FFFrog	ea6846b231	[CI] Remove the unnecessary workflow related functorch (#162581 ) The [docs](https://docs.pytorch.org/functorch/stable/) about `functorch` has been migrated into [PyTorch Doc](https://docs.pytorch.org/docs/stable/func.html) since PyTorch 2.0, so I think we can remove it right now to reduce the compute resources usages. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162581 Approved by: https://github.com/ezyang	2025-09-28 19:56:20 +00:00
Tugsbayasgalan Manlaibaatar	f6537d9616	Move control flow export tests to new tracer (#163259 ) Differential Revision: [D82732614](https://our.internmc.facebook.com/intern/diff/D82732614) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163259 Approved by: https://github.com/avikchaudhuri ghstack dependencies: #163136, #163137, #163258	2025-09-28 19:56:09 +00:00
Tugsbayasgalan Manlaibaatar	cc0332563e	Use new_tracer_experimental for torchao strict export (#163258 ) Export team is fixing up the old strict export implementation, as a result it fails a check where we proxy the whole module under given directories. _WrapperModule is a way for torchao to workaround the issue where export requiring nn.module to trace so it should never get proxied in the graph. Differential Revision: [D82732613](https://our.internmc.facebook.com/intern/diff/D82732613) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163258 Approved by: https://github.com/anijain2305 ghstack dependencies: #163136, #163137	2025-09-28 19:55:54 +00:00
Tugsbayasgalan Manlaibaatar	8239ba4087	Fix various bugs in subclass input in export (#163770 ) This adds basic support for subclass inputs in export (specifically for non-strict). I had to make fakify little more complicated which risks further divergence from dynamo fakification. But dynamo one is so complex, so i feel it is better to do this way. Also improved fake mode detection logic to recursively look into subclass inner tensors. Differential Revision: [D83156489](https://our.internmc.facebook.com/intern/diff/D83156489) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163770 Approved by: https://github.com/avikchaudhuri	2025-09-28 18:03:32 +00:00
Tugsbayasgalan Manlaibaatar	1fdd99de71	Building guards should be under metrics_context (#163967 ) Differential Revision: [D83354042](https://our.internmc.facebook.com/intern/diff/D83354042) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163967 Approved by: https://github.com/avikchaudhuri	2025-09-28 16:28:34 +00:00
lichuyang	38ed608956	Better error handling in torch/nativert/* (#163308 ) Replace the runtime_error of the vallina C++ exceptions with TORCH_CEHCK in torch/nativert/* The vallina C++ exception should not exist in the core part of pytorch for its corss-languanges trait. Comparing with the vallina C++ exceptions, TORCH_CHECK have the richer error context and It has the unified error handling mechanism. This commit replace the runtime_error with TORCH_CHECK of the files in torch/nativert/* . Fixes part of #148114 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163308 Approved by: https://github.com/dolpm	2025-09-28 14:23:44 +00:00
Alessandro Fanfarillo	238dc65368	[ROCm] use hipSolver instead of MAGMA for Cholesky (#163977 ) Currently, the Cholesky factorization and least squares operation defaults to magma when Pytorch is compiled for ROCm. This shows suboptimal performance. This change allows PyTorch to rely on hipSolver instead of Magma. @jeffdaily Pull Request resolved: https://github.com/pytorch/pytorch/pull/163977 Approved by: https://github.com/Skylion007	2025-09-28 06:53:06 +00:00
Laith Sakka	7bbde0c094	Remove unused argument from DEFINE_BINARY macro. (#163868 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163868 Approved by: https://github.com/Skylion007 ghstack dependencies: #163822	2025-09-28 06:32:41 +00:00
Laith Sakka	dfcab0e7e1	Handle DDE in infer_size_impl (#163822 ) hit this while running VLLM with unbacked for model Qwen/Qwen2-1.5B-Instruct Pull Request resolved: https://github.com/pytorch/pytorch/pull/163822 Approved by: https://github.com/bobrenjc93, https://github.com/Skylion007	2025-09-28 06:32:41 +00:00
PyTorch UpdateBot	1cc9263f52	[vllm hash update] update the pinned vllm hash (#164053 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164053 Approved by: https://github.com/pytorchbot	2025-09-28 04:35:17 +00:00
Yuanyuan Chen	c2862c8e66	[distributed] Remove python code older than 3.10 (#163613 ) Because now that the minimum Python version is 3.10 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163613 Approved by: https://github.com/XuehaiPan, https://github.com/kwen2501	2025-09-28 04:15:24 +00:00
Laith Sakka	b377c9e365	graph break on tolist if capture_scalar_outputs is false (#163807 ) address https://github.com/pytorch/pytorch/issues/163798 its problematic to not graph break because: 1. break current contract. 2. well dynamo trace then we have .item call then if we ever re-trace later in autograd for example we hit a failure (We do not know where to graph break at that point)! see the added unit test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163807 Approved by: https://github.com/bobrenjc93	2025-09-28 04:02:52 +00:00
Avik Chaudhuri	3059b08012	[inductor] add subsystem to pattern matcher (#163922 ) Summary: Running a toy example through `torch.compile(fullgraph=True, backend="inductor")` with default inductor config, I tried to see what passes are run in each of pre-grad, joint-graph, and post-grad phases by printing out the subsystem in `GraphTransformObserver`. However the subsystem showed up as None in a bunch of transforms that were run in each of those phases, so this PR adds some additional annotations. Note that these annotations are probably not a complete set, since other transforms may run based on changes to the config that are not covered here. Hopefully this doesn't change behavior. However, I did notice that bisecting relies on disabling various phases, which means that while before some passes would not be disabled (because their subsystem was `None`), now they would. Test Plan: existing tests + manual test described in summary Differential Revision: D83306676 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163922 Approved by: https://github.com/jansel	2025-09-28 03:15:23 +00:00
Aaron Gokaslan	5504a06e01	[BE]: Update NCCL to 2.28.3 (#162351 ) @eqy New NCCL has some a bunch of bugfixes for features including reducing the number SMs needed by NVLINK collectives as well as some very useful new APIs for SymmetricMemory. Also allows FP8 support for non-reductive operations on pre-sm90 devices. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162351 Approved by: https://github.com/ezyang, https://github.com/malfet, https://github.com/atalman	2025-09-28 01:38:59 +00:00
lichuyang	1ad491dd88	Better error handling in torch/csrc/jit/ir/* (#163757 ) Refactor error handling to use TORCH_CHECK for improved clarity in constants and scope management Fixes some parts of ISSUE #148114 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163757 Approved by: https://github.com/albanD	2025-09-28 01:18:24 +00:00
Bob Ren	fd20889d0b	Add type annotations to MPS profiler utilities (#163486 ) ## Summary - drop the local mypy allow-untyped-defs escape hatch in the MPS profiler helpers - annotate the context managers and bool helpers so they type-check cleanly ## Testing - python -m mypy torch/mps/profiler.py --config-file mypy-strict.ini ------ https://chatgpt.com/codex/tasks/task_e_68d0ce4df2e483268d06673b65ef7745 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163486 Approved by: https://github.com/Skylion007	2025-09-27 23:00:53 +00:00
fduwjj	2ce2e48a05	[WIP][symm_mem] Add a wait for signal and put signal for one side API (#159837 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159837 Approved by: https://github.com/kwen2501	2025-09-27 21:20:13 +00:00
Aart J.C. Bik	1d98be6abf	[NFC] fixed typo in sparse semi structured filename (#163904 ) Make sure all semi structured files use "SparseSemiStructured" Pull Request resolved: https://github.com/pytorch/pytorch/pull/163904 Approved by: https://github.com/Skylion007	2025-09-27 21:19:48 +00:00
Chien-Chin Huang	dfda239cce	[DTensor] Raise an RuntimeError when checkpointing APIs are used with Partial placement (#163941 ) A DTensor that contains partial placement shouldn't be checkpointed (DCP.save) -- the result is not correct and DCP doesn't know how to handle it. There are several APIs that are only used by checkpointing, e.g.,`__create_write_items__`. These APIs should raise an exception if the DTensor, `self`, has Partial placement. Ideally, we want to add the following test: ``` with self.assertRaisesRegex( RuntimeError, "Any checkpointing related operations are not supported for" ): dcp.save({"dtensor": dtensor}, checkpoint_id=tempfile.gettempdir()) ``` While we do see the RuntimeError is raised, the error was raised in another thread due to DTensor checkpoint APIs are called by DCP in a separate thread, which assertRaisesRegex cannot capture. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163941 Approved by: https://github.com/tianyu-l	2025-09-27 19:50:16 +00:00
Animesh Jain	991e3d0d16	[dynamo][guards] Revert introduction of different types of lambda_guards (#163385 ) With https://fb.workplace.com/groups/260102303573409/permalink/787294574187510/ issue, it might be a better idea to just speedup _realize_dict and keep the changes very local. So reverting this PR as well, to return to clean slate. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163385 Approved by: https://github.com/jansel	2025-09-27 18:20:48 +00:00
Yidi Wu	8f6dbc0ba8	[scan] create fw and bw graphs via partitioning (#162754 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162754 Approved by: https://github.com/zou3519 ghstack dependencies: #161557, #161664, #161808, #162025, #161732	2025-09-27 18:13:15 +00:00
Yidi Wu	3413490f53	[scan] materialize combine_fn in forward add more autograd tests (#161732 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161732 Approved by: https://github.com/zou3519 ghstack dependencies: #161557, #161664, #161808, #162025	2025-09-27 18:13:15 +00:00
Yidi Wu	b85bee3bbb	[hop] refactor check input alias and mutation to be a graph pass (#162025 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162025 Approved by: https://github.com/zou3519 ghstack dependencies: #161557, #161664, #161808	2025-09-27 18:13:15 +00:00
Yidi Wu	66dbf2c9f5	[scan][autograd] clone outputs that's aliasing with inputs or outputs in bw (#161808 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161808 Approved by: https://github.com/zou3519 ghstack dependencies: #161557, #161664	2025-09-27 18:13:15 +00:00
Yidi Wu	f5d85874dd	[scan][be] remove unnecessary tensor checks (#161664 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161664 Approved by: https://github.com/Skylion007, https://github.com/zou3519 ghstack dependencies: #161557	2025-09-27 18:13:14 +00:00
Yidi Wu	8f15d6a0c9	[test][scan] refactor inductor test and prepare for adding bw tests (#161557 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161557 Approved by: https://github.com/zou3519	2025-09-27 18:13:14 +00:00
redwrasse	e78792a70d	Update ctc loss docs float32 input required for CuDNN (#162042 ) Discovered while working on https://github.com/pytorch/pytorch/pull/159106 the non-obvious requirement that inputs must be float32 to use CuDNN (https://github.com/pytorch/pytorch/pull/159106#issuecomment-3189981705), otherwise the native CUDA implementation is called. Updates the docs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162042 Approved by: https://github.com/mikaylagawarecki Co-authored-by: mikaylagawarecki <mikaylagawarecki@gmail.com>	2025-09-27 18:10:17 +00:00
Ke Wen	d9db838f58	[CI] Re-enable test_all_to_all_vdev_2d_offset (#163985 ) Fixes https://github.com/pytorch/pytorch/issues/163847 Moving allocations upfront and collectives later. The hang goes away. My investigation indicates that the hang is inside the last call `torch.testing.assert_close(out_expected, out[:out_numel])`. Rank 3 calls into it, but never gets out. Don't know why yet. I will investigate more. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163985 Approved by: https://github.com/fegin	2025-09-27 16:56:25 +00:00
FFFrog	6ba83e06a5	[AMP] Add deprecated decorator for torch.xxx.amp.autocast class (#163654 ) As the title stated. Changes: - torch.cuda.amp.autocast - torch.cpu.amp.autocast - add explicit `__new__` and `__init_subclass__` for those class above for inspect.signature to retrieve correct signature Pull Request resolved: https://github.com/pytorch/pytorch/pull/163654 Approved by: https://github.com/Skylion007	2025-09-27 14:37:12 +00:00
FFFrog	960290d629	[Docs] Add standard-imghdr for PyTorch Doc (#163944 ) As the title stated. Python [Pep-0594](https://peps.python.org/pep-0594) have removed imghdr from python standard libaries, the older version of sphinx don`t add it as installation dependencies, so we need to add it to requirement as an temporary dependencies. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163944 Approved by: https://github.com/albanD, https://github.com/svekars	2025-09-27 08:14:51 +00:00
Min Si	b1a4efc302	[amd] Add cudaHostFn_t to cuda_to_hip_mappings (#164007 ) Summary: See title Test Plan: ``` buck build --flagfile fbcode//mode/opt-amd-gpu fbcode//comms/ctran/algos/common/tests:ctran_algo_gpe_kernel_sync_test ``` After fix: https://www.internalfb.com/buck2/362ff91e-53f2-4b82-9536-cb84c91384a2 Before fix: failed in D83294731 (version 1): https://www.internalfb.com/sandcastle/workflow/1792432651703947243 Differential Revision: D83375414 Pull Request resolved: https://github.com/pytorch/pytorch/pull/164007 Approved by: https://github.com/llxxee	2025-09-27 06:09:50 +00:00
Wei Wang	96182faf96	[CI][Distributed][CUDA][Symm-Mem] Enable B200 Symm Mem Test (#162988 ) Inspired by https://github.com/pytorch/pytorch/pull/162981 and motivated by https://github.com/pytorch/pytorch/pull/159323 taking a total of 20 hours to finish (and unlikely to make it in short time due to https://github.com/pytorch/pytorch/issues/162178 ) Creating this subtest to get something distributed on B200. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162988 Approved by: https://github.com/malfet	2025-09-27 05:12:05 +00:00
bobrenjc93	dcb8af7501	[torchfuzz] fix bool propagation (#164003 ) bools can't propogate through the current pointwise ops such as add/mul. once we add more that can, we'll probably want to add an additional subclass that supports pointwise bools, but for now just don't allow it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164003 Approved by: https://github.com/pianpwk ghstack dependencies: #163743, #163812, #163890, #164002	2025-09-27 04:51:29 +00:00
PyTorch UpdateBot	280e712c13	[vllm hash update] update the pinned vllm hash (#164029 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164029 Approved by: https://github.com/pytorchbot	2025-09-27 04:34:57 +00:00
Arsh Zahed	254d2864d6	Add runtime_overhead PR Time Benchmark (#163866 ) This adds a PR time benchmark that checks for runtime overhead on a very small graph. This will help track regressions in runtime overhead. Example Results: ``` runtime_overhead_inductor,instruction_count,222645 runtime_overhead_inductor_inference_mode,instruction_count,234998 runtime_overhead_inductor_requires_grad,instruction_count,293556 runtime_overhead_inductor_requires_grad_backward,instruction_count,78181 runtime_overhead_inductor_dynamic,instruction_count,234870 runtime_overhead_inductor_inference_mode_dynamic,instruction_count,248711 runtime_overhead_inductor_requires_grad_dynamic,instruction_count,309979 runtime_overhead_inductor_requires_grad_backward_dynamic,instruction_count,77599 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163866 Approved by: https://github.com/jansel, https://github.com/mlazos, https://github.com/anijain2305	2025-09-27 03:26:59 +00:00
Eli Uriegas	9dac6437da	lint: Filter out /usr/include from results (#164012 ) Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164012 Approved by: https://github.com/ZainRizvi ghstack dependencies: #164008	2025-09-27 00:54:07 +00:00
Eli Uriegas	8a0e8cad5f	lint: Only include files in pytorch (#164008 ) We were seeing instances of stdlib files in clang-tidy output so this just essentially removes them from the things that lintrunner will report up. Longer term fix here would be to just modify the clang-tidy configuration in order to do the correct thing here but that requires a bit more investigation as to why this is only happening in CI and is not reproduceable locally. Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/164008 Approved by: https://github.com/ZainRizvi	2025-09-27 00:54:07 +00:00
bobrenjc93	3a115da3e6	[torchfuzz] ones over zero (#164002 ) reduces likelihood of divide by zero errors. long term we'll probably want to just fuzz these values entirely Pull Request resolved: https://github.com/pytorch/pytorch/pull/164002 Approved by: https://github.com/pianpwk ghstack dependencies: #163743, #163812, #163890	2025-09-27 00:53:02 +00:00
fduwjj	b48a3d0a38	[CuTe] Add layout overlap checking util function in _MeshLayout (#163367 ) While refactoring the bookkeeping for DeviceMesh while leveraging CuTe layout, we found that we need to have two more util functions. One is to check whether one layout has overlap inside it or not. For example, (2,2):(2:1) has no overlap while (2,2):(2:2) has overlap. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163367 Approved by: https://github.com/fegin ghstack dependencies: #163212, #163288, #163928, #163930	2025-09-27 00:22:14 +00:00
Nan Zhang	8d474bdc14	Change python grid calc for MTIA back to python mode (#163601 ) Differential Revision: D83000165 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163601 Approved by: https://github.com/blaine-rister	2025-09-27 00:12:53 +00:00
Chang Pan	008051b13c	[Dynamic Shape][BE] trim _DimHint serialization (#163891 ) Summary: current serialization is a bit hard to read ``` Exporting with the dynamic shape spec: {getitem_123: (_DimHint(type=<_DimHintType.DYNAMIC: 3>, min=1, max=64, _factory=False)), getitem_118: (_DimHint(type=<_DimHintType.DYNAMIC: 3>, min=489, max=31232, _factory=False)), getitem_117: (_DimHint(type=<_DimHintType.DYNAMIC: 3>, min=489, max=31232, _factory=False)), getitem_116: (_DimHint(type=<_DimHintType.DYNAMIC: 3>, min=489, max=31232, _factory=False)), getitem_115: ( _DimHint(type=<_DimHintType.STATIC: 2>, min=None, max=None, _factory=True), _DimHint(type=<_DimHintType.DYNAMIC: 3>, min=1, max=64, _factory=False)), getitem_46: (_DimHint(type=<_DimHintType.DYNAMIC: 3>, min=29, max=1792, _factory=False), _DimHint(type=<_DimHintType.STATIC: 2>, min=None, max=None, _factory=True)), _predict_module__base_model_model_ro_sparse_arch_ebc__output_dists_0__dist: (_DimHint(type=<_DimHintType.DYNAMIC: 3>, min=1, max=64, _factory=False), _DimHint(t ype=<_DimHintType.STATIC: 2>, min=None, max=None, _factory=True)), _predict_module__base_model_model_nro_sparse_arch_ebc__output_dists_0__dist: (_DimHint(type=<_DimHintType.DYNAMIC: 3>, min=29, max=1792, _factory=False)... ``` Test Plan: UT Differential Revision: D83175131 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163891 Approved by: https://github.com/pianpwk	2025-09-27 00:08:01 +00:00
rraminen	e4ffd718ec	Fix setting of memory fraction in test_garbage_collect_expandable (#164000 ) Fixes #160598 Fixes #160551 Fixes #160507 This PR fixes a bug in the `test_garbage_collect_expandable` unit test where the finally block incorrectly re-reads the current per process memory fraction instead of setting the original value. With out the fix the other tests in the `test/test_cuda.py` test suite were impacted and failed with OOM error on ROCm. This ensures proper cleanup and isolation of test state, maintaining test correctness and avoiding side effects like the below OOM error that it caused. For example, `test_autocast_checkpointing` failed with the below error https://github.com/pytorch/pytorch/actions/runs/17982223758/job/51153974194 on ROCm `torch.OutOfMemoryError: HIP out of memory. Tried to allocate 76.00 MiB. GPU 0 has a total capacity of 255.69 GiB of which 252.97 GiB is free. 1.20 GiB allowed; Of the allocated memory 1.14 GiB is allocated by PyTorch, with 17.00 MiB allocated in private pools (e.g., HIP Graphs), and 18.63 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/164000 Approved by: https://github.com/jeffdaily	2025-09-26 23:57:32 +00:00
Eddie Yan	ed3085814a	[cuDNN][SDPA] Disable dropout for cuDNN SDPA on 9.11 - 9.13 (#163903 ) cuDNN introduced some broken heuristics for these cases so we need to disable dropout to avoid unexpected crashes due to heuristics refusing to proceed Pull Request resolved: https://github.com/pytorch/pytorch/pull/163903 Approved by: https://github.com/ngimel, https://github.com/malfet, https://github.com/atalman	2025-09-26 23:50:09 +00:00
Eddie Yan	e2817ac204	[cuDNN][Convolution] Disable cuDNN for 3D convolutions with kernel size != 1 for cuDNN 9.8+ (#163581 ) To workaround #163539 Still confirming whether 9.10 is affected. The original test states that the convolution is "large," but note that the input size does not apepar to require 64-bit indexing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163581 Approved by: https://github.com/ngimel, https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-09-26 23:47:29 +00:00
Chang Pan	1d138e658d	[AOTI] log error triton kernel name during autotune (#163889 ) Summary: can't tell from current error msg which kernel got exception Test Plan: lint & pyre Reviewed By: muchulee8 Differential Revision: D83246522 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163889 Approved by: https://github.com/jansel	2025-09-26 23:29:49 +00:00
Taras	f9095fb285	[Windows] Update libuv version from 1.39 to 1.51 (#160318 ) Fixes: [#148315](https://github.com/pytorch/pytorch/issues/148315) The PR updates `libuv` version as `conda-forge` channel doesn't contain `libuv=1.39` for Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160318 Approved by: https://github.com/iremyux, https://github.com/malfet	2025-09-26 23:29:21 +00:00
Kurt Mohler	a0136f149c	[MPS] Fix nan behavior in `grid_sampler_3d` (#163881 ) Fixes #163851 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163881 Approved by: https://github.com/malfet	2025-09-26 23:08:00 +00:00
Isalia20	62b0ebd8f9	[MPS] [Sparse] unique_dim and sparse broadcast (#163694 ) Implements unique_dim, sparse broadcast ops and adds dtypes for mps for tests where we expect to fail, otherwise they would always fail due to being run in double precision Pull Request resolved: https://github.com/pytorch/pytorch/pull/163694 Approved by: https://github.com/malfet	2025-09-26 23:03:13 +00:00
bobrenjc93	19f16a65b4	[torchfuzz] Add support for fuzz templates (#163890 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163890 Approved by: https://github.com/pianpwk ghstack dependencies: #163743, #163812	2025-09-26 22:51:45 +00:00
Joel Schlosser	0ebfa3d7d2	Avoid fast path mask left-align check in compiled TransformerEncoder (#163773 ) Fixes #163640 This PR avoids a mask left align check in the case that we're operating under torch.compile / torch.export. Originally, I planned to make a more invasive change to auto-disable the fast path entirely underneath torch.compile / torch.export, but I realized during testing that the fast path wasn't actually causing compile issues outside of the narrow issue identified here. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163773 Approved by: https://github.com/mikaylagawarecki	2025-09-26 22:29:37 +00:00
eqy	0ea10f9912	[cuDNN][conv][64-bit] Disable cuDNN for 64-bit depthwise convs again (#163171 ) test is breaking, will check if there's an older version that we can enable on to avoid completely dropping support Pull Request resolved: https://github.com/pytorch/pytorch/pull/163171 Approved by: https://github.com/ngimel, https://github.com/malfet	2025-09-26 22:12:17 +00:00
Bin Bao	48a852b7ae	[AOTI] Update AOTInductor tutorial (#163808 ) Summary: Remove the BC breaking warning. Add inductor_config to the example code. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163808 Approved by: https://github.com/yushangdi	2025-09-26 22:01:31 +00:00
Jeff Daily	f1260c9b9a	[ROCm][CI/CD] upgrade nightly wheels to ROCm 7.0 (#163937 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163937 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-26 21:42:09 +00:00
Zhijing Li	28c7d11428	[AOTI] Pass in shape_env for get_stride_order (#163925 ) Summary: As titled. Without the diff, we got P1963055009 With the diff passing in the enviroment, we can do correct sym_int deduction: https://fburl.com/mlhub/p5zy7o28 Test Plan: ``` buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:unbacked_symints -- test_sdfpa_unbacked_strides --print-passing-details --env TORCHDYNAMO_EXTENDED_DEBUG_CPP=1 --env TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(u0, 0)" ``` Without the fix: P1964887260 With the fix: P1964888579 Differential Revision: D83211018 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163925 Approved by: https://github.com/ColinPeppler	2025-09-26 21:10:03 +00:00
fduwjj	a60c6ed99f	[DeviceMesh][ez] Extract the pg creation as a util function (#163930 ) This is just to extract common logic into a util function because we will use it many times for the following stack of Device Mesh refactoring. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163930 Approved by: https://github.com/fegin ghstack dependencies: #163212, #163288, #163928	2025-09-26 20:42:58 +00:00
Isuru Fernando	c257570e6c	[inductor] require shape in TritonCSEVariable (#162275 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162275 Approved by: https://github.com/mlazos	2025-09-26 20:41:12 +00:00
Shangdi Yu	2f85de0b42	Fix preserve annotation with decomp (#163896 ) If we use `fx_traceback.preserve_node_meta()`, we will have a few extra node.meta fields on nodes, such as "seq_nr", added from `fx/proxy.py`. As a result, there might be non-empty node.meta on graph nodes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163896 Approved by: https://github.com/SherlockNoMad, https://github.com/ydwu4	2025-09-26 20:28:47 +00:00
Shangdi Yu	e21b037756	Add tests for aot_export_joint_with_descriptors annotation (#163893 ) As title, test 1) Annotation works with aot_export_joint_with_descriptor API 2) Annotation works with the 2 step "strict export.export + aot_export_joint_with_descriptor" Pull Request resolved: https://github.com/pytorch/pytorch/pull/163893 Approved by: https://github.com/SherlockNoMad	2025-09-26 19:25:44 +00:00
q1l1	f8c7505855	[inductor] Fix unbounded number of substitutions when equality checks contain Max expr (#163685 ) ## Issue From an internal use case, we found that if we have an equality rule like: ``` Max(15, u0) == s0 * Max(15, u0) ``` This would lead to wrong substitution rule being generated in the substitution table, the result would be the process got stuck in the substitution loop as if it hangs indefinitely, as it's doing the following substitutions: ``` Max(15, u0) --> s0 * Max(15, u0) --> s0 ** 2 * Max(15, u0) --> s0 ** 3 * Max(15, u0) --> s0 ** 4 * Max(15, u0) ... ``` The root cause is with SymPy expression comparison: as `Max` is [not inside the op class table](https://github.com/sympy/sympy/blob/1.14/sympy/core/basic.py#L50-L86), it'll take the [UNKNOWN](https://github.com/sympy/sympy/blob/1.14/sympy/core/basic.py#L120) order, and considered bigger than any other types of expressions. ## Fix 1. Added a breaking-out from the substitution while-loop to warn about any exccessive substitutions, what threshold should be used here and how to pass it are open to suggestion, using a hard-coded static value to be simple for now 2. Enhanced the sympy expression comparison logic, so that we first check if one expr "has" the other one or not, to help work around the issue with `Max` here ## Testing - with the unittiest alone --> unittest stuck - with the unittest and while-loop breakout, we could see tests finished with warning "Substitution limit reached": ``` test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleCpu::test_unbounded_expr_substitutions_cpu W0923 13:00:37.864000 46140 /data/users/q1l1/pytorch/torch/_export/__init__.py:70] +============================+ W0923 13:00:37.864000 46140 /data/users/q1l1/pytorch/torch/_export/__init__.py:71] \| !!! WARNING !!! \| W0923 13:00:37.865000 46140 /data/users/q1l1/pytorch/torch/_export/__init__.py:72] +============================+ W0923 13:00:37.865000 46140 /data/users/q1l1/pytorch/torch/_export/__init__.py:73] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead. stats [('calls_captured', 5), ('unique_graphs', 1)] inductor [('extern_calls', 2)] graph_break [] aten_mm_info [('aten.mm_Max(15, u0)_16_64', 1)] PASSED [5.6947s] test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleGpu::test_unbounded_expr_substitutions_cuda W0923 13:00:39.633000 46140 /data/users/q1l1/pytorch/torch/_inductor/sizevars.py:765] [0/0] Substitution limit (30) reached w/ u1*30Max(15, u0) W0923 13:00:39.679000 46140 /data/users/q1l1/pytorch/torch/_inductor/sizevars.py:765] [0/0] Substitution limit (30) reached w/ 64u130Max(15, u0) stats [('calls_captured', 5), ('unique_graphs', 1)] inductor [('extern_calls', 2), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('async_compile_cache_miss', 1)] graph_break [] aten_mm_info [('aten.mm_Max(15, u0)_16_64', 1)] PASSED [5.6278s] test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleMps::test_unbounded_expr_substitutions_mps SKIPPED [0.0002s] ============================ 2 passed, 1 skipped, 870 deselected in 19.66s ============================ ``` - with the unittest + comparison logic enhanced, we don't see the warning any more: ``` Running 3 items in this shard test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleCpu::test_unbounded_expr_substitutions_cpu W0923 13:15:39.560000 290812 /data/users/q1l1/pytorch/torch/_export/__init__.py:70] +============================+ W0923 13:15:39.561000 290812 /data/users/q1l1/pytorch/torch/_export/__init__.py:71] \| !!! WARNING !!! \| W0923 13:15:39.561000 290812 /data/users/q1l1/pytorch/torch/_export/__init__.py:72] +============================+ W0923 13:15:39.562000 290812 /data/users/q1l1/pytorch/torch/_export/__init__.py:73] torch._export.aot_compile()/torch._export.aot_load() is being deprecated, please switch to directly calling torch._inductor.aoti_compile_and_package(torch.export.export())/torch._inductor.aoti_load_package() instead. stats [('calls_captured', 5), ('unique_graphs', 1)] inductor [('extern_calls', 2)] graph_break [] aten_mm_info [('aten.mm_Max(15, u0)_16_64', 1)] PASSED [6.6093s] test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleGpu::test_unbounded_expr_substitutions_cuda stats [('calls_captured', 5), ('unique_graphs', 1)] inductor [('extern_calls', 2), ('benchmarking.InductorBenchmarker.benchmark_gpu', 2), ('async_compile_cache_miss', 1)] graph_break [] aten_mm_info [('aten.mm_Max(15, u0)_16_64', 1)] PASSED [6.0502s] test/inductor/test_aot_inductor.py::AOTInductorTestABICompatibleMps::test_unbounded_expr_substitutions_mps SKIPPED [0.0002s] ============================ 2 passed, 1 skipped, 870 deselected in 21.99s ============================ ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163685 Approved by: https://github.com/jansel	2025-09-26 18:46:36 +00:00
Catherine Lee	425ea90f95	[testing] Add test owner labels for some cuda? tests (#163296 ) I am trying to give some test files better owner labels than `module: unknown`. I am not sure them, but they seem pretty reasonable Pull Request resolved: https://github.com/pytorch/pytorch/pull/163296 Approved by: https://github.com/eqy, https://github.com/msaroufim	2025-09-26 18:26:56 +00:00
Catherine Lee	5b764267f4	[testing] Add test owner labels for some distributed tests (#163174 ) I am trying to give some test files better owner labels than `module: unknown`. I am not sure them, but they seem pretty reasonable Pull Request resolved: https://github.com/pytorch/pytorch/pull/163174 Approved by: https://github.com/ezyang	2025-09-26 18:19:04 +00:00
Scott Wolchok	50c0550f5a	Add magic TORCH_MAKE_PYBIND_ENUM_FASTER macro (#163527 ) See comment on the macro definition. In short, pybind11 3.x added `py::native_enum`, and also had to add overhead for that new way to bind enums on the critical path for calling functions that take regular old `py::enum_`s as arguments (for example, `__eq__`). Differential Revision: [D82873169](https://our.internmc.facebook.com/intern/diff/D82873169/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163527 Approved by: https://github.com/ezyang	2025-09-26 17:59:22 +00:00
arkadip-maitra	d7491fb1c1	Fix tensor creation with empty names crash (#163957 ) Partially fixes #148324 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163957 Approved by: https://github.com/malfet, https://github.com/janeyx99	2025-09-26 17:41:00 +00:00
Huamin Li	9534c59311	[Inductor] address comments from https://github.com/pytorch/pytorch/pull/163803 (#163901 ) Summary: address comments from https://github.com/pytorch/pytorch/pull/163803 Differential Revision: D83291637 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163901 Approved by: https://github.com/desertfire	2025-09-26 17:18:44 +00:00
hanchchch	5880996b4c	Expose torch.nn.utils.parametrize (#163835 ) `torch.nn.utils.parametrize` is not imported from `torch/nn/utils/__init__.py`, thus is not exposed and make it hard for code editors to statically analyze the code and provide auto-completion based on the function signature. <img width="615" height="292" alt="Screenshot 2025-09-25 at 12 01 52 PM" src="https://github.com/user-attachments/assets/a276f6f0-87f3-4732-943d-2a92ea871974" /> after the fix: <img width="964" height="393" alt="Screenshot 2025-09-25 at 12 02 16 PM" src="https://github.com/user-attachments/assets/ca47f09e-dc4e-4420-a2d2-11669e07471a" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163835 Approved by: https://github.com/albanD	2025-09-26 16:38:18 +00:00
Tugsbayasgalan Manlaibaatar	1d26eb0fcc	Move inductor.aot_compile to use new tracer (#163137 ) Differential Revision: [D82603768](https://our.internmc.facebook.com/intern/diff/D82603768) I feel no one probably uses this API now but still useful path for more test cases. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163137 Approved by: https://github.com/avikchaudhuri ghstack dependencies: #163136	2025-09-26 15:54:24 +00:00
Tugsbayasgalan Manlaibaatar	a05f6ecfec	Fix bug with renaming submodules in dynamo for new tracer (#163136 ) Differential Revision: [D82603767](https://our.internmc.facebook.com/intern/diff/D82603767) Previously, i forgot to add handle call_module case which now will have export_root prepended to their names. Basically i want to clean up sth like: ``` graph(): %l_self_export_root_sub_mod = call_module[target=l_self_export_root_sub_mod](%x, %y) %l_self_export_root_sub_mod_1 = call_module[target=l_self_export_root_sub_mod](%x, %y) ``` Dynamo graph can have call_module nodes that have messed up name due to our wrapper. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163136 Approved by: https://github.com/avikchaudhuri	2025-09-26 15:54:24 +00:00
thenumberouscode	c106ee8515	[FakeTensor] Supplement the relevant logic for converting conv1d to conv2d in meta_conv (#160408 ) ## Fixes https://github.com/pytorch/pytorch/issues/159462 also fixes #163569 , #163604 ## summary the issue is caused by the wrong stride of conv1d's result generated by meta_conv: `4d5b3f2d5a/torch/_meta_registrations.py (L2453-L2471)` and the wrong stride will be used to codegen size assert in inductor: `4d5b3f2d5a/torch/_inductor/ir.py (L6152-L6163)` ## reason So why the computed stride is wrong in the meta_conv function? because the corresponding backend will convert conv1d to conv2d and change the input tensor' size and memory_format(channel last). but the meta_conv do not do this transformation, so a mismatch happend. `4d5b3f2d5a/aten/src/ATen/native/Convolution.cpp (L1502-L1510)` just add corresponding logic in meta_conv. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160408 Approved by: https://github.com/eellison, https://github.com/jansel, https://github.com/mlazos	2025-09-26 15:45:02 +00:00
Isalia20	8aba513506	[MPS] test sparse add MPS dtypes so we get proper expected failure (#163951 ) Adds dtypeIfMPS so if op is supported we get proper error like unexpected success. Before we would never get unexpected success because tests were run in torch.double dtype which will always fail on MPS due to it not supporting the dtype Pull Request resolved: https://github.com/pytorch/pytorch/pull/163951 Approved by: https://github.com/malfet	2025-09-26 14:48:58 +00:00
fduwjj	8c194a367e	[DeviceMesh][ez] Add a type alias for backend config (#163928 ) Create a type alias for `tuple[Optional[str], Optional[C10dBackend.Options]]` since it is too long. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163928 Approved by: https://github.com/fegin ghstack dependencies: #163212, #163288	2025-09-26 14:46:53 +00:00
Yiming Zhou	33f3413bd3	[WIP][precompile] Set fake_mode of base tensor in fx graph pickler (#163738 ) Summary: When unpickling a fake tensor in fx graph pickler. It only sets the fake mode of the current tensor's metadata to the one that is consistent with pickler's `unpickle_state`. However, it doesn't set the fake mode of a tensor's base tensor when that tensor is a view. This will cause an issue when dumping and loading the following graph ``` class GraphModule(torch.nn.Module): def forward(self, s77: "Sym(s77)", L_x_: "f32[s77, 8]"): l_x_ = L_x_ chunk = l_x_.chunk(2, dim = -1); l_x_ = None y: "f32[s77, 4]" = chunk[0]; chunk = None y_repeat: "f32[s77, 8]" = y.repeat_interleave(2, dim = -1); y = None return (y_repeat,) ``` because `repeat_interleave` will create an intermediate fake tensor of size `[s77, 2, 4]` and it will become the base of the node `y_repeat`'s `meta['val']`. This causes issues during the deserialization phase when applying AOT precompile to DeepSeek in vLLM. Test Plan: This has been tested in vLLM with DeepSeek. As for unittest, ideally it should be `test_aot_compile_repeat_interleave` with mark_dynamic turned on. However, that's leading to some other pickle issues. ``` python test/dynamo/test_aot_compile.py -k test_aot_compile_repeat_interleave ``` I have yet to figure out a more appropriate unittest. But a proof-of-concept demo would be the following: ``` import inspect import sympy import torch from torch.fx._graph_pickler import GraphPickler from torch.fx.experimental.symbolic_shapes import ShapeEnv from torch._subclasses import FakeTensorMode from torch.fx._graph_pickler import GraphPickler, Options from unittest.mock import patch class M(torch.nn.Module): def forward(self, x): chunk = x.chunk(2, dim=-1) y = chunk[0] y_repeat = y.repeat_interleave(2, dim=-1) return y_repeat def my_custom_backend(gm, example_inputs): global gm_global gm_global = gm return gm.forward m = M() m_opt = torch.compile(m, backend=my_custom_backend, fullgraph=True) sample_inputs = (torch.randn(2, 8),) torch._dynamo.mark_dynamic(sample_inputs[0], [0]) opt_out = m_opt(*sample_inputs) graph_reducer_override = GraphPickler.reducer_override def _graph_reducer_override(self, obj): if (inspect.isclass(obj) and issubclass(obj, sympy.Function) and hasattr(obj, "_torch_unpickler")): return obj._torch_unpickler, (obj._torch_handler_name, ) if isinstance(obj, FakeTensorMode): return type(None), () return graph_reducer_override(self, obj) with patch.object(GraphPickler, "reducer_override", _graph_reducer_override): pickled_gm = GraphPickler.dumps(gm_global, Options(ops_filter=None)) fake_mode = FakeTensorMode(shape_env=ShapeEnv()) loaded_gm = GraphPickler.loads(pickled_gm, fake_mode) ``` Differential Revision: D83112599 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163738 Approved by: https://github.com/zhxchen17	2025-09-26 14:36:37 +00:00
mansiag05	d4e4f70768	Fix overflow in slow_conv3d when kernel size is too large. (#162718 ) Also, adding check for padding to avoid segmentation fault caused by overflow. Fixes #141846 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162718 Approved by: https://github.com/jgong5, https://github.com/Skylion007	2025-09-26 13:39:29 +00:00
PyTorch MergeBot	bfd21cd3e6	Revert "Add less warps config to inner reductions (#162447 )" This reverts commit 768361e67f0eb36491d7b763ef38d7c928ebefe6. Reverted https://github.com/pytorch/pytorch/pull/162447 on behalf of https://github.com/PaulZhang12 due to failed to land internally ([comment](https://github.com/pytorch/pytorch/pull/162447#issuecomment-3338680532))	2025-09-26 13:16:04 +00:00
Yuanyuan Chen	7441a1b9b1	Update ruff to 0.13.1 (#163744 ) Update ruff to 0.13.1 so that we can remove `UP038` from `pyproject.toml` because it has been removed from supported rules of ruff. There are some fixes, the most notable one is [(PYI059)](https://docs.astral.sh/ruff/rules/generic-not-last-base-class/#generic-not-last-base-class-pyi059) ``` Checks for classes inheriting from typing.Generic[] where Generic[] is not the last base class in the bases tuple. ``` A BC-breaking change is introduced to change the typing of `OrderedSet .storage` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163744 Approved by: https://github.com/Skylion007, https://github.com/jingsh	2025-09-26 10:12:21 +00:00
jianyizh	6a2bd1f4ee	[inductor] skip bmm when converting channel last (#159459 ) Workaround of #159458 by remove some nodes output channel last set Pull Request resolved: https://github.com/pytorch/pytorch/pull/159459 Approved by: https://github.com/etaf, https://github.com/eellison, https://github.com/shunting314	2025-09-26 09:11:40 +00:00
Cui, Yifeng	4783e3ff49	Update torch-xpu-ops commit pin (#163758 ) Update the torch-xpu-ops commit to [intel/torch-xpu-ops@229e8b](`229e8ba104`), includes: - Revert tracking of Work status for FlightRecorder in ProcessGroupXCCL to fix memory leak - Enable SYCL warnings on Linux - Fix accuracy issues with CTC loss - Enable aten::nonzero_static on XPU backend - Stop recursive calculations in polynomial kernels if tensor has NaNs Pull Request resolved: https://github.com/pytorch/pytorch/pull/163758 Approved by: https://github.com/EikanWang	2025-09-26 09:05:08 +00:00
CaoE	c8e5b7dabb	Add SDPA patterns for T5 variants when batch size is 1 (#163252 ) As mentioned in https://github.com/pytorch/pytorch/blob/main/torch/_inductor/fx_passes/fuse_attention.py#L838, this PR generates patterns for the cases batch size == 1. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163252 Approved by: https://github.com/Valentine233, https://github.com/jansel	2025-09-26 08:50:06 +00:00
Valentine233	04b51499f7	[CPU] Support transpose and packing fusion for bit8 (#163233 ) To be used by CPU INT8 SDPA in TorchAO https://github.com/pytorch/ao/pull/3025. This change has a kernel improvement of about 9%. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163233 Approved by: https://github.com/mingfeima, https://github.com/jansel	2025-09-26 07:15:04 +00:00
Arsh Zahed	54461a53bd	[Inductor] Check if profiling before using record_function in CompiledFxGraph (#163747 ) The call to `record_function` adds overhead even if profiling is disabled, which can as much as double the total runtime overhead of a compiled function. #163566 aims to make `record_function` more efficient, but doesn't fully eliminate overhead. This change adds a check if profiling is active before using `record_function`, which avoids this issue all together. `TestExecutionTrace.test_execution_trace_with_pt2` in https://github.com/pytorch/pytorch/blob/main/test/profiler/test_execution_trace.py#L372 already checks that the `record_function` region is tracked during profiling. Comparison of the `benchmarks/dynamo/microbenchmarks/overheads.py ` results: Before Change: ``` requires_grad=False compiled 56.9us (warmup=10.7s) requires_grad=True compiled 99.4us (warmup=0.2s) inference_mode() compiled 55.7us (warmup=0.1s) ``` After Change: ``` requires_grad=False eager 6.9us (warmup=0.0s) compiled 23.9us (warmup=22.3s) requires_grad=True eager 8.7us (warmup=0.0s) compiled 56.8us (warmup=0.1s) inference_mode() eager 6.3us (warmup=0.0s) compiled 22.2us (warmup=0.1s) ``` Additionally, #163866 introduces an instruction count benchmark. Because that is not merged and activated yet, here is a comparison: Before Change: ``` runtime_overhead_inductor,instruction_count,222645 runtime_overhead_inductor_inference_mode,instruction_count,234998 runtime_overhead_inductor_requires_grad,instruction_count,293556 runtime_overhead_inductor_requires_grad_backward,instruction_count,78181 runtime_overhead_inductor_dynamic,instruction_count,234870 runtime_overhead_inductor_inference_mode_dynamic,instruction_count,248711 runtime_overhead_inductor_requires_grad_dynamic,instruction_count,309979 runtime_overhead_inductor_requires_grad_backward_dynamic,instruction_count,77599 ``` After Change: ``` runtime_overhead_inductor,instruction_count,149997 runtime_overhead_inductor_inference_mode,instruction_count,163397 runtime_overhead_inductor_requires_grad,instruction_count,220722 runtime_overhead_inductor_requires_grad_backward,instruction_count,78276 runtime_overhead_inductor_dynamic,instruction_count,161177 runtime_overhead_inductor_inference_mode_dynamic,instruction_count,175495 runtime_overhead_inductor_requires_grad_dynamic,instruction_count,235674 runtime_overhead_inductor_requires_grad_backward_dynamic,instruction_count,77475 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163747 Approved by: https://github.com/mlazos, https://github.com/anijain2305	2025-09-26 06:49:40 +00:00
Lu Fang	d1403250c9	Fix specialize_impl from triton.runtime.jit (#163844 ) Summary: In https://github.com/triton-lang/triton/pull/7771/ , create_specialize_impl is removed. We extend the support using native_specialize_impl. Otherwise, PyTorch won't work with trunk triton. Test Plan: scripts/lufang/llm/launch_qwen3_vl_235b_a22b_thinking_2507_h100.sh No more error message like ``` (Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Encountered an exception in identify_mutated_tensors, assuming every input is mutated (Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Traceback (most recent call last): (Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to rch/_higher_order_ops/triton_kernel_wrap.py", line 924, in identify_mutated_tensors (Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] ttir_module, ordered_tensor_names = generate_ttir( (Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to rch/_higher_order_ops/triton_kernel_wrap.py", line 419, in generate_ttir (Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] specialization = _get_specialization(ordered_args.values()) (Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to rch/_higher_order_ops/triton_kernel_wrap.py", line 390, in _get_specialization (Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] from triton.runtime.jit import specialize_impl as specialize_impl_orig (Worker_TP0_EP0 pid=190353) [rank0]:W0924 23:24:48.190000 190353 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] ImportError: cannot import name 'specialize_impl' from 'triton.runtime.jit' (/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inf erence_platform_sp/llm_predictor_gpu/__service__/service#link-tree/triton/runtime/jit.py) (Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Encountered an exception in identify_mutated_tensors, assuming every input is mutated (Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Traceback (most recent call last): (Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to rch/_higher_order_ops/triton_kernel_wrap.py", line 924, in identify_mutated_tensors (Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] ttir_module, ordered_tensor_names = generate_ttir( (Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to rch/_higher_order_ops/triton_kernel_wrap.py", line 419, in generate_ttir (Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] specialization = _get_specialization(ordered_args.values()) (Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to rch/_higher_order_ops/triton_kernel_wrap.py", line 390, in _get_specialization (Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] from triton.runtime.jit import specialize_impl as specialize_impl_orig (Worker_TP1_EP1 pid=190354) [rank1]:W0924 23:24:48.210000 190354 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] ImportError: cannot import name 'specialize_impl' from 'triton.runtime.jit' (/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inf erence_platform_sp/llm_predictor_gpu/__service__/service#link-tree/triton/runtime/jit.py) (Worker_TP5_EP5 pid=190359) [rank5]:W0924 23:24:48.216000 190359 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Encountered an exception in identify_mutated_tensors, assuming every input is mutated (Worker_TP5_EP5 pid=190359) [rank5]:W0924 23:24:48.216000 190359 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] Traceback (most recent call last): (Worker_TP5_EP5 pid=190359) [rank5]:W0924 23:24:48.216000 190359 /data/users/lufang/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:948] [0/0] File "/data/users/lufang/fbsource/buck-out/v2/gen/fbcode/4e83bca020adbfd7/smart/inference_platform_sp/llm_predictor_gpu/__service__/service#link-tree/to rch/_higher_order_ops/triton_kernel_wrap.py", line 924, in identify_mutated_tensors ``` Differential Revision: D83229128 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163844 Approved by: https://github.com/henryoier, https://github.com/davidberard98, https://github.com/BoyuanFeng	2025-09-26 06:37:26 +00:00
Laith Sakka	b42e81def5	Allow unbacked to unbacked replacements if rhs unbacked symbols are all inputs (#163652 ) This partially solve the issue https://github.com/pytorch/pytorch/issues/163641. We do not need to ban unbacked to unbacked replacement if all rhs symbols are inputs since we know those symbols are seen by the whole program. This issue was found as i was tracing some vllm models with unbacked, namely Qwen/Qwen2-1.5B-Instruct it makes reasoning logic easier to do those replacements. as for data dependent similar pattern, I am thinking to create a set of replacements that we apply only during static eval instead of none. to make reasoning better. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163652 Approved by: https://github.com/bobrenjc93	2025-09-26 06:23:22 +00:00
Yiming Zhou	2a45f30ae7	Exporting aten.conv with cuda under fake mode on a cuda-less machine (#163912 ) Summary: Improve op coverage of exporting a CUDA model on a CPU-only machine under fake tensor mode. For `torch.nn.functional.conv2d`, it will `_select_conv_backend` based on input and weight shapes. When calling into `supportsDepthwiseConvolutionWithCuDNN()`, it calls `at::cuda::getCurrentDeviceProperties()` and fails on a CPU-only machine. So we check if CUDA is actually enabled first. Test Plan: TORCH_SHOW_CPP_STACKTRACES=1 buck2 run fbcode//caffe2/test:test_export -- --r nn_functional_conv2d Reviewed By: angelayi, henryoier Differential Revision: D80562984 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163912 Approved by: https://github.com/SherlockNoMad	2025-09-26 06:04:20 +00:00
angelayi	11b4c0eb9e	[aoti] Save compute information (#163792 ) Metadata looks like: ``` { 'AOTI_DEVICE_KEY': 'cpu', 'AOTI_PLATFORM': 'linux', 'AOTI_MACHINE': 'x86_64', 'AOTI_CPU_ISA': 'AVX512', 'AOTI_COMPUTE_CAPABILITY': '90' } ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163792 Approved by: https://github.com/yushangdi, https://github.com/desertfire ghstack dependencies: #163779	2025-09-26 05:40:44 +00:00
angelayi	fb93491ddc	[aoti] Load metadata w/o loading package (#163779 ) Add a function to load the metadata stored in aoti without needing to load the .so. This can be used to store what platform we are compiling the .so on which we can check before loading the .so. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163779 Approved by: https://github.com/yushangdi, https://github.com/desertfire	2025-09-26 05:40:44 +00:00
Karhou Tam	39df24fe04	[Code Clean] Replace `std::runtime_error` with `TORCH_CHECK` (#163610 ) Including: - `torch/csrc/instruction_counter` - `torch/csrc/lazy` - `torch/csrc/monitor` - `torch/csrc/profiler` - `torch/csrc/dynamo` Fixes part of #148114 Personal mistake about (PR #163317), this PR does the same thing and PR #163317 has already been approved by @albanD. This is a personal mistake on my part, and I'm so sorry about that. Hope you won't mind @albanD. 🥹 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163610 Approved by: https://github.com/albanD, https://github.com/Skylion007	2025-09-26 04:52:48 +00:00
PyTorch UpdateBot	bbde16fe98	[vllm hash update] update the pinned vllm hash (#163823 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163823 Approved by: https://github.com/pytorchbot	2025-09-26 04:29:52 +00:00
Nick Riasanovsky	1b78ca2ef5	[Triton] [Inductor] Prune template selection based on decompose_k (#163781 ) Summary: Triton templates tend to perform very poorly on large K, hence the introduction of decompose_k. As a result, when decompose_k is selected will disable exploring the Triton templates. We may want to consider an override in the future. Note: Based on the timing results it may be desirable to better refine/prune the decompose k decisions. Testing: Tested by looking at the autotune/compilation time using a single shape in TritonBench. `TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 ENABLE_PERSISTENT_TMA_MATMUL=1 python run --op gemm --rep 1000 --sleep 1.0 --m 512 --n 512 --k 300000 --only pt2_matmul_maxautotune` Before this change: `SingleProcess AUTOTUNE benchmarking takes 13.5368 seconds and 0.1595 seconds precompiling for 38 choices` With this change: `SingleProcess AUTOTUNE benchmarking takes 9.9626 seconds and 0.0020 seconds precompiling for 11 choices` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163781 Approved by: https://github.com/eellison, https://github.com/PaulZhang12	2025-09-26 04:09:35 +00:00
fduwjj	082eaf4aae	[DeviceMesh] Add extra check in flatten result cache lookup (#163288 ) while refactoring DeviceMesh bookkeeping, we found that there is one corner case which we just don't check whether the dims to be flattened into is same as the dims which an existing flattened name maps to. So we need to add extra cases in the unit test and extra check logic in the code. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163288 Approved by: https://github.com/wz337, https://github.com/ezyang, https://github.com/fegin ghstack dependencies: #163212	2025-09-26 03:41:58 +00:00
fduwjj	f1f2e3e4da	[DeviceMesh] Introduce CuTe layout into devicemesh code base for internal bookkeeping (#163212 ) DeviceMesh essentially is a way to specify how devices interact with each other or device layout. They are all integers but because they can have various shapes and meshes, it make internal bookkeeping internally way more challenging. Currently our internal bookkeeing inside DeviceMesh is not scalable, so in order to support new functions like `_unflatten`, we need to introduce very complicated logics inside DeviceMesh as pointed out per comment (https://github.com/pytorch/pytorch/pull/159482/files#r2256025452). So thanks to @lw 's suggestion and PoC PR (https://github.com/pytorch/pytorch/pull/160429), we realize that by leveraging CuTe layout algebra([ref](https://docs.nvidia.com/cutlass/media/docs/cpp/cute/02_layout_algebra.html)) from Cutlass will greatly simply our internal mechanical bookkeeping for and make the abstraction ops way easier on top of it. So to make things go incrementally, we propose couple steps here https://github.com/pytorch/pytorch/issues/160337#issuecomment-3195106243. On top of what we have been doing about PyCute we want to continue add methods into the wrapper class so that we can get rank indexes needed for ProcessGroup Creation with a layout object. We also added detailed explanations and comments (thanks to llm) and unit test to show case the code indeed is working as expected. More PRs are on the way. This is a continue of https://github.com/pytorch/pytorch/pull/161016 (originally messed with EasyCLA) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163212 Approved by: https://github.com/ezyang, https://github.com/fegin, https://github.com/lw	2025-09-26 03:32:19 +00:00
Kevin Fu	67cc0e0ac9	Add Static Dispatch Kernels (#163676 ) (#163870 ) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/1951 X-link: https://github.com/pytorch/FBGEMM/pull/4927 Add a few missing static dispatch kernels for remote_ro. Test Plan: Tested with scripts in D83028841. Differential Revision: D83258808 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163870 Approved by: https://github.com/henryoier	2025-09-26 03:00:07 +00:00
Ke Wen	bbf8aa43ef	[a2av] Separate in/out splits into two tensors (#163837 ) Old signature: `all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name)` New signature: `all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor in_splits, Tensor(a!) out_splits_offsets, str group_name)` i.e. split `in_out_splits` into IN tensor and OUT tensor so that we can define the TORCH_LIBRARY signature better. Also to be in line with the 2D version. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163837 Approved by: https://github.com/fduwjj ghstack dependencies: #163886	2025-09-26 01:03:54 +00:00
Yuanyuan Chen	5daa79fd6e	Remove dataclass_slots (#163623 ) `dataclass` now has `slots` kwarg. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163623 Approved by: https://github.com/Skylion007	2025-09-26 00:54:42 +00:00
Jeff Daily	b776e0c71e	[ROCm][CI/CD] create ROCm 7.0 magma tarball (#163883 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163883 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-26 00:51:17 +00:00
Yiming Zhou	5c2f09d1f9	[export] _detect_attribute_assignment gives warning instead of raising ValueError (#163809 ) Summary: LSTM was not exportable with non-strict export as it failed at `_detect_attribute_assignment` This is because the `_flat_weights` attribute in LSTM is a list of registered parameters and will be updated by the `_update_flat_weights` method in `forward`. However, in `_detect_attribute_assignment`, we manually restore the state of the module by `mod.__dict__.update(snapshot)`. Therefore, it should be fine to turn the `ValueError` into a warning so that RNN models are exportable with non-strict export. Added test to verify that there is no lifted tensor constant and no fake tensor leakage. Test Plan: buck2 run mode/dev-nosan caffe2/test:test_export -- -r test_export_rnn_variants_with_warning Differential Revision: D83196971 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163809 Approved by: https://github.com/tugsbayasgalan	2025-09-26 00:43:29 +00:00
Jerry Mannil	b4be380480	[ROCm] Implement float32 copy kernel (#163869 ) * Add `float32_copy_kernel` for vectorizing float16/bfloat16 to float32 conversion Pull Request resolved: https://github.com/pytorch/pytorch/pull/163869 Approved by: https://github.com/jeffdaily	2025-09-26 00:39:30 +00:00
Randy Shuai	5b8fef3f17	Extend triton_mm auto-tune options for HIM shapes (#163273 ) Summary: Add an option to auto-tune for shape: ``` M=1024 N=171712 K=1024 ``` Test Plan: ``` TRITON_PRINT_AUTOTUNING=1 buck2 run mode/opt-amd-gpu -c fbcode.enable_gpu_sections=true //pytorch/tritonbench:run -- --op fp8_gemm_rowwise --no_use_tma --no_use_persistent --m 1024 --n 171712 --k 1024 --bias ``` Before: {F1982074581} After, saw 10%~ boost: {F1982074585} Differential Revision: D82687336 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163273 Approved by: https://github.com/jananisriram, https://github.com/Camyll	2025-09-26 00:05:57 +00:00
Nikita Shulga	ff2f319e6e	[MPS] Fix conv layout handling (#162776 ) What started as simple fix for `mps_convolution_backward_input` resulted in a pretty significant refactor/fixes: - Updated `mps_conv_use_channels_last` to return channels last output if either input or weights are channels last - Use the same primitive throughout `Convolution.mm` to determine wether output should be allocated in channels last format or not But doing only those two, resulted in crash in `test_memory_format_nn_Conv2d_mps_float32`, when weights were backward, and bias is present: ``` % python -c "import torch;print(torch.nn.functional.conv2d(torch.rand(2, 4, 3, 4,device='mps'), torch.rand(5, 4, 3, 3,device='mps').to(memory_format=torch.channels_last), torch.rand(5,device='mps')))" /AppleInternal/Library/BuildRoots/4~B5E4ugDCh2RsPWAjMEoPu8LC5w1yXEwd7XweDhg/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphExecutable.mm:3619: failed assertion `Error: MLIR pass manager failed' zsh: abort python -c ``` Which requires a more thorough redesign/cleanup, namely: - Do not alter the layout based on MacOS version, but rather do additional copies on MacOS-14 if inputs/output or weight are in channels last format ( done by defining `std::optional<Tensor> output_c;` that contains a contiguous copy of the output tensor - Introduced `input_suggested_layout` which is set to ChannelsLast if and only if input is channels last and is running on MacOS-15+ - Delete unused `memory_layout` and `group` arguments from `fill_depthwise_conv_desc` - Fix bias broadcasting logic for channels last As result, in addition to adding one more regression test this change removes `expectedFailures` from: - `TestModule.test_memory_format` for `Conv2d`, `ConvTranspose2d`, `LazyConv1d`, `LazyConvTranspose1d` - `test_require_stride_expanded_dynamic_shapes` - `test_mutable_custom_op_fixed_layout2` for MacOS-14 Fixes https://github.com/pytorch/pytorch/issues/161905 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162776 Approved by: https://github.com/Skylion007	2025-09-25 23:41:34 +00:00
PaliC	94195a37ae	[BE] Remove HermeticPyObjectTLS and Simplify PythonOpRegistrationTrampoline (#163464 ) Removes HermeticPyObjectTLS as we no longer need since torch deploy is no longer supported. PythonOpRegistrationTrampoline is also drastically simplified as and being prepped for removal in a future PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163464 Approved by: https://github.com/albanD, https://github.com/Skylion007	2025-09-25 23:30:50 +00:00
suo	c58e096cd0	[DTensor] implement logsumexp (#163879 ) as title, mostly copypasta from internal. I am a dtensor noob, so please scrutinize my added test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163879 Approved by: https://github.com/XilunWu	2025-09-25 23:08:30 +00:00
Anshul Sinha	2a6e6a9e3b	[FSDP][Replicate] tests replicate parity for shared parameters (#162836 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162836 Approved by: https://github.com/mori360 ghstack dependencies: #162830	2025-09-25 23:08:22 +00:00
Ke Wen	6e6c899347	[Reland][163423] Promote `@requires_nvshmem` instead of `enable_triton` (#163549 ) #163423 was approved but reverted due to a revert of base. Relanding without base. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163549 Approved by: https://github.com/wdvr Co-authored-by: Wouter Devriendt <wouterdevriendt@meta.com>	2025-09-25 23:02:00 +00:00
Anshul Sinha	366961df78	[FSDP][Replicate] tests replicate parity with activation checkpointing (#162830 ) Summary: In order to ensure that replicate acts as intended (a specialized version of hsdp) we need to make sure that it can pass the same tests that fully_shard can for training. This tests that replicate function works correctly when combined with activation checkpointing Test Case 1. pytest test/distributed/_composable/test_replicate_training.py -k test_train_parity_with_activation_checkpointing Pull Request resolved: https://github.com/pytorch/pytorch/pull/162830 Approved by: https://github.com/mori360	2025-09-25 22:57:00 +00:00
Shangdi Yu	520fca82c8	Refactor Provenance Tracking (#163378 ) Summary: - Move the `provenance_level` flag check to inside the `set_kernel_post_grad_provenance_tracing` call to simply the code - Move the `set_kernel_post_grad_provenance_tracing` call and `write_provenance_debug_handle` call to `codegen_comment`. - If some `call_kernel` call sites don't have a proceeding `codegen_comment` call, add one. Now all `call_kernel` call sites are accompanied with a `codegen_comment` call. - Add a `codegen_comment` method to BaseScheduling and remove the noop `codegen_comment` method in Scheduling - Remove `debug_handle` from `call_kernel`. Test Plan: CI ``` buck run @//mode/opt-split-dwarf fbcode//caffe2/test/inductor:provenance_tracing ``` Differential Revision: D82839271 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163378 Approved by: https://github.com/angelayi	2025-09-25 22:55:59 +00:00
Mu-Chu Lee	908bcfd403	[AOTInductor] Add input information for Triton Kernels in AOTI (#160380 ) Summary: We use record_function to pass in input information to let Kineto show input information. Test Plan: Before: <img width="459" height="582" alt="Screenshot 2025-09-19 at 10 45 10 AM" src="https://github.com/user-attachments/assets/baa0c251-86e9-49ca-8c6c-fcd2619f7f48" /> After: <img width="473" height="1130" alt="Screenshot 2025-09-19 at 10 44 53 AM" src="https://github.com/user-attachments/assets/b7942d84-0362-4b9e-9232-14de92bbdd00" /> Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/160380 Approved by: https://github.com/desertfire ghstack dependencies: #163593	2025-09-25 22:41:04 +00:00
Ke Wen	96275dbf88	[CI] Fix test_triton_wait_until hang (#163886 ) I don't know why `nvshmem_barrier_all_kernel` leads the test to hang. Will investigate. But since it is an unnecessary call here, I am removing it to unblock other PRs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163886 Approved by: https://github.com/fegin	2025-09-25 22:22:16 +00:00
bobrenjc93	b14a14a662	[torchfuzz] make generated code much more concise and cleaner (#163812 ) ``` import torch torch._dynamo.config.capture_scalar_outputs = True torch.manual_seed(42) def fuzzed_program(arg_0, arg_1, arg_2): var_node_3 = arg_0 # size=(1,), stride=(1,), dtype=complex128, device=cuda var_node_4 = torch.full((1,), (-0.29262632146522655-0.7687848816195035j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda var_node_2 = torch.ops.aten.add(var_node_3, var_node_4) # size=(1,), stride=(1,), dtype=complex128, device=cuda var_node_6 = arg_1 # size=(1,), stride=(1,), dtype=complex128, device=cuda var_node_7 = arg_2 # size=(1,), stride=(1,), dtype=complex128, device=cuda var_node_5 = torch.ops.aten.add(var_node_6, var_node_7) # size=(1,), stride=(1,), dtype=complex128, device=cuda var_node_1 = torch.ops.aten.add(var_node_2, var_node_5) # size=(1,), stride=(1,), dtype=complex128, device=cuda var_node_0 = var_node_1.item() # dtype=complex128 return var_node_0 arg_0 = torch.as_strided(torch.randn(1).to(torch.complex128), (1,), (1,)) arg_1 = torch.as_strided(torch.randn(1).to(torch.complex128), (1,), (1,)) arg_2 = torch.as_strided(torch.randn(1).to(torch.complex128), (1,), (1,)) args = (arg_0, arg_1, arg_2) result_original = fuzzed_program(args) print('✅ eager success') compiled_program = torch.compile(fuzzed_program, fullgraph=False, dynamic=True) result_compiled = compiled_program(args) print('✅ compile success') ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163812 Approved by: https://github.com/pianpwk ghstack dependencies: #163743	2025-09-25 22:12:33 +00:00
Tianyu Liu	92f7361e27	[DTensor] fix uneven _StridedShard (#163843 ) Previous uneven `_StridedShard` in https://github.com/pytorch/pytorch/pull/150490 seems failing cases like sharding `tensor = torch.arange(6)` with FSDP 2, TP 2. This PR attempts to reinvent `_StridedShard`. I didn't test nested `_StridedShard`, because there shouldn't be any use cases. I think it will become quite messy when it comes to nested uneven `_StridedShard`. We are probably going to deprecate it anyway after @zpcore 's work https://github.com/pytorch/pytorch/pull/160266 on ordered sharding, so IMO not worth it to make it too general. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163843 Approved by: https://github.com/ezyang	2025-09-25 22:12:29 +00:00
Jane Xu	6a6d838832	Add H100 runner to be recognized in actionlint (#163795 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163795 Approved by: https://github.com/huydhn, https://github.com/seemethere	2025-09-25 22:09:11 +00:00
Huamin Li	183dca423f	[Inductor] add a new config fallback_embedding_bag_byte_unpack (#163803 ) Differential Revision: D82988783 introduce an inductor config fallback_embedding_bag_byte_unpack so we can have options to not let inductor decompose the op Pull Request resolved: https://github.com/pytorch/pytorch/pull/163803 Approved by: https://github.com/henryoier	2025-09-25 22:07:04 +00:00
bobrenjc93	b8efa336d2	[torchfuzz] simplify codegen and runner (#163743 ) much less code. a followup PR will make these repro files even smaller. small is important since it reduces the time for users to understand what the repro is doing. here's a sample: ``` (/home/bobren/local/a/pytorch-env) [21:34] devgpu009:/home/bobren/local/a/pytorch/tools/experimental/dynamic_shapes/torchfuzz [130] python fuzzer.py --seed 42 Running single fuzz_and_execute... Using seed: 42, max_depth: 10 Running generated program... Selected CUDA_VISIBLE_DEVICES=2 === Program Output === ✅ eager success ✅ compile success =============================== === Program Source === import torch import sys import os fuzzer_dir = r'/home/bobren/local/a/pytorch/tools/experimental/dynamic_shapes/torchfuzz' if fuzzer_dir not in sys.path: sys.path.insert(0, fuzzer_dir) from tensor_fuzzer import fuzz_scalar, fuzz_tensor_simple, ScalarSpec, TensorSpec def fuzzed_program(arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, arg_11, arg_12, arg_13, arg_14, arg_15, arg_16, arg_17, arg_18, arg_19, arg_20, arg_21, arg_22, arg_23, arg_24, arg_25, arg_26): # Node node_4: arg (depth 6) var_node_4 = arg_0 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_7: constant (depth 4) var_node_7 = torch.full((1,), (-0.8353595860703585-0.8384634248041143j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_8: arg (depth 4) var_node_8 = arg_1 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_6: tensor_pointwise (depth 5) var_node_6 = torch.ops.aten.mul(var_node_7, var_node_8) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_9: constant (depth 5) var_node_9 = torch.full((1,), (-0.32478860712861235+0.033909682598544454j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_5: tensor_pointwise (depth 6) var_node_5 = torch.ops.aten.mul(var_node_6, var_node_9) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_3: tensor_pointwise (depth 7) var_node_3 = torch.ops.aten.sub(var_node_4, var_node_5) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_11: arg (depth 6) var_node_11 = arg_2 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_18: constant (depth 0) var_node_18 = torch.full((1,), (0.12855308616305575+1.5268033634325642j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_19: arg (depth 0) var_node_19 = arg_3 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_17: tensor_pointwise (depth 1) var_node_17 = torch.ops.aten.mul(var_node_18, var_node_19) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_21: arg (depth 0) var_node_21 = arg_4 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_22: arg (depth 0) var_node_22 = arg_5 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_20: tensor_pointwise (depth 1) var_node_20 = torch.ops.aten.sub(var_node_21, var_node_22) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_16: tensor_pointwise (depth 2) var_node_16 = torch.ops.aten.add(var_node_17, var_node_20) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_25: arg (depth 0) var_node_25 = arg_6 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_26: arg (depth 0) var_node_26 = arg_7 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_24: tensor_pointwise (depth 1) var_node_24 = torch.ops.aten.add(var_node_25, var_node_26) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_27: constant (depth 1) var_node_27 = torch.full((1,), (-0.6315711191260084+1.342004076501214j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_23: tensor_pointwise (depth 2) var_node_23 = torch.ops.aten.mul(var_node_24, var_node_27) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_15: tensor_pointwise (depth 3) var_node_15 = torch.ops.aten.mul(var_node_16, var_node_23) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_28: constant (depth 3) var_node_28 = torch.full((1,), (1.064498531874825-0.37289464356501284j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_14: tensor_pointwise (depth 4) var_node_14 = torch.ops.aten.mul(var_node_15, var_node_28) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_30: arg (depth 3) var_node_30 = arg_8 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_32: arg (depth 2) var_node_32 = arg_9 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_33: constant (depth 2) var_node_33 = torch.full((1,), (1.5815627438573372+0.5124667911691704j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_31: tensor_pointwise (depth 3) var_node_31 = torch.ops.aten.div(var_node_32, var_node_33) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_29: tensor_pointwise (depth 4) var_node_29 = torch.ops.aten.div(var_node_30, var_node_31) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_13: tensor_pointwise (depth 5) var_node_13 = torch.ops.aten.div(var_node_14, var_node_29) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_39: arg (depth 0) var_node_39 = arg_10 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_40: constant (depth 0) var_node_40 = torch.full((1,), (-0.5987350493494642-0.5711360569376475j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_38: tensor_pointwise (depth 1) var_node_38 = torch.ops.aten.mul(var_node_39, var_node_40) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_41: arg (depth 1) var_node_41 = arg_11 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_37: tensor_pointwise (depth 2) var_node_37 = torch.ops.aten.add(var_node_38, var_node_41) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_42: constant (depth 2) var_node_42 = torch.full((1,), (0.7246044564672116-0.5930730980273312j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_36: tensor_pointwise (depth 3) var_node_36 = torch.ops.aten.mul(var_node_37, var_node_42) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_43: constant (depth 3) var_node_43 = torch.full((1,), (-0.7582976293117148+1.1880929376258396j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_35: tensor_pointwise (depth 4) var_node_35 = torch.ops.aten.mul(var_node_36, var_node_43) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_45: constant (depth 3) var_node_45 = torch.full((1,), (1.0896212896322774+0.3124038130417098j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_46: arg (depth 3) var_node_46 = arg_12 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_44: tensor_pointwise (depth 4) var_node_44 = torch.ops.aten.add(var_node_45, var_node_46) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_34: tensor_pointwise (depth 5) var_node_34 = torch.ops.aten.div(var_node_35, var_node_44) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_12: tensor_pointwise (depth 6) var_node_12 = torch.ops.aten.div(var_node_13, var_node_34) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_10: tensor_pointwise (depth 7) var_node_10 = torch.ops.aten.mul(var_node_11, var_node_12) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_2: tensor_pointwise (depth 8) var_node_2 = torch.ops.aten.div(var_node_3, var_node_10) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_48: constant (depth 7) var_node_48 = torch.full((1,), (-1.047745491289218+0.279447315087422j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_54: arg (depth 2) var_node_54 = arg_13 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_55: arg (depth 2) var_node_55 = arg_14 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_53: tensor_pointwise (depth 3) var_node_53 = torch.ops.aten.div(var_node_54, var_node_55) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_56: arg (depth 3) var_node_56 = arg_15 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_52: tensor_pointwise (depth 4) var_node_52 = torch.ops.aten.div(var_node_53, var_node_56) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_59: arg (depth 2) var_node_59 = arg_16 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_60: arg (depth 2) var_node_60 = arg_17 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_58: tensor_pointwise (depth 3) var_node_58 = torch.ops.aten.div(var_node_59, var_node_60) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_61: constant (depth 3) var_node_61 = torch.full((1,), (-0.7386327586576402-0.027025998767172658j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_57: tensor_pointwise (depth 4) var_node_57 = torch.ops.aten.add(var_node_58, var_node_61) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_51: tensor_pointwise (depth 5) var_node_51 = torch.ops.aten.sub(var_node_52, var_node_57) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_64: arg (depth 3) var_node_64 = arg_18 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_67: arg (depth 1) var_node_67 = arg_19 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_68: constant (depth 1) var_node_68 = torch.full((1,), (-0.6840241429755998+1.327637020136433j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_66: tensor_pointwise (depth 2) var_node_66 = torch.ops.aten.mul(var_node_67, var_node_68) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_69: arg (depth 2) var_node_69 = arg_20 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_65: tensor_pointwise (depth 3) var_node_65 = torch.ops.aten.sub(var_node_66, var_node_69) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_63: tensor_pointwise (depth 4) var_node_63 = torch.ops.aten.sub(var_node_64, var_node_65) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_70: arg (depth 4) var_node_70 = arg_21 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_62: tensor_pointwise (depth 5) var_node_62 = torch.ops.aten.sub(var_node_63, var_node_70) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_50: tensor_pointwise (depth 6) var_node_50 = torch.ops.aten.mul(var_node_51, var_node_62) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_76: constant (depth 1) var_node_76 = torch.full((1,), (1.864651314238342+0.27066487315113186j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_77: arg (depth 1) var_node_77 = arg_22 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_75: tensor_pointwise (depth 2) var_node_75 = torch.ops.aten.mul(var_node_76, var_node_77) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_78: arg (depth 2) var_node_78 = arg_23 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_74: tensor_pointwise (depth 3) var_node_74 = torch.ops.aten.add(var_node_75, var_node_78) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_79: arg (depth 3) var_node_79 = arg_24 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_73: tensor_pointwise (depth 4) var_node_73 = torch.ops.aten.mul(var_node_74, var_node_79) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_80: arg (depth 4) var_node_80 = arg_25 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_72: tensor_pointwise (depth 5) var_node_72 = torch.ops.aten.mul(var_node_73, var_node_80) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_82: constant (depth 4) var_node_82 = torch.full((1,), (1.6341547018841247+0.3096989611326181j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_84: constant (depth 3) var_node_84 = torch.full((1,), (0.9609065596935821+0.2920229825681946j), dtype=torch.complex128) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_85: arg (depth 3) var_node_85 = arg_26 # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_83: tensor_pointwise (depth 4) var_node_83 = torch.ops.aten.add(var_node_84, var_node_85) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_81: tensor_pointwise (depth 5) var_node_81 = torch.ops.aten.sub(var_node_82, var_node_83) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_71: tensor_pointwise (depth 6) var_node_71 = torch.ops.aten.sub(var_node_72, var_node_81) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_49: tensor_pointwise (depth 7) var_node_49 = torch.ops.aten.mul(var_node_50, var_node_71) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_47: tensor_pointwise (depth 8) var_node_47 = torch.ops.aten.add(var_node_48, var_node_49) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_1: tensor_pointwise (depth 9) var_node_1 = torch.ops.aten.add(var_node_2, var_node_47) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_0: torch.ops.aten.item (depth 10) var_node_0 = var_node_1.item() # dtype=complex128 # Final result from root node return var_node_0 arg_0 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10042) arg_1 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10043) arg_2 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10044) arg_3 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10045) arg_4 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10046) arg_5 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10047) arg_6 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10048) arg_7 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10049) arg_8 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10050) arg_9 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10051) arg_10 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10052) arg_11 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10053) arg_12 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10054) arg_13 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10055) arg_14 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10056) arg_15 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10057) arg_16 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10058) arg_17 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10059) arg_18 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10060) arg_19 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10061) arg_20 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10062) arg_21 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10063) arg_22 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10064) arg_23 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10065) arg_24 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10066) arg_25 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10067) arg_26 = fuzz_tensor_simple((1,), (1,), torch.complex128, seed=10068) import torch import sys torch._dynamo.config.capture_scalar_outputs = True args = (arg_0, arg_1, arg_2, arg_3, arg_4, arg_5, arg_6, arg_7, arg_8, arg_9, arg_10, arg_11, arg_12, arg_13, arg_14, arg_15, arg_16, arg_17, arg_18, arg_19, arg_20, arg_21, arg_22, arg_23, arg_24, arg_25, arg_26) result_original = fuzzed_program(args) print('✅ eager success') sys.exit(1) compiled_program = torch.compile(fuzzed_program, fullgraph=False, dynamic=True) result_compiled = compiled_program(args) print('✅ compile success') ====================== Program exited with code: 1 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163743 Approved by: https://github.com/pianpwk	2025-09-25 21:42:22 +00:00
Roman Bobniev	1cffa42d4d	PyTorch `histc` fix for values with large magnitudes (#163506 ) Summary: The current implementation of the `histc` function on CPU doesn't take into account the nature of the floating point precision represenation when two numbers have very different magnitudes. In the code of `histc` there is a following logic, which tries to fix an issue when automatically calculated `min` and `max` are identical: ``` if (leftmost_edge == rightmost_edge) { leftmost_edge -= 1; rightmost_edge += 1; } ... TORCH_CHECK(leftmost_edge < rightmost_edge, "torch.histc: max must be larger than min"); ``` But, not for all floating point values expanding the range exactly by 1 will give the representable result that is different from the original value. The test code: ``` info = th.finfo(th.float32) f_min = info.min test_tensor = th.ones((224, 224), dtype=th.float64) * f_min res = th.histc(test_tensor, bins=10) ``` Actual result: ``` RuntimeError: torch.histc: max must be larger than min ``` Expected result: Everything should work fine. NOTICE: If we set `f_min` just to small enough number, code works, which demonstrates the correct purpose of the possible range correction. In short, `f_min + 1 == f_min` executes to true, since we reach the precision of the floating point prepresentation. Please notice, this is not limitation of the float32 data type, since all computations happen in float64 (C++ data type `double`). The magnitudes are just different enough, that we reach the precision representation with simple approach of `+/-1`. Interesting is that `histogram` function doesn't throw an exception, because edges range selection is implemented differently. The fix we propose is to use `std::nextafter` which returns next representable floating point value starting from the current one in the direction of the lowest or max numbers. In theory, mathecmatically correct is to use this function without constrains, but to maintain backward compatibility in case if there is a code which relies on the current logic of `+/-1` offset we call `std::min` and `std::max` to pick the right representable value (i.e. for small floating point values the next representable value has step smaller than 1 for large values it's larger than 1). We could stick to `histogram` implementation, but again, to avoid possible backward compatibility breaks, we decided to use the fix presented in this change. The real use case scenario: In our project we use the well-known transformer version from HuggingFace which fills up the buffer with float32 min (please note this is not a minimal value closer to 0, it's minimal absolute value which is often like `-max`). The code where it sits is here: https://github.com/huggingface/transformers/blob/v4.51.1/src/transformers/models/mimi/modeling_mimi.py#L1159 Switching to other version of the transformer will lead to other issues in our project and the bug which we fix here may appear in other projects and scenarios. The real world problem appears when for such tensor the CPU version of the `histc` is called. In our usecase, it happens because this tensor is an input to the softmax activaiton function and as part of the quantisation the input parameter should go trough the observer as well. In our case the default Histogram observer is selected, which calls the `histc`. Test Plan: The simple test code snippet doesn't produce failure: ``` f_min = th.finfo(th.float32).min test_tensor = th.ones((224, 224), dtype=th.float32) * f_min th.histc(test_tensor, bins=10) ``` Testing update: The `test_histc` has been updated accordingly. Now when we have +INF as all values of the tensor, the previous representation of the floating number should be <max_float>, hence the assert message is changed from `[inf, inf]` to `[<max_float>\|inf, inf]`. The test also extended to check the assert message when tensor is filled with values -INF and with combination of (-INF, +INF). The new regexp assert includes possible output as `inf` and any floating point number in scientific representation for one of the bin edges. We left `inf` as possible value due to possible difference in implementation between CPU and CUDA. Differential Revision: D82955597 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163506 Approved by: https://github.com/jermenkoo, https://github.com/malfet	2025-09-25 20:55:25 +00:00
Shangdi Yu	ebfc87e303	Always produce kernel_info.json (#163715 ) Summary: Always produce kernel_info.json so zoomer can use this json to populate GPU traces Test Plan: CI Differential Revision: D82762435 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163715 Approved by: https://github.com/angelayi	2025-09-25 19:38:49 +00:00
Yidi Wu	21a41edd4f	Add fake_impl for _native_multi_head_attention (#163700 ) Test Plan: See added test in test_export.py Differential Revision: D83099187 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163700 Approved by: https://github.com/angelayi	2025-09-25 19:01:27 +00:00
PyTorch MergeBot	7bad9c5a64	Revert "Update ruff to 0.13.1 (#163744 )" This reverts commit 3dd89a079f2b0c1d39351f98ff5d5ca882523152. Reverted https://github.com/pytorch/pytorch/pull/163744 on behalf of https://github.com/malfet due to Broke lint, see https://github.com/pytorch/pytorch/actions/runs/18016220484/job/51261729375 looks like a landrace with PR that updated min-version to 3.10 ([comment](https://github.com/pytorch/pytorch/pull/163744#issuecomment-3335534084))	2025-09-25 18:54:03 +00:00
catswe	151e66e50d	Update documentation for torch.index_select (#163616 ) Description said "entries in index which is a LongTensor" but index_select can accept an IntTensor as the parameter Pull Request resolved: https://github.com/pytorch/pytorch/pull/163616 Approved by: https://github.com/jbschlosser Co-authored-by: Joel Schlosser <75754324+jbschlosser@users.noreply.github.com>	2025-09-25 18:29:17 +00:00
Svetlana Karslioglu	b61bdc7cc4	Fix cpp build (#162774 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/162774 Approved by: https://github.com/malfet, https://github.com/atalman	2025-09-25 18:21:45 +00:00
Yuanyuan Chen	3dd89a079f	Update ruff to 0.13.1 (#163744 ) Update ruff to 0.13.1 so that we can remove `UP038` from `pyproject.toml` because it has been removed from supported rules of ruff. There are some fixes, the most notable one is [(PYI059)](https://docs.astral.sh/ruff/rules/generic-not-last-base-class/#generic-not-last-base-class-pyi059) ``` Checks for classes inheriting from typing.Generic[] where Generic[] is not the last base class in the bases tuple. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163744 Approved by: https://github.com/Skylion007, https://github.com/jingsh	2025-09-25 17:52:35 +00:00
Jeff Daily	6539537a59	[ROCm][CD] create ROCm 7.0 images for binary builds (#163860 ) Adds gfx950. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163860 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-25 17:26:40 +00:00
Xinya Zhang	3cbfbbd691	[ROCm] Transformer/SDPA unit test parity (#163745 ) ## Major Changes * Efficient Attention on ROCM requires last dimensions of input tensors align with 16 bytes. - Unlike FA, ME does not pad input tensors in `scaled_dot_product_attention` and hence this is required. * Fix `atomic_counter` handling in varlen FA API * Unskips a few unit tests. Fixes #157120 Fixes #157121 Fixes #157122 Fixes #157167 Fixes #155217 Fixes #157043 Fixes #157060 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163745 Approved by: https://github.com/jeffdaily	2025-09-25 17:14:19 +00:00
PyTorch MergeBot	112e204797	Revert "[CUDA] Compare major version of the runtime device arch against the built version of the pytorch binary (#161299 )" This reverts commit 7163dce1e091cb5564c723110314bb372b5e81a8. Reverted https://github.com/pytorch/pytorch/pull/161299 on behalf of https://github.com/nWEIdia due to Incorrectly suppressing useful warnings when running sm89 binary on sm86 ([comment](https://github.com/pytorch/pytorch/pull/161299#issuecomment-3335127621))	2025-09-25 17:13:32 +00:00
Sherlock Huang	f9821b1be7	DebugMode supports_higher_order_operators=True (#163824 ) Make DebugMode supports HOP Pull Request resolved: https://github.com/pytorch/pytorch/pull/163824 Approved by: https://github.com/ydwu4	2025-09-25 17:11:43 +00:00
FFFrog	c4312b443f	[Tools] Adapting the Hypothesis library (version 5.x) for use with the PyTorch framework (#163748 ) Starting from version 5.x, the Hypothesis library removed the timeout setting and only retained the deadline. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163748 Approved by: https://github.com/albanD, https://github.com/Skylion007	2025-09-25 16:41:50 +00:00
Dmitry Nikolaev	7194d77550	Revert "enable test_sampled_addmm_zero_sized_cuda for rocm (#121940 )" (#163848 ) This reverts commit 5494b2a8d38c3ddbeb2d96a5ac990e20ec4c48fd. Need to skip `test_sparse_csr.py::TestSparseCSRCUDA::test_sampled_addmm_zero_sized_cuda_` again. Tests are failing now with "core dumped" error ``` python test_sparse_csr.py -v -k test_sampled_addmm_zero_sized_cuda_float64 test_sampled_addmm_zero_sized_cuda_float64 (__main__.TestSparseCSRCUDA) ... /tmp/pytorch/test/test_sparse_csr.py:2503: c = torch.empty(m, n, dtype=dtype, device=device, layout=torch.sparse_csr) GPU core dump created: gpucore.186789 :0:rocdevice.cpp :2992: 4701819131755 us: Callback: Queue 0x760cdcd00000 aborting with error : HSA_STATUS_ERROR_EXCEPTION: An HSAIL operation resulted in a hardware exception. code: 0x1016 Aborted (core dumped) ``` These failures are linked to `test_sparse_csr.py::TestSparseCSRCUDA::test_select_SparseBSC_int32_cuda_` due to incorrect test log parsing. We will be able to close these issues also: - Fixes https://github.com/pytorch/pytorch/issues/163663 - Fixes https://github.com/pytorch/pytorch/issues/160786 - Fixes https://github.com/pytorch/pytorch/issues/160785 - Fixes https://github.com/pytorch/pytorch/issues/160784 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163848 Approved by: https://github.com/jeffdaily	2025-09-25 16:38:00 +00:00
can-gaa-hou	22d5f5ff94	[OpenReg][BE] Replacing explicit prefix/suffix with CMake variables (#163850 ) As the title states, suffixes like`.dylib` and `lib` can be replaced by `CMAKE_SHARED_LIBRARY_SUFFIX`, and prefixes like `lib` can be replaced by `CMAKE_SHARED_LIBRARY_PREFIX` on Unix or `CMAKE_IMPORT_LIBRARY_PREFIX` on Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163850 Approved by: https://github.com/albanD	2025-09-25 16:33:16 +00:00
fduwjj	c8e75c48b9	[fr] Skip the dtype check for some one to all or all to one collective (#163839 ) As title, in practice we found that sometimes, the dtype of gather does not match when it comes to output among all ranks, which is a undefined behavior. Same with broadcast and scatter. And they are all completed, so we should not think they are errors, we can skip it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163839 Approved by: https://github.com/VieEeEw	2025-09-25 16:02:06 +00:00
Aart J.C. Bik	e8f5f1b1a2	[NFC] fixed mistake in comment (#163697 ) I used "floor" instead of "ceil", so fix it. Also fixed other typo. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163697 Approved by: https://github.com/jcaip	2025-09-25 15:53:51 +00:00
Sherlock Huang	10e69a6e17	Preserve user annotation in graph (#163673 ) ``` import torch import torch.fx.traceback as fx_traceback import torch.export class M(torch.nn.Module): def forward(self, x): with fx_traceback.annotate({"pp_stage": 0}): with fx_traceback.annotate({"fdsp_bucket": 0}): x = x + 1 x = x - 2 with fx_traceback.annotate({"cuda_stream": 2, "fsdp_bucket": 1}): x = x * 2 x = x / 3 return x m = M() with fx_traceback.preserve_node_meta(): ep = torch.export.export(m, (torch.randn(10),)) for node in ep.graph.nodes: if node.op == "call_function": print(f"{node.target}, {node.meta.get("custom", {})}") ``` prints ``` aten.add.Tensor, {'pp_stage': 0, 'fdsp_bucket': 0} aten.sub.Tensor, {'pp_stage': 0} aten.mul.Tensor, {'pp_stage': 0, 'cuda_stream': 2, 'fsdp_bucket': 1} aten.div.Tensor, {} ``` TODOs: - run_decomposition is failing - Need to test with the new full graph capture + aot_export_joint apis - Need to make the annotation propagate through autograd engine to reach the bw nodes. Sample impl here: https://github.com/pytorch/pytorch/pull/83558 - Edward want to restrict the key in custom field to be top-level singleton objects only - also need to take care of metadata merging when passes are fusing nodes Thanks @angelayi for contributing the dynamo fixes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163673 Approved by: https://github.com/albanD, https://github.com/angelayi	2025-09-25 15:50:15 +00:00
Timm Ruland	5fcde74aed	Fix pipeline parallelism not correctly initializing backwards stages when evaluating before training. (#162823 ) Previously, an eval() call before a training step() would not correctly initialize the backward pass of the pipeline stages, leading to errors during the subsequent training step. This PR ensures that the backward stages can still be initialized after an eval() call. Fixes #162822 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162823 Approved by: https://github.com/dcci, https://github.com/H-Huang	2025-09-25 15:13:19 +00:00
Mihai Polceanu	6fa3715c12	Expose Kineto event metadata in PyTorch Profiler events (#161624 ) ## Overview This PR allows the profiler users to access `Kineto` and `TorchOp` metadata in JSON string format through a new `metadata_json` attribute in `FunctionEvent` objects, which is triggered through a new `expose_kineto_event_metadata` flag in `ExperimentalConfig`. ## Testing A unit test was added to validate functionality. ## Documentation Added/updated function doc strings where appropriate. ## Example output ```python import torch from torch.profiler import profile with profile(experimental_config=torch._C._profiler._ExperimentalConfig(expose_kineto_event_metadata=True)) as prof: res = torch.mm(torch.rand(1024, 1024), torch.rand(1024, 1024)) for event in prof.events(): print(f'name: {event.key}, metadata: {event.metadata_json}') ``` ``` name: aten::rand, metadata: "Ev Idx": 0 name: aten::empty, metadata: "Ev Idx": 1 name: aten::uniform_, metadata: "Ev Idx": 2 name: aten::rand, metadata: "Ev Idx": 3 name: aten::empty, metadata: "Ev Idx": 4 name: aten::uniform_, metadata: "Ev Idx": 5 name: aten::mm, metadata: "Ev Idx": 6 name: aten::resolve_conj, metadata: "Ev Idx": 7 name: aten::resolve_conj, metadata: "Ev Idx": 8 name: aten::resolve_conj, metadata: "Ev Idx": 9 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161624 Approved by: https://github.com/sraikund16	2025-09-25 14:58:30 +00:00
atalman	98c4e35f14	[CD] Add statically linked windows libraries to exclude list (#163768 ) Fixes: https://github.com/pytorch/pytorch/issues/159514 Seeing following in the Wheel build logs: ``` Linking CXX static library lib\kineto.lib Linking CXX static library lib\dnnl.lib .... ``` These files are around 800MB uncompressed and 109MB compressed, hence provide ~50% size reduction for Windows CPU builds. Test Plan: Build Pytorch Windows binary. Build vision, audio and torchcodec with this binary. Smoke test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163768 Approved by: https://github.com/albanD, https://github.com/malfet	2025-09-25 14:03:14 +00:00
PyTorch MergeBot	00059db034	Revert "[RELAND] Always build USE_DISTRIBUTED (#160449 ) and Make distributed modules importable even when backend not built (#159889 ) (#162594 )" This reverts commit 09cb34c1dce8fe1b880bbf3115d8ddad3401d871. Reverted https://github.com/pytorch/pytorch/pull/162594 on behalf of https://github.com/malfet due to reverted internally and now can be safely reverted in OSS ([comment](https://github.com/pytorch/pytorch/pull/162594#issuecomment-3334176367))	2025-09-25 13:47:46 +00:00
IvanKobzarev	22fcc8b76b	[async_tp] Support mm+rs with scatter_dim matmul K by sharding B (#162794 ) Current state: Shape mismatch failure when mm+rs on the last mm scatter dim. Adding separate path to handle lastdim for aten.mm, scaled_mm should be handled similarly, but needs additional PR. So disabling scaled_mm case with filter matmul function. Adding inductor.config for this change that is True by default for fast debuggability of new path. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162794 Approved by: https://github.com/fegin	2025-09-25 12:18:39 +00:00
FFFrog	ab2ce3c50e	[Code Clean] Replace std::runtime_error with TORCH_CHECK (#163264 ) Related ISSUE: https://github.com/pytorch/pytorch/issues/148114 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163264 Approved by: https://github.com/albanD, https://github.com/cyyever	2025-09-25 11:28:51 +00:00
Brian Hirsh	7d710403b0	Reapply "Make functionalization `ViewMeta` serializable with pickle. (#143712 )" (#163769 ) ### Summary: NOTE: This is a re-export of https://github.com/pytorch/pytorch/pull/161994 ; the changes between these two PRs is exclusively to the buck/build files (Summary from #161994 ) Attempted rebase of https://github.com/pytorch/pytorch/pull/143712. This reverts commit 6c713ccb5e0df227dd5b630057cbccd373cbe7d6. cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela imported-using-ghimport Test Plan: Imported from OSS Differential Revision: D81524507 Pulled By: Lucaskabela Pull Request resolved: https://github.com/pytorch/pytorch/pull/163769 Approved by: https://github.com/dolpm Co-authored-by: Brian Hirsh <hirsheybar@fb.com>	2025-09-25 10:27:37 +00:00
PaliC	29cbcbac42	[BE] Make PyObjectSlot use a global PyInterpreter (#162659 ) This pr gets rid of the pyobj_interpreter_ variable from PyObjectSlot and saves a word in the process Gonna ask for review from @huydhn as there are some changes to CI. Testing: imported internally and the failed android build seems to work now! Pull Request resolved: https://github.com/pytorch/pytorch/pull/162659 Approved by: https://github.com/albanD, https://github.com/huydhn	2025-09-25 08:53:19 +00:00
Pian Pawakapan	5f90e8c7ae	[PGO] ignore extra PGO key if warm/cold cache present (#163810 ) Summary: avoids PGO profile merges Test Plan: test_pgo Differential Revision: D83200714 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163810 Approved by: https://github.com/bobrenjc93	2025-09-25 07:16:05 +00:00
Klaus Zimmermann	eb7f4e0004	Add PEP 517 compliant Python source distribution to release process (#157815 ) This adds the actual creation of a standards compliant sdist along with its upload to s3 to the create release workflow. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157815 Approved by: https://github.com/malfet, https://github.com/atalman ghstack dependencies: #157814, #160315	2025-09-25 07:15:52 +00:00
Klaus Zimmermann	42928876eb	Add sdist handling to version finding (#160315 ) The version finding logic triggered from `setup.py` generally tries to take the git information into account. This is fine for most situations where we are building from a checkout, but it creates a problem in the case of sdists, as here the version is determined at the time of sdist creation, taking the git information into account, but then later recalculated when building wheels or installing from the sdist, now with the git information missing. The solution is to take the version information directly from the sdist, which this PR adds by means of parsing the `PKG-INFO` which marks an unpacked sdist. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160315 Approved by: https://github.com/atalman ghstack dependencies: #157814	2025-09-25 07:15:51 +00:00
Klaus Zimmermann	c44ec9f4c2	Improve MANIFEST.in for source distribution (#157814 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/157814 Approved by: https://github.com/XuehaiPan, https://github.com/atalman	2025-09-25 07:15:42 +00:00
Pian Pawakapan	353991dd92	[PGO] distinguish sticky PGO put (#163799 ) Summary: put_remote_code_state vs. put_extra_remote_code_state Test Plan: test_pgo Differential Revision: D83195687 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163799 Approved by: https://github.com/bobrenjc93	2025-09-25 06:59:25 +00:00
Filip	2b6a74abf1	[optim] prevent unintended aliasing in lr_scheduler; update type annotations/docs (#163120 ) 1. Prevents unintended aliasing of `self._last_lr`/`get_last_lr(...)` with `group["lr"]` when `group["lr"]` is a tensor. 2. Prevents unintended aliasing of `LRScheduler.base_lrs` with the `group["initial_lr"]`s. 3. Updates `test/optim/test_lrscheduler.py` to test tensor LRs. 4. Changes type annotations for `_last_lr`, `get_last_lr()`, `base_lrs`, `get_lr()`, and `_get_closed_form_lr()` from `list[float]` to `list[float \| Tensor]`; adds documentation. Fixes #163103 LR schedulers can behave in unexpected ways when using a tensor LR due to patterns like this: ```python self._last_lr: list[float] = [group["lr"] for group in self.optimizer.param_groups] ``` This PR adds a helper to address this: ```python def _param_groups_val_list(optimizer: Optimizer, key: str) -> list[Any]: """Create a list containing group[key] for each optimizer param_group. Prevents aliasing when group[key] could be a Tensor. Raises a KeyError when group[key] does not exist. """ return [ group[key].clone() if isinstance(group[key], Tensor) else group[key] for group in optimizer.param_groups ] ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163120 Approved by: https://github.com/janeyx99	2025-09-25 06:58:58 +00:00
Bob Ren	ad869c58f5	remove allow-untyped-defs from ./torch/utils/benchmark/op_fuzzers/sparse_unary.py (#163476 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163476 Approved by: https://github.com/ezyang, https://github.com/Skylion007 ghstack dependencies: #163478, #163475, #163471	2025-09-25 06:48:44 +00:00
Bob Ren	d5afb9e31a	remove allow-untyped-defs from ./torch/ao/quantization/quantizer/utils.py (#163471 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163471 Approved by: https://github.com/Skylion007 ghstack dependencies: #163478, #163475	2025-09-25 06:48:44 +00:00
Bob Ren	e7d6ea65ca	remove allow-untyped-defs from ./torch/nn/utils/_expanded_weights/embedding_expanded_weights.py (#163475 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163475 Approved by: https://github.com/ezyang, https://github.com/Skylion007 ghstack dependencies: #163478	2025-09-25 06:48:44 +00:00
Bob Ren	a6974195da	remove allow-untyped-defs from ./torch/fx/experimental/unification/multipledispatch/core.py (#163478 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163478 Approved by: https://github.com/ezyang	2025-09-25 06:48:44 +00:00
FFFrog	a213848703	[Code Clean] Remove deadcodes about Python3.9 [8/N] (#163728 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163728 Approved by: https://github.com/albanD, https://github.com/cyyever ghstack dependencies: #163626, #163627, #163629, #163643, #163644, #163645, #163646	2025-09-25 05:12:46 +00:00
dolpm	cde5c9aebd	fix pickling for BitwiseFn (#163571 ) Summary: ran into AttributeError: Can't get local object 'make_opaque_bitwise_fn.<locals>.BitwiseFn' looks like it was fixed for UnaryFn but not BitwiseFn in https://github.com/pytorch/pytorch/pull/138395 Fixes #147841 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163571 Approved by: https://github.com/jamesjwu	2025-09-25 04:52:11 +00:00
Sampath Victor	783a9dcb6d	[6/n] Quantization with min & max bounds support - using fbgemm changes in ATen (#162924 ) Summary: This diff uses the FBGEMM changes made in D78181177 & D81858256 to support using the provided per row min/max values while quantizaing float/half to 8-bit, 4-bit & 2-bit in ATen library. Please find more context on this here: https://fburl.com/gdoc/yutf32a0 Test Plan: ``` buck test mode/opt caffe2/torch/fb/model_transform/splitting/tests:split_dispatcher_test ``` https://www.internalfb.com/intern/testinfra/testrun/7881299640979446 Please refer to D80905814's test plan for integration testing. Rollback Plan: Differential Revision: D81327342 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162924 Approved by: https://github.com/jerryzh168	2025-09-25 02:52:04 +00:00
bobrenjc93	ad2f7315ca	[torchfuzz] print out tensor descriptor as comments in codegen (#163739 ) eg. ``` # Node node_12: tensor_pointwise (depth 6) var_node_12 = torch.ops.aten.mul(var_node_13, var_node_34) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_10: tensor_pointwise (depth 7) var_node_10 = torch.ops.aten.div(var_node_11, var_node_12) # size=(1,), stride=(1,), dtype=complex128, device=cuda # Node node_2: tensor_pointwise (depth 8) var_node_2 = torch.ops.aten.div(var_node_3, var_node_10) # size=(1,), stride=(1,), dtype=complex128, device=cuda ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163739 Approved by: https://github.com/pianpwk ghstack dependencies: #163547, #163553, #163554, #163555, #163556, #163557, #163558, #163560, #163698	2025-09-25 01:29:29 +00:00
Nikita Shulga	cc660d38ac	[CI] Install libuv for Win testing (#163797 ) Current working theory why `f0078941cf` caused a regression, are because Windows CI no longer could be build with distributed, as it could not find libuv Pull Request resolved: https://github.com/pytorch/pytorch/pull/163797 Approved by: https://github.com/wdvr	2025-09-25 01:10:14 +00:00
Nikita Shulga	00f96dd84d	[CI] Run CUDA-13 binary builds on trunk (#163787 ) There are numerous other workflows that could be used to catch CUDA-12 build regression (our CI builds are almost identical to CD ones), but not many CUDA-13 builds around, so https://github.com/pytorch/pytorch/issues/163342 are really hard to detect in CI Pull Request resolved: https://github.com/pytorch/pytorch/pull/163787 Approved by: https://github.com/atalman, https://github.com/huydhn	2025-09-25 00:58:17 +00:00
Aaron Pollack	77b9aac6c2	Add rule for typechecking maintainers (#161307 ) Allow the following people merge rights on type checking configs: - @lolpack - @maggiemoss - @ndmitchell - @kinto0 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161307 Approved by: https://github.com/albanD, https://github.com/ezyang	2025-09-25 00:14:31 +00:00
Wei Wang	7163dce1e0	[CUDA] Compare major version of the runtime device arch against the built version of the pytorch binary (#161299 ) Fixes misleading warning messages when running on sm12x devices using binaries built with sm120. PyTorch binary built with sm120 is compatible with e.g. sm121, so no need for the warning of incompatibility. Also allow the 'matched_cuda_warn' message to show when e.g. the user is running a binary built with only sm90 on sm12x, so that the user would be prompted to get a build which supports e.g. sm120. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161299 Approved by: https://github.com/eqy, https://github.com/atalman	2025-09-24 23:59:19 +00:00
Sherlock Huang	4ac4a7351e	Shortcut redistribution when num_shards == 1 (#163742 ) Redistribution doesn't need collectives when num_shards == 1 on a mesh dimension. Only placement update is needed, local_tensor remains unchanged. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163742 Approved by: https://github.com/tianyu-l Co-authored-by: tianyu-l <150487191+tianyu-l@users.noreply.github.com>	2025-09-24 23:49:08 +00:00
Raman Kumar	65ddd91421	Fix redundant H2D/D2H memcpy in cpp_wrapper by creating scalar tensors on CPU (#160584 ) Fixes #160520 Summary: When running Inductor with cpp_wrapper under a DeviceContext, non-tensor arguments were being wrapped with torch.tensor(arg) without specifying the device. creating the tensor on the current active device (like CUDA), and later fetching it back to CPU via .item(), causing unnecessary host-device-host memory transfers. PR fixes issue by explicitly creating scalar tensors on the CPU: ``` input_tensors = [ arg if isinstance(arg, torch.Tensor) else torch.tensor(arg, device='cpu') for arg in args ] ``` impact: inductor, codegen Pull Request resolved: https://github.com/pytorch/pytorch/pull/160584 Approved by: https://github.com/benjaminglass1, https://github.com/desertfire, https://github.com/mlazos, https://github.com/jeffdaily	2025-09-24 23:40:37 +00:00
karthickai	8c98aee436	[Inductor] Update DeviceAssert op to behave like store (#163696 ) Updated the DeviceAssert operation to match the behavior of Store, it will fixes the issue mentioned in [this PR](https://github.com/pytorch/pytorch/pull/163023) and updated testcases as Elias [suggested](https://github.com/pytorch/pytorch/pull/160677#discussion_r2353834646). Pull Request resolved: https://github.com/pytorch/pytorch/pull/163696 Approved by: https://github.com/mlazos	2025-09-24 23:35:56 +00:00
bobrenjc93	d927e55498	[torchfuzz] refactor multi_process_fuzzer to be more readable (#163698 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163698 Approved by: https://github.com/pianpwk ghstack dependencies: #163547, #163553, #163554, #163555, #163556, #163557, #163558, #163560	2025-09-24 23:32:34 +00:00
Maggie Moss	754c7e2e88	Update pyrefly configuration file (#163775 ) Related to: https://github.com/pytorch/pytorch/issues/163283 This simply updates the existing pyrefly configuration and opts out additional directories. Running `pyrefly check` with this setup will result in ~100 errors reported. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163775 Approved by: https://github.com/ezyang, https://github.com/Skylion007	2025-09-24 23:14:39 +00:00
Jithun Nair	0ec946a052	[ROCm] Increase binary build timeout to 5 hours (300 minutes) (#163776 ) Despite narrowing down the [FBGEMM_GENAI build to gfx942](https://github.com/pytorch/pytorch/pull/162648), the nightly builds still timed out because they [didn't get enough time to finish the post-PyTorch-build steps](https://github.com/pytorch/pytorch/actions/runs/17969771026/job/51109432897). This PR increases timeout for ROCm builds for both [libtorch ](https://github.com/pytorch/pytorch/actions/runs/17969771026)and [manywheel](https://github.com/pytorch/pytorch/actions/runs/17969771041), because both of those are close to the 4hr mark currently. This PR is a more ROCm-targeted version of https://github.com/pytorch/pytorch/pull/162880 (which is for release/2.9 branch). Pull Request resolved: https://github.com/pytorch/pytorch/pull/163776 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-24 23:02:08 +00:00
Rob Timpe	2b1236de61	[dynamo] Fix handling of kwargs in exception constructor (#163390 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163390 Approved by: https://github.com/guilhermeleobas	2025-09-24 22:44:14 +00:00
Haifeng Jin	bc8680c298	Avoid `at::alias` in the `repeat` op implementation (#163455 ) Avoid `at::alias` in the `repeat` op implementation ## Summary This PR removed the usage of `at::alias` in the implementation and just `permute`+`reshape` the tensor to fit the specs of the result. This is a less hacky and a more readable way of implementing the op. All the new ops we are using are view-only ops, which does not introduce overhead of changing the storage. ## Who want this We are using `PrivateUse1` and accelerator, but this request to avoid `at::alias` in any op should be general enough for any backend who is using XLA, or who do not have explicit control over the memory allocation on the devices. ## Why we/they need this As we support TPU, we are overriding some ATen ops by binding them to PrivateUse1. However, it is not recommended to override the `repeat` op directly as we saw the following in `RegistrationDeclaration.h`. ``` at::Tensor repeat(const at::Tensor & self, c10::SymIntArrayRef repeats); // {"schema": "aten::repeat(Tensor self, SymInt[] repeats) -> Tensor", "dispatch": "True", "default": "True"} ``` We had to reuse the existing implementation of `repeat` to decomposite to other ops. However, we are unable to support the current implementation, which uses `at::alias`. It have two tensors share the same storage and modify one of them and return the other assuming it is changed, too. As, we do not have explicit control over the memory allocation of the tensors using XLA/PJRT. ## Alternatives We are open to alternative solutions that work for us if this PR is not in favor of the PyTorch community. For example, we may just bind our version of `repeat` op implementation to both `PrivateUse` and `AutogradPrivateUse1`. However, to my understanding, this would not work well with torch dynamo and `torch.compile`. Would you mind guiding us on how to solve this? Thanks! Pull Request resolved: https://github.com/pytorch/pytorch/pull/163455 Approved by: https://github.com/Skylion007	2025-09-24 22:28:24 +00:00
Andrey Talman	1495b35d29	Remove Python 3.9 for Triton builds (#163778 ) Related to https://github.com/pytorch/pytorch/issues/161167 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163778 Approved by: https://github.com/malfet	2025-09-24 20:19:43 +00:00
zeshengzong	90a282504e	Add `inference_mode` hint message to use `eval` with inference. (#163619 ) Fixes #162923 ## Test Result ### Before <img width="985" height="889" alt="image" src="https://github.com/user-attachments/assets/41de5cfa-7b25-4ba4-ade8-a6df745dcb30" /> ### After <img width="913" height="977" alt="image" src="https://github.com/user-attachments/assets/b6c06860-8db3-4b5d-9d46-31ece01fb04d" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163619 Approved by: https://github.com/jbschlosser	2025-09-24 20:07:14 +00:00
Jeff Daily	0dce2afd44	[ROCm][CI] adjust tf32 tolerance for test_compile_kernel_advanced (#163783 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/163783 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-24 19:39:15 +00:00
Natalia Gimelshein	71eec6a0bf	[dist] handle discontiguous allgather/reducescatter inputs (#163712 ) Fixes #163483 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163712 Approved by: https://github.com/ezyang, https://github.com/kwen2501	2025-09-24 19:38:44 +00:00
Xu Han	0456b23b77	[AOTI] Add verbose error information for extract file (#163718 ) This PR optimize `extract_file` functions: 1. `normalize_path_separator` the dest path for Windows. 2. Add verbose error message: a. On Linux, add mz_zip error string. b. On Windows, add mz_zip error string and Windows error code. For the UT `test_package_user_managed_weight`: <img width="1910" height="442" alt="image" src="https://github.com/user-attachments/assets/6a63eda1-70ce-40fb-9681-adc955463884" /> It still have issue with error code `32`, checked https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499- and find the verbose is `ERROR_SHARING_VIOLATION`. It is a little complex to debug, I will continue to working on it in further PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163718 Approved by: https://github.com/desertfire	2025-09-24 19:27:30 +00:00
Benji Beck	c414f75c8b	[WOQ][Inductor] Enable CUDA coverage for _weight_int8pack_mm (#163461 ) Summary: What: Unskip the CUDA path for test_int8_weight_only_quant in test_torchinductor.py as the kernel was added by #159325. Why: Confirm CUDA backend for _weight_int8pack_mm is registered. Test Plan: ``` buck2 test 'fbcode//mode/opt' fbcode//caffe2/test/inductor:test_inductor_cuda ``` https://www.internalfb.com/intern/testinfra/testrun/2533275104869494 Differential Revision: D82926440 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163461 Approved by: https://github.com/jerryzh168	2025-09-24 19:20:38 +00:00
PaulZhang12	768361e67f	Add less warps config to inner reductions (#162447 ) Add less warps to ensure proper vectorization + memory coalescing for inner reductions, prefer more work per thread <img width="1717" height="731" alt="Screenshot 2025-09-17 at 10 03 25 AM" src="https://github.com/user-attachments/assets/7b1f4a30-62f2-4bee-bb9c-122501bde63e" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162447 Approved by: https://github.com/v0i0, https://github.com/eellison, https://github.com/shunting314	2025-09-24 19:09:02 +00:00
Nan Zhang	9341ede617	Revert to old behaviour of not padding strides if shape or stride is dynamic (#163639 ) Differential Revision: D83053287 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163639 Approved by: https://github.com/blaine-rister	2025-09-24 18:31:01 +00:00
Sherlock Huang	4c2c401ccf	Record redistribute_local_tensor in DebugMode (#163704 ) Explicit redistribute_local_tensor API call could also results in communication, record it! Pull Request resolved: https://github.com/pytorch/pytorch/pull/163704 Approved by: https://github.com/ezyang	2025-09-24 16:11:26 +00:00
Frank Lin	5d0f639234	Make `Tensor.__dlpack__(stream=None)` capture-safe during CUDA Graph capture (#163242 ) Many extensions (including pybind helpers) call `Tensor.__dlpack__()` without a stream argument. Before #150217, `stream=None` behaved like “no cross-stream sync” and was safe inside CUDA Graph capture. After #150217, `stream=None` maps to the legacy default stream, adding a cross-stream wait that invalidates capture when running on a non-default stream. See this example ``` import torch s = torch.cuda.Stream() x = torch.randn(8, device="cuda") g = torch.cuda.CUDAGraph() with torch.cuda.stream(s): with torch.cuda.graph(g): _ = x + 1 cap = x.__dlpack__() _ = torch.utils.dlpack.from_dlpack(cap) ``` This PR partially reverts #150217 that stream=None defaults to no sync. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163242 Approved by: https://github.com/ngimel	2025-09-24 16:04:19 +00:00
atalman	9d0d98acfe	Use cuda nvrtc so file based on cuda version used by torch (#163642 ) Fixes https://github.com/pytorch/pytorch/issues/162367 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163642 Approved by: https://github.com/msaroufim	2025-09-24 14:23:39 +00:00
Angel Li	3b73841f43	update test_quantization tests to run weekly (#163077 ) Fixes #162854 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163077 Approved by: https://github.com/huydhn	2025-09-24 11:31:11 +00:00
atalman	141fc7276e	[CD] CUDA 13.0 fix preload logic to include nvidia/cu13/lib/ (#163661 ) Preload logic no longer works with CUDA 13.0 See the installation path: ``` ls /home/ubuntu/.venv/lib/python3.10/site-packages/nvidia/cu13/lib/ libcheckpoint.so libcudadevrt.a libcufft.so.12 libcufile_rdma.so.1 libcusolver.so.12 libnvJitLink.so.13 libnvperf_target.so libnvrtc.alt.so.13 libpcsamplingutil.so libcublas.so.13 libcudart.so.13 libcufftw.so.12 libcupti.so.13 libcusolverMg.so.12 libnvblas.so.13 libnvrtc-builtins.alt.so.13.0 libnvrtc.so.13 libcublasLt.so.13 libcudart_static.a libcufile.so.0 libcurand.so.10 libcusparse.so.12 libnvperf_host.so libnvrtc-builtins.so.13.0 libnvtx3interop.so.1 ls /home/ubuntu/.venv/lib/python3.10/site-packages/nvidia/ cu13 cudnn cusparselt nccl nvshmem ``` Test using script from : https://github.com/pytorch/pytorch/issues/162367 ``` Kernel test passed! ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163661 Approved by: https://github.com/nWEIdia, https://github.com/tinglvv, https://github.com/Camyll	2025-09-24 11:27:05 +00:00
Robert Hardwick	b66aa1ade1	[ARM] Add test_memory_profiler to aarch64 tests (#145260 ) TestMemoryProfilerE2E.test_memory_timeline is failing on AArch64, this fixes it and enables it in the opt-in list of tests for AArch64. Fixes #142371 Pull Request resolved: https://github.com/pytorch/pytorch/pull/145260 Approved by: https://github.com/fadara01, https://github.com/sraikund16	2025-09-24 09:29:13 +00:00
Nick Riasanovsky	207f104594	[Triton] [Inductor] Set default configs for Blackwell Matmul Template (#163740 ) Summary: Sets the default configs for the Blackwell Matmul Templates. Test Plan: NFC Differential Revision: D83116342 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163740 Approved by: https://github.com/jananisriram	2025-09-24 08:17:35 +00:00
Jason Ansel	3e1b1a30f2	Revert "[inductor] Fix issue with scalar arg handling" (#163737 ) This reverts commit a8cd437183142e17ba6fc8d7b5e9dcee462d7904. See https://github.com/pytorch/pytorch/pull/163481#issuecomment-3326310774 This PR might also cause issues with cudagraphs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163737 Approved by: https://github.com/ezyang ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422, #163481, #163520, #163482	2025-09-24 07:33:12 +00:00
FFFrog	2390d34c9b	[Code Clean] Remove deadcodes about Python3.9 [7/N] (#163646 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163646 Approved by: https://github.com/jansel ghstack dependencies: #163626, #163627, #163629, #163643, #163644, #163645	2025-09-24 07:30:50 +00:00
FFFrog	a635505a99	[Code Clean] Remove deadcodes about Python3.9 [6/N] (#163645 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163645 Approved by: https://github.com/albanD ghstack dependencies: #163626, #163627, #163629, #163643, #163644	2025-09-24 07:30:50 +00:00
FFFrog	6f34cc040f	[Code Clean] Remove deadcodes about Python3.9 [5/N] (#163644 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163644 Approved by: https://github.com/jansel ghstack dependencies: #163626, #163627, #163629, #163643	2025-09-24 07:30:50 +00:00
FFFrog	ec0cd81c38	[Code Clean] Remove deadcodes about Python3.9 [4/N] (#163643 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163643 Approved by: https://github.com/albanD ghstack dependencies: #163626, #163627, #163629	2025-09-24 07:30:50 +00:00
FFFrog	33aabdd8ac	[Code Clean] Remove deadcodes about Python3.9 [3/N] (#163629 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163629 Approved by: https://github.com/albanD ghstack dependencies: #163626, #163627	2025-09-24 07:30:50 +00:00
FFFrog	0bca77951d	[Code Clean] Remove deadcodes about Python3.9 [2/N] (#163627 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163627 Approved by: https://github.com/jansel ghstack dependencies: #163626	2025-09-24 07:30:50 +00:00
FFFrog	bf0747c6c6	[Code Clean] Remove deadcodes about Python3.9 [1/N] (#163626 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163626 Approved by: https://github.com/Skylion007, https://github.com/albanD	2025-09-24 07:30:50 +00:00
Ke Wen	11a231ef52	[c10d] P2P tensors must be dense (#163719 ) Fixes #161324 by adding `is_non_overlapping_and_dense` check. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163719 Approved by: https://github.com/ngimel	2025-09-24 06:58:03 +00:00
angelayi	dad54ca7c0	Add mistral/gpt-oss to benchmarks (#163565 ) Potential issues * gpt-oss-20b is probably too big (I can't run on my devserver) * Mistral requires HF authentication * Mistral also takes a while to run the performance checks (need to wait for CI) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163565 Approved by: https://github.com/huydhn	2025-09-24 06:12:36 +00:00
Edward Yang	2c5a3d7e60	Delete functorch C extension entirely. (#163340 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163340 Approved by: https://github.com/aorenste, https://github.com/wdvr, https://github.com/albanD, https://github.com/malfet	2025-09-24 06:08:58 +00:00
Blaine Burton Rister	f68de58c9d	[Inductor-FX] Support symbol and dynamic scalar graph inputs and outputs (#163596 ) # Problems This PR fixes a few edge cases that the FX converter missed related to dynamic shapes. 1. Inductor graphs can sometimes take `sympy.Symbol` inputs. We have logic to convert these to FX placeholder nodes. However, this logic did not update the `self.expr_to_proxy` table mapping symbols to proxy nodes. (There was existing logic to do this for `ir.TensorBox` inputs, but not `sympy.Symbol`.) This caused sympy tracing to fail when these symbol inputs were used in other expressions. 2. We lacked codegen for `ShapeAsConstantBuffer`. This IR node is seen when the graph input or output is a scalar computed from dynamic shapes. # Fixes a. Update `self.expr_to_proxy` when generating placeholders for `sympy.Symbol` inputs. Change `SymbolBuffer.get_example` to convert the symbol to a `torch.SymInt`, so we can populate `meta["val"]` correctly and use the value in other computations. b. Support `ShapeAsConstantBuffer` by tracing the sympy expression. c. Move output generation inside the metadata hook, allowing us to populate `meta["val"]` for the nodes computing `ShapeAsConstantBuffer`. # Test plan Added several new CI tests: 1. `torch.cond` with dynamic shapes. This exposes both issues, as the predicate is a `ShapeAsConstantBuffer` and one of the subgraphs uses a symbol input, due to the closure. Also tests when the parent and subgraphs have different input shapes. 2. Output dynamic shape scalar. This tests `ShapeAsConstantBuffer` as an output. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163596 Approved by: https://github.com/angelayi, https://github.com/jansel	2025-09-24 06:08:14 +00:00
Shunting Zhang	a8e9ed2407	[inductor] turn on loaf (for oss) by default (#162030 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162030 Approved by: https://github.com/eellison, https://github.com/jansel	2025-09-24 06:02:02 +00:00
Nick Riasanovsky	0390798dad	[Triton] [Inductor] Enable Epilogue Subtiling in the blackwell ws template (#163145 ) Summary: Enables support for epilogue subtiling in the blackwell ws template. This requires the ability to call `store_output` twice in the same kernel and reuse the same tensor descriptor across allocations. Test Plan: Tested with test_max_autotune.py on a Blackwell server. Rollback Plan: Differential Revision: D82610077 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163145 Approved by: https://github.com/eellison	2025-09-24 05:38:02 +00:00
Simon Fan	124dd364e9	[hop] support local_map + SAC (#163322 ) Some ops like local_map hop's deferred mode are not desugared by make_fx, this means that when we apply SAC tags, we will need to define dispatch rules for the SAC torch dispatch modes as pointed out here: https://github.com/pytorch/pytorch/issues/162246#issuecomment-3259176721. This PR adds those rules. Additionally it fixes a pre-existing issue where we weren't coercing tangent layout (that AOTAutograd typically does) when partitioning the HOP joint. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163322 Approved by: https://github.com/ezyang	2025-09-24 04:57:40 +00:00
orangeH25	20eeb54814	Add api info for torch._C._nn.pyi (#162936 ) Fix part of #148404 APis involved are as followed: - silu - silu_ - smooth_l1_loss - soft_margin_loss Pull Request resolved: https://github.com/pytorch/pytorch/pull/162936 Approved by: https://github.com/FFFrog, https://github.com/ezyang	2025-09-24 04:55:57 +00:00
PyTorch UpdateBot	6f1d962d5b	[vllm hash update] update the pinned vllm hash (#163711 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163711 Approved by: https://github.com/pytorchbot	2025-09-24 04:31:37 +00:00
Eli Uriegas	42e9902a0f	cd: Move arm64 to linux.arm64.r7g.12xlarge.memory (#163681 ) This should reduce the amount of build time we have by a lot by just throwing more hardware at the problem. Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163681 Approved by: https://github.com/huydhn, https://github.com/atalman, https://github.com/malfet	2025-09-24 04:06:09 +00:00
Jason Ansel	d746b987d8	[inductor] Fix divmod error in decomp (#163482 ) Fixes #163457 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163482 Approved by: https://github.com/eellison ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422, #163481, #163520	2025-09-24 02:52:36 +00:00
Jason Ansel	6fa972796e	[inductor] Fix bugs in emulate_precision_casts (#163520 ) Fixes #163449 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163520 Approved by: https://github.com/eellison ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422, #163481	2025-09-24 02:52:36 +00:00
Jason Ansel	ca512af3e7	[inductor] Fix issue with scalar arg handling (#163481 ) Fixes #163420 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163481 Approved by: https://github.com/eellison ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412, #163422	2025-09-24 02:52:36 +00:00
Edward Z. Yang	c261c71f3e	Simplify _compute_local_shape_and_global_offset and make it SPMD. (#163344 ) There is only one substantive change: the branch on `global_offset[shard_dim] <= local_offset[shard_dim]` is removed because it is unnecessary: you can always treat the first shard uniformly with the rest of the shards, because your global offset is guaranteed to be zero in this case anyway. I also switch the shard_size case to sym_ite, to make it possible for LocalTensor to deal with the MPMD-ness here, but it's equivalent to the old if-then-else. I tried to rewrite the comments to be more clear what is going on algorithmically here. Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163344 Approved by: https://github.com/albanD, https://github.com/zpcore, https://github.com/tianyu-l	2025-09-24 02:24:09 +00:00
drisspg	e2ce79e4cc	[Flex] Fix silent correctness w/ backpropping grads (#163677 ) Fixes #https://github.com/pytorch/pytorch/issues/162228 # Summary Majority of our tests are only compiling flex-attention in isolation. This means that for fake tensor propagation the input primals and all captured buffers dont do any intermediate computation below autograd. As a result result the by happen chance match the `require_grad`ness of the eager implementation and this check will pass. However if score_mod is a the result of some other intermediate fake tensor prop then it is not guaranteed to have accurate req_gradness, which was happening here. TLDR is that this was a boot and suspenders that was actually harmful and we should just let the joint graph handle creating the correct joint graph Pull Request resolved: https://github.com/pytorch/pytorch/pull/163677 Approved by: https://github.com/ydwu4	2025-09-24 02:12:19 +00:00
Bin Bao	be6c127927	[AOTI] Pass comments from metadata to the autotune block (#163600 ) Summary: When generating Triton kernels in the compile-time autotune blocks, it will be useful to generate source information as code comments. Previously we ignore these comments for autotune code blocks because the generated main output code will contain the same information, but it won't work if the generated autotune code crashes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163600 Approved by: https://github.com/yushangdi	2025-09-24 02:01:59 +00:00
Avik Chaudhuri	1e754d5a80	docs and optional kwargs for full graph capture (#163550 ) Test Plan: existing tests Differential Revision: D82995546 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163550 Approved by: https://github.com/tugsbayasgalan	2025-09-24 01:20:27 +00:00
Nick Riasanovsky	dc9352938b	[Triton] [Inductor] Restrict subprocess autotuning to just Triton (#162688 ) Summary: Restricts subprocess benchmarking to only `TritonTemplateCaller`, which is expected by the underlying `target` method. THhis triggered a bug with large K shapes because the decompose k is `SubgraphChoiceCaller`. Test Plan: mm autotuning with a large k and `TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC=1` Rollback Plan: Differential Revision: D82181924 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162688 Approved by: https://github.com/PaulZhang12, https://github.com/eellison, https://github.com/mlazos	2025-09-24 01:03:40 +00:00
Yuanyuan Chen	4535254c28	[3/N] Use std::filesystem in inductor (#163632 ) Continued work to use std::fs in inductor. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163632 Approved by: https://github.com/Skylion007	2025-09-24 00:23:34 +00:00
Markus Hoehnerbach	eb3fbf5b08	[inductor] in emulate_precision_casts, disable fma fusion in triton (#163073 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163073 Approved by: https://github.com/eellison, https://github.com/jansel	2025-09-23 23:59:17 +00:00
Suryadev Sahadevan Rajesh	ee75c3d91f	Support for amin, amax, and aminmax (#163669 ) Support for amin, amax, and aminmax Test Plan: E2E tests in the stack with benchmark suite passes. Differential Revision: D83016894 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163669 Approved by: https://github.com/albanD, https://github.com/malfet	2025-09-23 23:45:43 +00:00
Nikita Shulga	f9fa138a39	[BE] Delete all pre py-3.10 checks (#163653 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163653 Approved by: https://github.com/jansel ghstack dependencies: #163648, #163649	2025-09-23 23:22:53 +00:00
drisspg	f3f67ff43a	Fix warn message (#163578 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163578 Approved by: https://github.com/albanD, https://github.com/Skylion007, https://github.com/atalman, https://github.com/v0i0	2025-09-23 22:46:51 +00:00
Mu-Chu Lee	6b5ad5f211	[Kineto] Add list of string parsing for profiler (#163593 ) Summary: We add the parsing for list of string. This is needed for AOTInductor profiling for input information of Triton kernels. Test Plan: Included in commit. test_profiler_op_event_kwargs_list_of_strings Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/163593 Approved by: https://github.com/sraikund16	2025-09-23 22:45:49 +00:00
Kurt Mohler	20149080f2	[MPS] Compute `offset2bag/bag_size/max_indices` in `_embedding_bag` (#163281 ) Part of #162270 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163281 Approved by: https://github.com/malfet	2025-09-23 22:30:48 +00:00
Jeff Daily	b879ef7c0d	[ROCm][CI] skip TestCudaPrimaryCtx.test_set_device_0 (#163693 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/163693 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-23 22:15:10 +00:00
eellison	c63e417c79	use reduction hint for aggressive rblock (#163371 ) I had been using tiling scores to essentially check if this is an inner reduction. since that is not fully rolled out for dynamic shapes, use reduction hint when they are not available. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163371 Approved by: https://github.com/PaulZhang12	2025-09-23 22:04:22 +00:00
bobrenjc93	c3d9f089d9	[torchfuzz] introduce multi process fuzzer (#163560 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163560 Approved by: https://github.com/laithsakka ghstack dependencies: #163547, #163553, #163554, #163555, #163556, #163557, #163558	2025-09-23 22:00:51 +00:00
eellison	29af25844b	Less aggressive persistent reduction when it could induce large masking with dynamic shapes (#163365 ) As per comment in source code: ``` # If we are are coalescing on xblock (not ReductionHint.INNER) and this is not a tiny kernel # (not ReductionHint.OUTER_TINY), do not use persistent reduction if it induces tile # quantization. Peristent reduction forces rblock == rnumel, if the bounds between lower # and upper are large, for the lower values we will be masking off large % of read/writes, # when we could expand the coalescing xblock instead. ``` For the test case in question, this pr improves perf from 0.8573521325143717 -> 0.043151492193814305 because we were egregiously masking out rblock values (58/64 values). Differential Revision: [D82853279](https://our.internmc.facebook.com/intern/diff/D82853279) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163365 Approved by: https://github.com/shunting314, https://github.com/PaulZhang12, https://github.com/jansel, https://github.com/v0i0	2025-09-23 21:58:57 +00:00
Svetlana Karslioglu	8c8416b021	Update pytorch.org links in docs/conf.py (#163682 ) Update links in conf.py to docs.pytorch.org Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/163682 Approved by: https://github.com/sekyondaMeta, https://github.com/albanD	2025-09-23 21:40:11 +00:00
Bob Ren	b182365660	[ez] use list initializer syntax in fill_diagonal_ (#163607 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163607 Approved by: https://github.com/Skylion007 ghstack dependencies: #163485	2025-09-23 21:27:12 +00:00
bobrenjc93	5ca563ea09	symintify fill_diagonol_ (#163485 ) Fixes #162271 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163485 Approved by: https://github.com/Skylion007	2025-09-23 21:27:12 +00:00
Tugsbayasgalan Manlaibaatar	e671dcc969	Update tests to check for more robust pattern (#163107 ) Landing this instead of https://github.com/pytorch/pytorch/pull/162994. Here is how i think the whole dynamo + frame construction logic work: 1) There is no way to create a frame object in python land as this is created in runtime from cpython. So that's why aot_compile creates FrameInfo this way. (kind of like simulating the runtime) i guess you could write your own very simple eval_frame.c where you can interject the frame construction but we probably don't want that. 2) When there is no wrapper (the old export or aot_compile), we first assign sources by iterating over f_locals which contain both local args and closure variables (this is implementation details of cpython frame construction). So thats why closure variables end up getting LocalSource names as can be shown in this test case (`f6ea41ead2/test/export/test_export.py (L1369)`). Note that L["self"] here means we are referring to local object self. Important thing to keep in mind here is this self is not actually model self, but the outer self. 3) When we switch to wrapper case, we end up trying to inline the original inner module. When doing so, we need to track all local and closures for this inner module as can be seen here (`f6ea41ead2/torch/_dynamo/variables/functions.py (L463)`) Here we are not looking into inner frame's f_locals but just directly look at closures. I guess this is because we are one more frame up so there is no access to frame f_locals at this point. And it is probably not good idea to change dynamo's logic here. As a result, i get following error message that is different from old export: "While exporting, we found certain side effects happened in the model.forward. Here are the list of potential sources you can double check: ["L['self']._export_root.forward.__func__.__closure__[1].cell_contents.bank", "L['self']._export_root.forward.__func__.__closure__[1].cell_contents.bank_dict", "L['self']._export_root.forward.__func__.__closure__[0].cell_contents"]" My initial attempt of solving this was taking inner closures and put them to f_locals for the frame i am constructing which turned out too compilcated because we needed to muck around bytecode instructions as well. So i am thinking we should just update the test to reflect new names and follow up with better post-processing step to have better names. Differential Revision: [D82582029](https://our.internmc.facebook.com/intern/diff/D82582029) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163107 Approved by: https://github.com/avikchaudhuri	2025-09-23 21:11:48 +00:00
Mark Saroufim	fc84743707	Implement CUDA stream protocol (#163614 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/163614 Approved by: https://github.com/eqy	2025-09-23 21:02:08 +00:00
Pian Pawakapan	2a9745de3c	[multi-kernel] shape-similarity kernel selection (#163090 ) Introduces a variant of size-hint multi-kernel, where for novel runtime shapes, instead of performing full benchmarking to determine the optimal kernel, selects one of many kernels pre-generated from multi-kernel hints, based off similarity b/w hint / runtime input & output shapes (L1 distance in log2 space). Some caveats/changes: - Size-hint multi-kernel now only kicks in if the kernel has dynamic shapes - Pre-generation still only does 1-d search over specified hints, e.g. `matmul([s0, s1], [s1, s2])` with size-hints `[64, 256]` only generates 2 kernels - based on tuning shapes ([64, 64], [64, 64]) and ([256, 256], [256, 256]). Extending this to reasonable n-d search (via user API?) is an extension Benchmarking results, compared to multi-kernel w/ full benchmarking (hints 64, 4096), and compiling with the ground truth hint: <img width="1902" height="1222" alt="550541081_1088709150049684_6528797079439730237_n" src="https://github.com/user-attachments/assets/056cca48-c16a-4451-9b4a-fa13a7a058a9" /> Full benchmarking doing worse is extremely weird, but we did see similar spikes in #156628 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163090 Approved by: https://github.com/bobrenjc93	2025-09-23 21:00:47 +00:00
PaulZhang12	22c5e8c17c	Add num_store to inductor_meta and use it to scale persistent reduction x block (#162446 ) Scale up XBLOCK for contiguous persistent reductions based on rnumel and number of loads + stores <img width="928" height="656" alt="Screenshot 2025-09-18 at 5 02 57 PM" src="https://github.com/user-attachments/assets/ec3c561f-2a3f-4459-9e14-653715898da3" /> Differential Revision: [](https://our.internmc.facebook.com/intern/diff/) Differential Revision: [](https://our.internmc.facebook.com/intern/diff/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162446 Approved by: https://github.com/v0i0, https://github.com/eellison, https://github.com/shunting314 ghstack dependencies: #162296	2025-09-23 20:36:39 +00:00
Jithun Nair	bcb893acb0	[ROCm] Build FBGEMM_GENAI for gfx942 only (#162648 ) Fixes build timeouts >4h on libtorch build jobs: `75e7f49f9c/1` Brings back code to narrow down CK compilation targets from `69a25f6888 (diff-ce80f3115ab2f6be5142f0678a1fc92c6b2d7727766ce44f48726c99e720f777)` gfx942 supports fp8 Don't enable gfx950 for now, until more optimizations are in place as per https://github.com/pytorch/pytorch/pull/162648/files#r2369588738 Validation: [rocm6.4](https://github.com/pytorch/pytorch/actions/runs/17944766350/job/51028483128) and [rocm6.3](https://github.com/pytorch/pytorch/actions/runs/17944766350/job/51028483093) libtorch builds finished within 3.9h. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162648 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-23 18:55:35 +00:00
Blaine Burton Rister	8e6b0c71fb	[Inductor] Remove `no_type_check` annotation on properties (#163570 ) Some properties with `cache_on_self` were prevously annotated with `no_type_check`, to get around mypy limitations. This PR replaces both annotations with `cache_property_on_self`, to enable type checking. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163570 Approved by: https://github.com/mlazos, https://github.com/PaulZhang12, https://github.com/Skylion007	2025-09-23 18:20:04 +00:00
Nikita Shulga	0696a4b0b8	[EZ] Perma-ignore UP038 (#163649 ) As it has been removed, see https://docs.astral.sh/ruff/rules/non-pep604-isinstance/ Pull Request resolved: https://github.com/pytorch/pytorch/pull/163649 Approved by: https://github.com/Skylion007 ghstack dependencies: #163648	2025-09-23 17:58:18 +00:00
Nikita Shulga	ca35dc2fdd	[EZ] Fix UP041 violations (#163648 ) I.e. use `TimeoutError` instead of `socket.timeout` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163648 Approved by: https://github.com/cyyever, https://github.com/Skylion007	2025-09-23 17:58:18 +00:00
Raman-RH	649ceda8a5	[export] handling NamedTuple inputs (#162959 ) Fixes #160547 ### Summary: bug ``` def test_namedtuple(self): from collections import namedtuple Point = namedtuple('Point', 'x y') class M(torch.nn.Module): def forward(self, x, y): return x + y inp = Point(torch.ones(3), torch.ones(3)) print(M()(*inp)) # errors ep = torch.export.export(M(), inp, strict=False) print(ep) # succeeds ep = torch.export.export(M(), inp, strict=True) print(ep) # workaround could be to convert namedtuple to a kwarg inp_kwargs = {field: getattr(inp, field) for field in inp._fields} ep = torch.export.export(M(), (), inp_kwargs) print(ep) ``` FIx : namedtuple is subclass of tuple but namedtuple is not expected So, this change handles named tuple case I have added 🧪 test case for this as well Pull Request resolved: https://github.com/pytorch/pytorch/pull/162959 Approved by: https://github.com/angelayi Co-authored-by: Angela Yi <angelayi@meta.com>	2025-09-23 17:43:50 +00:00
Jerry Mannil	2aadcea05c	[ROCm] Improve perf for elementwise broadcast with mixed dtype (#163562 ) * Unroll loops manually to hide memory access latency Co-author: @amd-hhashemi Pull Request resolved: https://github.com/pytorch/pytorch/pull/163562 Approved by: https://github.com/jeffdaily	2025-09-23 17:42:48 +00:00
Xu Han	fde929c8a8	[AOTI] Fix model_package_loader get_cpp_compile_command (#163561 ) It should fix AOTI UTs of `test_aot_inductor_package.py`, these cases are failed at `compile_so`. reproducer: ```cmd pytest test\inductor\test_aot_inductor_package.py -v -k test_multiple_methods ``` <img width="1262" height="95" alt="image" src="https://github.com/user-attachments/assets/49458536-1cfe-498e-a12a-2bfd8da67a9e" /> Major fix at `get_cpp_compile_command`. The code is aligned to cpp_builder frontend code: `3ef1bef36c/torch/_inductor/cpp_builder.py (L1780-L1790)` `3ef1bef36c/torch/_inductor/cpp_builder.py (L1959-L1976)` Fixed on Windows: <img width="1261" height="89" alt="Image" src="https://github.com/user-attachments/assets/9bf43b11-aac1-4161-a625-e602e313a299" /> Also validated on Linux: <img width="1039" height="81" alt="Image" src="https://github.com/user-attachments/assets/46063e16-6cf1-4a28-8466-0496871b8619" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163561 Approved by: https://github.com/jansel	2025-09-23 17:38:18 +00:00
Saurabh Mishra	134dfbeaef	[DCP] DTensor slice dequantization with proper block alignment (#163532 ) Summary: When loading quantized tensors with DTensor slicing, the dequantization process was producing numerically incorrect results due to improper block-to-slice coordinate mapping. The previous implementation calculated block boundaries relative to the sliced tensor dimensions instead of the original full tensor dimensions, causing scale factors to be applied to wrong tensor regions. This fix addresses the issue by: 1. Proper coordinate mapping: Added `_get_slice_to_block_mapping()` to correctly map tensor slices to quantization blocks using global coordinates from the full tensor shape. 3. Block-aligned dequantization: Updated `_dequantize_tensor()` to use proper block intersection logic, ensuring scale factors are applied to the correct portions of sliced tensors. The fix ensures that when DTensor requests a slice of a quantized tensor, the dequantization correctly identifies which quantization blocks intersect with the requested slice and applies the appropriate scale factors to the right tensor regions. Test Plan: Tested with DTensor configurations where quantized tensors are sliced across different dimensions. Verified that: 1. Dequantized tensor values are numerically correct 2. Block boundaries are properly calculated relative to full tensor shape 3. Scale factors are applied to correct tensor regions 4. Tensor shapes map is built efficiently using only metadata Correctness validation using https://github.com/wwwjn/torchtitan/blob/dsv3-sd-test/tests/fsdp_dequantized_load.py ``` { "model.layers.0.mlp.gate_proj.weight": { "mse": 4.30626645453458e-11, "mae": 9.98388827611052e-07, "max_abs_diff": 0.0009703934192657471, "cosine_similarity": 1.010810375213623, "relative_error": 0.001330620958469808, "kl_divergence_1_to_2": "6.563401e-08", "kl_divergence_2_to_1": "-6.522914e-08", "js_divergence": 1.3711876079014476e-10, "shape": [ 18432, 7168 ], "t1_stats": { "min": -0.4453125, "max": 0.30859375, "mean": -1.2592146958922967e-05 }, "t2_stats": { "min": -0.44529813528060913, "max": 0.3085886240005493, "mean": -1.2624391274584923e-05 } }, "model.layers.0.mlp.up_proj.weight": { "mse": 2.5534721906361746e-11, "mae": 3.118609583907528e-06, "max_abs_diff": 0.00047551095485687256, "cosine_similarity": 1.038962483406067, "relative_error": 0.0013681650161743164, "kl_divergence_1_to_2": "-5.8253768e-08", "kl_divergence_2_to_1": "5.8747577e-08", "js_divergence": NaN, "shape": [ 18432, 7168 ], "t1_stats": { "min": -0.228515625, "max": 0.2333984375, "mean": 8.862222955485777e-08 }, "t2_stats": { "min": -0.2285017967224121, "max": 0.23338991403579712, "mean": 8.824501662729745e-08 } }, "model.layers.0.mlp.down_proj.weight": { "mse": 2.2803769289536646e-11, "mae": 2.8916260816913564e-06, "max_abs_diff": 0.0008973777294158936, "cosine_similarity": 1.0376262664794922, "relative_error": 0.001346255769021809, "kl_divergence_1_to_2": "1.2744896e-07", "kl_divergence_2_to_1": "-1.2736885e-07", "js_divergence": 5.992362162032805e-11, "shape": [ 7168, 18432 ], "t1_stats": { "min": -0.54296875, "max": 0.546875, "mean": -2.9487239316949854e-07 }, "t2_stats": { "min": -0.5429964661598206, "max": 0.5469087362289429, "mean": -2.9507478416235244e-07 } } } ``` https://www.internalfb.com/intern/testinfra/testrun/3940649985202645 Differential Revision: D82975005 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163532 Approved by: https://github.com/wwwjn	2025-09-23 16:48:16 +00:00
PyTorch MergeBot	221ac81043	Revert "[precompile] Add option to disable guard check on aot-compiled function. (#163432 )" This reverts commit 539e84e289fa7563032410706ede50a4eaa7a15d. Reverted https://github.com/pytorch/pytorch/pull/163432 on behalf of https://github.com/Camyll due to breaking internal tests ([comment](https://github.com/pytorch/pytorch/pull/163432#issuecomment-3324757069))	2025-09-23 16:31:30 +00:00
dilililiwhy	6e5dddba64	Use accelerator API in common_dtensor (#163498 ) Fixes #ISSUE_NUMBER Try to unify the device checking in common_dtensor (testing module) by accelerator API Pull Request resolved: https://github.com/pytorch/pytorch/pull/163498 Approved by: https://github.com/albanD, https://github.com/H-Huang	2025-09-23 16:30:20 +00:00
Jeff Daily	ebddbe787a	[ROCm][CI] skip test_sparse_triangular_solve (#163651 ) need more time to debug, but also need clean CI signal test was unskipped by #163495, but had been skipp on rocm prior Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/163651 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-23 15:55:51 +00:00
drisspg	5f0c7cb4aa	Add B200 smoke test (#159494 ) Okay running test_max_autotune locally on B200is horrible read, for now to get something landed I am focusing on test_matmul_cuda.py and test_fp8 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159494 Approved by: https://github.com/nWEIdia, https://github.com/huydhn ghstack dependencies: #163460, #163537, #163552	2025-09-23 15:45:05 +00:00
drisspg	b3cf5c79dd	Skip on sm100 later since Tests are non determinisitic (#163552 ) This is tracked https://github.com/pytorch/pytorch/issues/163462 skipping since we are seeing sporadic errors locally and on CI, Pull Request resolved: https://github.com/pytorch/pytorch/pull/163552 Approved by: https://github.com/eqy, https://github.com/Skylion007 ghstack dependencies: #163460, #163537	2025-09-23 15:45:05 +00:00
drisspg	0f674077f4	Large tests failing on bfloat16 (#163537 ) # Summary I ran these tests locally, each 10k Tests takes over 5 mins for an extremely beefy cpu to run. I think that this is overkill feel free to disagree. Also the 1 test I ran that failed earlier up in the stack failed with 1 ulp difference so I think that this is kind of an edgecase on how we do testing (will right up issue for my thoughts later) ``` Shell ==================================================================================================== FAILURES ===================================================================================================== _________________________________________________________ TestMatmulCudaCUDA.test_cublas_addmm_reduced_precision_size_10000_backend_cublas_cuda_bfloat16 __________________________________________________________ Traceback (most recent call last): File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 58, in testPartExecutor yield File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 634, in run self._callTestMethod(testMethod) File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 589, in _callTestMethod if method() is not None: ^^^^^^^^ File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3223, in wrapper method(args, kwargs) File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3223, in wrapper method(args, kwargs) File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_device_type.py", line 426, in instantiated_test result = test(self, param_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_device_type.py", line 1408, in only_fn return fn(slf, args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 2024, in wrap_fn return fn(self, args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/dev/meta/pytorch/test/test_matmul_cuda.py", line 190, in test_cublas_addmm_reduced_precision self.cublas_addmm(size, dtype, True) File "/home/dev/meta/pytorch/test/test_matmul_cuda.py", line 162, in cublas_addmm assert_close_with_ulp(res_cpu, res_cuda, atol=tolerance.atol, rtol=tolerance.rtol) File "/home/dev/meta/transformer_nuggets/transformer_nuggets/numerics/__init__.py", line 222, in assert_close_with_ulp raise AssertionError("\n".join(error_parts)) AssertionError: Tensor-likes are not close! Mismatched elements: 425 / 100030002 (0.0%) Greatest absolute difference: 16 at index (2176, 9325) (up to 10 allowed) Greatest relative difference: 3984 at index (376, 3754) (up to 0.2 allowed) ============================================================ ULP Analysis of Failures: ============================================================ Total failures: 425 ULP distances: min=-32761, max=32763, mean=-11513.7 Top 10 failures by absolute difference: # \| Index \| Abs Diff \| Rel Diff \| ULP \| Expected \| Actual ---------------------------------------------------------------------------------------------------- 1 \| (6923, 1580) \| 1.600000e+01 \| 5.390625e-01 \| 146 \| 29.750000 \| 13.750000 2 \| (4677, 420) \| 1.600000e+01 \| 6.601562e-01 \| 95 \| 24.250000 \| 40.250000 3 \| (2176, 9325) \| 1.600000e+01 \| 6.875000e-01 \| 210 \| 23.250000 \| 7.250000 4 \| (5119, 7865) \| 1.600000e+01 \| 1.164062e+00 \| 146 \| -13.750000 \| -29.750000 5 \| (3218, 8334) \| 1.600000e+01 \| 2.593750e+00 \| 236 \| 6.156250 \| 22.125000 6 \| (5245, 241) \| 1.600000e+01 \| 5.468750e-01 \| 75 \| 29.250000 \| 45.250000 7 \| (7666, 6549) \| 1.600000e+01 \| 1.640000e+03 \| 1376 \| -0.009766 \| -16.000000 8 \| (1663, 1115) \| 1.593750e+01 \| 8.375000e+00 \| -32427 \| 1.898438 \| -14.062500 9 \| (3967, 7708) \| 1.593750e+01 \| 1.368750e+01 \| -32510 \| 1.164062 \| -14.750000 10 \| (2874, 2038) \| 1.593750e+01 \| 1.710938e+00 \| 181 \| 9.312500 \| 25.250000 Note: Maximum absolute and relative errors occur at different locations Max abs diff location (2176, 9325): 210 ULP Max rel diff location (376, 3754): 31868 ULP To execute this test, run the following from the base repo dir: python test/test_matmul_cuda.py TestMatmulCudaCUDA.test_cublas_addmm_reduced_precision_size_10000_backend_cublas_cuda_bfloat16 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 ________________________________________________________ TestMatmulCudaCUDA.test_cublas_addmm_reduced_precision_size_10000_backend_cublaslt_cuda_bfloat16 _________________________________________________________ Traceback (most recent call last): File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 58, in testPartExecutor yield File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 634, in run self._callTestMethod(testMethod) File "/home/dev/.conda/envs/nightly/lib/python3.12/unittest/case.py", line 589, in _callTestMethod if method() is not None: ^^^^^^^^ File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3223, in wrapper method(args, *kwargs) File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3223, in wrapper method(args, kwargs) File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_device_type.py", line 426, in instantiated_test result = test(self, param_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_device_type.py", line 1408, in only_fn return fn(slf, args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/dev/.conda/envs/nightly/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 2024, in wrap_fn return fn(self, args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/dev/meta/pytorch/test/test_matmul_cuda.py", line 190, in test_cublas_addmm_reduced_precision self.cublas_addmm(size, dtype, True) File "/home/dev/meta/pytorch/test/test_matmul_cuda.py", line 162, in cublas_addmm assert_close_with_ulp(res_cpu, res_cuda, atol=tolerance.atol, rtol=tolerance.rtol) File "/home/dev/meta/transformer_nuggets/transformer_nuggets/numerics/__init__.py", line 222, in assert_close_with_ulp raise AssertionError("\n".join(error_parts)) AssertionError: Tensor-likes are not close! Mismatched elements: 425 / 100030002 (0.0%) Greatest absolute difference: 16 at index (2176, 9325) (up to 10 allowed) Greatest relative difference: 3984 at index (376, 3754) (up to 0.2 allowed) ============================================================ ULP Analysis of Failures: ============================================================ Total failures: 425 ULP distances: min=-32761, max=32763, mean=-11513.7 Top 10 failures by absolute difference: # \| Index \| Abs Diff \| Rel Diff \| ULP \| Expected \| Actual ---------------------------------------------------------------------------------------------------- 1 \| (6923, 1580) \| 1.600000e+01 \| 5.390625e-01 \| 146 \| 29.750000 \| 13.750000 2 \| (4677, 420) \| 1.600000e+01 \| 6.601562e-01 \| 95 \| 24.250000 \| 40.250000 3 \| (2176, 9325) \| 1.600000e+01 \| 6.875000e-01 \| 210 \| 23.250000 \| 7.250000 4 \| (5119, 7865) \| 1.600000e+01 \| 1.164062e+00 \| 146 \| -13.750000 \| -29.750000 5 \| (3218, 8334) \| 1.600000e+01 \| 2.593750e+00 \| 236 \| 6.156250 \| 22.125000 6 \| (5245, 241) \| 1.600000e+01 \| 5.468750e-01 \| 75 \| 29.250000 \| 45.250000 7 \| (7666, 6549) \| 1.600000e+01 \| 1.640000e+03 \| 1376 \| -0.009766 \| -16.000000 8 \| (1663, 1115) \| 1.593750e+01 \| 8.375000e+00 \| -32427 \| 1.898438 \| -14.062500 9 \| (3967, 7708) \| 1.593750e+01 \| 1.368750e+01 \| -32510 \| 1.164062 \| -14.750000 10 \| (2874, 2038) \| 1.593750e+01 \| 1.710938e+00 \| 181 \| 9.312500 \| 25.250000 Note: Maximum absolute and relative errors occur at different locations Max abs diff location (2176, 9325): 210 ULP Max rel diff location (376, 3754): 31868 ULP To execute this test, run the following from the base repo dir: python test/test_matmul_cuda.py TestMatmulCudaCUDA.test_cublas_addmm_reduced_precision_size_10000_backend_cublaslt_cuda_bfloat16 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 ``` Okay the bfloat16 are forsure real cc @eqy Pull Request resolved: https://github.com/pytorch/pytorch/pull/163537 Approved by: https://github.com/Skylion007, https://github.com/malfet, https://github.com/eqy ghstack dependencies: #163460	2025-09-23 15:45:05 +00:00
Yiming Zhou	720a7b2887	[export] Remove .contiguous() when saving weights to raw bytes (#163587 ) Summary: `.contiguous()` will discard the original storage size of the tensor, and could lead to issues during loading. Test Plan: buck2 run mode/dev-nosan caffe2/test:test_export -- -r test_1D_tensor_slicing buck2 run mode/dev-nosan caffe2/test:test_export -- -r test_2D_tensor_slicing Differential Revision: D83016250 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163587 Approved by: https://github.com/angelayi	2025-09-23 15:44:56 +00:00
Jason Ansel	49e7b2f69d	[inductor] Fix error from custom CUDA allocators (#163422 ) Fixes #163257 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163422 Approved by: https://github.com/eellison ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393, #163412	2025-09-23 15:37:45 +00:00
Jason Ansel	6ef74879f6	[dynamo] Fix TorchFunctionMode handling with get_rng_state (#163412 ) Fixes #162624 Fixes #162586 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163412 Approved by: https://github.com/eellison ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434, #163393	2025-09-23 15:37:45 +00:00
Jason Ansel	9c4d9f940b	[inductor] Support out_dtype arg to matmul (#163393 ) Fixes #163275 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163393 Approved by: https://github.com/eellison, https://github.com/coconutruben ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419, #163434	2025-09-23 15:37:38 +00:00
Jason Ansel	ed84e808f0	[inductor] Freeze layouts in FlexAttention (#163434 ) Fixes #163300 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163434 Approved by: https://github.com/drisspg ghstack dependencies: #163386, #163398, #163387, #163414, #163415, #163419	2025-09-23 15:37:29 +00:00
Jason Ansel	518c320676	[inductor] libdevice.sqrt => tl.sqrt_rn (#163419 ) Fixes #163082 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163419 Approved by: https://github.com/Skylion007, https://github.com/mlazos ghstack dependencies: #163386, #163398, #163387, #163414, #163415	2025-09-23 15:37:21 +00:00
Scott Wolchok	4264fd34ec	Add basic tests for torch.distributed.tensor._utils.compute_global_tensor_info (#162968 ) Next PR writes a C++ implementation. Seems good to have tests first. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162968 Approved by: https://github.com/ezyang ghstack dependencies: #161695, #162508	2025-09-23 14:56:32 +00:00
Jeff Daily	e05c9c0c84	[ROCm][CI] cudagraph trees ut fixes (#163592 ) Fixes #162125. Fixes #160719. Fixes #157901. Fixes #157871. Fixes #157761. Fixes #157723. Fixes #157643. Fixes #157616. Fixes #157556. Fixes #157533. Fixes #157449. Fixes #157428. Fixes #157413. Fixes #157367. Fixes #157350. Fixes #157339. Fixes #157312. Fixes #157280. Fixes #157258. Fixes #157173. Fixes #157143. Fixes #157112. Fixes #157086. Fixes #157058. Fixes #157035. Fixes #156984. Fixes #156957. Fixes #156954. Fixes #156922. Fixes #156886. Fixes #156838. Fixes #156808. Fixes #156801. Fixes #156778. Fixes #156755. Fixes #156735. Fixes #156693. Fixes #152561. Fixes #130749. Fixes #100074. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163592 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-23 14:45:00 +00:00
PyTorch MergeBot	aff76c046d	Revert "Add fake_impl for _native_multi_head_attention (#163167 )" This reverts commit 27164b6788cab6e6d8095012839e51c958a819d6. Reverted https://github.com/pytorch/pytorch/pull/163167 on behalf of https://github.com/malfet due to This broke in inductor-cpu-test, see `1a42656d6c/1` ([comment](https://github.com/pytorch/pytorch/pull/163167#issuecomment-3324302026))	2025-09-23 14:36:45 +00:00
Isalia20	1a42656d6c	[Flex attention] Fix flex attention head broadcast (#163426 ) Fixes part of #163314 In particular bug: Bug 1: H=None Broadcasting Produces Incorrect Results This fixes a shape bug when slicing BlockMask on the Q-tile axis with an int (mask[:, :, i]). That form of indexing collapses the Q dimension, so kv_num_blocks/kv_indices lose their expected [B, H, Q_tiles, …] shape. Due to them losing shape, even though the mask_mod remains "interpretable", the kernel’s stride math then reads wrong offsets. Due to this we get silent numerical mismatches compared to regular SDPA, especially when single position decoding/H broadcasting. The B=None, H=None works case is accidental: with singleton batch/head the kernel maps to index 0 via `sparse_idx_z = off_zq % 1` and `sparse_idx_hq = off_hq % 1` and with a single Q tile `q_start // SPARSE_Q_MULTIPLE = 0`. The missing Q-tiles stride is multiplied by 0, so the bad offset from the collapsed Q axis doesn’t move the pointer and it happens to read the first tile correctly. Once H > 1 or there are multiple Q tiles, those terms become nonzero and the kernel indexes with wrong strides which causes silent error Pull Request resolved: https://github.com/pytorch/pytorch/pull/163426 Approved by: https://github.com/drisspg	2025-09-23 13:01:51 +00:00
Simon Fan	bda9ab291d	[inductor] fix as_strided lowering with .view(dtype) inputs (#163319 ) FIXES https://github.com/pytorch/pytorch/issues/163286 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163319 Approved by: https://github.com/eellison	2025-09-23 12:50:57 +00:00
atalman	3c64b2abab	CUDA 13.0 Warning update for supported architectures (#163585 ) Please see build script: `8da008678f/.ci/manywheel/build_cuda.sh (L69-L71)` This should display correct warning: `` Please install PyTorch with a following CUDA configurations: 12.6 12.8 13.0 following instructions at https://pytorch.org/get-started/locally/ `` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163585 Approved by: https://github.com/malfet	2025-09-23 11:27:11 +00:00
Yuanyuan Chen	5d749ceb92	Remove test conditions for CUDA<12 (#163495 ) Because it required that CUDA >=12. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163495 Approved by: https://github.com/janeyx99	2025-09-23 07:52:00 +00:00
Nicolas Macchioni	8d81564df5	[pt2][cache] rework cache for true generic usage + better tests (#163488 ) Differential Revision: D82933509 over the weekend I realized that some of the cache implementation was a bit silly, and too constrained to be actually generic. for example, InMemoryCache[str, bytes] was odd since we'd probably want to be able to store more than just str keys with bytes values. so tldr; everything is now generic, with the one constraint being that Key and Value must both be pickle-able types. this makes things a lot simpler for us, since all caches can now be str -> bytes caches under the hood if we'd like, and Key/Value just get pickled on the way in and out. with this change, there were also some improvements made to the testing; mainly better coverage, but now we also test each cache across every combination of Key/Value types to ensure that they will work with the types we might specify later I also hardened some things here and there, for example we now use literal_eval (forgot who mentioned this on the first PR, but thank you for the suggestion!), and all errors coming from the caching will be wrapped in CacheError from now on (although we still raise from the original error context where possible) putting this PR up now for feedback, in the process of generalizing the code I did remove the documentation since it was becoming outdated but I will add that back in after the PR is green I have the next PR ready as well (implements a fresh cache context manager), will export once this lands Pull Request resolved: https://github.com/pytorch/pytorch/pull/163488 Approved by: https://github.com/aorenste, https://github.com/masnesral	2025-09-23 07:31:48 +00:00
bobrenjc93	b426ba1d5e	[torchfuzz] introduce tensor and scalar pointwise ops (#163558 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163558 Approved by: https://github.com/laithsakka ghstack dependencies: #163547, #163553, #163554, #163555, #163556, #163557	2025-09-23 06:20:13 +00:00
KarhouTam	375f3e3a61	[OpenReg][Docs] Correct docs about `openreg` usage example. (#163235 ) ## Why this PR? I've tried to follow the guidance of the `OpenReg` [usage example](https://github.com/pytorch/pytorch/tree/main/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg) and found that the command for compiling `example.cpp` (`g++ -o out example/example.cpp -L ./build -lopenreg`) is not compatible with my `gcc` (v11.4). Since I installed my `gcc` through `apt install build-essential`, and I think that's a common way to install `gcc` for a few developers? I believe it's necessary to slightly modify the command to add `-I ./` to explicitly indicate the header file search path. ## What I've changed? - I added `-I ./` to correctly search for `./include/openreg.h`. - I also added a `pwd` comment for better readability and removed unused imports in `example/example.cpp`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163235 Approved by: https://github.com/FFFrog, https://github.com/albanD Co-authored-by: Jiawei Li <ljw1101.vip@gmail.com>	2025-09-23 06:16:45 +00:00
Shivam Raikundalia	45d9dcccc5	Update Kineto Submodule (#162222 ) Summary: Update Test Plan: CI Rollback Plan: Differential Revision: D81727392 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162222 Approved by: https://github.com/sanrise	2025-09-23 06:08:55 +00:00
bobrenjc93	309fe03f4b	[torchfuzz] remove unneeded try catch (#163557 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163557 Approved by: https://github.com/laithsakka ghstack dependencies: #163547, #163553, #163554, #163555, #163556	2025-09-23 06:05:08 +00:00
bobrenjc93	1545bb1c00	[torchfuzz] shuffle compatible ops (#163556 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163556 Approved by: https://github.com/laithsakka ghstack dependencies: #163547, #163553, #163554, #163555	2025-09-23 05:53:44 +00:00
bobrenjc93	d5e51d34f7	[torchfuzz] decompose -> fuzz_inputs_specs (#163555 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163555 Approved by: https://github.com/laithsakka ghstack dependencies: #163547, #163553, #163554	2025-09-23 05:44:59 +00:00
bobrenjc93	08c5efde5f	[torchfuzz] cache operators (#163554 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163554 Approved by: https://github.com/laithsakka ghstack dependencies: #163547, #163553	2025-09-23 05:28:07 +00:00
PyTorch MergeBot	19b754dff8	Revert "Update cutlass version for fbcode (#163091 )" This reverts commit 509c4e86270cc4decca58905d0f446e1fc0cf618. Reverted https://github.com/pytorch/pytorch/pull/163091 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/163091#issuecomment-3322428791))	2025-09-23 05:08:42 +00:00
Yuanyuan Chen	d3a1345ed8	Use functools.cache on has_efa (#163439 ) Cache the result of `has_efa` by `functools.cache`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163439 Approved by: https://github.com/janeyx99	2025-09-23 05:03:03 +00:00
Yuanyuan Chen	e3b392bdfd	[BC breaking] Remove deprecated imports for torch.utils.data.datapipes.iter.grouping (#163438 ) This PR removes import tricks of `SHARDING_PRIORITIES` and `ShardingFilterIterDataPipe` from `torch.utils.data.datapipes.iter.grouping`. They are declared to be removed in PyTorch 2.1 but not. Before change: ``` import torch.utils.data.datapipes.iter.grouping.SHARDING_PRIORITIES import torch.utils.data.datapipes.iter.grouping.ShardingFilterIterDataPipe ``` works After change: there is an import error exception. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163438 Approved by: https://github.com/janeyx99	2025-09-23 05:02:06 +00:00
Valentin Andrei	bb5be56619	[torch][cuda][device_limits] Library for querying device hardware limits for flops and bandwidth (#162942 ) In various benchmarks scattered across the repo, the limits for flops/second and memory bandwidth are usually hardcoded for a single device. This utility could help in providing a more structured way to query the device capabilities. If this is approved, we can use it when reporting flops efficiency and bandwidth relative to peak in the benchmarks and tests. The intent is to add more devices, more parameters (e.g. L2 cache bandwidth, NVLink, etc.) for both CPUs and accelerators. Testing: ``` import torch if torch.cuda.is_available(): device = torch.cuda.current_device() mod = torch.get_device_module('cuda') hw = mod._device_limits.GPULimits(device) print(hw.get_tflops_per_second(torch.float16)) print(hw.get_tflops_per_second(torch.float32)) print(hw.get_tflops_per_second(torch.float64)) print(hw.get_tflops_per_second(torch.bfloat16)) print(hw.get_tflops_per_second(torch.int8)) print(hw.get_memory_bandwidth_Bps() / 1e9) print(hw.get_shared_memory_bandwidth_Bps() / 1e9) # Output on an H100 GPU 1070.53056 535.26528 66.90816 1070.53056 2141.06112 4893.696 33454.08 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162942 Approved by: https://github.com/ngimel, https://github.com/albanD	2025-09-23 04:48:19 +00:00
bobrenjc93	0e122380c2	[torchfuzz] remove supports_variable_inputs for now (#163553 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163553 Approved by: https://github.com/laithsakka ghstack dependencies: #163547	2025-09-23 04:44:54 +00:00
PyTorch UpdateBot	fcd79d5228	[vllm hash update] update the pinned vllm hash (#163590 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163590 Approved by: https://github.com/pytorchbot	2025-09-23 04:44:15 +00:00
Sherlock Huang	95ac7d724e	Rename to _debug_mode.py to make it private (#163534 ) rename debug_mode.py to _debug_mode.py to make it private, per @alban's request. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163534 Approved by: https://github.com/albanD	2025-09-23 04:27:10 +00:00
bobrenjc93	0b75a16200	[torchfuzz] Encapsulate fuzzing and codegen logic into ops (#163547 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163547 Approved by: https://github.com/laithsakka	2025-09-23 04:26:00 +00:00
Yidi Wu	27164b6788	Add fake_impl for _native_multi_head_attention (#163167 ) Test Plan: See added test in test_export.py Rollback Plan: Reviewed By: henryoier Differential Revision: D77747446 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163167 Approved by: https://github.com/angelayi	2025-09-23 04:02:20 +00:00
cyy	447b8fc56d	[2/N] Use filesystem in inductor (#163465 ) Use std::filesystem in most inductor code. This is follow-up of https://github.com/pytorch/pytorch/pull/152288 . Pull Request resolved: https://github.com/pytorch/pytorch/pull/163465 Approved by: https://github.com/Skylion007	2025-09-23 03:56:16 +00:00
Yuanyuan Chen	6a48f57d2f	[1/N] Remove 'type: ignore' suppressions (#163468 ) Remove some unnecessary 'type: ignore' suppressions from python code. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163468 Approved by: https://github.com/Skylion007, https://github.com/janeyx99	2025-09-23 03:53:11 +00:00
Bob Ren	e9300b2b7c	remove allow-untyped-defs from ./torch/onnx/_internal/torchscript_exporter/_globals.py (#163472 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163472 Approved by: https://github.com/Skylion007 ghstack dependencies: #163246, #163469, #163470	2025-09-23 03:50:29 +00:00
Mu-Chu Lee	8f30a8dc47	[AOTInductor] Add grid information for Triton Kernels (#160131 ) Summary: Add grid information for Triton Kernels for profiling in Kineto. Test Plan: Before change: <img width="539" height="625" alt="Screenshot 2025-08-07 at 1 09 07 PM" src="https://github.com/user-attachments/assets/dd0778a9-2ff3-4819-acd3-de585cf7f9d1" /> After change: <img width="550" height="898" alt="Screenshot 2025-08-07 at 1 05 49 PM" src="https://github.com/user-attachments/assets/d84988df-bb83-41ed-80ac-8a6d843a1a9d" /> *Note we can extract grid size etc. from device side trace, but we're focusing host side specifically for this PR, mainly to add more host side information in the future needed for performance profiling. Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/160131 Approved by: https://github.com/desertfire	2025-09-23 02:15:24 +00:00
Bob Ren	2c7959eee9	[ignore][codex-test] Add typing to simple library registry (#161367 ) ## Summary - add type annotations for simple library registry and dispatch rule holder - remove allow-untyped-defs directive ## Testing - `python -m mypy torch/_library/simple_registry.py` (fails: repo expects mypy==1.16.0) - `lintrunner -a torch/_library/simple_registry.py` (fails: attr-defined error in torchgen/gen_schema_utils.py) - `python test/test_torch.py TestTorch.test_dir` (fails: ModuleNotFoundError: No module named 'torch') ------ https://chatgpt.com/codex/tasks/task_e_68aa3cc210488326befdd992c79115a0 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161367 Approved by: https://github.com/Skylion007	2025-09-23 02:08:55 +00:00
Colin Peppler	3ef1bef36c	[sdpa] make sure to recompile if alignment is different than before (#163083 ) ## Context An example from Qwen2-7B - This come from running torch.compile with a sequence length that is divisible by 8 (no padding needed). Call this `Run1`. - If we then run the compiled model with a difference length that isn't divisible by 8 (requires padding). Call this `Run2`. - Then we'll see this error. ``` File "/var/tmp/torchinductor_nobody/2w/c2wby7ilxbna45xrtrrfjqpeutwouruviu2742ockunnd2bleeiz.py", line 1963, in call buf24 = torch.ops.aten._scaled_dot_product_efficient_attention_backward.default(reinterpret_tensor(buf18, (s85, 3584 // s19, s48, 512 // (512 // s19)), (s48(512 // (512 // s19))(3584 // s19), 512 // (512 // s19), (512 // (512 // s19))(3584 // s19), 1), 0), buf20, buf21, buf22, buf23, getitem, getitem_1, getitem_2, getitem_3, 0.0, [True, True, True, False], scale=0.08838834764831845) File "torch/_ops.py", line 841, in __call__ return self._op(args, *kwargs) RuntimeError: attn_bias is not correctly aligned (strideM). attn_bias.stride(2) = 6102, and should be a multiple of 4. ``` - We only see the error because we did not recompile on `Run2`. Instead we ran the inputs on the same graph as `Run1`. ### A bit more on why. Here we check whether to realize the unpadded buffer (unwrapped slice) which we want for `Run1` but not for `Run2`. `0897affcd5/torch/_inductor/lowering.py (L2687-L2694)` ## Fix Size hint doesn't guard, so the fix is to use `guard_or` to guard. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163083 Approved by: https://github.com/eellison	2025-09-23 01:33:33 +00:00
Zhengxu Chen	539e84e289	[precompile] Add option to disable guard check on aot-compiled function. (#163432 ) Summary: Under circumstances it seems reasonable to return a callable directly without guard check when user use aot_compile on a function with single compilation result. When having multiple entries (aot_compile_module), we should start enabling guard check to differetiate different compiled functions apart. Test Plan: CI Differential Revision: D82904540 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163432 Approved by: https://github.com/dolpm	2025-09-23 01:00:05 +00:00
Svetlana Karslioglu	68e75be86a	Update pytorch_sphinx_theme2 to latest hash (#163269 ) The updated theme: - Fixes articleBody in the json+ld that caused previous Google Search issues - Other minor fixes - 404.html fixes Pull Request resolved: https://github.com/pytorch/pytorch/pull/163269 Approved by: https://github.com/albanD	2025-09-22 23:20:23 +00:00
Yuanyuan Chen	8da008678f	Remove outdated commented CMake code (#163442 ) Policies `CMP0023` and `CMP0022` have been removed in CMake 4. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163442 Approved by: https://github.com/janeyx99	2025-09-22 23:07:36 +00:00
Nikita Shulga	fa15fb01ab	[EZ] Remove XLA from unstable.yml (#163564 ) It runs for 30 min on linux.12xlarge and then fails and it has been like that since Aug 7th Besides, there are no more python-3.9 builds left. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163564 Approved by: https://github.com/seemethere, https://github.com/atalman, https://github.com/huydhn	2025-09-22 22:11:50 +00:00
clr	33daaad7d0	dynamo: Handle objects in graph that do not support weakref (#163168 ) We are seeing crashes of the form ``` Traceback (most recent call last): File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/symbolic_convert.py", line 1487, in run while self.step(): File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/symbolic_convert.py", line 1348, in step self.dispatch_table[inst.opcode](self, inst) File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/symbolic_convert.py", line 2437, in LOAD_ATTR self._load_attr(inst) File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/symbolic_convert.py", line 2425, in _load_attr result = BuiltinVariable(getattr).call_function( File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/variables/builtin.py", line 1347, in call_function return handler(tx, args, kwargs) File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/variables/builtin.py", line 967, in <lambda> tx, [v.realize() for v in args], kwargs File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/variables/builtin.py", line 967, in <listcomp> tx, [v.realize() for v in args], kwargs File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/variables/lazy.py", line 72, in realize self._cache.realize() File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/variables/lazy.py", line 33, in realize self.vt = builder.VariableBuilder(tx, self.source)(self.value) File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/variables/builder.py", line 445, in __call__ vt = self._wrap(value) File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/variables/builder.py", line 1043, in _wrap torch._dynamo.utils.store_user_object_weakref(value) File "/packages/aps_ads_vm/launcher_multiapp-inplace#link-tree/torch/_dynamo/utils.py", line 4694, in store_user_object_weakref user_obj_id_to_weakref[obj_id] = weakref.ref(obj) torch._dynamo.exc.InternalTorchDynamoError: TypeError: cannot create weak reference to 'torch.Event' object ``` This pull request makes us gracefully graph break, vs explicitly crashing. I've added a test which reproduces the issue. There is a side discussion re: how did torch.Event support ever work here, since it appears you cannot take a weakref to a torch.Event Pull Request resolved: https://github.com/pytorch/pytorch/pull/163168 Approved by: https://github.com/Lucaskabela, https://github.com/jansel	2025-09-22 22:11:09 +00:00
Yuanyuan Chen	60c2bdedcd	Replace Literal[None] with None in typing (#163489 ) This PR replaces Literal[None] with None in typing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163489 Approved by: https://github.com/Skylion007, https://github.com/mlazos	2025-09-22 22:10:08 +00:00
Tugsbayasgalan Manlaibaatar	b756b580fb	Improve fake tensor leakage detection in export by not relying on gc too much (#163516 ) Previously we relied on gc to get the snapshot of fake tensors before and after export to get list of fake tensors that are created during export. This caused some flakiness in our test suite (https://github.com/pytorch/pytorch/issues/162232). it seems super hard to make gc deterministic, so we just instrument fake tensor creation which seems lot better. In addition, it is also quite faster than previous approach becuase we are no longer manually triggering garbage collector. Differential Revision: [D82966648](https://our.internmc.facebook.com/intern/diff/D82966648) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163516 Approved by: https://github.com/ezyang	2025-09-22 22:04:24 +00:00
Chang Pan	e0cbab46ad	[Inductor] avoid CUDA__equal when constant tensors are from different device (#163529 ) Summary: otherwise, may hit ``` Exception: Expected all tensors to be on the same device, but got other is on cuda:0, different from other tensors on cpu (when checking argument in method wrapper_CUDA__equal) ``` Test Plan: UTs Reviewed By: yushangdi Differential Revision: D82974062 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163529 Approved by: https://github.com/yushangdi, https://github.com/Skylion007	2025-09-22 22:04:11 +00:00
Jason Ansel	4fc271e559	[inductor] Don't require_dense for grid_sampler_2d_backward (#163415 ) Fixes #163372 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163415 Approved by: https://github.com/Skylion007 ghstack dependencies: #163386, #163398, #163387, #163414	2025-09-22 21:53:01 +00:00
Jason Ansel	c8fd2b45e5	[inductor] Skip test_baddmm on XPU (#163414 ) Fixes #161484 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163414 Approved by: https://github.com/Skylion007 ghstack dependencies: #163386, #163398, #163387	2025-09-22 21:53:01 +00:00
Jason Ansel	a1bd9248eb	[inductor] Fallback on strided complex add (#163387 ) Fixes #163243 Fixes #162561 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163387 Approved by: https://github.com/eellison ghstack dependencies: #163386, #163398	2025-09-22 21:52:53 +00:00
Jason Ansel	36c2a1325c	[inductor] Fix bug where viewed outputs get padded (#163398 ) Fixes #163328 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163398 Approved by: https://github.com/eellison ghstack dependencies: #163386	2025-09-22 21:52:45 +00:00
Jason Ansel	7ea8998c0b	Better decomp for torch.eye (#163386 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163386 Approved by: https://github.com/eellison	2025-09-22 21:52:37 +00:00
PaulZhang12	2b036632ca	Allow add_persistent_r_block to scale up rblock up to a limit (#162296 ) <img width="654" height="392" alt="Screenshot 2025-09-18 at 4 22 53 PM" src="https://github.com/user-attachments/assets/975650ec-f769-43a6-bdf5-2885a8d40d3c" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162296 Approved by: https://github.com/eellison	2025-09-22 21:41:46 +00:00
can-gaa-hou	0256f91558	[BUG] MaxUnpool2d/3d should check output dim before accessing its elements (#163507 ) Fixes #163409 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163507 Approved by: https://github.com/malfet, https://github.com/Skylion007	2025-09-22 21:36:48 +00:00
Nikita Shulga	da05aa7a9d	[BE] Use `output_t` directly (#163518 ) Rather than deref the safe tensor wrapped in `TensorArg` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163518 Approved by: https://github.com/Skylion007	2025-09-22 21:33:42 +00:00
PyTorch UpdateBot	e558f7a222	[vllm hash update] update the pinned vllm hash (#163463 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163463 Approved by: https://github.com/pytorchbot Co-authored-by: Huy Do <huydhn@gmail.com>	2025-09-22 21:24:56 +00:00
Edward Yang	09cb34c1dc	[RELAND] Always build USE_DISTRIBUTED (#160449 ) and Make distributed modules importable even when backend not built (#159889 ) (#162594 ) Summary: Original: D81957844 and D81957923 Also, https://github.com/pytorch/pytorch/pull/162142 is patched in as well #buildall Test Plan: sandcastle and oss ci Rollback Plan: Reviewed By: H-Huang Pull Request resolved: https://github.com/pytorch/pytorch/pull/162594 Approved by: https://github.com/H-Huang, https://github.com/dcci	2025-09-22 21:12:18 +00:00
Nikita Shulga	4027e97791	[BE] Delete `skipIfMPSOnMacOS13` (#163515 ) As PyTorch needs MacOS-14 or newer to use MPS Pull Request resolved: https://github.com/pytorch/pytorch/pull/163515 Approved by: https://github.com/Skylion007	2025-09-22 21:10:22 +00:00
Svetlana Karslioglu	8e62d01f7a	Add dynamic shapes doc (#159428 ) This PR adds new Dynamic Shapes documentation and expands on the existing one. - Adds a new structure with Intro, Core Concepts, Troubleshooting Pull Request resolved: https://github.com/pytorch/pytorch/pull/159428 Approved by: https://github.com/bobrenjc93 Co-authored-by: bobrenjc93 <bobren@meta.com>	2025-09-22 21:01:27 +00:00
Pearu Peterson	8abc2af9b9	[STABLE ABI] Add clone method to torch::stable::Tensor (#161896 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161896 Approved by: https://github.com/janeyx99	2025-09-22 20:39:24 +00:00
drisspg	02da4753f5	Triton template IMA reads on B200 (#163460 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163460 Approved by: https://github.com/eqy, https://github.com/alexsamardzic	2025-09-22 20:34:39 +00:00
Bob Ren	cf28ab2c88	remove allow-untyped-defs from ./torch/ao/quantization/pt2e/duplicate_dq_pass.py (#163470 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163470 Approved by: https://github.com/aorenste ghstack dependencies: #163246, #163469	2025-09-22 20:29:09 +00:00
Bob Ren	46e1b7d70b	remove allow-untyped-defs from ./torch/utils/data/datapipes/iter/fileopener.py (#163469 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163469 Approved by: https://github.com/aorenste, https://github.com/Skylion007 ghstack dependencies: #163246	2025-09-22 20:29:09 +00:00
Aaron Gokaslan	e065d35fd3	[BE]: Add a few more missing move from return indices (#163456 ) @ezyang A follow up where I found a few more missing returns of this style in the codebase. Follow up to #163416 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163456 Approved by: https://github.com/cyyever, https://github.com/albanD	2025-09-22 20:24:23 +00:00
adabeyta	fd785b1762	Add NestedTensor dispatch for _is_any_true/_is_all_true (#162096 ) Fixes: https://github.com/pytorch/pytorch/issues/161818 ### Summary Add NestedTensor support for `_is_any_true` and `_is_all_true`. ### Changes - Register dispatch for `aten._is_any_true.default` and `aten._is_all_true.default` - Add CPU tests: - `test_is_any_true_jagged`: dispatch_matches_values_buffer, all_false_returns_false, one_true_returns_true - `test_is_all_true_jagged`: dispatch_matches_values_buffer, all_true_returns_true, any_false_returns_false ### Testing Before Fix: `pytest -q test/test_nestedtensor.py -k "test_is_any_true_jagged or test_is_all_true_jagged" -v` Output: ``` FAILED [0.0129s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_is_all_true_jagged_cpu - NotImplementedError: aten._is_all_true.default FAILED [0.0007s] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_is_any_true_jagged_cpu - NotImplementedError: aten._is_any_true.default ``` After Fix: `pytest -q test/test_nestedtensor.py -k "test_is_any_true_jagged or test_is_all_true_jagged" -v` Output: ``` Running 2 items in this shard test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_is_all_true_jagged_cpu PASSED [0.0277s] [ 50%] test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_is_any_true_jagged_cpu PASSED [0.0013s] ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162096 Approved by: https://github.com/jbschlosser	2025-09-22 20:22:44 +00:00
David Berard	d0086708dd	[triton] update 3.5 pin to bbb06c0334a6772b92d24bde54956e675c8c6604 (#163382 ) Includes: * https://github.com/triton-lang/triton/pull/8211 to work around a PTXAS bug that was causing 03-matrix-multiplication tutorial matmuls to underperform due to excessive WGMMA waits * https://github.com/triton-lang/triton/pull/8157 to fix a convert_layout bug Verified that this passes Triton CI in https://github.com/pytorch/pytorch/pull/159158 and improves gemm perf (see https://github.com/pytorch/pytorch/issues/159704) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163382 Approved by: https://github.com/Camyll, https://github.com/atalman	2025-09-22 20:20:59 +00:00
Sherlock Huang	6f9aef5fef	[2/n] Support module.to("cuda:0") in FakeTensorMode on cuda-less machine (#163433 ) Summary: To support exporting a cuda model on a CPU-only machine under fake tensor mode. User commonly need to move sample inputs to the cuda device with .to("cuda:0") or .to("cuda") call. This diff supports this. I expect the following pattern to work ``` with FakeTensorMode(allow_non_fake_inputs=True): cuda_module = module.to("cuda:0") cuda_sample_inputs = tuple([x.to("cuda:0") for x in sample_inputs]) with torch.no_grad(): ep = torch.export.export(cuda_module, cuda_sample_inputs) ``` Before Moving module.to("cuda:0") under fake tensor mode would have parameter on `meta` device. After parameters would be on "cuda:0" . Test Plan: buck2 run fbcode//caffe2/test:fake_tensor -- --r test_move_module Reviewed By: mikaylagawarecki Differential Revision: D80102876 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163433 Approved by: https://github.com/albanD	2025-09-22 20:16:32 +00:00
angelayi	d15048493c	[opaque_obj] Add set_payload + docs (#163276 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163276 Approved by: https://github.com/zou3519 ghstack dependencies: #162660	2025-09-22 20:02:29 +00:00
Adrian Abeyta	bf28990c3d	Add support for NestedTensor share_memory_ (#162272 ) Fixes: https://github.com/pytorch/pytorch/issues/161915 ### Summary Implements share_memory_() support for NestedTensor! ### Changes - Added share_memory_() method to NestedTensor class. - Shares storage for all NestedTensor components: _values, _offsets, _lengths, and cached seqlen tensors. - Guard for CUDA Tensors. ### Testing Before Fix: `pytest -q test/test_nestedtensor.py -k "test_share_memory" -v` Output: ``` Running 1 items in this shard test/test_nestedtensor.py Fatal Python error: Segmentation fault ``` After Fix: `pytest -q test/test_nestedtensor.py -k "test_share_memory" -v` Output: ``` Running 1 items in this shard test/test_nestedtensor.py::TestNestedTensorDeviceTypeCPU::test_share_memory_cpu PASSED [0.0753s] ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162272 Approved by: https://github.com/jbschlosser	2025-09-22 19:59:58 +00:00
PyTorch MergeBot	eaa613bf66	Revert "[opaque_obj] Add set_payload + docs (#163276 )" This reverts commit dd30667f6c2204a15e91eaeb61c84f9080be7748. Reverted https://github.com/pytorch/pytorch/pull/163276 on behalf of https://github.com/ZainRizvi due to Sorry but this fails lint on trunk: [GH job link](https://github.com/pytorch/pytorch/actions/runs/17924886989/job/50968430537) [HUD commit link](`dd30667f6c`) ([comment](https://github.com/pytorch/pytorch/pull/163276#issuecomment-3321054061))	2025-09-22 19:32:30 +00:00
Kathryn-cat	1818c36d6e	[Fix] Restrict stride normalization to 1D tensors on export (#163282 ) This change restricts the DLPack stride normalization to apply only to 1D tensors of shape (1,). ### Rationale The previous implementation normalized the strides for any multi-dimensional tensor containing a dimension of size 1. While well-intentioned, this "over-normalization" discards critical memory layout information, causing issues for downstream consumers who rely on strides to infer alignment and contiguity. For example: * A row-major tensor with `shape=(1, 128)` and `stride=(128, 1)` would be incorrectly normalized to `stride=(1, 1)`. * A column-major tensor with `shape=(1024, 1)` and `stride=(1, 1024)` would also be normalized to `stride=(1, 1)`. This loss of stride information makes it impossible for consumers to detect the original memory layout (e.g., row-major vs. column-major) and breaks assumptions about memory alignment needed for optimized indexing or specialized hardware APIs like GPU TMA. The original intent of the normalization was to handle the simple case of a 1D tensor with shape=(1,) and a non-standard stride. This fix reverts to that specific, non-problematic behavior, ensuring that multi-dimensional tensors retain their precise stride information during DLPack export. ### Related Issues #163274 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163282 Approved by: https://github.com/eqy	2025-09-22 19:10:05 +00:00
angelayi	7e9781174c	Fix lint (#163542 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/163542 Approved by: https://github.com/malfet	2025-09-22 19:10:00 +00:00
Basil Wong	4941719061	Enable logging for absolute memory estimation (#158799 ) Summary: Update the Auto AC logging so that it also provides the absolute memory estimations for each node. Test Plan: (aps-gem_omnifm_v2_mwb_dynamic_005_budget-f23a84c3d8): https://fburl.com/ai_infra/0r738h5r {F1980393481} * Memory Recorded in bytes --- ``` buck2 test //caffe2/test/functorch:test_ac_logging ``` https://www.internalfb.com/intern/testinfra/testrun/14918173863021573 Rollback Plan: Differential Revision: D78580107 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158799 Approved by: https://github.com/jansel	2025-09-22 18:36:49 +00:00
angelayi	dd30667f6c	[opaque_obj] Add set_payload + docs (#163276 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163276 Approved by: https://github.com/zou3519 ghstack dependencies: #162660	2025-09-22 18:30:28 +00:00
angelayi	3be9c86c74	[opaque obj] Initial OpaqueObject (#162660 ) A big pain point ppl have with custom ops is that they do not accept arbitrary input/outputs. In this PR we create the concept of an "OpaqueObject" which allows users to pass arbitrary python objects into custom operators. Some still slightly annoying parts with this implementation: - The schema of the operator is `__torch__.torch.classes.aten.OpaqueObject` instead of whatever python type - `@torch.library.custom_op` doesn't work.. yet? UX: ```python from torch._library.opaque_object import make_opaque, get_payload # your custom python class class OpaqueQueue: def __init__(self, queue: list[torch.Tensor], init_tensor_: torch.Tensor) -> None: super().__init__() self.queue = queue self.init_tensor_ = init_tensor_ def push(self, tensor: torch.Tensor) -> None: self.queue.append(tensor) def pop(self) -> torch.Tensor: if len(self.queue) > 0: return self.queue.pop(0) return self.init_tensor_ def size(self) -> int: return len(self.queue) queue = OpaqueQueue([], torch.zeros(3)) obj: torch._C.ScriptObject = make_opaque(queue) # obj.payload stores a direct reference to this python queue object self.assertEqual(get_payload(obj), queue) # This is able to be passed through the dispatcher torch.ops._TestOpaqueObject.queue_push(obj, torch.ones(3)) self.assertTrue(queue.size(), 1) ``` Authoring a custom op: ```python lib = torch.library.Library("_TestOpaqueObject", "FRAGMENT") torch.library.define( f"_TestOpaqueObject::queue_push", "(__torch__.torch.classes.aten.OpaqueObject a, Tensor b) -> ()", tags=torch.Tag.pt2_compliant_tag, lib=lib, ) @torch.library.impl(f"{libname}::queue_push", "CompositeExplicitAutograd", lib=lib) def push_impl(q: torch._C.ScriptObject, b: torch.Tensor) -> None: # We can get the payload directly by get_payload(q) queue = get_payload(q) assert isinstance(queue, OpaqueQueue) queue.push(b) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162660 Approved by: https://github.com/zou3519	2025-09-22 18:30:28 +00:00
Yuanyuan Chen	bec967eaa4	Remove C++ and test branches for CUDA<12 (#163443 ) Remove conditional branches for CUDA<12. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163443 Approved by: https://github.com/eqy	2025-09-22 18:20:08 +00:00
Eli Uriegas	d279a6a6f1	ci: Add a way to lint all files in a PR from label (#163525 ) Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163525 Approved by: https://github.com/ZainRizvi	2025-09-22 18:06:39 +00:00
Ben Niu	281f8f407e	Combine strong and weak refcounts in intrusive_ptr in a single refcount (#163394 ) Summary: Currently, we assume that refcount_ and weakcount_ are always stored in an 8-byte aligned address right next to each other. Based on this assumption, we load 8 bytes in intrusive_ptr::reset_ to check the values of both counts. However, that assumption is not part of C++ language standard so it's essentially undefined behavior. This change eliminates that assumption by combining refcount_ and weakcount_ in a single 64-bit count and we use the lower 32 bits for refcount_ and upper 32 bits for the weakcount_. In addition to eliminating the undefined behavior, the change also eliminates the read of weakcount_ after decrementing refcount_ in intrusive_ptr::reset_. This claws back lost performance introduced in https://github.com/pytorch/pytorch/pull/162784 for non-final refcount_ decrementing. Reviewed By: yfeldblum Differential Revision: D82869192 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163394 Approved by: https://github.com/Skylion007	2025-09-22 17:53:28 +00:00
Nikita Shulga	5e7be98800	[BE] Update Python min version to 3.10 (#162310 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162310 Approved by: https://github.com/atalman, https://github.com/Skylion007, https://github.com/ZainRizvi	2025-09-22 17:04:21 +00:00
Xu Han	06fe5b9025	[AOTI] fix TestAOTInductorPackage temp file locked handler. (#163499 ) Fix `test\inductor\test_aot_inductor_package.py` common class `TestAOTInductorPackage`'s `check_model` function, temp file locked file handler on Windows. It would caused c++ backend open file failed: ```cmd FAILED [4.5918s] test/inductor/test_aot_inductor_package.py::TestAOTInductorPackage_cpu::test_add - RuntimeError: File C:/Users/Xuhan/AppData/Local/Temp/tmp21sjnnhl.pt2 cannot be opened. FAILED [4.1703s] test/inductor/test_aot_inductor_package.py::TestAOTInductorPackage_cpu::test_bool_input - RuntimeError: File C:/Users/Xuhan/AppData/Local/Temp/tmp5kd3apub.pt2 cannot be opened. FAILED [4.2266s] test/inductor/test_aot_inductor_package.py::TestAOTInductorPackage_cpu::test_linear - RuntimeError: File C:/Users/Xuhan/AppData/Local/Temp/tmpkyy3pxow.pt2 cannot be opened. FAILED [4.2134s] test/inductor/test_aot_inductor_package.py::TestAOTInductorPackage_cpu::test_metadata - RuntimeError: File C:/Users/Xuhan/AppData/Local/Temp/tmphyer7wi9.pt2 cannot be opened. ...... ``` Fix it via `WritableTempFile`, it can release file handler for backend use. After fixed: <img width="1904" height="176" alt="image" src="https://github.com/user-attachments/assets/e71b3182-0204-497b-9aca-cbbb33bc4687" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163499 Approved by: https://github.com/jansel, https://github.com/desertfire	2025-09-22 16:54:18 +00:00
Laith Sakka	9ca183e933	switch from stack based to graph based aproach (#163459 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163459 Approved by: https://github.com/bobrenjc93 ghstack dependencies: #163417	2025-09-22 16:41:35 +00:00
Chris Thi	e310cc5e06	Update fbgemm submodule (#163411 ) Test Plan: As titled, includes some new changes fbgemm to see if CUDA13 breakage is fixed. Reviewers: Subscribers: Tasks: Tags: Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/163411 Approved by: https://github.com/Skylion007	2025-09-22 15:46:11 +00:00
Xinya Zhang	eaac218b64	[ROCm] Fix environment variable AOTRITON_INSTALLED_PREFIX (#163373 ) Early assignment of `__AOTRITON_LIB` breaks the usage of environment variable `$AOTRITON_INSTALLED_PREFIX` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163373 Approved by: https://github.com/pruthvistony, https://github.com/jeffdaily	2025-09-22 15:01:18 +00:00
henrylhtsang	509c4e8627	Update cutlass version for fbcode (#163091 ) Differential Revision: [D82567751](https://our.internmc.facebook.com/intern/diff/D82567751/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163091 Approved by: https://github.com/drisspg	2025-09-22 14:31:11 +00:00
PyTorch MergeBot	10adeb9044	Revert "[BE] Update Python min version to 3.10 (#162310 )" This reverts commit 9f5a644f0768258bc81f8b38492754d297399f74. Reverted https://github.com/pytorch/pytorch/pull/162310 on behalf of https://github.com/malfet due to Broke lint, but to the best of my knowledge it's no longer possible to run lint for all files on PRs ([comment](https://github.com/pytorch/pytorch/pull/162310#issuecomment-3319289031))	2025-09-22 14:13:59 +00:00
Nikita Shulga	9f5a644f07	[BE] Update Python min version to 3.10 (#162310 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162310 Approved by: https://github.com/atalman, https://github.com/Skylion007, https://github.com/ZainRizvi	2025-09-22 13:37:02 +00:00
Isalia20	60b4791d08	[MPS] Fix compile linalg inv (#163452 ) Fixes #161969 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163452 Approved by: https://github.com/Skylion007	2025-09-22 10:36:52 +00:00
Yuanyuan Chen	96a3afb8ec	Simplify BFLOAT16_AVAILABLE (#163445 ) Simplify `BFLOAT16_AVAILABLE` by using `torch.cuda.is_bf16_supported()` and `torch.xpu.is_bf16_supported()`. Outdated comments are also removed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163445 Approved by: https://github.com/Skylion007, https://github.com/kwen2501	2025-09-22 07:31:46 +00:00
PyTorch MergeBot	edafc902d7	Revert "[BE] Make PyObjectSlot use a global PyInterpreter (#162659 )" This reverts commit d1993c27ae59842c887d549a3f8936fbcd769498. Reverted https://github.com/pytorch/pytorch/pull/162659 on behalf of https://github.com/wdvr due to reverted internally, please see D82771705 @PaliC ([comment](https://github.com/pytorch/pytorch/pull/162659#issuecomment-3317110247))	2025-09-22 06:22:37 +00:00
PyTorch MergeBot	ae5be038a6	Revert "Delete functorch C extension entirely. (#163340 )" This reverts commit 1faf6367e396b1d0894e8735912a47ac465f469d. Reverted https://github.com/pytorch/pytorch/pull/163340 on behalf of https://github.com/wdvr due to temporary revert to pull out #162659 ([comment](https://github.com/pytorch/pytorch/pull/163340#issuecomment-3317105243))	2025-09-22 06:20:04 +00:00
PyTorch MergeBot	f0078941cf	Revert "[RELAND] Always build USE_DISTRIBUTED (#160449 ) and Make distributed modules importable even when backend not built (#159889 ) (#162594 )" This reverts commit 6c334885d48725197b5d35e2c1543efc0f4198d0. Reverted https://github.com/pytorch/pytorch/pull/162594 on behalf of https://github.com/wdvr due to reverted internally - @ezyang see D82281294 ([comment](https://github.com/pytorch/pytorch/pull/162594#issuecomment-3317017530))	2025-09-22 05:39:07 +00:00
PyTorch MergeBot	3a7db34cf9	Revert "[SymmMem] Promote `@requires_nvshmem` instead of `enable_triton` (#163423 )" This reverts commit 5d8a226e23339e7243a2a84afd174f685f145b68. Reverted https://github.com/pytorch/pytorch/pull/163423 on behalf of https://github.com/wdvr due to temporary reverting to back out #162594 ([comment](https://github.com/pytorch/pytorch/pull/163423#issuecomment-3317011500))	2025-09-22 05:35:41 +00:00
Yuanyuan Chen	281bb56cc5	Enable half precision types on test_conv_cudnn_nhwc_support (#163444 ) This PR adds flaot16 and bfloat16 cases to `test_conv_cudnn_nhwc_support` and removes outdated comments. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163444 Approved by: https://github.com/Skylion007	2025-09-22 04:11:20 +00:00
Yuanyuan Chen	01f927eb40	Remove workarounds for Python 3.6 (#163440 ) This PR removes tuple unpacking workarounds for Py 3.6 form two distributed files. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163440 Approved by: https://github.com/ezyang	2025-09-22 04:08:04 +00:00
angelayi	0b59492853	[export] Fix wrap_with_set_grad_enabled retracing (#163295 ) Fixes https://github.com/pytorch/pytorch/issues/163294 The code `with torch.set_grad_enabled(enable_grad)` calls `torch._C._set_grad_enabled` three times -- (1) when [initializing set_grad_enabled](`bb7c9a2d41/torch/autograd/grad_mode.py (L187C9-L187C35)`), (2) when [entering the context](`bb7c9a2d41/torch/autograd/grad_mode.py (L194)`), and (3) when [exiting the context](`bb7c9a2d41/torch/autograd/grad_mode.py (L197)`). This results in the the retraced export module to have a duplicate `torch._C._set_grad_enabled` like: ``` def forward(self, arg0_1): add = torch.ops.aten.add.Tensor(arg0_1, 1); arg0_1 = None _set_grad_enabled = torch._C._set_grad_enabled(False); _set_grad_enabled = None _set_grad_enabled = torch._C._set_grad_enabled(False); _set_grad_enabled = None add_1 = torch.ops.aten.add.Tensor(add, 2); add = None _set_grad_enabled_1 = torch._C._set_grad_enabled(True); _set_grad_enabled_1 = None add_2 = torch.ops.aten.add.Tensor(add_1, 3); add_1 = None return (add_2,) ``` When export runs the `replace_set_grad_with_hop_pass`, it will look through the graph for `torch._C._set_grad_enabled` and create subgraphs. The duplicate `torch._C._set_grad_enabled` results in an empty submod in the graph, which resulted in an error in [this post](https://fb.workplace.com/groups/1028545332188949/posts/1844720036398281/?comment_id=1862175381319413). Pull Request resolved: https://github.com/pytorch/pytorch/pull/163295 Approved by: https://github.com/yushangdi	2025-09-21 22:54:40 +00:00
Yuanyuan Chen	8a281d7214	[submodule] Bump libfmt to 12.0.0 (#163441 ) libfmt 12.0 brings new optimisations and fixes some compilation issues for clang 21 (https://github.com/fmtlib/fmt/pull/4477). For a detailed release log, see https://github.com/fmtlib/fmt/releases/tag/12.0.0 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163441 Approved by: https://github.com/Skylion007	2025-09-21 22:37:25 +00:00
Jiannan Wang	6ac2b3ae35	[BE] Adding aliases for CUDA and XPU API documentation (#162984 ) This PR reorganizes CUDA and XPU API documentation with additional aliases pages. Multiple entries of APIs under torch.cuda are thus removed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162984 Approved by: https://github.com/janeyx99	2025-09-21 22:28:27 +00:00
Yedidya Feldblum	8b14f43da9	[torch] DRY a couple of lines in unpickler (#163447 ) Test Plan: CI. Reviewed By: dolpm Differential Revision: D82660989 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163447 Approved by: https://github.com/Skylion007	2025-09-21 20:29:33 +00:00
Laith Sakka	4d3d32f14c	Add torchfuzz initial impl. (#163417 ) all details are in readme.md Note: one thing i want to do soonest is to switch to graph representation instead of stack representation for the fuzzed ops should make things easier as things get more complicated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163417 Approved by: https://github.com/bobrenjc93	2025-09-21 19:17:54 +00:00
Scott Wolchok	5599f487ef	Fully native DTensor.__new__ (#162508 ) Move the entirety of `__new__` into C++, saving a layer of disable_dynamo and making progress toward all-C++. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162508 Approved by: https://github.com/ezyang ghstack dependencies: #161695	2025-09-21 18:36:05 +00:00
Yuanyuan Chen	51152efa67	Remove autograd code for Python < 3.9 (#163313 ) As PyTorch is moving to Python 3.10, it is safe to remove code for Python < 3.9. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163313 Approved by: https://github.com/ezyang	2025-09-21 15:35:06 +00:00
Markus Hoehnerbach	f34744d2a5	[inductor] bugfix: keep WeakDeps (WAR deps) during fusion (#162316 ) fixes #159855, was not triggered in other tests since it took more than one round of fusion to get to the problematic code which prunes WeakDeps. The WeakDeps are important to inhibit fusion of kernels that read/write data into mutated buffers with different indexing. We modify the code to a) always prune before fusion, rather than after, which improves its coverage and makes our basic vertical fusion tests surface this issue as well and b) check whether the weak dep is fusable before eliminating it (which basically means checking that the producing code and the consuming code are sufficiently compatible). The tests that trigger this with change (a) is: test_fusing_write_into_disjoint_read introduced in #118210. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162316 Approved by: https://github.com/eellison, https://github.com/mlazos, https://github.com/shunting314	2025-09-21 13:08:11 +00:00
Ke Wen	5d8a226e23	[SymmMem] Promote `@requires_nvshmem` instead of `enable_triton` (#163423 ) ### Issue The previous `enable_triton` UI requires the user-defined Triton kernel have a "nvshmem" in its name. If users did not do so, the kernel would miss the NVSHMEM init, and silently hit CUDA IMA. The `@require_nvshmem` decorator eliminates the above name requirement (and the `enable_triton` call). ### Usage: ``` @requires_nvshmem @triton.jit def foo(...): ... foo[(1, 1)](...) ``` It also remove the need of passing `extern_lib` to `foo` (handled by the decorator now). Pull Request resolved: https://github.com/pytorch/pytorch/pull/163423 Approved by: https://github.com/ngimel ghstack dependencies: #163025, #163152, #163194	2025-09-21 10:03:20 +00:00
FFFrog	d8cbbc0f70	[Easy][AMP] Refactor the AMP logic for getting dtype (#162796 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162796 Approved by: https://github.com/ezyang	2025-09-21 06:32:35 +00:00
orangeH25	9ba918082a	Add api info for torch._C._nn.pyi (#162707 ) Fix part of #148404 APis involved are as followed: - multilabel_margin_loss - multi_margin_loss - nll_loss_nd - relu6 - relu6_ Pull Request resolved: https://github.com/pytorch/pytorch/pull/162707 Approved by: https://github.com/ezyang	2025-09-21 06:17:15 +00:00
Edward Yang	1faf6367e3	Delete functorch C extension entirely. (#163340 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163340 Approved by: https://github.com/aorenste ghstack dependencies: #160236	2025-09-21 06:02:21 +00:00
windsonsea	4a96a6fa4a	[Docs] Fix indentations in cond.md (#156147 ) This is a follow-up PR to fix indentations mentioned by https://github.com/pytorch/pytorch/pull/155653#issuecomment-2971660356 Pull Request resolved: https://github.com/pytorch/pytorch/pull/156147 Approved by: https://github.com/svekars, https://github.com/cyyever	2025-09-21 05:50:50 +00:00
Yuanyuan Chen	f591bb5056	Remove data_source argument from Sampler (#163134 ) `data_source` is declared being removed in PT 2.2 but not. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163134 Approved by: https://github.com/ezyang	2025-09-21 05:44:41 +00:00
Aaron Gokaslan	1ca9445229	[BE][Ez]: Prevent copies of std::vector in CUDA ForeachOps (#163416 ) No need for unnecessary copy of std::vectors. This Tensor list is copied throughout the foreach paths and this code is on a hot path for torch optimizers. Auto move elision will not happen on the return statement since it's a subelement of a vector that needs to be copied out before the std::vector is dtor'd. This should reduce quite a few list copies along this path. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163416 Approved by: https://github.com/ezyang	2025-09-21 05:24:13 +00:00
PyTorch UpdateBot	5b386ee16e	[vllm hash update] update the pinned vllm hash (#163392 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163392 Approved by: https://github.com/pytorchbot	2025-09-21 04:34:14 +00:00
Edward Yang	97eb7a281d	torchdim Python port (#160236 ) The big semantic change (and the reason for this port) is that we no longer monkeypatch Tensor with torchdim's special methods. The new algorithm for handling dispatch is that we first land in `__torch_function__` and we see if a special FCD implementation needs to be dispatch to first, and if there is nothing we fallback to the standard level strategy. Because there is no longer C binding equivalent of classes, we've condensed _C.Dim and Dim together, and similar for Tensor. This resulted in some bugs as the Python API is sometimes different from the C API. I've attempted to disambiguate these but there may still be mistakes (many early bugs were due to this problem). Dim and DimEntry are especially painful as Dim must abide by Tensor equality semantics, but is pointer equality in C (DimEntry doesn't have this problem). Another difference between C/Python that is subtle is we no longer get implicit conversions from Dim to DimEntry, this also caused some bugs. Much of the mechanical porting work was done by claude code. I have a separate PR that deletes functorch._C, but it was useful having dim.cpp to point claude at it so I haven't done it in this PR. From a reviewing perspective, I need to re-review that I didn't forget to port anything, some noticeably missing "small" things are patched_dim_method. I am still in progress of carefully doing a side-by-side review of ports; "simplifications" from claude code were also a major source of bugs. There are two major feature gaps in the implementation: - DelayedTensor and dot handling are not implemented yet. This should be reasonably easy, just need to do it. However, for the purposes of sharded propagation it is actually better not to reconstruct matmuls. - Splitting dimensions with an index like `[x, y]` doesn't work. The problem is that `__getitem__` interprets this as advanced indexing and sends the list to torch.tensor to turn into a tensor, instead of being eligible for `__torch_function__`. I think I might need to hard code a special case for this or something? Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160236 Approved by: https://github.com/zdevito, https://github.com/albanD	2025-09-21 03:01:04 +00:00
Edward Yang	2887f3fde4	[BE] Slight improvements to documentation in python_dispatch (#162963 ) I was briefly confused which way I should iterate stack, here's the comments I wanted. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162963 Approved by: https://github.com/albanD, https://github.com/SherlockNoMad	2025-09-21 01:45:46 +00:00
eqy	e37b600007	[CUDA][cuBLAS][FP8] Forward-fix #162022 (#163354 ) @ngimel is right, `ciflow/h100` doesn't actually appear to test the PR :( Pull Request resolved: https://github.com/pytorch/pytorch/pull/163354 Approved by: https://github.com/ngimel, https://github.com/Skylion007	2025-09-21 00:55:12 +00:00
Yedidya Feldblum	8e3fd3d4f9	[AI Codemod][DevmatePerfOptimizationVectorReallocation] fbcode/caffe2/torch/csrc/jit/serialization/unpickler.cpp (#163240 ) Reviewed By: marksantaniello, yfeldblum Differential Revision: D82140619 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163240 Approved by: https://github.com/Skylion007	2025-09-20 23:26:24 +00:00
Avik Chaudhuri	9e3725e8e5	make fullgraph_capture work on mod, args, kwargs (#162849 ) Summary: Today `fullgraph_capture` takes a frame, but clients usually take a callable (`nn.Module`, function, or method) and example inputs (args and kwargs) and then explicitly set up the frame to pass. This is boilerplate—and potentially tricky to get right—that can be hidden inside the API. The original `fullgraph_capture` now becomes `_fullgraph_capture_frame`. Test Plan: existing tests Rollback Plan: Differential Revision: D82339400 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162849 Approved by: https://github.com/zhxchen17	2025-09-20 22:48:06 +00:00
Sherlock Huang	3938175ec1	[1/n] Support cpu_tensor.to("cuda:0") in FakeTensorMode on cuda-less machine (#160431 ) Summary: To support exporting a cuda model on a CPU-only machine under fake tensor mode. User commonly need to move sample inputs to the cuda device with .to("cuda:0") call. This diff supports this. Notice that .to("cuda") doesn't work yet, as it enquery current device idx by calling cuda API. I expect the following pattern to work ``` with FakeTensorMode(allow_non_fake_inputs=True): cuda_module = module.to("cuda:0") cuda_sample_inputs = tuple([x.to("cuda:0") for x in sample_inputs]) with torch.no_grad(): ep = torch.export.export(cuda_module, cuda_sample_inputs) ``` Test Plan: buck2 run fbcode//caffe2/test:fake_tensor -- --r test_fake_gpu_no_init Rollback Plan: Differential Revision: D80101283 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160431 Approved by: https://github.com/henryoier, https://github.com/ezyang	2025-09-20 21:33:53 +00:00
Avik Chaudhuri	d70c0babf5	minimize graph capture output (#162211 ) Currently OutputGraphGuardsState is separated out as a serializable interface for OutputGraph, but some of the typing around it is incorrect in dynamo's guards.py and output_graph.py: more fields are used by code than claimed by OutputGraphGuardsState, and it works because either the full OutputGraph is passed in or the parts that use those fields are dead when OutputGraphGuardsState is passed in. In this PR we try to further separate the necessary fields of OutputGraph that should be retained by a full graph capture mechanism, not just limited to dynamo (as it is currently) but also something like make_fx (in the future). Since these fields do not need to be serialized, the result is an intermediate "common" data structure that is between OutputGraphGuardsState and OutputGraph in the inheritance hierarchy. Differential Revision: D81718791 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162211 Approved by: https://github.com/zhxchen17	2025-09-20 15:52:28 +00:00
Pearu Peterson	f9074c7332	[STABLE ABI] Add copy_ operation. (#161895 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161895 Approved by: https://github.com/janeyx99	2025-09-20 10:30:33 +00:00
Nicolas De Carli	eb11d172e3	[Caffe2] Improve SVE batch box cox by 2% (#163360 ) Summary: Improve bound checking on exp computation, decreasing the longest dependency chain by 1. Box-cox benchmarks show about 2% of improved throughput. Precision remains unaltered. before: NonZeroLambdaBatch 155.30us 6.44K after: NonZeroLambdaBatch 151.78us 6.59K Test Plan: Correctness: buck2 test @//mode/opt //koski/functions_contrib/df4ai/tests:batch_box_cox_test Performance: buck2 run @//mode/opt //koski/functions_contrib/df4ai/benchmark:boxcox_benchmark Differential Revision: D82847111 Privacy Context Container: L1208939 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163360 Approved by: https://github.com/Skylion007	2025-09-20 06:42:26 +00:00
Menglu Yu	5050cfa363	[Opitmus] fix fp8 activation quatization for duplicates forward output (#163364 ) Summary: We observe a case then the fwd graph has duplicated return nodes, which will lead to errors due to fx renaming the node, thus we add poi info into the node name. Test Plan: ### unit test ``` CUDA_VISIBLE_DEVICES=3 buck2 test mode/opt -m ovr_config//triton:beta -c fbcode.nvcc_arch=b200a -c fbcode.platform010_cuda_version=12.8 //caffe2/test/functorch:test_aotdispatch -- test_quantize_activation_duplicate_nodes ``` Buck UI: https://www.internalfb.com/buck2/de5eccc6-4064-4214-843d-70b8e3829afe Test UI: https://www.internalfb.com/intern/testinfra/testrun/4503599937670844 Network: Up: 217KiB Down: 72KiB (reSessionID-73e5c269-4f4d-4a54-896a-79c077eea326) Executing actions. Remaining 0/2 0.1s exec time total Command: test. Finished 1 local Time elapsed: 45.9s Tests finished: Pass 2. Fail 0. Fatal 0. Skip 0. Build failure 0 ### E2E before f798417700 after Differential Revision: D82844100 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163364 Approved by: https://github.com/Yuzhen11	2025-09-20 06:33:20 +00:00
Chien-Chin Huang	d55c9d52cd	[CP] Fix cuDNN CP LSE dimension bug (#163231 ) We should only unsqueeze if necessary. Fix https://github.com/pytorch/pytorch/issues/162743 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163231 Approved by: https://github.com/eqy ghstack dependencies: #162539, #162540, #162541, #163115, #163131	2025-09-20 06:13:45 +00:00
Ruben Rodriguez Buchillon	0ee331b523	[inductor][choices] move extra kwargs out of get_template_configs (#163209 ) # why - extra kwargs are input/op dependent and not config dependent. We don't plan to serialize/deserialize them, and so they need to be fed in later beore making the KTC, rather than when getting the config values directly # what - move extra_kwargs into the KTC and get_ktc interface directly # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v -k "_addmm" ``` Differential Revision: [D82871310](https://our.internmc.facebook.com/intern/diff/D82871310) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163209 Approved by: https://github.com/nmacchioni ghstack dependencies: #163305	2025-09-20 05:30:40 +00:00
Ruben Rodriguez Buchillon	df5d6d57c9	[inductor][triton heuristics] move allow tf32 out of config params (#163305 ) # why - this is not directly controlled by the config arg but rather by the input and by the inductor wide setting - it's always the same for every choice - we want the config kwargs to be programable and this is not programable in that sense but rather needs to use inductor config # what - move generating the ALLOW_TF32 kwarg in Triton templates into get_extra_kwargs # testing with some annotations, this is now the kwargs and extra_kwargs on addmm ``` {'EVEN_K': True, 'USE_FAST_ACCUM': False, 'ACC_TYPE': 'tl.float32', 'num_stages': 1, 'num_warps': 2, 'BLOCK_M': 32, 'BLOCK_N': 32, 'BLOCK_K': 16, 'hint_override': None, 'GROUP_M': 8} # choice/config kwargs {'ALLOW_TF32': True, 'epilogue_fn': <function addmm_epilogue.<locals>.epilogue at 0x7f64d54ff600>, 'epilogue_fn_hash': "['addmm_epilogue', torch.float32, 1, 1]", 'prefix_args': 1} # extra kwargs ``` they're both passed onto the template Differential Revision: [D82871312](https://our.internmc.facebook.com/intern/diff/D82871312) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163305 Approved by: https://github.com/nmacchioni	2025-09-20 05:30:40 +00:00
Parshant Sharma	0b5a99be88	remove duplicate import for defaultdict (#160519 ) Fixes #160518 This PR aims to remove the duplicate import of defaultdict in the following file: `ecde76c764/functorch/op_analysis/gen_data.py (L36)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160519 Approved by: https://github.com/malfet	2025-09-20 04:06:39 +00:00
dsashidh	a87aea03f7	Update RandomSampler docstring. data_source must be Sized not Dataset (#158857 ) Fixes #158631 The docstring said data_source was a Dataset, but RandomSampler only needs something that implements __len__. This updates the docstring to use Sized instead, which matches the actual type used in the constructor. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158857 Approved by: https://github.com/divyanshk	2025-09-20 04:05:25 +00:00
Blaine Burton Rister	e56dd5d770	[Inductor-FX] Support torch.cond (#163234 ) # Feature Support `torch.cond` in the FX converter. The generated FX IR is conceptually indentical to what would come from `torch.export`: - Submodules as stored as attributes, and accessed via `getattr`. - The conditional is represented as `torch.ops.higher_order.cond`, which takes in the subgraphs, a predicate and submodule inputs. # Implementation overview The FX backend generates code for subgraphs using the following steps: 1. When `codegen_conditional` is called in `WrapperFxCodegen`, we emit a `ConditionalLine`. a. We also codegen the true/false subgraphs at this time, storing their subgms for later. 2. At the beginning of FX conversion, generate `get_attr` nodes accessing each subgraph. It's important to do this at the start, before registering the node metadata hook. This also matches the convention followed by torch.export. 3. When we see the `ConditionalLine` in the FX converter, we generate a corresponding `torch.ops.higher_order.cond`. # Implementation details This ended up being a substantial change, as wrapper codegen has some special logic for subgraphs. Certain methods of `PythonWrapperCodegen` are overridden by `SubgraphPythonWrapperCodegen`. To apply these overrides, we use multiple inheritance with the registered subclass of `WrapperFxCodegen`. Unlike most other wrapper codegen methods, which map 1:1 to Wrapper IR lines, subgraph codegen generates a number of wrapper lines including `EnterSubgraphLine` and `ExitSubgraphLine`, along with Python or C++ code calling the subgraph as a function. These lines are used for some backends' memory planning. In contrast, FX IR typically represents a subgraph call as a single HOP node, or a `call_module` op. To account for this difference, this PR introduces a new wrapper IR line called `ConditionalLine`, which is only used by the FX backend. We override the `codegen_conditional` method to emit this line. This sidesteps having to port the existing subgraph codegen and associated memory planning to Wrapper IR. (In principle, it seems possible to adapt the existing backends to `ConditionalLine`, but it could be a larger refactor, since we'd also have to update the memory planning.) Some of the lower-level subgraph codegen methods are still shared between the FX and Python backends, such as `generate_subgraph_common`. Those were easier to port to Wrapper IR. This also required generalizing the way the FX converter handles graph inputs and outputs. Previously, it assumed the IO signature was the same as `V.graph.module`, but this is only true for the parent graph, and not subgraphs. Instead, we need to call `get_graph_inputs` and `get_graph_outputs` to populate the inputs and outputs for subgraphs. # Test plan This PR adds a couple of tests using torch.cond. Here's an example graph generated by one of them: ``` graph(): %arg0_1 : [num_users=1] = placeholder[target=arg0_1] %arg1_1 : [num_users=1] = placeholder[target=arg1_1] %true_graph_0 : [num_users=1] = get_attr[target=true_graph_0] %false_graph_0 : [num_users=1] = get_attr[target=false_graph_0] %cond : [num_users=1] = call_function[target=torch.ops.higher_order.cond](args = (%arg0_1, %true_graph_0, %false_graph_0, (%arg1_1,)), kwargs = {}) %buf1 : [num_users=2] = call_function[target=operator.getitem](args = (%cond, 0), kwargs = {}) %triton_kernel_wrapper_mutation : [num_users=0] = call_function[target=torch.ops.higher_order.triton_kernel_wrapper_mutation](args = (), kwargs = {kernel_idx: 6, constant_args_idx: 6, grid: [(1, 1, 1)], tma_descriptor_metadata: {}, kwargs: {in_out_ptr0: %buf1, xnumel: 6, XBLOCK: 8}}) return buf1 ``` It also removes an existing negative test which checked that a certain error was raised when subgraphs were encountered. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163234 Approved by: https://github.com/angelayi, https://github.com/jansel	2025-09-20 03:52:31 +00:00
Huy Do	a31acf32bd	Clean up obsoleted vLLM tests (#163383 ) They have been removed in https://github.com/vllm-project/vllm/pull/25117 and https://github.com/vllm-project/vllm/pull/22772, thus failing in trunk at the moment after the latest pin commit update Pull Request resolved: https://github.com/pytorch/pytorch/pull/163383 Approved by: https://github.com/wdvr, https://github.com/seemethere, https://github.com/malfet	2025-09-20 02:48:36 +00:00
Sherlock Huang	a1df0b42ce	Lazy import to avoid circular import issue for DebugMode (#163381 ) as title. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163381 Approved by: https://github.com/dolpm	2025-09-20 01:54:57 +00:00
James Wu	bfe9e60ffb	Simplify PrecompileContext to no longer be a CacheArtifactManager (#162886 ) Summary: This diff does a big refactor of PrecompileContext to make it considerably simpler: instead of being a CacheArtifactManager and managing a bunch of bytes, it simply stores two things: dynamo cache entries and backend cache entries. When asked, it stitches them together into PrecompileCacheEntries, which are stored by DynamoCache. This structure then allows us to register DynamoCache to the regular Megacache API, instead of having two separate APIs that are confusing. It also lets us remove the autotune cache integration, since MegaCache API will automatically store autotune cache entries. The intent here is that users who want to use caching precompile will simply be able to use torch.compiler.save_cache_artifacts as before, just with `torch.dynamo.config.caching_precompile` set to True. They can also directly interact with PrecompileContext if they wish to specifically only load Precompile entries, using PrecompileContext.create_cache_entries(). Saving single entries and such with DynamoCache still works normally. Test Plan: All existing unit tests pass. Rollback Plan: Differential Revision: D82380307 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162886 Approved by: https://github.com/zhxchen17	2025-09-20 01:24:37 +00:00
Jason Ansel	8225a26835	[dynamo] Fix issue with namedtuple slicing (#163351 ) Fixes #163253 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163351 Approved by: https://github.com/williamwen42, https://github.com/mlazos	2025-09-20 00:42:02 +00:00
Chien-Chin Huang	093f0642aa	[CP][BE] Correct an incorrect docstring (#163131 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163131 Approved by: https://github.com/tianyu-l, https://github.com/XilunWu ghstack dependencies: #162539, #162540, #162541, #163115	2025-09-19 23:55:03 +00:00
rzou	ee7bdd8f2f	[graph partition] Add way to register custom rule (#163310 ) This PR adds an experimental way to register a custom rule for if inductor should partition the graph around an operator. Test Plan: - new test Pull Request resolved: https://github.com/pytorch/pytorch/pull/163310 Approved by: https://github.com/ProExpertProg, https://github.com/BoyuanFeng, https://github.com/eellison ghstack dependencies: #162117, #162307, #162651	2025-09-19 23:28:03 +00:00
Nikita Shulga	0098e5636d	[CI] Move Windows build/tests to Python-3.10 (#162862 ) What supposed to be a very simple change end up being quite involved, as current Windows CI framework is quite inflexible, i.e. it takes a lots of argument, but later on ignores them, namely: - `PYTHON_VERSION` used to be a no-op that is simply ignored by the scripts - With this change, `setup-win` action will create an environment called `py_tmp` with specific python version + intel-openmp (that is hard runtime requirement, but for some reason not packaged into the wheel nor marked as such) - Copied test type dependencies from `be01a40157/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1 (L16)` into `win-test.sh`, but made some adjustments to be compatible with 3.10 runtime (scipy version update) and just make rerun-tests compatible with the rest of the deps I think in the long run, one needs to update `4432e2cacd/aws/ami/windows/scripts/Installers/Install-Miniconda3.ps1` that currently pins Miniconda python to 3.9, but also figure out how CI can still create a new environment without having to download all the dependencies all the time Pull Request resolved: https://github.com/pytorch/pytorch/pull/162862 Approved by: https://github.com/wdvr, https://github.com/huydhn ghstack dependencies: #163339, #163341	2025-09-19 22:51:38 +00:00
Aart J.C. Bik	9b5ec0ff7c	Use computed buffer sizes of torch for cusparseLt metadata (#163125 ) Making sure buffer allocation matches what is computed by cusparseLt compression Pull Request resolved: https://github.com/pytorch/pytorch/pull/163125 Approved by: https://github.com/jcaip	2025-09-19 22:12:40 +00:00
Svetlana Karslioglu	e6a9db58d7	Add analytics ID to cpp docs (#163370 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163370 Approved by: https://github.com/albanD	2025-09-19 21:45:19 +00:00
Mikayla Gawarecki	fab8455943	Don't use declarations in global namespace in stable headers (#163352 ) Fixes https://github.com/pytorch/pytorch/issues/163338 Configured https://clang.llvm.org/extra/clang-tidy/checks/google/global-names-in-headers.html for torch/csrc/stable Note that doesn't error for the DeleterFnPtr case, but will generate the following for the `using torch::stable::Tensor;` ``` >>> Lint for torch/csrc/stable/ops.h: Error (CLANGTIDY) [google-global-names-in-headers,-warnings-as-errors] using declarations in the global namespace in headers are prohibited 10 \|#include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h> 11 \|#include <torch/headeronly/core/ScalarType.h> 12 \| >>> 13 \|using torch::stable::Tensor; 14 \| 15 \|namespace torch::stable { 16 \| ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163352 Approved by: https://github.com/janeyx99	2025-09-19 21:15:52 +00:00
xinan.lin	9f8a311af0	[Inductor][Intel GPU] Save `threads_per_warp` from tirton compiled kernel for launching kernel correctly in cpp wrapper. (#163315 ) On the Inductor XPU backend, `threads_per_warp` is not always 32. For Intel GEMM Triton kernels, it can be 16. This information must be preserved for XPU so that the Cpp wrapper can launch the kernel with the correct configuration. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163315 Approved by: https://github.com/EikanWang, https://github.com/desertfire	2025-09-19 21:06:56 +00:00
Samuel Park	df9a4824e6	Bugfix for doing negative padding (#161639 ) Fixes #161014 This bug fix introduces a fix that is consistent with the exception handling. Outlined in issue #161014, there is an edge case where the negative padding does not make the tensor size negative but still triggers the exception that the size is negative. The fix is simply adding `new_dim >=0` to include the zero dim and letting the operator return an empty tensor. In the PR I have added the edge case where the test will now check the negative padding where the dimension gets reduced to zero. But the sample is only for the `constant` type of padding. I would like some feedback if it is necessary to put the same sample on the `reduce` type as well. This is my first PR to contribute to PyTorch and any help/feedback will be welcome! Thank you! @malfet @manuelcandales @janeyx99 @ezyang Pull Request resolved: https://github.com/pytorch/pytorch/pull/161639 Approved by: https://github.com/manuelcandales	2025-09-19 20:57:05 +00:00
Shunting Zhang	248156ed06	[Inductor] do loop reordering in a separate final round (#162355 ) Previous LOAF after fusion algorithm is not guaranteed to create more fusion opportunities even if loop reordering happens. I can not find an example that LOAF reduce the amount of fusion, but here is an example that reordering loops does not add more fusions: `a1f7639922/test/inductor/test_loop_ordering.py (L612-L641)` Move LOAF to a separate final round of fusion so that we are guaranteed to not reducing the amount of fusions. Hopefully this also helps compilation time since LOAF kicks in when there are less nodes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162355 Approved by: https://github.com/eellison, https://github.com/jansel ghstack dependencies: #162101, #162126	2025-09-19 20:21:33 +00:00
Shunting Zhang	e88460f453	[Inductor] don't call sympy_str when not needed (#162126 ) I see torch.compile spend 2% of time on sympy_str when compiling the bwd graph for MobileBertForQuestionAnswering. Most time sympy_str is called when extracting read/write dependencies. But when we extracting read/writer deps, the result of sympy_str is just discarded (correct me if I'm wrong). To make things simple, I just remove those calls. But if people think it may be useful for debugging, I can add a flag to only call sympy_str when it's explicitly set. <img width="667" height="409" alt="Screenshot 2025-09-03 at 6 21 52 PM" src="https://github.com/user-attachments/assets/a5929473-873d-4540-8f1e-c29f92be7125" /> (scuba link: https://fburl.com/scuba/pyperf_experimental/on_demand/3k2rduh9 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162126 Approved by: https://github.com/jansel, https://github.com/eellison ghstack dependencies: #162101	2025-09-19 20:21:33 +00:00
Shunting Zhang	466122b92c	[inductor] avoid creating LoopBody twice (#162101 ) Previously in merge_loops, we have to construct LoopBody twice to make sure we can use the same symbol prefix as before. This PR change it to create LoopBody only once by allowing using the same symbol prefix for the new LoopBody. In looks like it's ok to have duplicate symbols in sympy replacement: ``` >>> x, y = sympy.symbols("x y") >>> (x + y).xreplace({x: 0, y: x + 1}) x + 1 >>> (x + y).xreplace({x: y * y, y: x + 1}) x + y*2 + 1 >>> (x + y + x x).xreplace({x: 0, y: x}) x ``` UPDATE: add the same optimization for LoopBody.reorder_iter_loops Pull Request resolved: https://github.com/pytorch/pytorch/pull/162101 Approved by: https://github.com/jansel, https://github.com/eellison	2025-09-19 20:21:33 +00:00
ankushwahaRH	ba3c2c80ab	SDP Backend function fix (#161169 ) The issue cannot be reproduced using the original repro code provided in the issue description. However, the underlying issue mentioned by the maintainer (missing functions in `builder.py` and `trace_rules.py`) was never addressed and can still be reproduced with this test case: ```python import torch from torch.nn.attention import _cur_sdpa_kernel_backends @torch.compile(fullgraph=True) def test_function_that_triggers_error(): return _cur_sdpa_kernel_backends() print("Calling torch.compile function...") try: result = test_function_that_triggers_error() print(f"Success: {result}") except Exception as e: print(f"ERROR: {e}") print(f"Error type: {type(e)}") ``` The original repro likely no longer triggers the issue due to code path changes in the SDPA implementation, while the direct call to `_cur_sdpa_kernel_backends()` exposes the underlying problem where certain torch._C functions returning non-Tensor values aren't properly handled by dynamo tracing. I have implemented the changes by adding the missing functions to both `builder.py` and `trace_rules.py` to properly handle these cases during compilation. @guilhermeleobas Pull Request resolved: https://github.com/pytorch/pytorch/pull/161169 Approved by: https://github.com/guilhermeleobas, https://github.com/StrongerXi	2025-09-19 20:19:59 +00:00
Ke Wen	7130b174e0	[SymmMem] Fix memory allocation hold-up (#162680 ) Problem: Without MemPool it looks like nvshmem backend never deallocates memory. Cause: Handles in `symm_mems_` (a map) keeps reference to memory allocations. Solution: - Remove reference to allocation from handles -- the reference is never used anyway. - Use `unique_ptr` instead of `shared_ptr` to wrap allocation to ensure single ownership. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162680 Approved by: https://github.com/ezyang ghstack dependencies: #163298	2025-09-19 20:19:47 +00:00
Ke Wen	f8fb437197	[SymmMem] Barrier on team instead of world (#163298 ) As titled. Avoiding a potential hang when running dispatch and combine in subgroups. The rest is just re-arrange of the tests to create a sub-group test class. (no substantial change) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163298 Approved by: https://github.com/fegin	2025-09-19 20:19:47 +00:00
PyTorch MergeBot	2a308c7dee	Revert "Improve device info with new flops and bandwidth formula based on hardware libraries (#162245 )" This reverts commit 35d7b321597ed00245aad533a8fa6b7fdadd73ea. Reverted https://github.com/pytorch/pytorch/pull/162245 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/162245#issuecomment-3313669412))	2025-09-19 20:09:12 +00:00
thenumberouscode	4a160dae3c	[CUDA] revert PR 130472 (#162950 ) This change may also resolve https://github.com/pytorch/pytorch/issues/161789, though verification is still needed. PR #130472 would introduced the problem of freeing the same address without clean metadata. according to the below discussion, reverted it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162950 Approved by: https://github.com/ngimel, https://github.com/eqy, https://github.com/syed-ahmed	2025-09-19 19:50:44 +00:00
Nikita Shulga	a273475b01	[BE] Introduce `CONDA_ROOT_DIR` (#163341 ) Which equal to `%CONDA_PARENT_DIR%/Miniconda3`, and replace this pattern with `%CONDA_ROOT_DIR%` throughout the codebase Pull Request resolved: https://github.com/pytorch/pytorch/pull/163341 Approved by: https://github.com/clee2000 ghstack dependencies: #163339	2025-09-19 19:45:32 +00:00
Lucas Kabela	979e10f7d6	[Bugfix] Match eager stride semantics for cloned tensors with preserve_format in compile (#163017 ) Fixes #161010 by making `clone_meta` match the semantics of strides for eager mode. This is: * Case 1: Tensor is_non_overlapping_and_dense; in this case, stride should match input tensor stride * Case 2: Otherwise, stride should be contiguous computed from input tensor using `compute_elementwise_output_strides` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163017 Approved by: https://github.com/williamwen42, https://github.com/xmfan Co-authored-by: morrison-turnansky <mturnans@redhat.com>	2025-09-19 19:41:33 +00:00
Guilherme Leobas	bc7b17a36d	Realize LazyVariableTracker before raising exception (#163350 ) Improves error message reported on #163321 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163350 Approved by: https://github.com/Skylion007, https://github.com/xmfan	2025-09-19 19:25:17 +00:00
dsashidh	03f34fd307	Add explicit typing to nn.Module.__init__() parameters (#157389 ) Fixes #156740 Adds explicit `Any` typing to `args` and `*kwargs` in `nn.Module.__init__()` to fix type checker errors in strict mode. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157389 Approved by: https://github.com/Skylion007, https://github.com/Raman-RH	2025-09-19 19:02:28 +00:00
Nikita Shulga	52dd7a898c	Move ROCM trunk wheel builds to 3.10 (#163339 ) This code is a delicious spaghetti: Sometimes python version is defined in jinja template (see https://github.com/pytorch/pytorch/pull/162297 ) sometimes in shell script (see https://github.com/pytorch/pytorch/pull/162877 ), but this time around it's in a python file (and there is another one called `generate_binary_build_matrix.py` that defines `FULL_PYTHON_VERSIONS`) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163339 Approved by: https://github.com/clee2000	2025-09-19 18:52:00 +00:00
Nikita Shulga	b8c5ec582f	[CD] Simplify NVIDIA driver installation step (#163349 ) Undo changes introduced in https://github.com/pytorch/pytorch/pull/160956 as driver has been updated to 580 for both fleets Fixes https://github.com/pytorch/pytorch/issues/163342 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163349 Approved by: https://github.com/seemethere	2025-09-19 18:50:47 +00:00
arkadip-maitra	a0d2d84846	Handling overflow for long int overflow for the product of kernel_hei… (#155989 ) …ght and kernel_width that overflows to be exactly 0 Fixes [#155981](https://github.com/pytorch/pytorch/issues/155981) Pull Request resolved: https://github.com/pytorch/pytorch/pull/155989 Approved by: https://github.com/malfet	2025-09-19 18:15:01 +00:00
PyTorch MergeBot	607469bdad	Revert "[ROCm] Bump FBGEMM commit to avoid CK errors (#162590 )" This reverts commit c9b80c4d4b48deb1931e5f8641ab345d7cc7b639. Reverted https://github.com/pytorch/pytorch/pull/162590 on behalf of https://github.com/malfet due to This breaks CUDA 13 builds ([comment](https://github.com/pytorch/pytorch/pull/162590#issuecomment-3313263772))	2025-09-19 18:13:00 +00:00
PyTorch MergeBot	a3b68c7c57	Revert "Fix boxcox to return same result for same input in one batch (#162772 )" This reverts commit 49d30f9a234f0816a1ece278c8450d119e417714. Reverted https://github.com/pytorch/pytorch/pull/162772 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/162772#issuecomment-3313213011))	2025-09-19 17:58:29 +00:00
Catherine Lee	2984bfe3da	[ez][CI] Run vllm workflow on vllm pin updates (#163353 ) As in title The auto pin update was merged without running vllm workflow Pull Request resolved: https://github.com/pytorch/pytorch/pull/163353 Approved by: https://github.com/malfet, https://github.com/wdvr	2025-09-19 17:32:49 +00:00
Janani Sriram	3e663ce5da	[Inductor][Triton][FP8] Add a Blackwell-specific scaled persistent + TMA template for GEMMs (#163147 ) Summary: X-link: https://github.com/meta-pytorch/tritonbench/pull/432 Add a Blackwell-specific scaled persistent + TMA Triton template to Inductor. This diff builds on D82515450 by adding a new set of mixins which inherit the scaling epilogue and add scaled persistent + TMA kwargs to the template. This diff also adds a benchmark for the scaled Blackwell persistent + TMA template to TritonBench `fp8_gemm`. Note that this diff is a minimal extension to the above diff; rather than adding a new kernel for the scaled version, we opted to simply extend the epilogue to account for scaling. This template is accurate for per-tensor and per-row scaling but may require modifications for other scaling modes, such as deepseek-style scaling, which apply scaling prior to the GEMM computation. In addition, note that epilogue subtiling is currently unsupported for both the scaled and non-scaled Blackwell templates, and functionality will be added in a subsequent diff. Test Plan: Verified that the scaled Blackwell template adds the scaling epilogue to the generated Triton kernel by inspecting the Inductor-generated Triton kernel. Benchmarking command: ``` TRITON_PRINT_AUTOTUNING=1 TORCHINDUCTOR_CACHE_DIR=~/personal/cache_dir_inductor TRITON_CACHE_DIR=~/personal/cache_dir_triton TRITON_ALWAYS_COMPILE=1 TORCH_LOGS=+inductor TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 ENABLE_PERSISTENT_TMA_MATMUL=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM=1 buck2 run mode/{opt,inplace} pytorch/tritonbench:run -c fbcode.nvcc_arch=b200a -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8 -- --op fp8_gemm --only torch_fp8_gemm,blackwell_pt2_fp8_gemm --metrics tflops,accuracy --input-loader=/home/jananisriram/personal/fp8_shapes_testing.json --scaling_rowwise --output="/home/jananisriram/personal/fp8_shapes_testing_results.csv" --atol=1e-2 --rtol=0.5 2>&1 \| tee ~/personal/fp8_shapes_testing.log ``` Rollback Plan: Differential Revision: D82597111 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163147 Approved by: https://github.com/njriasan	2025-09-19 17:23:37 +00:00
Boyuan Feng	4967ad8baa	[Graph Partition] improve custom op output alias (#163227 ) For a custom op with multiple outputs, we will see the following generated code: ``` buf1 = op1(arg0) buf3 = buf0[0] buf4 = buf0[1] del buf1 # <--- if buf1 is not accessed in the future ``` If `buf1` is not accessed in the future, it's good to deallocate early. So we don't delay `del` until both buf3 and buf4 are not used anymore. Note that buf3 and buf4 hold reference to the data such that `del buf1` does not prevent their usage. However, when there are mutating args, we don't see `del buf1` immediately. ```python @torch.library.custom_op( "mylib::op1", mutates_args=["x"], schema="(Tensor(a!)? x) -> (Tensor, Tensor)", device_types="cuda", ) def op1(x) -> tuple[torch.Tensor, torch.Tensor]: x = x + 1 return (x + 1, x + 2) ``` <img width="661" height="821" alt="image" src="https://github.com/user-attachments/assets/3d1d1f5a-9749-4652-bb02-da593c78702d" /> Why? Because `buf3` is a MultiOutput with `buf1` as input and believes `buf1` (an output of FallbackKernel op1) has inputs that alias output. `72fedf0575/torch/_inductor/ir.py (L7976-L7982)` According to `[NOTE: FallbackKernel supported operators]`, as a mutating op that are auto-functionalizable, buf1's output should NOT alias any of the inputs. This PR improves get_inputs_that_alias_output of Fallback Kernel. Use case: [moe custom op in vllm](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/layer.py#L2057-L2064) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163227 Approved by: https://github.com/zou3519	2025-09-19 17:01:36 +00:00
drisspg	e631d76002	[Flex] Changing how bwd configs are setup and updating default b200 config (#163318 ) ```Shell Up to 4x perf boost 🔝 Top 5 Performance Differences (by absolute %): shape: (5, 7) ┌───────────┬────────────────┬────────────────────────────────┬───────────────────┬─────────────────────────────┬─────────────────────────────────┬────────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops BWD (base) ┆ TFlops BWD (better_configs) ┆ better_configs_speedup_over_ba… ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞═══════════╪════════════════╪════════════════════════════════╪═══════════════════╪═════════════════════════════╪═════════════════════════════════╪════════════╡ │ noop ┆ torch.bfloat16 ┆ (4, 16, 32768, 16, 32768, 128) ┆ 124.775035 ┆ 532.580435 ┆ 4.268325 ┆ 326.832527 │ │ noop ┆ torch.bfloat16 ┆ (4, 16, 16384, 16, 16384, 128) ┆ 124.494557 ┆ 519.798488 ┆ 4.175271 ┆ 317.527078 │ │ causal ┆ torch.bfloat16 ┆ (4, 16, 32768, 16, 32768, 128) ┆ 123.984189 ┆ 512.877391 ┆ 4.136635 ┆ 313.663544 │ │ noop ┆ torch.bfloat16 ┆ (4, 16, 8192, 16, 8192, 128) ┆ 122.827725 ┆ 496.195958 ┆ 4.039772 ┆ 303.977164 │ │ causal ┆ torch.bfloat16 ┆ (4, 16, 16384, 16, 16384, 128) ┆ 123.826738 ┆ 484.244647 ┆ 3.910663 ┆ 291.066303 │ └───────────┴────────────────┴────────────────────────────────┴───────────────────┴─────────────────────────────┴─────────────────────────────────┴────────────┘ 🔺 Top 5 Cases Where better_configs (change) is Faster than base (baseline): shape: (5, 7) ┌───────────┬────────────────┬────────────────────────────────┬───────────────────┬─────────────────────────────┬─────────────────────────────────┬────────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops BWD (base) ┆ TFlops BWD (better_configs) ┆ better_configs_speedup_over_ba… ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞═══════════╪════════════════╪════════════════════════════════╪═══════════════════╪═════════════════════════════╪═════════════════════════════════╪════════════╡ │ noop ┆ torch.bfloat16 ┆ (4, 16, 32768, 16, 32768, 128) ┆ 124.775035 ┆ 532.580435 ┆ 4.268325 ┆ 326.832527 │ │ noop ┆ torch.bfloat16 ┆ (4, 16, 16384, 16, 16384, 128) ┆ 124.494557 ┆ 519.798488 ┆ 4.175271 ┆ 317.527078 │ │ causal ┆ torch.bfloat16 ┆ (4, 16, 32768, 16, 32768, 128) ┆ 123.984189 ┆ 512.877391 ┆ 4.136635 ┆ 313.663544 │ │ noop ┆ torch.bfloat16 ┆ (4, 16, 8192, 16, 8192, 128) ┆ 122.827725 ┆ 496.195958 ┆ 4.039772 ┆ 303.977164 │ │ causal ┆ torch.bfloat16 ┆ (4, 16, 16384, 16, 16384, 128) ┆ 123.826738 ┆ 484.244647 ┆ 3.910663 ┆ 291.066303 │ └───────────┴────────────────┴────────────────────────────────┴───────────────────┴─────────────────────────────┴─────────────────────────────────┴────────────┘ 🔻 Top 5 Cases Where better_configs (change) is Slower than base (baseline): shape: (5, 7) ┌───────────────┬────────────────┬───────────────────────────────┬───────────────────┬─────────────────────────────┬─────────────────────────────────┬───────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops BWD (base) ┆ TFlops BWD (better_configs) ┆ better_configs_speedup_over_ba… ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞═══════════════╪════════════════╪═══════════════════════════════╪═══════════════════╪═════════════════════════════╪═════════════════════════════════╪═══════════╡ │ document_mask ┆ torch.bfloat16 ┆ (4, 16, 8192, 16, 8192, 128) ┆ 267.502004 ┆ 250.728732 ┆ 0.937297 ┆ -6.270335 │ │ document_mask ┆ torch.bfloat16 ┆ (4, 16, 8192, 4, 8192, 128) ┆ 248.510516 ┆ 235.210874 ┆ 0.946483 ┆ -5.351742 │ │ document_mask ┆ torch.bfloat16 ┆ (4, 16, 16384, 4, 16384, 128) ┆ 282.856295 ┆ 271.806926 ┆ 0.960936 ┆ -3.906354 │ │ document_mask ┆ torch.bfloat16 ┆ (4, 16, 8192, 16, 8192, 64) ┆ 282.212695 ┆ 280.519092 ┆ 0.993999 ┆ -0.600116 │ │ document_mask ┆ torch.bfloat16 ┆ (4, 16, 32768, 4, 32768, 128) ┆ 295.864073 ┆ 294.477894 ┆ 0.995315 ┆ -0.468519 │ └───────────────┴────────────────┴───────────────────────────────┴───────────────────┴─────────────────────────────┴─────────────────────────────────┴───────────┘ 📊 Performance Summary: ============================================================ Baseline: base Change: better_configs Geometric Mean Speedup (change over baseline): 1.9954x Geometric Mean % Change: +99.54% Median Speedup (change over baseline): 2.1590x Speedup Std Dev: 0.9800 Valid Comparisons: 60/60 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163318 Approved by: https://github.com/BoyuanFeng	2025-09-19 16:57:21 +00:00
Eddie Yan	f8f230a801	[FP8][cuBLAS][H100] only test fp32 outputs for rowwise `_scaled_mm` on H100 (#162022 ) only cuBLAS supports float32 output and cuBLAS only supports rowwise for SM 9.0 Intended to land after #161305 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162022 Approved by: https://github.com/ngimel	2025-09-19 15:18:13 +00:00
Jagadish Krishnamoorthy	264e7f68a0	[ROCm] Fix mx fp8 and fp4 code after scaling refactor changes. (#163127 ) PR #151360 added mx fp8 and fp4 support on ROCm. 1. However, on recent upstream, scaling function in Blas.cpp along with test_matmul_cuda changes triggered failures. This patch corrects is_blockwise_1x32_scaling function code. 2. Fixes the m, n, k dimensions for ROCm mx case. 3. Modify FP4E2M1FN_LARGEST_POW2 (largest power of 2 representable in `torch.float4_e2m1fn_x2`) to 2. This resulted in higher SQNR value for mx fp4 test. Testing result on gfx950 w/ ROCm7.0 PYTORCH_TEST_WITH_ROCM=1 python test/test_matmul_cuda.py -k test_blockwise -v Ran 452 tests in 22.698s OK passed 111 This is same as before. (when PR 151360 was merged) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163127 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-19 12:29:52 +00:00
Prachi Gupta	bee362c381	[ROCm][SymmMem] Fix skip condition for PLATFORM_SUPPORTS_SYMM_MEM (#163205 ) It seems `TEST_CUDA` is set to true even for ROCm (MI200) jobs. Changing if TEST_CUDA to an else condition to avoid running symmetric memory UTs on MI200. For other non-rocm arch, it should return true and can be skipped using other skip decorators. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163205 Approved by: https://github.com/ezyang Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-19 12:12:47 +00:00
dependabot[bot]	33e6c5a93d	[Dependabot] Update(deps): Bump transformers from 4.54.0 to 4.56.0 in /.ci/docker/ci_commit_pins (#162063 ) * [Dependabot] Update(deps): Bump transformers Bumps [transformers](https://github.com/huggingface/transformers) from 4.54.0 to 4.56.0. - [Release notes](https://github.com/huggingface/transformers/releases) - [Commits](https://github.com/huggingface/transformers/compare/v4.54.0...v4.56.0) --- updated-dependencies: - dependency-name: transformers dependency-version: 4.56.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> * Refresh results Signed-off-by: Huy Do <huydhn@gmail.com> * Another round of updates Signed-off-by: Huy Do <huydhn@gmail.com> * Another round of update Signed-off-by: Huy Do <huydhn@gmail.com> * Hopefully the last round of update Signed-off-by: Huy Do <huydhn@gmail.com> * Plz Signed-off-by: Huy Do <huydhn@gmail.com> --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: Huy Do <huydhn@gmail.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Huy Do <huydhn@gmail.com>	2025-09-19 02:50:36 -07:00
Xiao, Wang	ab5086a7ae	[WOQ] Add XPU kernel for _weight_int8pack_mm (#160938 ) Summary: This issue proposes implementing a XPU kernel for aten._weight_int8pack_mm, a weight-only quantized (WOQ) linear operation that is currently only supported on CPU and CUDA. Motivation: Same as https://github.com/pytorch/pytorch/pull/159325. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160938 Approved by: https://github.com/EikanWang, https://github.com/ZhiweiYan-96, https://github.com/liangan1, https://github.com/jerryzh168	2025-09-19 07:37:14 +00:00
Chien-Chin Huang	0815091d86	[CP][BE] Cosmetic refactors for CP code base (#163115 ) Summary: This PR is extracted from https://github.com/pytorch/pytorch/pull/162542, to make the original PR easier to review. This PR only contains cosmetic changes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163115 Approved by: https://github.com/tianyu-l ghstack dependencies: #162539, #162540, #162541	2025-09-19 07:21:46 +00:00
PyTorch MergeBot	32ad29b72a	Revert "[dynamo][guards] Fail on an unknown framelocals to dict conversion (#162695 )" This reverts commit a8432bcaadd6dea52a94429dced1fb4550f2f560. Reverted https://github.com/pytorch/pytorch/pull/162695 on behalf of https://github.com/anijain2305 due to internal failure at https://fburl.com/workplace/qiitdlp6 ([comment](https://github.com/pytorch/pytorch/pull/162695#issuecomment-3310757225))	2025-09-19 06:18:27 +00:00
PyTorch MergeBot	1302637a23	Revert "[dynamo][guards] Do not construct entire framelocals dict for LAMBDA_GUARD (#162525 )" This reverts commit 5f630d28d7ff9fdd8bd6cdbe2438e5c821007845. Reverted https://github.com/pytorch/pytorch/pull/162525 on behalf of https://github.com/anijain2305 due to internal tests fail ([comment](https://github.com/pytorch/pytorch/pull/162525#issuecomment-3310748980))	2025-09-19 06:15:28 +00:00
Xiaotian Hu	e0bcd58f57	[MTIA] Add MTIA dispatch for kernel foreach_maximum(Add D80022242 back) (#161571 ) Summary: dispatch MTIA to function foreach_tensor_maximum_scalar_kernel_mtia_ Test Plan: CI Rollback Plan: Differential Revision: D81086607 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161571 Approved by: https://github.com/malfet	2025-09-19 05:57:09 +00:00
PyTorch MergeBot	17081209e5	Revert "[CI] Move Windows build/tests to Python-3.10 (#162862 )" This reverts commit 2dcd153342d27b0981ff79eb2ccb8d8962e79c48. Reverted https://github.com/pytorch/pytorch/pull/162862 on behalf of https://github.com/malfet due to Breaks some windows tests ([comment](https://github.com/pytorch/pytorch/pull/162862#issuecomment-3310606135))	2025-09-19 05:16:49 +00:00
PyTorch MergeBot	578047838c	Revert "[BE] Update Python min version to 3.10 (#162310 )" This reverts commit 3016616ccbba3dc9bb6a80eb4a81a846ddf49cc9. Reverted https://github.com/pytorch/pytorch/pull/162310 on behalf of https://github.com/malfet due to Breaks some windows tests ([comment](https://github.com/pytorch/pytorch/pull/162862#issuecomment-3310606135))	2025-09-19 05:16:49 +00:00
can-gaa-hou	ce5637be29	Fix invalid indices bug for max_unpool2d/3d on MPS (#163036 ) Fixes #163035 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163036 Approved by: https://github.com/kulinseth, https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-09-19 05:13:21 +00:00
Edward Z. Yang	c91f59b1a0	Fix performance regression when indexing by Numpy arrays (#163280 ) Benchmark script: ``` import time import numpy as np import torch def main() -> None: for i in range(10): block_indices = np.arange(16384, dtype=np.int32) block_indices = block_indices.reshape(-1).clip(max=255) batch_indices = np.zeros(16384, dtype=np.int64) virtual_batches = 32 block_table = torch.randn(32, 256) start = time.perf_counter() block_table[batch_indices, block_indices].view(virtual_batches, -1) end = time.perf_counter() time_elapsed_ms = (end - start) * 1000 print(f"Function execution time: {time_elapsed_ms:.1f}ms") if __name__ == "__main__": main() ``` Before: ``` (a) [ezyang@devvm006.dkl0 ~/local/b/pytorch] python ben.py Function execution time: 28.5ms Function execution time: 12.9ms Function execution time: 12.6ms Function execution time: 13.5ms Function execution time: 12.0ms Function execution time: 13.4ms Function execution time: 12.9ms Function execution time: 12.9ms Function execution time: 13.1ms Function execution time: 13.0ms ``` After: ``` Function execution time: 17.8ms Function execution time: 2.5ms Function execution time: 1.3ms Function execution time: 2.5ms Function execution time: 2.3ms Function execution time: 1.3ms Function execution time: 2.4ms Function execution time: 2.5ms Function execution time: 2.5ms Function execution time: 2.4ms ``` Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163280 Approved by: https://github.com/SherlockNoMad, https://github.com/cyyever	2025-09-19 05:02:58 +00:00
Nikita Shulga	3016616ccb	[BE] Update Python min version to 3.10 (#162310 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162310 Approved by: https://github.com/atalman, https://github.com/Skylion007, https://github.com/ZainRizvi ghstack dependencies: #162862	2025-09-19 04:28:56 +00:00
PyTorch UpdateBot	46c647d1ee	[vllm hash update] update the pinned vllm hash (#163304 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163304 Approved by: https://github.com/pytorchbot	2025-09-19 04:25:43 +00:00
Scott Wolchok	76a841fd47	Port OpSchema.__post_init__ and OpSchema._recompute_comparison_key to C++ (#161695 ) I initially didn't see good results porting this, but it was apparently because of pybind11 function calling overhead. (pybind11's object-handling primitives seem fine enough.) I'm interested in setting up nanobind, but this demonstrates it's not blocking. Differential Revision: [D81530102](https://our.internmc.facebook.com/intern/diff/D81530102) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161695 Approved by: https://github.com/ezyang	2025-09-19 04:07:30 +00:00
Animesh Jain	bd964cbbfb	[functionalize] Avoid one more call to custom get_device on FunctionalTensorWrapper (#163019 ) Trying to reduce the number of `__torch_dispatch__` calls of FakeTensorMode in the AOT metadata collection pass. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163019 Approved by: https://github.com/Lucaskabela, https://github.com/bdhirsh, https://github.com/zou3519 ghstack dependencies: #162987	2025-09-19 02:52:08 +00:00
Jae Ku	5f25dbe7fd	Rm pytorch deps platform args (#163086 ) Summary: Platform args was a buck1 concept that we decided to port over to buck2 in order to make the migration easier. However, platforms args existing in the repo blocks some buck modernization like modefile free efforts, so we're trying to get rid of the usage. Test Plan: CI Rollback Plan: Differential Revision: D82470032 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163086 Approved by: https://github.com/malfet, https://github.com/8Keep	2025-09-19 02:13:03 +00:00
Han Chao	e134bb340a	Update torch-xpu-ops commit pin (#163244 ) Update the torch-xpu-ops commit to `24fab67b6e`, includes: - Clean up getDeviceIndexOfCurrentQueue - Fix hardswish gradients corner case - Fix xccl contiguous check - Move checks from nonzero kernel to operator - support high priority stream for xccl Pull Request resolved: https://github.com/pytorch/pytorch/pull/163244 Approved by: https://github.com/EikanWang	2025-09-19 02:04:40 +00:00
Xuan Zhang	6e680ae8de	add more restriction to fusion with large accumulate reads (#163163 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163163 Approved by: https://github.com/yf225	2025-09-19 01:20:30 +00:00
James Wu	3c9e220f34	Refactor PrecompileContext to be considerably more debuggable (#162740 ) Summary: This diff does a few things: - It refactors PrecompileContext to store DynamoCacheEntries directly on the context. This allows us at serialization time to check if the dynamo cache entry has all its backends ready for serialization, and if not, skip unnecessarily serializing it - It also gives us the ability to print out a `debug` JSON, which contains a mapping for everything being serialized and deserialized. Here's an example of what that JSON looks like: ``` { "artifacts": { "precompile_aot_autograd": [ "__compiled_fn_8_306d538b_f7f8_4ab4_98a1_b5ff4493f99d" ], "precompile_dynamo": [ { "backend_ids": [ "__compiled_fn_8_306d538b_f7f8_4ab4_98a1_b5ff4493f99d" ], "fn_name": "TorchBenchmarkRunner.forward_and_backward_pass", "num_codes": "10", "python_version": "3.12.11+meta", "torch_version": "2.10.0a0+fb" } ] }, "num_entries": 1 } ``` Test Plan: Existing tests pass. NanoGPT tlparse showing the new debug: https://manifold.edge.x2p.facebook.net/v0/read/tree/logs/.tmpeIsL5G/index.html?bucketName=tlparse_reports&apiKey=tlparse_reports-key&withPayload=1&timeoutMsec=10000 Note that there aren't compile ids since we're logging this in PrecompileContext.serialize() for now, where there isn't a compile yet. I think this is fine for now, as no compile ID makes sense here. If anything, these kind of belong in a "Global" compile ID, which I will not implement in this PR. Rollback Plan: Differential Revision: D82232574 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162740 Approved by: https://github.com/zhxchen17	2025-09-19 01:14:28 +00:00
Prachi Gupta	c9b80c4d4b	[ROCm] Bump FBGEMM commit to avoid CK errors (#162590 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/162590 Approved by: https://github.com/jeffdaily	2025-09-19 01:14:20 +00:00
Chien-Chin Huang	cd4303afc6	[CP][BE] Correct the names of some tests (#162541 ) We are not doing ring attention but only using allgather to do CP for Flex. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162541 Approved by: https://github.com/ezyang, https://github.com/Skylion007, https://github.com/tianyu-l, https://github.com/XilunWu ghstack dependencies: #162539, #162540	2025-09-19 00:38:04 +00:00
Nikita Shulga	2dcd153342	[CI] Move Windows build/tests to Python-3.10 (#162862 ) What supposed to be a very simple change end up being quite involved, as current Windows CI framework is quite inflexible, i.e. it takes a lots of argument, but later on ignores them, namely: - `PYTHON_VERSION` used to be a no-op that is simply ignored by the scripts - With this change, `setup-win` action will create an environment called `py_tmp` with specific python version + intel-openmp (that is hard runtime requirement, but for some reason not packaged into the wheel nor marked as such) - Introduced `CONDA_ROOT_DIR` env variable in `activate_miniconda3.bat` to avoid `%CONDA_PARENT_DIR%\Miniconda3` invocations throughout the codebase - Copied test type dependencies from `be01a40157/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1 (L16)` into `win-test.sh`, but made some adjustments to be compatible with 3.10 runtime (scipy version update) and just make rerun-tests compatible with the rest of the deps I think in the long run, one needs to update `4432e2cacd/aws/ami/windows/scripts/Installers/Install-Miniconda3.ps1` that currently pins Miniconda python to 3.9, but also figure out how CI can still create a new environment without having to download all the dependencies all the time Pull Request resolved: https://github.com/pytorch/pytorch/pull/162862 Approved by: https://github.com/wdvr, https://github.com/huydhn	2025-09-19 00:33:03 +00:00
Sherlock Huang	04842ac2b0	Change DebugMode record_torchfunction default to False (#163293 ) Result is too noisy with `record_torchfunction = True`. Change it to False, to make it clean. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163293 Approved by: https://github.com/zpcore	2025-09-19 00:30:53 +00:00
Jacob Szwejbka	17c16537e2	Deprecate Lite Interpreter (#163289 ) Summary: Point people lowering to lite interpreter to the existence of ExecuTorch. Added the typing deprecation, a warnings deprecation Test Plan: Try using it, see deprecation warning Reviewed By: lucylq Differential Revision: D82759566 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163289 Approved by: https://github.com/larryliu0820	2025-09-18 23:56:21 +00:00
Animesh Jain	ddc56f6f92	[functional] Use the saved device on storage instead for device_custom (#162987 ) Trying to reduce the number of __torch_dispatch__ calls of FakeTensorMode in the AOT metadata collection pass. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162987 Approved by: https://github.com/Lucaskabela, https://github.com/bdhirsh, https://github.com/zou3519	2025-09-18 23:43:20 +00:00
Chien-Chin Huang	096d35c44c	[CP] Remove the need of recording cp_dim in the global var (#162540 ) This information can be obtained during the dispatching. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162540 Approved by: https://github.com/ezyang, https://github.com/tianyu-l, https://github.com/XilunWu ghstack dependencies: #162539	2025-09-18 23:40:48 +00:00
Pian Pawakapan	4c007073e6	[dynamic shapes] DynamicInts prototype (#162194 ) Initial prototype for dynamic int inputs, allows users to run with `torch.compile(f)(DynamicInt(4))`, compiling dynamically and using the underlying hint at runtime. Current behavior: - Also works in eager (mostly by subclassing int), as scalar input to torch functions, or numpy/math/etc. For example, `x = DynamicInt(3); torch.randn(x); torch.add(y, z, alpha=x); np.arange(x)` all act as if x = 3. - Behavior for arithmetic ops is to return new DynamicInts rather than static ints; `DynamicInt(3) * 2 = DynamicInt(6)`. This is via SymNode magic methods, but coverage might not be 100% - for example, I had to explicitly override floordiv to avoid int casting. This is not necessarily the case for non-magic method ops (e.g. `math.cos(x)`). The alternative here is to int cast on all operations, but I opted for this for dynamism propagation in non-compiled regions. - Doesn't ban fullgraph=False; DynamicInt objects might be leaked back to the user, but I guess this is fine, because they can be casted to ints when needed? - Dynamo only allocates one symbol per DynamicInt; specifying the same DynamicInt for multiple inputs leads to input deduplication, and a guard installed. - We don't raise on int specialization (in allowlist/maybe_mark_dynamic style) - but an easy change if needed. - DynamicInts as nn.Module attributes are handled. - We don't guard on the DynamicInt id, e.g. users can do the following without recompiling (maybe we should guard?) ```python x = DynamicInt(4) f(x) f(1) f(DynamicInt(3)) # same as f(3) ``` Follow-up work: - Specifying shape constraints, either at the int-level, e.g. ```python DynamicInt(64, name="s0", constraints=["s0 % 32 == 0", "s0 <= 1024"] ``` or at the compilation level, e.g. something like ```python s0 = DynamicInt(64, name="s0") s1 = DynamicInt(128, name="s1") with some_compiler_config.dynamic_int_constraints(["s1 == 2*s0", "s0 % 32 == 0"]): f(s0, s1) ``` This should subsume the need for specifying derived SymInts? - SymFloat support - currently it seems backed floats are specialized by the tensorify float pass, and there's no handling in inductor. - Propagating dynamism in tensor constructors, e.g. `x = DynamicInt(4); torch.randn(x)` could annotate `_dynamo_dynamic_indices`. Differential Revision: D81698719 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162194 Approved by: https://github.com/bobrenjc93	2025-09-18 23:26:28 +00:00
Mergen Nachin	f4eca0e3b3	Try updating ET pin in PT/PT (#159664 ) Looking into resolving this: https://github.com/pytorch/pytorch/issues/159599 Test Plan: Wait for executorch CI Pull Request resolved: https://github.com/pytorch/pytorch/pull/159664 Approved by: https://github.com/malfet	2025-09-18 21:55:16 +00:00
bobrenjc93	ed3438ff13	Turn on capture_dynamic_output_shape_ops when fullgraph=True (#163123 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163123 Approved by: https://github.com/laithsakka ghstack dependencies: #163121	2025-09-18 21:24:15 +00:00
bobrenjc93	7dcb568c8f	Turn on capture_scalar_outputs when fullgraph=True (#163121 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163121 Approved by: https://github.com/laithsakka	2025-09-18 21:24:15 +00:00
Sherlock Huang	bb7c9a2d41	[DTensor] Fix DTensor.mean with uneven sharding (#163241 ) Fixes #162692 When input is uneven sharded, redistribute input as Replicated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163241 Approved by: https://github.com/dcci	2025-09-18 19:53:51 +00:00
Edward Yang	159c2140f7	test: ensure editable cached wrapper is respected (#160943 ) ## Summary - add a test verifying that editing the local cache wrapper is picked up after Dynamo reset ## Testing - `lintrunner -a` (fails: FLAKE8 failure, TEST_HAS_MAIN failure, CODESPELL failure, PYFMT failure) - `PYTHONPATH=. python test/inductor/test_codecache.py TestPyCodeCache.test_editable_cached_wrapper -v` ------ https://chatgpt.com/codex/tasks/task_e_68a3aa3fcc9883239b17d1f4250d1e89 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160943 Approved by: https://github.com/xmfan, https://github.com/albanD	2025-09-18 19:24:51 +00:00
Jeff Daily	62a746f62c	[ROCm] update ci_expected_accuracy for dynamo benchmarks (#163256 ) Some tests that were already failing changed status to skipped. Some model entries were missing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163256 Approved by: https://github.com/malfet Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-18 19:05:19 +00:00
Edward Yang	8627454c84	Add local file path to inductor_output_code trace metadata (#160920 ) ## Summary - include local file path in `inductor_output_code` structured trace metadata - adjust structured trace tests for new `file_path` field ## Testing - `python test/dynamo/test_structured_trace.py StructuredTraceTest.test_compile_id_serialization_deserialization` - `lintrunner -a torch/_inductor/codecache.py torch/_inductor/graph.py test/dynamo/test_structured_trace.py` (fails: MYPY failure) ------ https://chatgpt.com/codex/tasks/task_e_68a2b02b54ec8323ae820120605a9f1c Pull Request resolved: https://github.com/pytorch/pytorch/pull/160920 Approved by: https://github.com/oulgen	2025-09-18 18:39:46 +00:00
thenumberouscode	93964ed6ab	[unit test] correct wrong input shape in test_flop_fx (#163148 ) The input tensor shape does not match the weight tensor shape, which was detected by the validation logic implemented in my other PR(https://github.com/pytorch/pytorch/pull/160408). The input tensor should have a shape of (2, 2, 3), since dimension 1 of the input (representing input channels) must match dimension 0 of the weight tensor (representing input channels). ref https://docs.pytorch.org/docs/stable/generated/torch.nn.ConvTranspose1d.html Pull Request resolved: https://github.com/pytorch/pytorch/pull/163148 Approved by: https://github.com/eellison	2025-09-18 18:38:01 +00:00
Ke Wen	80f8be9840	[SymmMem] Fix put_signal + wait_until hang (#163194 ) The test used a wrong ptr to refer to remote address: ``` dst_ptr = out_hdl.buffer_ptrs[peer] src_ptr = inp_hdl.buffer_ptrs[rank] sig_ptr = out_hdl.signal_pad_ptrs[peer] ``` All three indices should be `rank` instead of `peer` because NVSHMEM APIs accept local address as input and perform translation internally. Without correct signal address, the peer would be waiting, thus hang. Also adjusted the signature of `nvshmem.putmem_signal_block` to accept tensor instead of pointer. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163194 Approved by: https://github.com/ngimel ghstack dependencies: #163025, #163152	2025-09-18 18:18:58 +00:00
Edward Z. Yang	e36a6fcf0f	Massive hack to make autograd shut up about threaded PG mutations (#163238 ) See the Note for explanation. Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/163238 Approved by: https://github.com/albanD	2025-09-18 18:12:57 +00:00
Benji Beck	23af32a078	[WOQ] Integrate CUDA support for concat linear int8pack_mm woq optimization pattern (#161848 ) Summary: What: Enables CUDA support for concat linear int8_mm woq optimization pattern by: - Updating pattern validation to accept CUDA devices - Adding test coverage for CUDA Why: Extend WOQ to more device types Test Plan: ``` buck2 run 'fbcode//mode/opt' //caffe2/test/inductor:cuda_select_algorithm ``` Rollback Plan: Differential Revision: D80884518 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161848 Approved by: https://github.com/jerryzh168	2025-09-18 18:08:07 +00:00
henrylhtsang	a81a2e54ed	[submodule] CUTLASS upgrade to 4.2.0 and change cutlass to cutlass_cppgen (#163092 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163092 Approved by: https://github.com/drisspg, https://github.com/Skylion007	2025-09-18 18:03:51 +00:00
PyTorch MergeBot	4b7aed89d8	Revert "[torch][cuda][device_limits] Library for querying device hardware limits for flops and bandwidth (#162942 )" This reverts commit 627482a7b7780752c0e7aea034a2eb2db5899fcc. Reverted https://github.com/pytorch/pytorch/pull/162942 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it needs some fixes for CUDA 13 ([comment](https://github.com/pytorch/pytorch/pull/162942#issuecomment-3308784448))	2025-09-18 17:49:16 +00:00
Robert Hardwick	1aeac304b8	Move prioritized text linker optimization code from setup.py to cmake (#160078 ) Note. This is a replica PR of #155901 which will be closed. I had to create a new PR in order to add it into my ghstack as there are some later commits which depend on it. ### Summary 🚀 This PR moves the prioritized text linker optimization from setup.py to cmake ( and enables by default on Linux aarch64 systems ) This change consolidates what was previously manual CI logic into a single location (cmake), ensuring consistent behavior across local builds, CI pipelines, and developer environments. ### Motivation Prioritized text layout has measurable performance benefits on Arm systems by reducing code padding and improving cache utilization. This optimization was previously triggered manually via CI scripts (.ci/aarch64_linux/aarch64_ci_build.sh) or user-set environment variables. By detecting the target architecture within setup.py, this change enables the optimization automatically where applicable, improving maintainability and usability. Note: Due to ninja/cmake graph generation issues we cannot apply the linker file globally to all targets to the targets must be manually defined. See CMakeLists.txt the main libraries torch_python, torch, torch_cpu, torch_cuda, torch_xpu have been targetted which should be enough to maintain the performance benefits outlined above. Co-authored-by: Usamah Zaheer <usamah.zaheer@arm.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160078 Approved by: https://github.com/seemethere	2025-09-18 17:09:48 +00:00
Tugsbayasgalan Manlaibaatar	56893ca1f6	Don't register wrong overload to prim decomp (#163138 ) These decompositions take precedence before CIA decomps in fake tensor prop, as a result, we would hit this implementation for all where overloads which is wrong in some cases. For the overloads that can't be implemented by this decomp, we just run the default CIA impl. Previously this doesn't matter because in post-dispatch IR, aten.where would have decomposed but when user tries to preserve aten.where this issue will surface because fake tensor will start seeing aten.where. Differential Revision: [D82604702](https://our.internmc.facebook.com/intern/diff/D82604702) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163138 Approved by: https://github.com/henryoier, https://github.com/ezyang	2025-09-18 17:01:19 +00:00
Catherine Lee	af8c232b75	[CI] reuse old whl: fix metadata file not getting version replaced (#163214 ) In the .dist-info/METADATA file, the version was not being written with the new sha. On python <3.11 (I think), the glob `*` will only match directories, so change this to ``, which I checked that it will match both files and directories on py3.9 and py3.13 There's probably also a bunch of mismatches in RECORD but thats a problem for later Pull Request resolved: https://github.com/pytorch/pytorch/pull/163214 Approved by: https://github.com/huydhn	2025-09-18 16:08:29 +00:00
Catherine Lee	4908fb53c3	[testing] Add test owner labels for some ao sparse tests (#163203 ) I am trying to give some test files better owner labels than `module: unknown`. I am not sure them, but they seem pretty reasonable Pull Request resolved: https://github.com/pytorch/pytorch/pull/163203 Approved by: https://github.com/jcaip	2025-09-18 16:08:13 +00:00
Colin Peppler	3c8b90542c	support unbacked softmax / logsoftmax (#162216 ) ### DDE ``` GuardOnDataDependentSymNode: Could not guard on data-dependent expression Eq(3u0, 0) (unhinted: Eq(3u0, 0)). (Size-like symbols: u0) Caused by: (_decomp/decompositions.py:1185 in _softmax) ``` ``` torch._dynamo.exc.UserError: Could not guard on data-dependent expression Eq(u0, 0) (unhinted: Eq(u0, 0)). (Size-like symbols: u0) Caused by: logsoft = torch.nn.functional.log_softmax(nz, dim=0) # test/inductor/test_unbacked_symints.py:573 in fn (_decomp/decompositions.py:1212 in _log_softmax) ``` ``` GuardOnDataDependentSymNode: Could not guard on data-dependent expression Ne(u0, 0) (unhinted: Ne(u0, 0)). (Size-like symbols: u0) Caused by: (_refs/__init__.py:2218 in _reduction) ``` ### Cannot convert symbols to int ``` File "torch/_inductor/lowering.py", line 7160, in prepare_softmax_online and V.graph.sizevars.size_hint(rnumel) >= config.unroll_reductions_threshold ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "orch/_inductor/sizevars.py", line 591, in size_hint return int(out) ^^^^^^^^ File "sympy/core/expr.py", line 342, in __int__ raise TypeError("Cannot convert symbols to int") ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162216 Approved by: https://github.com/laithsakka, https://github.com/eellison	2025-09-18 15:43:20 +00:00
morrison-turnansky	1f21f8544c	fixing graph break for namedtuple._replace (#160139 ) Fixes #158772 _replace works without graph break Pull Request resolved: https://github.com/pytorch/pytorch/pull/160139 Approved by: https://github.com/mlazos	2025-09-18 14:32:36 +00:00
Saman Khatir	1330c638be	Update Microsoft C++ Redistributable to the latest version (#161430 ) Update Microsoft C++ Redistributable link to the latest version as one of the libraries used by AMD currently has a dependency on that. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161430 Approved by: https://github.com/malfet	2025-09-18 14:22:03 +00:00
Jagadish Krishnamoorthy	8bc4a467a7	[ROCm] test_aot_inductor: Enable fp8 tests. (#163050 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163050 Approved by: https://github.com/jeffdaily	2025-09-18 14:05:21 +00:00
Xinya Zhang	e769026bcb	[ROCm] Remove HIPBLASLT_ALLOW_TF32 from codebase (#162998 ) A few UT failures are caused by `HIPBLASLT_ALLOW_TF32` Fixes #157094 Fixes #157093 Fixes #157092 Fixes #157091 Fixes #157064 Fixes #157063 Fixes #157062 Fixes #157061 Fixes #157042 Fixes #157041 Fixes #157039 Fixes #157004 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162998 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-18 13:53:48 +00:00
Natalia Gimelshein	14f8d86136	Reland #161649 , vectorize stored in cat for all dtypes (#162440 ) Per title Pull Request resolved: https://github.com/pytorch/pytorch/pull/162440 Approved by: https://github.com/Skylion007	2025-09-18 13:50:44 +00:00
PaliC	c43ccfbc2d	[BE] Remove bottleneck (#163210 ) Some cleanup related to this RFC: https://github.com/pytorch/pytorch/issues/68742 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163210 Approved by: https://github.com/ezyang	2025-09-18 12:08:13 +00:00
Wei Feng	cfb8aec1a4	[FSDP2] idempotent reset_sharded_param: no-op if _local_tensor is already padded (#163130 ) resolves https://github.com/pytorch/torchtitan/issues/1136 torchtitan use cached state dict for ft. reset_sharded_param should be idempotent if model.parameters() are padded already ``` # pad DTensor._local_tensor fully_shard(model) sd = fsdp_model.state_dict() # reset_sharded_param should be a no-op in lazy_init loss = fsdp_model(inp).sum() ``` this PR make `reset_sharded_param` idempotent by checking storage data ptr and return early unit test ``` pytest -s test/distributed/_composable/fsdp/test_fully_shard_state_dict.py -k test_cached_state_dict ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163130 Approved by: https://github.com/tianyu-l	2025-09-18 09:20:37 +00:00
rzou	98ce93db0b	[DTensor] Add guide for what to do about mixed torch.Tensor and DTensor operations (#162651 ) Also updates the error message to point to the guide. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162651 Approved by: https://github.com/ezyang ghstack dependencies: #162117, #162307	2025-09-18 06:41:02 +00:00
vandrei	627482a7b7	[torch][cuda][device_limits] Library for querying device hardware limits for flops and bandwidth (#162942 ) In various benchmarks scattered across the repo, the limits for flops/second and memory bandwidth are usually hardcoded for a single device. This utility could help in providing a more structured way to query the device capabilities. If this is approved, we can use it when reporting flops efficiency and bandwidth relative to peak in the benchmarks and tests. The intent is to add more devices, more parameters (e.g. L2 cache bandwidth, NVLink, etc.) for both CPUs and accelerators. Testing: ``` import torch if torch.cuda.is_available(): device = torch.cuda.current_device() mod = torch.get_device_module('cuda') hw = mod._device_limits.GPULimits(device) print(hw.get_tflops_per_second(torch.float16)) print(hw.get_tflops_per_second(torch.float32)) print(hw.get_tflops_per_second(torch.float64)) print(hw.get_tflops_per_second(torch.bfloat16)) print(hw.get_tflops_per_second(torch.int8)) print(hw.get_memory_bandwidth_Bps() / 1e9) print(hw.get_shared_memory_bandwidth_Bps() / 1e9) # Output on an H100 GPU 1070.53056 535.26528 66.90816 1070.53056 2141.06112 4893.696 33454.08 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162942 Approved by: https://github.com/ngimel	2025-09-18 06:40:07 +00:00
Markus Hoehnerbach	c5e7bb08b0	[inductor] pdl inductor option (disabled by default) (#160928 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160928 Approved by: https://github.com/eellison	2025-09-18 06:35:28 +00:00
Dmitry Nikolaev	6f9b4ccf8f	Fix SEMI_STRUCTURED_SUPPORTED_BACKENDS selection on CUDA and ROCm (#163223 ) It should work with the current CUDA/ROCm device_capability enumeration anyway. But it will help to avoid unexpected triggering in the future Pull Request resolved: https://github.com/pytorch/pytorch/pull/163223 Approved by: https://github.com/jeffdaily	2025-09-18 06:29:29 +00:00
Chien-Chin Huang	708dc6e3cd	[CP][BE] Remove _AttentionContextParallel (#162539 ) This is not an API we want to support. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162539 Approved by: https://github.com/ezyang, https://github.com/tianyu-l	2025-09-18 06:20:18 +00:00
Anshul Sinha	7803d2c244	[FSDP][Replicate] tests replicate synchronization after optimizer states (#162785 ) Summary: In order to ensure that replicate acts as intended (a specialized version of hsdp) we need to make sure that it can pass the same tests that fully_shard can for training. Verify replicate correctly handles post-optimizer events. Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_post_optim_event Pull Request resolved: https://github.com/pytorch/pytorch/pull/162785 Approved by: https://github.com/mori360 ghstack dependencies: #162631, #162636, #162650, #162654, #162656, #162658	2025-09-18 04:47:09 +00:00
Sherlock Huang	033b7d1e1a	[Reland] Return NoOpDeviceGuardImpl in replace of CudaDeviceGuard when device is not available (#163187 ) Reland of #160532 Summary: To support exporting a cuda model on a CPU-only machine under fake tensor mode. User commonly need to move sample inputs to the cuda device with .to("cuda:0") or .to("cuda") call. This diff supports this. I expect the following pattern to work ``` with FakeTensorMode(allow_non_fake_inputs=True): cuda_module = module.to("cuda:0") cuda_sample_inputs = tuple([x.to("cuda:0") for x in sample_inputs]) with torch.no_grad(): ep = torch.export.export(cuda_module, cuda_sample_inputs) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163016 Approved by: https://github.com/huydhn Pull Request resolved: https://github.com/pytorch/pytorch/pull/163187 Approved by: https://github.com/angelayi	2025-09-18 04:46:26 +00:00
PyTorch UpdateBot	d734b26141	[vllm hash update] update the pinned vllm hash (#163218 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163218 Approved by: https://github.com/pytorchbot	2025-09-18 04:31:47 +00:00
Nick Riasanovsky	a27c002186	[BE] [Triton] [Inductor] Add an assert for store_output val_shape to use a tuple (#162887 ) Summary: Updates the remaining tests to all ensure val_shapes is always passed a tuple and not a list. Enables adding an assert consistent with the other function arguments. Test Plan: Depends on CI. Rollback Plan: Differential Revision: D82383319 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162887 Approved by: https://github.com/NikhilAPatel	2025-09-18 04:30:36 +00:00
Laith Sakka	0f462740a0	replace more // with FloorDiv in inductor code (#162969 ) see this https://github.com/pytorch/pytorch/pull/162869 for more context, sympy div representation can make reasoning fail. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162969 Approved by: https://github.com/ezyang, https://github.com/eellison, https://github.com/jansel	2025-09-18 03:28:31 +00:00
Ruben Rodriguez Buchillon	d6aaf08344	[inductor][heuristics] add kernel template params (#162781 ) # why - enable a clear interface for kernel templates to declare all their instantiation parameters and any potential defaults - simplify KernelTemplateChoice to just have a single params, and not kwargs and extra_kwargs # what - KernelTemplateParams interface - placeholder implementation where we just pass through a dict # testing - existing ci tests Pull Request resolved: https://github.com/pytorch/pytorch/pull/162781 Approved by: https://github.com/jansel	2025-09-18 02:15:42 +00:00
Er-Xin (Edwin) Shang	13304401df	Port 4 dynamo test files for the intel XPU (#160953 ) # Description Fixes #114850, we will port dynamo tests to Intel GPU We could enable Intel GPU with following methods and try the best to keep the original code styles: # Changes 1. Get device type from accelerator method. 2. Replace the requires cuda statement with requires_gpu. 3. Add HAS_XPU_AND_TRITON into the scope. 4. Add several wrapper methods in cuda module into the accelerator. # Notify Pull Request resolved: https://github.com/pytorch/pytorch/pull/160953 Approved by: https://github.com/EikanWang, https://github.com/guangyey, https://github.com/jansel Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com>	2025-09-18 01:54:45 +00:00
Huy Do	8e48d1ba25	Skip reuse PyTorch wheel when building vLLM (#163232 ) This issues starts surfacing in [trunk](`b26d4c9a7a/1`). When building vLLM, uv doesn't like that we rename CI wheel without changing its metadata to match it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163232 Approved by: https://github.com/izaitsevfb	2025-09-18 01:42:32 +00:00
Zhengxu Chen	6189a5f731	[dynamo][ez] Initialize tracer_output to None by default. (#163169 ) Summary: In edge cases, tracer_output can be left unset if there's double exception raised which causes the following issue: ``` UnboundLocalError: local variable 'tracer_output' referenced before assignment ``` Default initialize this variable so that it's always present. Test Plan: CI Rollback Plan: Differential Revision: D82652815 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163169 Approved by: https://github.com/tugsbayasgalan	2025-09-18 01:30:23 +00:00
Xia, Weiwen	48a7e8cc70	[CPU][GEMM Template] Improve A16W8 performance (#162479 ) Summary Improve A16W8 performance by 1. supporting GQA concat linear 2. using smaller cache blocking size 3. improving code for dequantization of weight (reducing instructions and adding prefetch) We saw > 5% E2E next token performance gain when running Llama3.1-8B-instruct. Test plan Already covered by UT Pull Request resolved: https://github.com/pytorch/pytorch/pull/162479 Approved by: https://github.com/mingfeima, https://github.com/CaoE, https://github.com/jansel	2025-09-18 01:28:37 +00:00
Anshul Sinha	f17e2ab1f9	[FSDP][Replicate] tests replicate with prefetching (#162658 ) Summary: Prefetching tests validate that distributed training systems can correctly overlap communication and computation by pre-loading parameters or data before they're needed. This test ensures the prefetching mechanism doesn't break training correctness while potentially improving performance by reducing idle time where computation waits for communication to complete. Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_explicit_prefetching Pull Request resolved: https://github.com/pytorch/pytorch/pull/162658 Approved by: https://github.com/mori360 ghstack dependencies: #162631, #162636, #162650, #162654, #162656	2025-09-18 01:05:16 +00:00
Anshul Sinha	e14b290d1e	[FSDP][Replicate] tests replicate module functionality when used multiple times in a forward pass (#162656 ) Summary: Verifies that Replicate works correctly when a module is used multiple times in a single forward pass. Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_multi_forward_module Pull Request resolved: https://github.com/pytorch/pytorch/pull/162656 Approved by: https://github.com/mori360 ghstack dependencies: #162631, #162636, #162650, #162654	2025-09-18 01:02:08 +00:00
Laith Sakka	04ddea44fd	Fix: ShapeEnv not propagated properly to inductor SizeVars (#162927 ) Summary: I am really skeptical about inductor sizevars creating an empty shape env when not provided with one i think we should fail there if the graph has dynamic shapes and no shape env is provided. however i wonder if there are actually use cases that depends on the shape env not being there? Reasoning APIs depends on facts in the shape env. and assumes some stuff exists for specific symbols. Test Plan: Fix the bug reported in creating simple e2e unit test is not trivial https://www.internalfb.com/diff/D82337184 Rollback Plan: Differential Revision: D82412384 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162927 Approved by: https://github.com/ezyang, https://github.com/eellison, https://github.com/jansel	2025-09-18 00:56:22 +00:00
Ke Wen	57a54a04b6	[SymmMem] Fix NVSHMEM plugin + Triton 3.5 (#163152 ) 1. The dispatch signatures defined in `core.extern_elementwise` call must match the C signature of the NVSHMEM functions, in particular the dtypes. Otherwise, there would be weird errors, such as IMA or hang. When matched, most of time the NVSHMEM device function will be inlined into the generated PTX. When not matched, it is represented as a function call in the PTX (not sure if it is the function call that goes wrong). 2. When calling the `core.extern` wrappers from the `triton.jit` kernels, the input must be cast to match the signatures defined in 1, e.g. via `nbytes.to(tl.int64)`. Otherwise, Triton will report a key error when searching for such kernel. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163152 Approved by: https://github.com/ngimel ghstack dependencies: #163025	2025-09-18 00:50:22 +00:00
Anshul Sinha	6edfb3062c	[FSDP][Replicate] tests replicate runs forward/backward for root and non-root module (#162654 ) Summary: Verifies that Replicate correctly handles the scenario where forward and backward passes are run through both the root module and a non-root module. Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_non_root_forward_backward Pull Request resolved: https://github.com/pytorch/pytorch/pull/162654 Approved by: https://github.com/mori360 ghstack dependencies: #162631, #162636, #162650	2025-09-18 00:47:19 +00:00
Tugsbayasgalan Manlaibaatar	72fedf0575	Move export_db to use new tracer, remove restriction on optional inputs (#162993 ) Differential Revision: [D82478644](https://our.internmc.facebook.com/intern/diff/D82478644) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162993 Approved by: https://github.com/zhxchen17 ghstack dependencies: #162557, #162558, #162559, #162682, #162992	2025-09-18 00:43:32 +00:00
Tugsbayasgalan Manlaibaatar	b26d4c9a7a	Make dynamo preserving stack trace work with inlined nn modules (#162992 ) Differential Revision: [D82478646](https://our.internmc.facebook.com/intern/diff/D82478646) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162992 Approved by: https://github.com/williamwen42 ghstack dependencies: #162557, #162558, #162559, #162682	2025-09-18 00:43:23 +00:00
Anshul Sinha	bb25c60945	[FSDP][Replicate] tests replicate parity for single and multigroup (#162650 ) Summary: The parity tests train two identical models with the same inputs - one using a reference approach and one using the test approach (replicate) - then check that both models produce identical losses. This ensures the distributed training methods don't change the mathematical results compared to standard training. Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_train_parity_single_group 2. pytest test/distributed/_composable/test_replicate_training.py -k test_train_parity_multi_group 3. pytest test/distributed/_composable/test_replicate_training.py -k test_train_parity_multi_group_cpu_offload_eager Pull Request resolved: https://github.com/pytorch/pytorch/pull/162650 Approved by: https://github.com/mori360 ghstack dependencies: #162631, #162636	2025-09-18 00:38:49 +00:00
Nicolas Macchioni	fdcef1477c	[pcache] Generalize testing + All Caches thread-safe (#163173 ) Summary: 1. Generalized testing by auto-detecting Cache types and splitting testing by abstract base class - Now checks that all Cache types are thread-safe - Will fail tests if any new Cache is added and is untested (for example, any cache with non-str key or non-bytes value) 2. All Caches are thread-safe - InMemoryCache was the only one not thread-safe, so added a lock for access - Realized that to implement MultiCache we should just have this requirement. * Also, OnDiskCache is now a functioning AsyncCache with a default base_dir using Python's tempfile.gettempdir, i.e. OnDiskCache is no longer an abstract cache class Test Plan: ``` [nmacchioni@* / ()]$ buck test fbcode//mode/opt caffe2/test/inductor:pcache Tests finished: Pass 28. Fail 0. Fatal 0. Skip 0. Build failure 0 [nmacchioni@* / ()\|remote/fbcode/warm_gpu_od_stable...)]$ ``` Rollback Plan: Differential Revision: D82660240 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163173 Approved by: https://github.com/masnesral	2025-09-18 00:32:46 +00:00
xinan.lin	e93706c2c8	[Intel GPU][pre_compile] Add XPU toolkit version and hardware info in compiled model check. (#162951 ) Following #162438, this PR generalized the origin CUDA only check, and add XPU check. Fixes #162939, Fixes #162938, Fixes #163032，Fixes #163045 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162951 Approved by: https://github.com/EikanWang, https://github.com/jansel	2025-09-18 00:04:22 +00:00
ghostspiders	26eefd5ae2	Fix windows path escape characters (#162761 ) Fixes #135954 Torch Inductor Windows Path Escape Characters Pull Request resolved: https://github.com/pytorch/pytorch/pull/162761 Approved by: https://github.com/jansel, https://github.com/mlazos	2025-09-17 23:39:39 +00:00
Mark Saroufim	28c42cc280	compile_kernel: Add DLPack test (#163166 ) Note to self: i should probably. start using gh stack This is rebased on top of https://github.com/pytorch/pytorch/pull/163165 so you only need to review this commit `7387c1becf` This test doesn't add any new functionality it just ensures DLPack conversion is working well Pull Request resolved: https://github.com/pytorch/pytorch/pull/163166 Approved by: https://github.com/janeyx99, https://github.com/albanD	2025-09-17 22:55:48 +00:00
bobrenjc93	0661ecdb38	add support for hint_override in mark_unbacked (#162652 ) Very similar to https://github.com/pytorch/pytorch/pull/161007 except now for mark_unbacked. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162652 Approved by: https://github.com/laithsakka	2025-09-17 22:29:54 +00:00
Anshul Sinha	7a0f93344e	[FSDP][Replicate] tests replicate casting module after init (#162636 ) Summary: In order to ensure that replicate acts as intended (a specialized version of hsdp) we need to make sure that it can pass the same tests that fully_shard can for training. This test is important as it verifies we can cast a replicated module to a different type after initialization, and import feature for enabling mixed precision, Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_to_float64_after_init Pull Request resolved: https://github.com/pytorch/pytorch/pull/162636 Approved by: https://github.com/mori360 ghstack dependencies: #162631	2025-09-17 20:36:13 +00:00
Shaobin Ma	63276edb7c	[Inductor] support mixed dtype in the native_layer_norm_backward meta function (#159830 ) Fixes #159829 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159830 Approved by: https://github.com/albanD	2025-09-17 20:29:12 +00:00
Hanchen Zhang	dfda2dfd53	very small typo in fsdp2 comment (#163155 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163155 Approved by: https://github.com/awgu, https://github.com/Skylion007	2025-09-17 20:19:41 +00:00
Tugsbayasgalan Manlaibaatar	876824f174	Make inline tests to use new exporter and fix some issues around it (#162682 ) inline_and_install_module export variant is our long term state so it is better to use the new tracer for this. It also uncovered bunch of minor bugs because with inline_and_install_module, the nn_module_stack generation is changed a bit. Differential Revision: [D82478648](https://our.internmc.facebook.com/intern/diff/D82478648) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162682 Approved by: https://github.com/zhxchen17 ghstack dependencies: #162557, #162558, #162559	2025-09-17 20:09:28 +00:00
Mark Saroufim	a89d5e97ec	compile_kernel remove header_code arg (#163165 ) We previously asked users to seperate these because we didn't have any way of adding extern C declarations. Now we don't and we don't need this confusing flag anymore BC breaking but is fine for this API since it doesn't have major users yet. Please just put your all your code in `kernel_source` moving forward ## BC note The header_code parameter has been removed from torch.cuda._compile_kernel. Previously, users could pass separate header code that would be prepended to the kernel source. Now, header code must be included directly in the kernel_source parameter. Note this only affects torch.cuda._compile_kernel, which is a private API. Example: Before ```python kernel = compile_kernel( kernel_source="global void my_kernel() { ... }", kernel_name="my_kernel", header_code="#define SCALE 2.0f\n__device_ float scale(float x) { return x * SCALE; }" ) ``` After ```python kernel_source = """ #define SCALE 2.0f device float scale(float x) { return x * SCALE; } global void my_kernel() { ... } """ kernel = _compile_kernel(kernel_source, "my_kernel") ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163165 Approved by: https://github.com/janeyx99, https://github.com/albanD	2025-09-17 19:47:32 +00:00
Shangdi Yu	4660e38e5a	write conv1d decomposition (#163080 ) In Unified Runtime, we cannot have any fallback ops (for now). Not all conv1d ops can avoid fallbacks now, so we write a decomposition for it. it's not registered to the default decomposition table as currently only executorch/unified runtime needs it. But it might benefit inductor as well because conv2d can generate triton kernels while there's no triton codegen for conv1d. I don't know if the conv2d triton kernel will have better perf compared to aten::conv1d, so it's not registered by default yet. To register it, one just needs to do `import torch._decomp as decomp;decomp.register_decomposition(torch.ops.aten.conv1d.default, conv1d_to_conv2d)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163080 Approved by: https://github.com/angelayi	2025-09-17 19:22:38 +00:00
Kurt Mohler	5236007806	[MPS] Add `embedding_bag` forward pass (#163012 ) Part of #162270 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163012 Approved by: https://github.com/kulinseth, https://github.com/malfet	2025-09-17 19:00:47 +00:00
Filip	167ad09be5	[optim] override SWALR.state_dict and load_state_dict (#163122 ) Fixes #163105 Note that the new `SWALR.load_state_dict` is not backwards compatible: ```python @override def load_state_dict(self, state_dict: dict[str, Any]) -> None: """Load the scheduler's state. Args: state_dict (dict): scheduler state. Should be an object returned from a call to :meth:`state_dict`. """ self.__dict__.update(state_dict) self._set_anneal_func(self._anneal_strategy) ``` If we'd like to maintain compatibility with old state_dicts (loaded with `weights_only=False`), we could use something along these lines: ```python @override def load_state_dict(self, state_dict: dict[str, Any]) -> None: """Load the scheduler's state. Args: state_dict (dict): scheduler state. Should be an object returned from a call to :meth:`state_dict`. """ anneal_func = state_dict.pop("anneal_func", None) strategy = state_dict.get("_anneal_strategy") self.__dict__.update(state_dict) if anneal_func is not None: state_dict["anneal_func"] = anneal_func if strategy is None: if anneal_func == self._linear_anneal: strategy = "linear" elif anneal_func == self._cosine_anneal: strategy = "cos" if strategy is None: strategy = getattr(self, "_anneal_strategy", "cos") self._set_anneal_func(strategy) ``` But given the fact that loading an `SWALR` state_dict before this PR would have caused an error, this seems okay. A GitHub/Google search for `SWALR.load_state_dict` had no results. Happy to change if not, or add a warning just in case. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163122 Approved by: https://github.com/janeyx99	2025-09-17 18:17:26 +00:00
Jane Xu	bcbb45b746	remove tolerance override for dynamo test_mixed_device_dtype in SGD (#163088 ) In reaction to https://github.com/pytorch/pytorch/issues/116202#issuecomment-3145929113 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163088 Approved by: https://github.com/albanD	2025-09-17 18:17:23 +00:00
Ruben Rodriguez Buchillon	3cad2403cb	[inductor][choices] pass through annotations from KTC to ChoiceCaller (#163117 ) # why - KTC might regenerate a choicecaller e.g. through FlexibleLayout optimization. This in turn would delete any annotations # what - provide an annotations dict inside KTC - forward that dict towards the ChoiceCaller's annotations - ChoiceCaller users e.g. in selectalgorithm now have access to the KTC and can register handlers do record/make decisions based on the KTC # testing n/a Differential Revision: [D82587631](https://our.internmc.facebook.com/intern/diff/D82587631) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163117 Approved by: https://github.com/nmacchioni	2025-09-17 18:06:50 +00:00
Nakul Iyer	e63476b236	[MTIA Runtime] Add foreach_div ops to native_functions.yaml (#162732 ) Summary: Quick fix for runtime support on foreach_div, see D81274963. Fixed an issue that I created in that diff so that the CIs pass. Test Plan: CIs created in D81274963 and D81286593 pass. Added some logs in [aten_mtia_ops.py](https://www.internalfb.com/code/fbsource/[c56272ba042c43c65517dcac254364cf732fcfa9]/fbcode/mtia/host_runtime/torch_mtia/aten_mtia_ops.cpp?lines=3676) to all the foreach_div ops. We can see that the correct MTIA kernels are being invoked in the tests. https://www.internalfb.com/intern/testinfra/testrun/15481123829281588 Rollback Plan: Pull Request resolved: https://github.com/pytorch/pytorch/pull/162732 Approved by: https://github.com/danielhou0515	2025-09-17 17:44:03 +00:00
Amandeep Chhabra	4f641aa1a2	capturing exit codes after sigterm/sigkill from torch elastic. (#160908 ) Summary: Background Torch Elastic sends SIGKILL/SIGTERM signals if any process fails while others are still running. However, processes terminated by these signals do not generate termination logs, causing confusion. Solution Capture exit codes after SIGTERM signals to ensure complete and accurate termination logging. Test Plan: unit tests https://www.internalfb.com/mlhub/pipelines/runs/mast/f773486907-TrainingApplication__13_D79584569?job_attempt=1&version=0&tab=summary&env=PRODUCTION Rollback Plan: Differential Revision: D79584569 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160908 Approved by: https://github.com/d4l3k	2025-09-17 17:41:35 +00:00
Nikita Shulga	8dbac62edb	[CI] Update NVIDIA driver to `580.82.07` (#163111 ) To make CI machines capable of running CUDA-13 tests. Unfortunately, this upgrade regresses NUMBA integration, so live patch it with `6e08c9d08e` This fix was suggested in https://github.com/pytorch/pytorch/issues/162878#issuecomment-3288635745 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163111 Approved by: https://github.com/huydhn	2025-09-17 17:37:06 +00:00
Tugsbayasgalan Manlaibaatar	7a1e267d4a	Fix set_grad_enabled HOP in strict mode with new tracer (#162559 ) previous graph seems wrong probably because dynamo bytecode running might be changing the grad state unintentionally. Differential Revision: [D82478643](https://our.internmc.facebook.com/intern/diff/D82478643) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162559 Approved by: https://github.com/zhxchen17, https://github.com/ydwu4 ghstack dependencies: #162557, #162558	2025-09-17 17:13:03 +00:00
Mu-Chu Lee	2291199e9b	[AOTInductor] Use CudaCachingAllocator for memory allocation (#162893 ) Summary: Use c10::CudaCachingAllocator for AOTInductor's initial constant buffer allocation. Test Plan: Activate test under test/cpp/aoti_inference/test.cpp Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/162893 Approved by: https://github.com/desertfire	2025-09-17 17:08:20 +00:00
Tugsbayasgalan Manlaibaatar	0e9f9c3a61	Fix inconsistent test and add new tracer as config (#162558 ) It is better to have the new tracer as global config that can be manipulated easily. Also I believe dynamo-like config infra is useful instead of relying on custom way of patching stuff. Differential Revision: [D82478649](https://our.internmc.facebook.com/intern/diff/D82478649) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162558 Approved by: https://github.com/zhxchen17 ghstack dependencies: #162557	2025-09-17 17:01:48 +00:00
Tugsbayasgalan Manlaibaatar	0e9e3cf996	Don't skip register_dataclass unflatten in dynamo (#162557 ) We changed how we are tracing, as a result, we need to trace into register_data_class now. Differential Revision: [D82478651](https://our.internmc.facebook.com/intern/diff/D82478651) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162557 Approved by: https://github.com/zhxchen17	2025-09-17 16:53:02 +00:00
Animesh Jain	c5c9e20f11	[dtensor][compile] Disable proxy mode in sharding prop rules (#163126 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163126 Approved by: https://github.com/bdhirsh	2025-09-17 16:49:35 +00:00
Sahan Paliskara	d1993c27ae	[BE] Make PyObjectSlot use a global PyInterpreter (#162659 ) This pr gets rid of the pyobj_interpreter_ variable from PyObjectSlot and saves a word in the process Gonna ask for review from @huydhn as there are some changes to CI. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162659 Approved by: https://github.com/albanD, https://github.com/huydhn	2025-09-17 16:40:55 +00:00
Syed Tousif Ahmed	928ac57c2a	Upgrades dlpack to v1.1 to include fp8/fp4 (#162195 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162195 Approved by: https://github.com/eqy, https://github.com/albanD, https://github.com/Skylion007, https://github.com/rgommers	2025-09-17 16:39:11 +00:00
zpcore	f2206b1ed8	fix wait() missing in redistribute tensor (#162749 ) We notice that the wait() op is missing after collective op call: https://github.com/pytorch/pytorch/pull/162665#discussion_r2338460562. The issue is that `_maybe_warp_tensor` calls AsyncCollectiveTensor in `3ad3bfe11d/torch/distributed/_functional_collectives.py (L829)` We need to check whether the wait() is required after collective op call. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162749 Approved by: https://github.com/ezyang, https://github.com/SherlockNoMad, https://github.com/wconstab	2025-09-17 16:24:26 +00:00
PyTorch MergeBot	4ca3f435fb	Revert "[CI] Update NVIDIA driver to `580.82.07` (#163111 )" This reverts commit 16475a829f7fe3b1dc3c74573740df09ffaec650. Reverted https://github.com/pytorch/pytorch/pull/163111 on behalf of https://github.com/malfet due to It started to fail now, but worked just fine in PR CI ([comment](https://github.com/pytorch/pytorch/pull/163111#issuecomment-3303707671))	2025-09-17 16:20:31 +00:00
PyTorch MergeBot	79fd497423	Revert "[Reland] Return NoOpDeviceGuardImpl in replace of CudaDeviceGuard when device is not available, or cpu-only build (#163016 )" This reverts commit f1eb99e2e4363f20eb5896433e1eb7f7500aadea. Reverted https://github.com/pytorch/pytorch/pull/163016 on behalf of https://github.com/jeffdaily due to broke rocm CI, see export/test_export_opinfo.py::TestExportOnFakeCudaCUDA::test_fake_export_nonzero_cuda_float32 [GH job link](https://github.com/pytorch/pytorch/actions/runs/17787208381/job/50564369696) [HUD commit link](`f1eb99e2e4`) ([comment](https://github.com/pytorch/pytorch/pull/163016#issuecomment-3303707552))	2025-09-17 16:17:53 +00:00
Eddie Yan	9b7a8c4d05	[cuDNN][SDPA][submodule] Roll-back cuDNN frontend upgrade, update Meta registration (#163104 ) For https://github.com/pytorch/torchtitan/issues/1713 Also note that we will need to rollback the cuDNN frontend upgrade in 2.9 as it currently introduces a segmentation fault by assuming tensors have their strides and sizes populated at graph creation time `1a7b4b78db/include/cudnn_frontend/node/sdpa_support_surface.h (L447%C2%A0)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163104 Approved by: https://github.com/drisspg	2025-09-17 15:48:54 +00:00
Nikita Shulga	16475a829f	[CI] Update NVIDIA driver to `580.82.07` (#163111 ) To make CI machines capable of running CUDA-13 tests. Unfortunately, this upgrade regresses NUMBA integration, so live patch it with `6e08c9d08e` This fix was suggested in https://github.com/pytorch/pytorch/issues/162878#issuecomment-3288635745 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163111 Approved by: https://github.com/huydhn	2025-09-17 14:44:06 +00:00
Nikita Shulga	6cfb080d84	[CD] Do not enable GenAI on Windows (#163116 ) Follow up after https://github.com/pytorch/pytorch/pull/162209 as looks like it causes some of the Windows builds to fail with ``` C:/actions-runner/_work/pytorch/pytorch/pytorch/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include\fbgemm_gpu/quantize/utils.h(19): error C3861: '__builtin_clz': identifier not found ``` May be fixes https://github.com/pytorch/pytorch/issues/162881 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163116 Approved by: https://github.com/wdvr, https://github.com/danielvegamyhre	2025-09-17 14:09:10 +00:00
Filip	bc38c5baa1	[optim] prevent problematic tensor aliasing in lr_scheduler (#163098 ) Prevents edge cases in SequentialLR and ReduceLROnPlateau which could corrupt learning rates or trigger recompilation. Supersedes #162360 Fixes #162359 Fixes #163093 While putting #162360 together, I noticed the class of issue I was fixing (i.e. unintended aliasing in lr_schedulers when using Tensor lrs) appeared in several other places. @janeyx99 suggested I put together a follow-up PR. There are several bugs resembling the one fixed in #162360. I added a helper to fix these: ```python def _update_param_group_val(param_group: dict[str, Any], key: str, val: float \| Tensor): """Set param_group[key] to val without aliasing or assignment when they're both tensors. Raises a KeyError if param_group[key] does not exist. """ if isinstance(param_group[key], Tensor): param_group[key].fill_(_to_scalar(val)) else: param_group[key] = val ``` And applied it to fix bugs in `SequentialLR.__init__` and `LRScheduler._update_lr`. I also added it to `CyclicLR.__init__` which was using an equivalent pattern, and `CosineAnnealingWarmRestarts.step` which should have had a similar issue: ```python for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()): param_group["lr"] = lr ``` But did not, because `get_lr()` actually returns tensors when using a tensor lr (despite its `list[float]` return type annotation). Relying on this propagation seems fragile, so I conservatively added the method here as well. I'll be fixing the type annotations and several related issues in followup PRs built off of this one. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163098 Approved by: https://github.com/janeyx99	2025-09-17 13:40:23 +00:00
Amandeep Chhabra	607489f3d0	logging exit code for failures to ease debugging (#160907 ) Summary: Problem Some processes are terminated by other processes using signals. These signal terminations often lack stack traces, causing confusion during debugging. Solution Log exit codes to simplify and improve the debugging process failures. Test Plan: unit tests https://www.internalfb.com/mlhub/pipelines/runs/mast/f773486907-TrainingApplication__13_D79777290?version=0&env=PRODUCTION Rollback Plan: Differential Revision: D79777290 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160907 Approved by: https://github.com/d4l3k	2025-09-17 12:52:48 +00:00
Chris Sidebottom	89a6dbe73a	Filter out local timer tests which are unimplemented in Python on AArch64 (#158342 ) This stems from using a conda build of Python, which incorrectly detects this as unimplemented: https://github.com/conda-forge/python-feedstock/issues/804 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158342 Approved by: https://github.com/malfet	2025-09-17 11:31:57 +00:00
Zeng, Xiangdong	c6392fcc06	[2/N] Port 3 fsdp distributed test cases to Intel GPU (#160940 ) For https://github.com/pytorch/pytorch/issues/114850, we will port distributed tests to Intel GPU. This is the second PR for fsdp distributed test cases, the first is https://github.com/pytorch/pytorch/pull/160158. We could enable Intel GPU with following methods and try the best to keep the original code styles: - Use "torch.accelerator.current_accelerator()" to determine the accelerator backend - Enabled XPU for some test path Pull Request resolved: https://github.com/pytorch/pytorch/pull/160940 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-17 10:45:28 +00:00
Benji Beck	c52c4052d8	[WOQ] Integrate CUDA support for int8pack_mm woq optimization pattern (#161680 ) Summary: What: Enables CUDA support for int8_mm woq optimization pattern by: - Fixing dtype conversion in weight_int8pack_mm_kernel to match CPU - Updating pattern validation to accept CUDA devices - Adding test coverage for CUDA Why: Extend WOQ to more device types Test Plan: ``` buck2 run 'fbcode//mode/opt' //caffe2/test/inductor:cuda_select_algorithm ``` Rollback Plan: Reviewed By: jerryzh168 Differential Revision: D80882442 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161680 Approved by: https://github.com/jerryzh168	2025-09-17 10:24:13 +00:00
Simon Fan	175299416b	[mypy] add some import ignores to onnx (#163133 ) these keep appearing when I run `lintrunner` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163133 Approved by: https://github.com/justinchuby ghstack dependencies: #161458, #162702	2025-09-17 09:32:38 +00:00
Simon Fan	a97cefac15	[dtensor] do not mutate specs when doing sharding prop (#162702 ) Because these specs are cached by reference. So by reusing them and mutating them, we're overwriting the cached specs of another op. I'm just fixing these 2, there are more instances, we'll need to do an audit separately. This fixes a few opinfo tests, but side note that `PYTORCH_OPINFO_SAMPLE_INPUT_INDEX=0 python test/distributed/tensor/test_dtensor_ops.py TestDTensorOpsCPU.test_dtensor_op_db_nn_functional_multi_head_attention_forward_cpu_float32` fails for me locally even on the base commit, but it is not marked as xfail NOTE: I am renaming `_wrap_output_spec_tensor_meta` so that external libraries will loudly fail. You should migrate to the functional `_create_output_spec_with_new_tensor_meta` or create your own mutation wrapper and take responsibility for the cache! This should be improved in https://github.com/pytorch/pytorch/issues/162731 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162702 Approved by: https://github.com/ezyang, https://github.com/Skylion007, https://github.com/dcci ghstack dependencies: #161458	2025-09-17 09:32:38 +00:00
Simon Fan	821458d97a	[dynamo][hop] Introduce Local Map HOP (#161458 ) Can't actually deploy it because of: https://github.com/pytorch/pytorch/issues/161456 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161458 Approved by: https://github.com/ydwu4	2025-09-17 09:32:38 +00:00
Shangdi Yu	c9f16f201a	[inductor] Fix convolution autotune check when groups != 1 (#163094 ) When generating the triton template for convolution, we check `V.graph.sizevars.statically_known_equals(in_chan * groups, x.get_size()[1]) `. Note that in this check, we should consider the groups. This check verifies, at compile time, that the total number of input channels expected by the convolution weights (in_chan * groups) exactly matches the number of channels in the input tensor (x.get_size()[1]). This fix is good in general as it allows for conv triton template to be generated when `groups> 1`. It's also required for unified runtime to use AOTI as a backend delegate, because unified runtime is libtorch-free, so we cannot use the ATEN fallback of conv2d. ``` python test/inductor/test_select_algorithm.py -k test_convolution2_group ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163094 Approved by: https://github.com/SherlockNoMad	2025-09-17 09:09:32 +00:00
Georgia Phillips	b229455ddd	Update placement utils and weights to handle meta device (#162842 ) Summary: This diff fixes two things which come up when testing a tgif-published pt2 model remote net: 1) Updates isSameDevice to handle meta device to avoid this error: ``` what(): Unsupported device typemeta and meta Exception raised from isSameDevice at fbcode/caffe2/torch/nativert/executor/PlacementUtils.cpp:20 ``` 2. Updates xl weight v2 loading logic in Weights.cpp to handle non-TBE xl-weights. Today, we enforce the device is the same for an old weight and new weight when replacing with ModelRunnerAdapter.setAttr(). However, the way we replace non-TBE xl weights is to find any weights on "meta" device and then replace them with their correct weight with real device from xl_weights folder. Therefore, the new weight and old weight will always have different devices and the device check is invalid. I don't think we've run into this so far bc non-TBE xl weights have not been thoroughly tested until now. Test Plan: Run MRS you model merge net, which uses non-TBE xl weights. Confirm that before change #1 we get error: ``` Unsupported device typemeta and meta ``` Then after change #1 and before change #2 we get: ``` what(): Mismatched device for merge.user_tower.linear.weight: meta vs cpu Exception raised from validateValue at fbcode/caffe2/torch/nativert/executor/Weights.cpp:374 ``` After change run is successful Command: ``` MODEL_ENTITY_ID=921242082 SNAPSHOT_ID=1269 module_name=merge SAMPLE_INPUT_DIR=/data/users/georgiaphillips/models/921242082/${SNAPSHOT_ID}/${module_name}_archive/package/data/sample_inputs buck2 run mode/dev-nosan -c fbcode.nvcc_arch=h100,a100 -c fbcode.enable_gpu_sections=true caffe2/torch/fb/model_transform/fx2trt/packaging:load_net_predictor -- --loadMode=Benchmark --inputNetFile=/data/users/$USER/models/${MODEL_ENTITY_ID}/${SNAPSHOT_ID}/${MODEL_ENTITY_ID}_${SNAPSHOT_ID}.predictor.${module_name} --moduleName=${module_name} --submodToDevice="merge\|cuda0" --benchmarkEnableProfiling=false --disableStaticRuntime=true --doNotRandomizeSampleInputs=true --benchmarkDontRebatchSamples=true --pytorch_predictor_sigmoid_static_dispatch_enable=false --pytorch_predictor_sigmoid_graph_passes_enable=false --sampleInputFilePath=${SAMPLE_INPUT_DIR}/${module_name}.pt ``` Rollback Plan: Differential Revision: D80713052 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162842 Approved by: https://github.com/henryoier	2025-09-17 08:12:32 +00:00
PyTorch MergeBot	a5419743c6	Revert "remove unnecessary sync point in AveragedModel update (#158017 )" This reverts commit cb7f45fd34b890fa7665837573ebb25744889568. Reverted https://github.com/pytorch/pytorch/pull/158017 on behalf of https://github.com/wdvr due to discussed with author - expecting this to break checkpointing ([comment](https://github.com/pytorch/pytorch/pull/158017#issuecomment-3301790645))	2025-09-17 08:02:02 +00:00
Scott Wolchok	a63221a335	Fix TODO in make_tensor_for_subclass_helper (#162336 ) The constructor does accept a DataPtr (had to fix the DataPtr variant not accepting a SymInt, though). Pull Request resolved: https://github.com/pytorch/pytorch/pull/162336 Approved by: https://github.com/ezyang ghstack dependencies: #162298	2025-09-17 06:46:34 +00:00
Deng, Daisy	c9485f8ff3	[Reland][2/N]Port several test files under test/distributed to Intel GPU (#159473 ) For https://github.com/pytorch/pytorch/issues/114850, we will port distributed tests to Intel GPU. This PR will work on some test files under test/distributed. We could enable Intel GPU with following methods and try the best to keep the original code styles: - instantiate_device_type_tests() - use "torch.accelerator.current_accelerator()" to determine the accelerator backend - use requires_accelerator_dist_backend to allow both nccl and xccl test - enabled XPU for some test path - Change the hardcoded world_size according to device_count. - Unify some common code under torch/testing/_internal for multiple backend, for example: Added xpu for Backend.backend_capability and dist.Backend.register_backend() Pull Request resolved: https://github.com/pytorch/pytorch/pull/159473 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-17 06:42:27 +00:00
Edward Yang	71b272e4a3	[BE] Use init_device_mesh over DeviceMesh (#162960 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162960 Approved by: https://github.com/albanD, https://github.com/Skylion007, https://github.com/dcci	2025-09-17 06:12:19 +00:00
xinan.lin	39450e7b00	[Fix XPU CI][Inductor UT] Fix test cases broken by community. (#162933 ) Fixes #162937 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162933 Approved by: https://github.com/EikanWang, https://github.com/jansel	2025-09-17 05:35:06 +00:00
Sherlock Huang	f1eb99e2e4	[Reland] Return NoOpDeviceGuardImpl in replace of CudaDeviceGuard when device is not available, or cpu-only build (#163016 ) Reland of #160532 Summary: To support exporting a cuda model on a CPU-only machine under fake tensor mode. User commonly need to move sample inputs to the cuda device with .to("cuda:0") or .to("cuda") call. This diff supports this. I expect the following pattern to work ``` with FakeTensorMode(allow_non_fake_inputs=True): cuda_module = module.to("cuda:0") cuda_sample_inputs = tuple([x.to("cuda:0") for x in sample_inputs]) with torch.no_grad(): ep = torch.export.export(cuda_module, cuda_sample_inputs) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/163016 Approved by: https://github.com/huydhn	2025-09-17 05:01:33 +00:00
PyTorch UpdateBot	bb635a11f8	[vllm hash update] update the pinned vllm hash (#163128 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163128 Approved by: https://github.com/pytorchbot	2025-09-17 04:26:07 +00:00
Blaine Burton Rister	3bfa35d62e	[AOTI-FX] Solve for undefined symbols in dynamic input shapes (#163044 ) # Problem When dynamic shapes are passed to AOTInductor, they usually have a very basic form like `(s0, 5, 27)`. In these cases it's straightforward to generate code defining the symbol `s0` as a specific dimension of the input tensor. However, AOTI can handle slightly more generic expressions than this, such as `(2 * s0, 5, 27)`. In these cases, we don't immediately know the value of `s0`, but we need to solve for it, since it may be referenced in other parts of the program such as kernel call arguments, launch grids, etc. # Feature This PR adds support for more generic dynamic input expressions in the FX backend, following the implementation already present in AOTI's C++ backend: 1. Check if the expression contains one undefined symbol, as multiple variables would make the equation underdetermined. Let's call this `s0`. (We could potentially generalize this, but this PR focuses on cases AOTI can already handle.) 2. Generate a new symbol for the relevant size or stride of the input tensor. Let's call this `size`. This is computed with FX nodes just as a normal symbol would be. 3. Use sympy to solve for `s0` in terms of `size`. Let's call the resulting expression `solution`. 4. Since we know `s0` is an integer, `solution == floor(solution)`. Take the floor and then convert division to `FloorDiv`. This is required to trace through the expression, since the return value of regular division is not guaranteed to be an integer. 5. Generate FX for the modified `solution`, which defines the value `s0`. 6. Override the relevant method of `PythonWrapperCodegen` to a no-op, since the FX converter handles the above on its own. # Test plan In addition to the existing dynamic shapes tests, this PR adds new test cases where the input shape contains a non-trivial expression. This dynamic input dimension is then multiplied by other dimensions to form the argument to a `reshape`. Here's an example graph from one of the CI tests. In this case, the input expression was `2*x + 1`, and the solution is `x = (sym_size_int - 1) / 2`: ``` graph(): %arg0_1 : [num_users=2] = placeholder[target=arg0_1] %sym_size_int : [num_users=1] = call_function[target=torch.ops.aten.sym_size.int](args = (%arg0_1, 0), kwargs = {}) %sym_sum : [num_users=1] = call_function[target=torch.sym_sum](args = ([-1, %sym_size_int],), kwargs = {}) %floordiv : [num_users=1] = call_function[target=operator.floordiv](args = (%sym_sum, 2), kwargs = {}) %mul : [num_users=2] = call_function[target=operator.mul](args = (8, %floordiv), kwargs = {}) %sym_sum_1 : [num_users=2] = call_function[target=torch.sym_sum](args = ([4, %mul],), kwargs = {}) %buf0 : [num_users=2] = call_function[target=torch.empty_strided](args = ([%sym_sum_1], [1]), kwargs = {dtype: torch.float32, device: cuda:0}) %sym_sum_2 : [num_users=1] = call_function[target=torch.sym_sum](args = ([35, %mul],), kwargs = {}) %floordiv_1 : [num_users=1] = call_function[target=operator.floordiv](args = (%sym_sum_2, 32), kwargs = {}) %triton_kernel_wrapper_mutation : [num_users=0] = call_function[target=torch.ops.higher_order.triton_kernel_wrapper_mutation](args = (), kwargs = {kernel_idx: 0, constant_args_idx: 0, grid: [(%floordiv_1, 1, 1)], tma_descriptor_metadata: {}, kwargs: {in_ptr0: %arg0_1, out_ptr0: %buf0, xnumel: %sym_sum_1, XBLOCK: 32}}) return buf0 ``` The `sym_size_int` node returns the first dimension of the input tensor. Next, `floordiv` computes the input symbol in terms of the input size. Then, the launch grid is computed by `floordiv_1`, the kernel argument by `sym_sum_1`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163044 Approved by: https://github.com/jansel	2025-09-17 04:12:03 +00:00
Daniel Galvez	7a3791c5d0	Make torch.cuda.rng_set_state() and torch.cuda.rng_get_state() work during stream capture. (#162505 ) Note that this works only in a limited case, where you don't change the seed, but change only the offset of the philox generator. This captures the main use case we're interested in: Rewinding the RNG to a previous state. This is done by torch.utils.checkpoint.checkpoint in particular. Calls to increase() change only the offset, not the seed. Thus, we allow for "no-op" calls to set_seed where the new seed is the same as the old seed. If a user does happen to try to change the seed during stream capture, they will receive an error. Fixes #162504 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162505 Approved by: https://github.com/ngimel, https://github.com/eqy, https://github.com/eellison, https://github.com/eee4017, https://github.com/cyyever	2025-09-17 03:57:34 +00:00
Tugsbayasgalan Manlaibaatar	e28983be76	Add decomp rule to assert_tensor_metadata for BatchedTensors (#163008 ) Whenever there is device move, export introduces assert_tensor_metadata aten operator to make sure to guard for device specialization. This aten op didn't work with Vmap because we didn't register explicit decomp rule saying we just skip BatchedTensor and call it on underlying tensor Differential Revision: [D82483979](https://our.internmc.facebook.com/intern/diff/D82483979) Pull Request resolved: https://github.com/pytorch/pytorch/pull/163008 Approved by: https://github.com/huydhn	2025-09-17 03:49:41 +00:00
Nicolas De Carli	794b48c9f4	[PyTorch] Compile SVE's box-cox only when building targeting SVE (#163078 ) Summary: Internally, we are building PyTorch on the compat layer. Need to avoid compiling sve's box-cox, as sve is not marked as build target. Rollback Plan: Reviewed By: rraometa, YifanYuan3 Differential Revision: D82544412 Privacy Context Container: L1208939 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163078 Approved by: https://github.com/Skylion007, https://github.com/malfet	2025-09-17 03:35:11 +00:00
Nikita Shulga	65845d7291	Update Gloo submodule (#163112 ) Which makes PyTorch buildable with gcc-15, tested by running the build inside `fedora:44` docker ``` docker run --rm -it fedora:44 bash -c "yum install -y g++ python3-devel git; git clone https://github.com/pytorch/pytorch; cd pytorch; git checkout 8f710acce8332979c9a7bf97e72666dfd35c43e6; python3 -mpip install -r requirements.txt; python3 setup.py bdist_wheel" ``` Fixes https://github.com/pytorch/pytorch/issues/156595 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163112 Approved by: https://github.com/huydhn	2025-09-17 03:04:09 +00:00
Anshul Sinha	3009b6959a	[FSDP][Replicate] tests replicate parameter registration (#162631 ) Summary Tests parameter state management after forward and backward passes for single and multiple replicate groups Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_param_registration_after_forward 2. pytest test/distributed/_composable/test_replicate_training.py -k test_param_registration_after_backward Pull Request resolved: https://github.com/pytorch/pytorch/pull/162631 Approved by: https://github.com/mori360	2025-09-17 02:46:30 +00:00
Sherlock Huang	df4ebddbe0	DisableTorchFunction in debug_string (#163096 ) debug_string() invokes some torch functions under the hood. Use DisableTorchFunction() to avoid re-invoking __torch_function__ when calling debug_sting() inside DebugMode() Pull Request resolved: https://github.com/pytorch/pytorch/pull/163096 Approved by: https://github.com/zpcore	2025-09-17 00:19:49 +00:00
PyTorch MergeBot	e13cf68d03	Revert "[Triton] [Inductor] Restrict subprocess autotuning to just Triton (#162688 )" This reverts commit 082d3dd9d53a60deb022e203892f0c492cf2cce7. Reverted https://github.com/pytorch/pytorch/pull/162688 on behalf of https://github.com/mlazos due to H100 tests didn't run internally for some reason, rerun with ciflow/h100 ([comment](https://github.com/pytorch/pytorch/pull/162688#issuecomment-3300634763))	2025-09-16 23:17:14 +00:00
Huy Do	814338826e	Set the credential to upload vLLM nightly wheels on schedule and workflow_dispatch (#163018 ) The build is ok, but uploading is failing at the moment https://github.com/pytorch/pytorch/actions/runs/17734972779/job/50416387786 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163018 Approved by: https://github.com/wdvr, https://github.com/malfet	2025-09-16 22:26:22 +00:00
Nikita Shulga	c527292c43	[CI] Remove functorch doc build jobs (#163101 ) As repo has been archived, there couldn't be any doc updates Pull Request resolved: https://github.com/pytorch/pytorch/pull/163101 Approved by: https://github.com/svekars, https://github.com/zou3519, https://github.com/ZainRizvi	2025-09-16 22:25:59 +00:00
PyTorch MergeBot	d4554bc284	Revert "Set the credential to upload vLLM nightly wheels on schedule and workflow_dispatch (#163018 )" This reverts commit 61be0f1c11ef59ff8cf39138b594efe3672816c0. Reverted https://github.com/pytorch/pytorch/pull/163018 on behalf of https://github.com/huydhn due to Missed another update on the environment ([comment](https://github.com/pytorch/pytorch/pull/163018#issuecomment-3300444271))	2025-09-16 21:44:11 +00:00
can-gaa-hou	f6ea41ead2	[CPU] Adding missing brackets in native MaxUnpool log (#163039 ) As stated in the title. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163039 Approved by: https://github.com/Skylion007	2025-09-16 21:28:15 +00:00
Tugsbayasgalan Manlaibaatar	489860f3c2	Prefer_deferred_runtime_asserts should be propagated to new tracer (#162556 ) Differential Revision: [D82478650](https://our.internmc.facebook.com/intern/diff/D82478650) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162556 Approved by: https://github.com/avikchaudhuri ghstack dependencies: #162487	2025-09-16 21:25:00 +00:00
Angel Li	9494b09549	bf16 support for fused_moving_avg_obs_fake_quant() op (#162620 ) enabling bf16 support for `torch.fused_moving_avg_obs_fake_quant()` op on cuda testing `python test/quantization/pt2e/test_quantize_pt2e.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162620 Approved by: https://github.com/andrewor14, https://github.com/jerryzh168	2025-09-16 21:22:44 +00:00
Ruben Rodriguez Buchillon	c230ac7300	[inductor][ez] add ChoiceCaller annotations (#162672 ) # why - enable ChoiceCaller generation to provide extra information that feedback_saver_fns (functions registered to run at the bench of benchmarking) can use afterwards - users that extend ChoiceCaller creation e.g. by creating their own InductorChoices can use this to shuttle through information # what - add an annotations dictionary to ChoiceCaller class # testing n/a Pull Request resolved: https://github.com/pytorch/pytorch/pull/162672 Approved by: https://github.com/nmacchioni	2025-09-16 20:49:55 +00:00
Hari Krishna Sai Kodali	77cafe105a	enable sync batchnorm for HPU device (#163047 ) Add HPU to list of supported devices for SyncBatchNorm Pull Request resolved: https://github.com/pytorch/pytorch/pull/163047 Approved by: https://github.com/albanD	2025-09-16 20:45:38 +00:00
PyTorch MergeBot	66308fb470	Revert "[ROCm] Remove HIPBLASLT_ALLOW_TF32 from codebase (#162998 )" This reverts commit cef815dc2ce37f98e01a6469a15b69f15995c1f9. Reverted https://github.com/pytorch/pytorch/pull/162998 on behalf of https://github.com/huydhn due to Sorry for reverting this, but it seems to break a test in trunk ([comment](https://github.com/pytorch/pytorch/pull/162998#issuecomment-3300280242))	2025-09-16 20:39:41 +00:00
fduwjj	232dd65c15	[CuTe] Change the logic of pycute manipulation ops like coalesce, complement from co-lex to lex (#162690 ) PyTorch tensor iteration (.view, contiguous, broadcasting) and NumPy array indexing all follow lexicographic (row-major) order. In Lexicographic (lex) on (i0, i1, …, i{k-1}): the leftmost index(stride is larger) changes fastest and the rightmost index changes slowest and usually last dim is contiguous. However original pycute is all based on co-lex, after porting their code into pytorch and some cosmetic change, we now make it lex so that we can use it for use cases like device mesh internal bookkeeping and other stuff as well. Changes included in this PR: 1. We changes all API ported in, included prefix_product(stride inferring and rename it to suffix_product), idx2crd, crd2idx, coalesce, composition, complement, right_inverse and left_inverse to make sure they are working in the lex way. 2. Added more unit test cases for some API mentioned above since existing unit tests do not have full coverage. 3. One bug fix inside composition, which will lead to infinite recursive call. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162690 Approved by: https://github.com/ezyang ghstack dependencies: #162413, #162534, #162414	2025-09-16 19:53:45 +00:00
Boyuan Feng	505ee42570	[Graph Partition] allow sharing default device context (#162873 ) Entering a device context takes 30 us and exiting a device context takes 11 us. If all graph partitions and cudagraph-unsafe ops happen on the same device, we can share the device context. ## Trace Use vLLM as an example. The first trace shows dynamo graph partition. <img width="1338" height="453" alt="image" src="https://github.com/user-attachments/assets/b81815fd-cdcb-4024-846a-5b64164f8bac" /> The second trace shows inductor graph partition prior to this PR. <img width="1331" height="270" alt="image" src="https://github.com/user-attachments/assets/8d98b127-2053-4eae-9a31-5491661f14d8" /> Comparing with fx graph partition, we can see inductor graph partition shows extra overhead from enter/exit device contexts (13+6 us -> 30+11 us), but smaller runtime overhead (13 us -> 7 us). This motivates the PR to share default device context. The third trace shows Inductor graph partition after this PR. We observe that the extra overhead from enter/exit device contexts have been fixed. At the same time, we observe the smaller runtime overhead. <img width="1336" height="276" alt="image" src="https://github.com/user-attachments/assets/77be2237-34dd-4bac-ad9c-d9af3be36417" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162873 Approved by: https://github.com/shunting314	2025-09-16 19:36:42 +00:00
Henry	9babcae1ed	fix f-string in errors.py (#163074 ) Add missing "f" for formatted f-string in UnsupportedOperandError, change "op_name" (undefined) to "name" for more descriptive error message in case of an unsupported operand with an unrecognized namespace. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163074 Approved by: https://github.com/justinchuby, https://github.com/Skylion007	2025-09-16 19:19:30 +00:00
Shangdi Yu	69a5a5ac02	Add to inductor provenance tracking doc (#162975 ) As title Pull Request resolved: https://github.com/pytorch/pytorch/pull/162975 Approved by: https://github.com/desertfire, https://github.com/mlazos	2025-09-16 19:09:06 +00:00
Tugsbayasgalan Manlaibaatar	a4e74f416b	Fix error message (#162487 ) More proper fix here should be that we directly replace shape_env with correct sources but it is bit involved as we have to manually construct dynamo sources by hand (need to handle list/dict etc) but it is quite easy if we are operating on a string so i do this as post-processing step for now. Differential Revision: [D82478647](https://our.internmc.facebook.com/intern/diff/D82478647) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162487 Approved by: https://github.com/zhxchen17	2025-09-16 19:06:30 +00:00
Gael Le Lan	cb7f45fd34	remove unnecessary sync point in AveragedModel update (#158017 ) Summary: The test `bool(self.n_averaged == 0)` is a CPU/GPU synchronization point that is called for each update. This test is only meant to know whether the AveragedModel copy has been initialized or not. This diff introduces a CPU-based variable for that purpose. When loading from checkpoint we also make sure the parameter is refreshed. After this fix, each `update_parameter` call is reduced to 6ms from 333ms (98% reduction). Test Plan: contbuild & OSS CI Test plan from GitHub: CI Rollback Plan: Differential Revision: D78074709 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158017 Approved by: https://github.com/janeyx99	2025-09-16 18:57:55 +00:00
Aidyn-A	5937861eba	[TEST][CUDA] Use proper dtype in test_cuda_tensor_pow_scalar_tensor_cuda (#163070 ) The test `test_binary_ufuncs.py::TestBinaryUfuncsCUDA::test_cuda_tensor_pow_scalar_tensor_cuda` fails with a mismatched `dtype`: ```Python AssertionError: The values for attribute 'dtype' do not match: torch.float32 != torch.float64. ``` This PR forces both arguments to use the same `dtype` to fix the test failure. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163070 Approved by: https://github.com/eqy	2025-09-16 18:28:50 +00:00
Zhengxu Chen	bb3f3cc65e	[precompile] Store traced file information with CompileArtifacts. (#162983 ) Summary: Add some metadata to CompileArtifacts, so that it contains the source code information about the original code while they are being traced. For now, we will not provide a verification method to end user and instead we just provide which files are inlined. It's up to user to verify the content from these files are not changed (because it's optional for many users to validate source code changes anyway in aot precompile) Test Plan: buck run @mode/opt test/dynamo:test_dynamo -- -k test_file_change buck run @mode/opt test/dynamo:test_dynamo -- -k test_aot_compile_source_info Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/162983 Approved by: https://github.com/yushangdi	2025-09-16 18:27:48 +00:00
Yu, Guangye	0819de412d	Add a new API torch.xpu.can_device_access_peer for Intel GPU (#162705 ) # Motivation Aligned with other backends, this PR introduces an new API `torch.xpu.can_device_access_peer`, which is used in vllm distributed [scenarios](`2048c4e379/vllm/distributed/device_communicators/custom_all_reduce.py (L37)`) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162705 Approved by: https://github.com/EikanWang, https://github.com/ezyang	2025-09-16 18:00:22 +00:00
Isalia20	6db37d7206	[MPS] zeros like, narrow and enable tests (#163011 ) zeros like, narrow and enable tests for SparseMPS Pull Request resolved: https://github.com/pytorch/pytorch/pull/163011 Approved by: https://github.com/malfet	2025-09-16 17:48:04 +00:00
joshuamarkovic	559e8d1c20	[doc]: Small typos (#162982 ) Small typo fixes Pull Request resolved: https://github.com/pytorch/pytorch/pull/162982 Approved by: https://github.com/ezyang, https://github.com/zou3519	2025-09-16 17:42:19 +00:00
Alexander Grund	6702f545d8	Restore environment after NcclUserBufferRegistrationTest (#163063 ) This test sets "NCCL_ALGO=NVLS" in NcclUserBufferRegistrationTest which affects tests run in the same process such as `test_on_completion_hook_*` that fail with > invalid usage (run with NCCL_DEBUG=WARN for details), NCCL version 2.26.2 > ncclInvalidUsage: This usually reflects invalid usage of NCCL library. > Last error: > Error : no algorithm/protocol available for function Broadcast with datatype ncclInt8. NCCL_ALGO was set to NVLS. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163063 Approved by: https://github.com/ezyang	2025-09-16 17:37:09 +00:00
Anshul Sinha	ddf3124b05	[FSDP][Replicate] tests replicate input device movements (#162629 ) Summary: This test verifies that the replicate function automatically moves forward pass inputs to the correct device. Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_root_move_forward_input_to_device Pull Request resolved: https://github.com/pytorch/pytorch/pull/162629 Approved by: https://github.com/mori360	2025-09-16 17:35:27 +00:00
Anshul Sinha	457b27f92f	[FSDP][Collectives] skipping reduce_scatter when world size is 1 (#162021 ) Summary: In its current state, FSDP collectives uses cuda synchronizations and communication ops regardless of what the world size is. However, now that replicate will use FSDP, there will be instances where group size = 1 and these synchronizations and ops will be used needlessly. I have updated fsdp_collectives to skip reduce_scatter in the foreach_reduce API when world_size ‎ = 1. I have created edited a test that uses CommDebugMode to verify that the reduce_scatter has been removed. I also edited an affected test which used 1-way FSDP by verifying and changing its assert statements for CommDebugMode. I have also added a test command. Test Cases 1. pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_single_worldsize1 2. pytest test/distributed/_composable/test_composability/test_2d_composability.py -k test_tp_with_fsdp_offloading Pull Request resolved: https://github.com/pytorch/pytorch/pull/162021 Approved by: https://github.com/mori360	2025-09-16 17:18:07 +00:00
jiannanWang	b6a48ff69f	[BE] Add Documentation for Device APIs (#162834 ) Added documentation for torch.cuda APIs. Fixed docstring for xpu and mtia is_bf16_supported API. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162834 Approved by: https://github.com/janeyx99 Co-authored-by: Jane (Yuan) Xu <31798555+janeyx99@users.noreply.github.com>	2025-09-16 17:01:06 +00:00
Howard Huang	9de22bc5da	Inspect schedule IR comms (#162996 ) Small change to util to allow us to see comms (e.g. `SEND`, `RECV`, etc.) in the schedule IR Pull Request resolved: https://github.com/pytorch/pytorch/pull/162996 Approved by: https://github.com/fegin	2025-09-16 16:59:06 +00:00
Prachi Gupta	f638854e1d	[ROCm][SymmMem] re-enable UTs (#162811 ) After the UT suite moved to `MultiProcContinuousTest`, `skipIfRocm` decorator started failing rather than skipping UTs because now we spawn multiple threads before the skip decorator is taken into account and the skip decorator was raising an exception to exit the process. But, the parent process treated the child process exiting as a crash rather than a skip. Additionally, in `MultiProcContinuousTest`, if one UT fails all subsequent ones are also skipped which makes sense since there's one setup for the entire suite. However, this showed up as many failing/skipped UTs in the parity. I added multiprocess version of skip decorators for ROCm, including, `skip_if_rocm_arch_multiprocess` and `skip_if_rocm_ver_lessthan_multiprocess`. These are needed as symmetric memory feature is only supported on MI300 onwards and we need to skip them for other archs and some UTs only work after ROCm7.0. Fixes #161249 Fixes #161187 Fixes #161078 Fixes #160989 Fixes #160881 Fixes #160768 Fixes #160716 Fixes #160665 Fixes #160621 Fixes #160549 Fixes #160506 Fixes #160445 Fixes #160347 Fixes #160203 Fixes #160177 Fixes #160049 Fixes #159921 Fixes #159764 Fixes #159643 Fixes #159499 Fixes #159397 Fixes #159396 Fixes #159347 Fixes #159067 Fixes #159066 Fixes #158916 Fixes #158760 Fixes #158759 Fixes #158422 Fixes #158138 Fixes #158136 Fixes #158135 Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/162811 Approved by: https://github.com/jeffdaily	2025-09-16 15:35:39 +00:00
James Wu	3ee071aa85	Allow aot_module_simplified to return a serializable output (#162527 ) This PR refactors AOTAutograd slightly: - It adds `simple_wraps` to various wrappers so that the reference to inner functions is stored in the output of AOTAutograd. - It saves a `serialize()` method on the result of `aot_stage2`, in the event of an eager backward compile. I discussed the lazy backward case with @bdhirsh, and we agreed that serialization in that case would probably use a different, more AOT API anyway, so we do not implement a serialize function for the lazy backward case. AOT precompile, at least initially, will always eagerly compile the backward. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162527 Approved by: https://github.com/zhxchen17 ghstack dependencies: #162171	2025-09-16 15:22:05 +00:00
PyTorch MergeBot	e7c3f802ff	Revert "[dynamo][hop] Introduce Local Map HOP (#161458 )" This reverts commit 505458db803e1ffabac08a2fc150b566d3ea3a57. Reverted https://github.com/pytorch/pytorch/pull/161458 on behalf of https://github.com/jeffdaily due to broke rocm tests ([comment](https://github.com/pytorch/pytorch/pull/161458#issuecomment-3299230458))	2025-09-16 15:14:36 +00:00
PyTorch MergeBot	4db203f875	Revert "[BE] Make PyObjectSlot use a global PyInterpreter (#162659 )" This reverts commit 05ee8114f818a95745c812c3cd7aa8e784e61a9a. Reverted https://github.com/pytorch/pytorch/pull/162659 on behalf of https://github.com/jeanschmidt due to seems to have introduced errors in linting see https://github.com/pytorch/pytorch/actions/runs/17750689989/job/50444910643 ([comment](https://github.com/pytorch/pytorch/pull/162659#issuecomment-3298626136))	2025-09-16 12:52:57 +00:00
Xinya Zhang	cef815dc2c	[ROCm] Remove HIPBLASLT_ALLOW_TF32 from codebase (#162998 ) A few UT failures are caused by `HIPBLASLT_ALLOW_TF32` Fixes #157094, #157093, #157092, #157091, #157064, #157063, #157062, #157061, #157042, #157041, #157039, #157004 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162998 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-16 12:48:45 +00:00
zeshengzong	fa127d9b20	Fix `LBFGS` wolfe max iteration (#161488 ) Fixes #91581 , based on #135026 ## Test Result ```bash pytest test/test_optim.py ......... ========================== 1473 passed, 242 skipped in 2412.49s (0:40:12) =========================== ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161488 Approved by: https://github.com/albanD	2025-09-16 12:07:50 +00:00
Aidyn-A	6926710adf	[ATen][CUDA] CUTLASS matmuls: add sm_103a flag (#162956 ) This PR adds an `sm_103a` flag for GroupMM and RowwiseScaledMM. Contrary to just #161399, this simply adds the flag as the support for `sm_103a` matmuls is going to be added to CUTLASS v4.2 (see https://github.com/pytorch/pytorch/pull/161399#issuecomment-3252892937). Pull Request resolved: https://github.com/pytorch/pytorch/pull/162956 Approved by: https://github.com/eqy, https://github.com/Skylion007	2025-09-16 10:29:55 +00:00
zeshengzong	e3783a9575	Replace `std::runtime_error` with `TORCH_CHECK` (#159344 ) Fixes part of #148114 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159344 Approved by: https://github.com/cyyever, https://github.com/albanD	2025-09-16 09:00:06 +00:00
Blaine Burton Rister	9aca0ba027	[Inductor-FX] Support IndexPutFallback (#162863 ) # Feature This PR supports lowering `IndexPutFallback` through Inductor's FX converter. The approach is very similar to the one taken in https://github.com/pytorch/pytorch/pull/162686. Compared to `ScatterFallback`, this required one additional change: the value of `self.op_overload` for `IndexPutFallback` was inaccurate. Previously, it used `aten.index_put`, which would result in unsound FX IR. The existing Python/C++ codegen use `aten.index_put_`, since the fallback mutates its input. This PR changes `self.op_overload` to match that. # Test plan Added a CI test lowering deterministic index put via the FX converter. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162863 Approved by: https://github.com/angelayi	2025-09-16 08:52:47 +00:00
FFFrog	de143bf79b	[C10d] Code clean for torch.distributed.init_process_group (#163038 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163038 Approved by: https://github.com/msaroufim	2025-09-16 08:15:25 +00:00
Eddie Yan	fb1e0321da	[CUDA] fix shared memory race in `reduce_kernel` (#162995 ) Reported by compute-sanitizer, otherwise it looks like `block_y_reduce` and `block_x_reduce` both use `shared_memory` for temporaries without synchronization between them reproduces in e.g., `compute-sanitizer --tool=racecheck python test/test_matmul_cuda.py -k test_scaled_mm_vs_emulated_block_wise_float32_lhs_block_128_rhs_block_1_cuda` (note that this test requires H100 to run unless only the non-emulated (cuBLAS impl.) is commented out) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162995 Approved by: https://github.com/msaroufim	2025-09-16 07:53:21 +00:00
Sherlock Huang	f8d379d29e	[DTensor] Introduce DebugMode (#162665 ) Introduce a lightweight TorchDispatchMode for understanding the magic behind DTensor. - Tracks redistribution, see `redistribute_input(input_idx, from_placement, to_placement)` - Optionally tracks torch-level functions, via `__torch_function__` - Optionally tracks FakeTensor operations, which was needed for propagating tensor meta as a step of sharding propagation - Optionally tracks real tensor operations, including functional c10d op, and regular ops - Calls are shown in the hierarchical structure! - shorthand representation - dt: DTesnor, ft: FakeTensor, t: Tensor - DM(2, 2) == DeviceMesh(shape = [2, 2]) - [R, P, S(0)] == Placement[Replicate, Partial, Shard(0)] - f32[8,8] == float32 with shape[8, 8] ``` debug_mode = DTensorDebugMode(record_faketensor=False, record_realtensor=True) with debug_mode: torch.mm(x_dtensor, y_dtensor) print(debug_mode.debug_string()) ``` produces: ``` torch.mm(dt: f32[8, 8][S(0)], dt: f32[8, 32][S(0)]) aten::mm(dt: f32[8, 8][S(0)], dt: f32[8, 32][S(0)]) redistribute_input(1, [S(0)], [R]) _c10d_functional::all_gather_into_tensor(t: f32[1, 32], 8, 0) _c10d_functional::wait_tensor(t: f32[8, 32]) aten::mm(t: f32[1, 8], t: f32[8, 32]) ``` Another example, for torch.einsum ``` torch.functional.einsum(bld,dnh->blnh, dt: f32[16, 6, 8][P, R], dt: f32[8, 4, 4][R, P]) aten::unsqueeze(dt: f32[16, 6, 8][P, R], 3) aten::unsqueeze(t: f32[16, 6, 8], 3) aten::unsqueeze(dt: f32[16, 6, 8, 1][P, R], 4) aten::unsqueeze(t: f32[16, 6, 8, 1], 4) aten::permute(dt: f32[16, 6, 8, 1, 1][P, R], [0, 1, 3, 4, 2]) aten::permute(t: f32[16, 6, 8, 1, 1], [0, 1, 3, 4, 2]) aten::unsqueeze(dt: f32[8, 4, 4][R, P], 3) aten::unsqueeze(t: f32[8, 4, 4], 3) aten::unsqueeze(dt: f32[8, 4, 4, 1][R, P], 4) aten::unsqueeze(t: f32[8, 4, 4, 1], 4) aten::permute(dt: f32[8, 4, 4, 1, 1][R, P], [3, 4, 1, 2, 0]) aten::permute(t: f32[8, 4, 4, 1, 1], [3, 4, 1, 2, 0]) aten::permute(dt: f32[16, 6, 1, 1, 8][P, R], [0, 1, 4, 2, 3]) aten::permute(t: f32[16, 6, 1, 1, 8], [0, 1, 4, 2, 3]) aten::view(dt: f32[16, 6, 8, 1, 1][P, R], [1, 96, 8]) aten::view(t: f32[16, 6, 8, 1, 1], [1, 96, 8]) aten::permute(dt: f32[1, 1, 4, 4, 8][R, P], [4, 2, 3, 0, 1]) aten::permute(t: f32[1, 1, 4, 4, 8], [4, 2, 3, 0, 1]) aten::view(dt: f32[8, 4, 4, 1, 1][R, P], [1, 8, 16]) aten::view(t: f32[8, 4, 4, 1, 1], [1, 8, 16]) aten::bmm(dt: f32[1, 96, 8][P, R], dt: f32[1, 8, 16][R, P]) redistribute_input(0, [P, R], [S(2), S(2)]) aten::chunk(t: f32[1, 96, 8], 4, 2) aten::cat(['t: f32[1, 96, 2]', 't: f32[1, 96, 2]', 't: f32[1, 96, 2]', 't: f32[1, 96, 2]']) _c10d_functional::reduce_scatter_tensor(t: f32[4, 96, 2], sum, 4, 2) aten::clone(t: f32[1, 96, 1]) redistribute_input(1, [R, P], [S(1), S(1)]) aten::chunk(t: f32[1, 8, 16], 4, 1) aten::clone(t: f32[1, 2, 16]) aten::chunk(t: f32[1, 2, 16], 2, 1) aten::cat(['t: f32[1, 1, 16]', 't: f32[1, 1, 16]']) _c10d_functional::reduce_scatter_tensor(t: f32[2, 1, 16], sum, 2, 3) _c10d_functional::wait_tensor(t: f32[1, 1, 16]) aten::bmm(t: f32[1, 96, 1], t: f32[1, 1, 16]) aten::view(dt: f32[1, 96, 16][P, P], [16, 6, 1, 4, 4]) aten::view(t: f32[1, 96, 16], [16, 6, 1, 4, 4]) aten::permute(dt: f32[16, 6, 1, 4, 4][P, P], [0, 1, 3, 4, 2]) aten::permute(t: f32[16, 6, 1, 4, 4], [0, 1, 3, 4, 2]) aten::view(dt: f32[16, 6, 4, 4, 1][P, P], [16, 6, 4, 4]) aten::view(t: f32[16, 6, 4, 4, 1], [16, 6, 4, 4]) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162665 Approved by: https://github.com/ezyang	2025-09-16 07:30:05 +00:00
Nicolas De Carli	2459da4a64	[Caffe2] Add float batch box cox SVE128 implementation (#159778 ) Introduce SVE128 SIMD batch box-cox computation. We've seen about 65% throughput improvement. Privacy Context Container: L1196524 This is a no-op from OSS point of view, therefore it could be landed without tests (see precedence set by https://github.com/pytorch/pytorch/pull/143627), but we should delete those at some point Pull Request resolved: https://github.com/pytorch/pytorch/pull/159778 Approved by: https://github.com/malfet	2025-09-16 07:25:04 +00:00
angelayi	76fa381eef	[mps] Take into account offset (#163021 ) Fixes issue when running AOTI + MPS on voxtral model Pull Request resolved: https://github.com/pytorch/pytorch/pull/163021 Approved by: https://github.com/malfet	2025-09-16 07:14:33 +00:00
can-gaa-hou	29ea6254a0	[Bug] Add more boundary check for FractionalMaxPool3d (#161876 ) This PR aims to fix the bug mentioned at [#161853](https://github.com/pytorch/pytorch/issues/161853#issuecomment-3240695121) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161876 Approved by: https://github.com/malfet	2025-09-16 06:59:02 +00:00
Kevin Fu	d2ecddf1a3	[PT2]: Overriding Tensor device by SubmodNameToDevice (#162144 ) Summary: A temporarily solution mainly for weights that are not moved to cuda in fake mode during publishing, but runs on cuda in serving. This has some overlap with placement, but with 2 differences: 1. OverrideWeightsDevice only changes weights, not graph. 2. Placement only handles mapping between non-empty cuda indices, while here we override everything as submodNameToDevice is the ground truth. Test Plan: ICE replayer with custom package: https://www.internalfb.com/intern/unidash/dashboard/ads_infra_cost_estimation/model_infra_cost_estimation/?e[select_ESTIMATION_RUN_ID]=ICE_kevinqfu_1756939411c164_replayeripnext_00 Rollback Plan: Differential Revision: D81284723 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162144 Approved by: https://github.com/henryoier, https://github.com/SherlockNoMad	2025-09-16 06:56:06 +00:00
Shangdi Yu	1115749da7	Fix provenance tracking kernel name for fallback kernels (#162628 ) Summary: as title `kernel.cpp_kernel_name` is something like `at::_ops::_scaled_dot_product_efficient_attention::call`, but the actual kernel name we want is `aoti_torch_cuda__scaled_dot_product_efficient_attention` Differential Revision: D82142287 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162628 Approved by: https://github.com/angelayi, https://github.com/desertfire	2025-09-16 06:56:00 +00:00
Cui, Yifeng	9786243b64	Update torch-xpu-ops commit pin (#162804 ) Update the torch-xpu-ops commit to [intel/torch-xpu-ops@d8c3ee](`d8c3eefc29`), includes: - Optimize adaptive average pool for channel-last memory format - Add unregister wait_tensor - Replace deprecated `[[intel::reqd_sub_group_size(SgSize)]]` with `[[sycl::reqd_sub_group_size(SIMD)]]` and remove unnecessary attributes - Revert "Roll back to original usage of sycl::get_kernel_bundle" Pull Request resolved: https://github.com/pytorch/pytorch/pull/162804 Approved by: https://github.com/EikanWang	2025-09-16 06:30:48 +00:00
Animesh Jain	9009c4da39	[functional] Avoid duplicate custom get_device call in constructor (#162889 ) Trying to reduce the number of `__torch_dispatch__` calls of FakeTensorMode in the AOT metadata collection pass. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162889 Approved by: https://github.com/Lucaskabela, https://github.com/zou3519	2025-09-16 05:00:19 +00:00
Scott Rostrup	b68a5115a4	Workaround for mtia double init issue in has_triton (#162974 ) Summary: This change adds a new environment variable (`TORCHINDUCTOR_TRITON_DISABLE_DEVICE_DETECTION`) and configuration in `torch._inductor.config` which can be set to `"1"` to allow a user to disable triton's device detection logic in [torch/utils/_triton.py:has_triton()](`c9e57d7e9f/torch/utils/_triton.py (L128)`). This function is used at import scope in several places but the function has a side effect of initializing the mtia device if it is available which is causing some of our autotuning workflows to crash. Worth noting that when enabled this configuration disables all device detection not just mtia and this is because the logic in has_triton will initialize the mtia device as a side effect even when checking for a cuda or other device via the [get_interface_for_device()](`c9e57d7e9f/torch/_dynamo/device_interface.py (L570)`) function. I've tagged it `topic: not user facing` since I don't anticipate any outside of meta users making use of this, however this is my first PR here, so please indicate if it should be handled differently. Test Plan: This has been tested in the context of internal workflows. Differential Revision: D82347853 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162974 Approved by: https://github.com/xmfan	2025-09-16 04:46:11 +00:00
Phillip Liu	2c45628813	[Flight Recorder][WP] Added mismatch tail as an arg (#162991 ) Summary: Mismatch tail is used as a fixed variable and there are cases that there are more than 10 mismatches FR gives up producing results (e.g. https://fburl.com/ai_infra/7gjl5ucb). This diff added the mismatch tail in the parsed args so make this configuarble. Also tho the variable name is `mismatch_tail`(last 10) it is used as `mismatch_head` (the first 10). Updated it to be `num_mismatch_to_print` Test Plan: `buck2 run @//mode/opt //caffe2/fb/flight_recorder:fr_trace -- --mast_job_id aps-ctx_fm_pipeline_change-1c8ea38a94 --mast_job_version 0 --mast_job_attempt 2 --bucket tlcm_log_blob --world_size 128 --dump_file_name_offset 0 --allow-incomplete-ranks --num_mismatch_to_print 20 1>out 2>err` Confirm no error and output 20 mismatches. Rollback Plan: Differential Revision: D82335995 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162991 Approved by: https://github.com/fduwjj	2025-09-16 04:46:05 +00:00
PyTorch UpdateBot	6c0fd747af	[vllm hash update] update the pinned vllm hash (#162928 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162928 Approved by: https://github.com/pytorchbot	2025-09-16 04:25:04 +00:00
Nicolas Macchioni	d172d0231b	[pcache] Cache and AsyncCache implementations (#162777 ) Summary: Implemented caching abstractions: `Cache` and `AsyncCache`. `Cache` provides an abstraction for defining simple key -> value stores with get and put functionality. We propose using `Cache` for implementations with very low (microseconds) overhead, for example an in-memory cache. `AsyncCache` provides an abstraction for defining simple key -> value stores with asynchronous get and put functionality. We propose using `AsyncCache` for implementations with medium to high (> millisecond) overhead, for example an on-disk cache. We provide an initial extension of `Cache` in the form of `InMemoryCache`. `InMemoryCache` provides fast, in-memory caching that can be later used to memoize more expensive cache accesses. `InMemoryCache` also provides a custom constructor `InMemoryCache.from_env_var` that can be used to pre-populate the in-memory cache, which will be helpful for enabling determinism in the future. We also provides extensions of `AsyncCache`. `OnDiskCache` subclasses `AsyncCache` and serves as a generic on-disk caching implementation with atomic, write-once guarantees. `OnDiskCache` is semi-generic, allowing subclassing to alter the output directory. `InductorOnDiskCache` subclasses `OnDiskCache` to create an Inductor-specific on-disk cache that outputs to Inductor's default caching directory. Test Plan: `Cache` Tests: 1. Get -> Set -> Get - Checks that `get(key)` returns `None` when `key` is not cached, and that after calling `put(key, value)` subsequent `get(key)` calls return `value` 2. Set -> Set - Checks that with duplicated `set(key, value)` calls only the initial call is successful 3. From env var - Checks that constructing an `InMemoryCache` from an environment variable works. `AsyncCache` Tests: 1. Get -> Set -> Get - Same as `Cache` test, but checks both with synchronous and asynchronous execution 2. Set -> Set - Same as `Cache` test, but checks both with synchronous and asynchronous execution 3. Set -> Set Concurrent - Checks that of two concurrent `set(key, value)` operations, only one passes ``` cd ~/fbsource/fbcode && buck test mode/opt //caffe2/test/inductor:pcache ``` {F1981926248} Rollback Plan: Differential Revision: D82269762 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162777 Approved by: https://github.com/masnesral, https://github.com/aorenste	2025-09-16 04:07:12 +00:00
Justin Chu	fdf68fa5d7	[ONNX] Fix rotary_embedding_23 implementation (#162865 ) The implementation of rotary_embedding_23 when input is 3D was incorrect. ## Tested Locally with ```py import onnx_ir as ir import onnx import torch import os import numpy as np base_path = "/home/justinchu/dev/onnx/onnx/backend/test/data/node" test_names = [ "test_rotary_embedding", "test_rotary_embedding_3d_input", "test_rotary_embedding_interleaved", "test_rotary_embedding_no_position_ids", "test_rotary_embedding_no_position_ids_interleaved", "test_rotary_embedding_no_position_ids_rotary_dim", "test_rotary_embedding_with_interleaved_rotary_dim", "test_rotary_embedding_with_rotary_dim", ] model_paths = [os.path.join(base_path, name) for name in test_names] for path in model_paths: print(f"Checking {path} for issues...") model = onnx.load(os.path.join(path, "model.onnx")) input0 = ir.from_proto( onnx.load_tensor(os.path.join(path, "test_data_set_0", "input_0.pb")) ).numpy() input1 = ir.from_proto( onnx.load_tensor(os.path.join(path, "test_data_set_0", "input_1.pb")) ).numpy() input2 = ir.from_proto( onnx.load_tensor(os.path.join(path, "test_data_set_0", "input_2.pb")) ).numpy() if os.path.exists(os.path.join(path, "test_data_set_0", "input_3.pb")): input3 = ir.from_proto( onnx.load_tensor(os.path.join(path, "test_data_set_0", "input_3.pb")) ).numpy() else: input3 = None output0 = ir.from_proto( onnx.load_tensor(os.path.join(path, "test_data_set_0", "output_0.pb")) ).numpy() m = ir.from_proto(model) node = m.graph[-1] print(node) assert node.op_type == "RotaryEmbedding" interleaved = node.attributes.get_int("interleaved", 0) num_heads = node.attributes.get_int("num_heads", 0) rotary_embedding_dim = node.attributes.get_int("rotary_embedding_dim", 0) torch_out = torch.onnx.ops.rotary_embedding( torch.tensor(input0), torch.tensor(input1), torch.tensor(input2), position_ids=torch.tensor(input3) if input3 is not None else None, interleaved=bool(interleaved), num_heads=num_heads, rotary_embedding_dim=rotary_embedding_dim, ) torch_out = torch_out.detach().cpu().numpy() np.testing.assert_allclose(torch_out, output0) ``` Fix https://github.com/pytorch/pytorch/issues/162848 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162865 Approved by: https://github.com/kunal-vaishnavi, https://github.com/titaiwangms	2025-09-16 03:30:05 +00:00
Ke Wen	7924b083c1	[CI] disable rerun of distributed tests (#163025 ) #162978 identified an issue that distributed test failures were wrongly muted. Per discussion with @malfet, one solution is to disable rerun of distributed tests in `run_test.py`. The PR makes use of the `is_distributed_test` flag to identify those tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163025 Approved by: https://github.com/malfet	2025-09-16 03:11:50 +00:00
Kevin Tang	3ae31782cc	[DCP] Add timeout for checkpoint background process join (#162828 ) Summary: Cleaning up checkpoint background process can currently block trainer thread indefinitely if the process is hanging (notably due to Gloo pg init timeout). This diff adds a 5s grace period for normal termination and sends SIGTERM if unable to shut down in that period. Rollback Plan: Differential Revision: D82268979 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162828 Approved by: https://github.com/meetv18	2025-09-16 02:32:50 +00:00
Jeff Daily	c7fa16a05c	[ROCm][CI] update _rocm-test.yml based on _linux-test.yml (#163014 ) Fixes missing huggingface secrets and aligns _rocm-test.yml with other updates from _linux-test.yml that it was initially based on. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163014 Approved by: https://github.com/huydhn	2025-09-16 02:14:38 +00:00
CaoE	1aa41eccc2	[Inductor][CPP] Reuse the pre-existing kernel for the same kernels (#158404 ) Reuse the pre-existing kernel to avoid defining redundant kernels. Inductor CPP will generate same kernels. For example: ``` # Example class Model(torch.nn.Module): def __init__(self, K, N): super().__init__() self.linear0 = torch.nn.Linear(K, N) self.linear1 = torch.nn.Linear(N, K) self.linear2 = torch.nn.Linear(K, N) def forward(self, input): out = self.linear0(input) out = self.linear1(out) out = self.linear2(out) return out ``` For the above example, linear2 is same as linear0, and Inductor CPP generates 2 same kernels: cpp_fused_addmm_0 and cpp_fused_addmm_2. ``` # Generated code: ... cpp_fused_addmm_0 = async_compile.cpp_pybinding(['const at::BFloat16', 'const at::BFloat16', 'const at::BFloat16', 'at::BFloat16'], ''' ... extern "C" void kernel(const at::BFloat16* X, const at::BFloat16* W, const at::BFloat16* inp, at::BFloat16* Y) { constexpr int64_t num_threads = 32; constexpr int64_t N = 1024; constexpr int64_t K = 2048; constexpr int64_t Mr = 32; constexpr int64_t Nr = 32; constexpr int64_t Kr = 32; ... cpp_fused_addmm_1 = async_compile.cpp_pybinding(['const at::BFloat16', 'const at::BFloat16', 'const at::BFloat16', 'at::BFloat16'], ''' ... extern "C" void kernel(const at::BFloat16* X, const at::BFloat16* W, const at::BFloat16* inp, at::BFloat16* Y) { constexpr int64_t num_threads = 32; constexpr int64_t N = 2048; constexpr int64_t K = 1024; constexpr int64_t Mr = 32; constexpr int64_t Nr = 32; constexpr int64_t Kr = 32; ... cpp_fused_addmm_2 = async_compile.cpp_pybinding(['const at::BFloat16', 'const at::BFloat16', 'const at::BFloat16', 'at::BFloat16'], ''' extern "C" void kernel(const at::BFloat16* X, const at::BFloat16* W, const at::BFloat16* inp, at::BFloat16* Y) { constexpr int64_t num_threads = 32; constexpr int64_t N = 1024; constexpr int64_t K = 2048; constexpr int64_t Mr = 32; constexpr int64_t Nr = 32; constexpr int64_t Kr = 32; ... def call(self, args): arg6_1, = args args.clear() buf0 = empty_strided_cpu((1024, 1024), (1024, 1), torch.bfloat16) cpp_fused_addmm_0(arg6_1, constant6, _frozen_param6, buf0) del arg6_1 buf1 = empty_strided_cpu((1024, 2048), (2048, 1), torch.bfloat16) cpp_fused_addmm_1(buf0, constant6_0, _frozen_param8, buf1) buf2 = buf0; del buf0 # reuse cpp_fused_addmm_2(buf1, constant6_1, _frozen_param10, buf2) return (buf2, ) ``` After reusing the pre-existing kernel, Inductor CPP will reuse cpp_fused_addmm_0. ``` cpp_fused_addmm_0 = async_compile.cpp_pybinding(['const at::BFloat16', 'const at::BFloat16', 'const at::BFloat16', 'at::BFloat16'], ''' ... extern "C" void kernel(const at::BFloat16* X, const at::BFloat16* W, const at::BFloat16* inp, at::BFloat16* Y) { constexpr int64_t num_threads = 32; constexpr int64_t N = 1024; constexpr int64_t K = 2048; constexpr int64_t Mr = 32; constexpr int64_t Nr = 32; constexpr int64_t Kr = 32; ... cpp_fused_addmm_1 = async_compile.cpp_pybinding(['const at::BFloat16', 'const at::BFloat16', 'const at::BFloat16', 'at::BFloat16'], ''' ... extern "C" void kernel(const at::BFloat16* X, const at::BFloat16* W, const at::BFloat16* inp, at::BFloat16* Y) { constexpr int64_t num_threads = 32; constexpr int64_t N = 2048; constexpr int64_t K = 1024; constexpr int64_t Mr = 32; constexpr int64_t Nr = 32; constexpr int64_t Kr = 32; ... def call(self, args): arg6_1, = args args.clear() buf0 = empty_strided_cpu((1024, 1024), (1024, 1), torch.bfloat16) cpp_fused_addmm_0(arg6_1, constant6, _frozen_param6, buf0) del arg6_1 buf1 = empty_strided_cpu((1024, 2048), (2048, 1), torch.bfloat16) cpp_fused_addmm_1(buf0, constant6_0, _frozen_param8, buf1) buf2 = buf0; del buf0 # reuse cpp_fused_addmm_0(buf1, constant6_1, _frozen_param10, buf2) return (buf2, ) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158404 Approved by: https://github.com/jansel, https://github.com/leslie-fang-intel	2025-09-16 01:54:24 +00:00
Huy Do	61be0f1c11	Set the credential to upload vLLM nightly wheels on schedule and workflow_dispatch (#163018 ) The build is ok, but uploading is failing at the moment https://github.com/pytorch/pytorch/actions/runs/17734972779/job/50416387786 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163018 Approved by: https://github.com/wdvr, https://github.com/malfet	2025-09-16 01:46:59 +00:00
Laith Sakka	48dbd60df4	are_strides_like_channels_last_or_false (#162354 ) Note this could change suggest_memory_format behaviour for unbacked we used to return True for are_strides_like_channels_last sometimes even when results undecided now when its not decided we return False. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162354 Approved by: https://github.com/aorenste	2025-09-16 00:49:05 +00:00
Simon Fan	505458db80	[dynamo][hop] Introduce Local Map HOP (#161458 ) Can't actually deploy it because of: https://github.com/pytorch/pytorch/issues/161456 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161458 Approved by: https://github.com/ydwu4	2025-09-16 00:37:40 +00:00
PaliC	05ee8114f8	[BE] Make PyObjectSlot use a global PyInterpreter (#162659 ) This pr gets rid of the pyobj_interpreter_ variable from PyObjectSlot and saves a word in the process Gonna ask for review from @huydhn as there are some changes to CI. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162659 Approved by: https://github.com/albanD, https://github.com/huydhn	2025-09-16 00:37:09 +00:00
Michael Kelly	e900a274e5	Add `CUDA_KERNEL_ASSERT_PRINTF`, a more flexible `CUDA_KERNEL_ASSERT_MSG` (#160129 ) This new assertion helper bundles a printf call with the assertion. The goal is to make changes to instrument asserts with device-side information more intuitive and less error-prone. (See the printf call in ATen/native/cuda/Repeat.cu.) Parametrized error messages are a substantial improvement in debuggability because they show the mismatched device-side values. This lets us avoid a whole cycle of rebuilding + re-running failing training workflows. We include file, line number, function, and failing condition in the printf (along with the message provided by the user). The format matches the format of the message output by `__assert_fail`. There's also an easy-to-grep-for keyword `CUDA_KERNEL_ASSERT` in the message. I'm following the existing patterns of arch-specific macros - e.g., on ROCm, this is just a call to abort(), just like the other `CUDA_KERNEL_ASSERT` variations. I'd appreciate any thoughts on architecture-specific testing (most likely on the OSS side). # Alternatives We could just update `CUDA_KERNEL_ASSERT_MSG`. That would mean introducing `printf` calls from the kernel where there weren't any before, though. This seems like a bad idea because of the performance sensitivity. * If we want to move more slowly here, I could instrument more `CUDA_KERNEL_ASSERT` callsites without a macro, similar to https://github.com/pytorch/pytorch/pull/157996. But the main downside here is the performance hit, so let's have an organized way of doing it first. # Risks/Problems * We're shoving a lot of stuff into this printf. If a filename (at compile-time) contains `%s`, we will end up dereferencing whatever value was pushed in. On a CPU this can cause a segfault. I don't know how it behaves on a GPU. * Adding printf calls can have a performance impact because of increased register and stack usage. I did not see this play out in practice (see "benchmarks" below). However, there are changes to the generated PTX that could result in performance problems later (see "changes in generated PTX" below). # Benchmarks * I ran the following benchmarks a several times on a host with an A100: https://gist.github.com/mjkatmeta/e5494d949204a2afe2d43c452b99424f * Results are here -- I couldn't find a significant difference before or after https://gist.github.com/mjkatmeta/0f99ec27bb91214fb2cc7f612938d431 # Change in generated PTX This is the easiest way I found to run nvcc over just Repeat.cu (this is a buck2 target that includes just a copy of Repeat.cu): ``` buck2 build --show-output scripts/mjk/ai_training/cuda_benchmarks:repeat_cuda # then use the printed .so file like this: ~/fbsource/third-party/cuda/cuda_12.8.0/x64-linux/bin/cuobjdump -ptx ../buck-out/v2/gen/fbcode/028bde1acfaba823/scripts/mjk/ai_training/cuda_benchmarks/__repeat_cuda__/libscripts_mjk_ai_training_cuda_benchmarks_repeat_cuda.so ``` ## with printf This is the version of the code that appears in this diff: https://gist.github.com/mjkatmeta/5d18d48282d46b2240d946b335052b9a ## without printf I recompiled, replacing `CUDA_KERNEL_ASSERT_PRINTF(...)` in Repeat.cu with: ``` CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1]); ``` https://gist.github.com/mjkatmeta/480df4b3a122e7b326554dd15ebb7c9d (Both of these are annotated with `// CHAR ARRAY:` comments to make the string constants easier to read.) Test Plan: Running this minimal test case: ``` import torch def main(): x = torch.ones(10, dtype=torch.int64, device="cuda:0") torch.repeat_interleave(x, x, output_size=0) ``` Now we see the new message (from printf) alongside the assert failure: ``` $ buck2 run fbcode//scripts/darshanr/repeat_interleave_errors:repeat_interleave_errors [...] [CUDA_KERNEL_ASSERT] fbcode/caffe2/aten/src/ATen/native/cuda/Repeat.cu:25: compute_cuda_kernel: block: [0,0,0], thread: [31,0,0]: Assertion failed: `result_size == cumsum_ptr[size - 1]`: Invalid input! In `repeat_interleave`, the `output_size` argument (0) must be the same as the sum of the elements in the `repeats` tensor (10). fbcode/caffe2/aten/src/ATen/native/cuda/Repeat.cu:25: compute_cuda_kernel: block: [0,0,0], thread: [384,0,0] Assertion `result_size == cumsum_ptr[size - 1]` failed. [...[ ``` Rollback Plan: Reviewed By: mradmila Differential Revision: D79310684 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160129 Approved by: https://github.com/ngimel	2025-09-16 00:23:48 +00:00
drisspg	d08cabe314	[BC Breaking] Remove flex + njt code paths (#161734 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161734 Approved by: https://github.com/jbschlosser	2025-09-16 00:13:56 +00:00
Chien-Chin Huang	dac6a4bf6c	[CP] Fix the CP FlexAttention test (#162518 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162518 Approved by: https://github.com/XilunWu, https://github.com/drisspg	2025-09-16 00:12:26 +00:00
Kushagra Rastogi	cfc539fe15	Improved error lr last epoch (#162368 ) Fixes #160626 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162368 Approved by: https://github.com/janeyx99	2025-09-15 23:33:14 +00:00
Nick Riasanovsky	955e195c7d	[Triton] [Inductor] Add a Blackwell specific Template for persistent matmul (#162916 ) Summary: This adds the Triton Tutorial Matmul persistent matmul with device side TMA for Blackwell and adds it as a template option for blackwell. This uses newer Triton features such as automatic warp specialization and loop flattening, which while still containing flaws can improve performance on blackwell. This does not include the Epilogue subtiling section, as that will be a followup PR. This PR doesn't include any tuning. I am doing a larger benchmarking run to determine the best initial configs for tuning and will open a followup PR with better defaults soon. Test Plan: Tested on a Blackwell machine with test_max_autotune.py and confirmed the new tests pass. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162916 Approved by: https://github.com/NikhilAPatel	2025-09-15 23:23:04 +00:00
Isuru Fernando	c77726b1d7	[inductor] fix expand_shape when copy_shape is not a string (#162739 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162739 Approved by: https://github.com/eellison, https://github.com/mlazos	2025-09-15 23:22:07 +00:00
Scott Wolchok	6b608dfe81	Add DISABLE_JUSTKNOBS to torch/_utils_internal.py and use it for dynamo _maybe_set_eval_frame (#162298 ) If JustKnobs is disabled (as it always is in OSS), we can easily avoid an extra layer of Python function call. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162298 Approved by: https://github.com/ezyang	2025-09-15 23:00:39 +00:00
Mark Saroufim	090e6838a0	compile_kernel enable pch (#162972 ) Enabling automatic pre compiled headers per https://docs.nvidia.com/cuda/nvrtc/index.html#example-automatic-pch-cuda-12-8 I'm seeing large speedups in compilation times using PCH on average but the max compilation time with PCH is worst which is why I can't enable it by default. `load_inline()` also supports precompiled headers and does not enable them by default ``` Without PCH: 270.58 ms average With PCH: 115.27 ms average ``` ``` Without PCH: Max: 337.99 ms With PCH: Max: 383.82 ms ``` ```python source) [marksaroufim@devgpu005]~/pytorch% python simple_pch_benchmark.py ============================================================ Simple PCH Compilation Benchmark ============================================================ Device: NVIDIA B200 Iterations: 100 Testing WITHOUT PCH: ------------------------------ Compiling kernel 100 times WITHOUT PCH... Completed 10/100 compilations Completed 20/100 compilations Completed 30/100 compilations Completed 40/100 compilations Completed 50/100 compilations Completed 60/100 compilations Completed 70/100 compilations Completed 80/100 compilations Completed 90/100 compilations Completed 100/100 compilations Average: 270.58 ms (±6.99 ms) Min: 264.09 ms Max: 337.99 ms Testing WITH PCH: ------------------------------ Compiling kernel 100 times WITH PCH... Completed 10/100 compilations Completed 20/100 compilations Completed 30/100 compilations Completed 40/100 compilations Completed 50/100 compilations Completed 60/100 compilations Completed 70/100 compilations Completed 80/100 compilations Completed 90/100 compilations Completed 100/100 compilations Average: 115.27 ms (±27.32 ms) Min: 110.65 ms Max: 383.82 ms ``` ## Benchmarking script ```python #!/usr/bin/env python3 import argparse import os import sys import time from statistics import mean, stdev import torch from torch.cuda._utils import _nvrtc_compile def benchmark_compilation(use_pch, iterations=100): """Compile the same kernel many times with or without PCH.""" # CUB kernel that benefits from PCH kernel_source = """ #include <cub/block/block_reduce.cuh> #include <cub/block/block_scan.cuh> #include <cub/warp/warp_reduce.cuh> extern "C" __global__ void test_kernel(const float* input, float* output, int n) { using BlockReduce = cub::BlockReduce<float, 256>; using BlockScan = cub::BlockScan<float, 256>; using WarpReduce = cub::WarpReduce<float>; __shared__ union { typename BlockReduce::TempStorage reduce; typename BlockScan::TempStorage scan; typename WarpReduce::TempStorage warp[8]; } temp_storage; int idx = blockIdx.x * blockDim.x + threadIdx.x; float val = (idx < n) ? input[idx] : 0.0f; float sum = BlockReduce(temp_storage.reduce).Sum(val); __syncthreads(); float scan_result; BlockScan(temp_storage.scan).ExclusiveSum(val, scan_result); __syncthreads(); int warp_id = threadIdx.x / 32; float warp_sum = WarpReduce(temp_storage.warp[warp_id]).Sum(val); if (threadIdx.x == 0) { output[blockIdx.x] = sum + scan_result + warp_sum; } } """ device = torch.cuda.current_device() major, minor = torch.cuda.get_device_capability(device) compute_capability = f"{major}{minor}" compile_times = [] print( f"Compiling kernel {iterations} times {'WITH' if use_pch else 'WITHOUT'} PCH..." ) for i in range(iterations): # Use unique kernel name to avoid caching between iterations kernel_name = f"test_kernel_{i}" unique_source = kernel_source.replace("test_kernel", kernel_name) start = time.perf_counter() ptx, mangled_name = _nvrtc_compile( unique_source, kernel_name, compute_capability, header_code="", nvcc_options=["-std=c++17"], auto_pch=use_pch, ) elapsed = time.perf_counter() - start compile_times.append(elapsed * 1000) # Convert to ms # Progress indicator if (i + 1) % 10 == 0: print(f" Completed {i + 1}/{iterations} compilations") return compile_times def main(): parser = argparse.ArgumentParser(description="Simple PCH Compilation Benchmark") parser.add_argument("--pch", action="store_true", help="Test with PCH only") parser.add_argument("--no-pch", action="store_true", help="Test without PCH only") parser.add_argument( "--iterations", type=int, default=100, help="Number of compilations" ) args = parser.parse_args() print("=" * 60) print("Simple PCH Compilation Benchmark") print("=" * 60) print(f"Device: {torch.cuda.get_device_name()}") print(f"Iterations: {args.iterations}") print() # Determine what to test test_both = not args.pch and not args.no_pch results = {} # Test without PCH if args.no_pch or test_both: print("Testing WITHOUT PCH:") print("-" * 30) times_no_pch = benchmark_compilation(use_pch=False, iterations=args.iterations) if times_no_pch: avg_no_pch = mean(times_no_pch) std_no_pch = stdev(times_no_pch) if len(times_no_pch) > 1 else 0 print(f"Average: {avg_no_pch:.2f} ms (±{std_no_pch:.2f} ms)") print(f"Min: {min(times_no_pch):.2f} ms") print(f"Max: {max(times_no_pch):.2f} ms") results["no_pch"] = avg_no_pch print() # Test with PCH if args.pch or test_both: print("Testing WITH PCH:") print("-" * 30) times_with_pch = benchmark_compilation( use_pch=True, iterations=args.iterations ) if times_with_pch: avg_with_pch = mean(times_with_pch) std_with_pch = stdev(times_with_pch) if len(times_with_pch) > 1 else 0 print(f"Average: {avg_with_pch:.2f} ms (±{std_with_pch:.2f} ms)") print(f"Min: {min(times_with_pch):.2f} ms") print(f"Max: {max(times_with_pch):.2f} ms") results["pch"] = avg_with_pch print() if __name__ == "__main__": main() ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162972 Approved by: https://github.com/albanD, https://github.com/janeyx99	2025-09-15 22:55:39 +00:00
Scott Wolchok	cf7873ea8b	Placement: make is_shard/is_replicate/is_partial more straightforward (#162619 ) We already have method dispatch based on actual type, so just provide appropriate base class and subclass method implementations. (This is not motivated by any particular performance profiling, just seems more straightforward to me. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162619 Approved by: https://github.com/ezyang, https://github.com/tianyu-l, https://github.com/zpcore	2025-09-15 22:54:06 +00:00
Jeff Daily	0def79fdd9	[ROCm] fix conv relu fusion (#162856 ) Fixes #162816. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162856 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-15 22:49:32 +00:00
Davide Italiano	8590c3a66b	[DTensor] Add _foreach_pow to sharding propagation list. (#162895 ) Fixes #152696 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162895 Approved by: https://github.com/ezyang	2025-09-15 21:14:06 +00:00
Shivam Raikundalia	dae5beae8e	[RecordFunction] Add Scope for Record Function Fast (#162661 ) Differential Revision: D82164587 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162661 Approved by: https://github.com/davidberard98	2025-09-15 21:01:47 +00:00
Jagadish Krishnamoorthy	01c3c891c1	[ROCm] Enable test_fixed_striding (#162787 ) Enable the distributed test test_fixed_striding on gfx arch which supports fp8. Test command: python test/distributed/test_c10d_functional_native.py -k test_fixed_striding Pull Request resolved: https://github.com/pytorch/pytorch/pull/162787 Approved by: https://github.com/pruthvistony, https://github.com/jeffdaily	2025-09-15 20:23:43 +00:00
Edward Yang	1247dde1f2	[BE] Improve pytest summary display for OpInfo tests (#162961 ) pytest summarizes test failures by printing a truncated first line of the test of the OUTERMOST wrapped exception. Prior to this PR, it looked like this: ``` FAILED [0.0454s] test/distributed/tensor/test_dtensor_ops.py::TestLocalDTensorOpsCPU::test_dtensor_op_db_H_cpu_float32 - Exception: Caused by sample input at index 0: SampleInput(input=Tensor[size=(12, 12), device="cpu", dtype=torch.float32], args=(), kwargs={}, ... ``` I argue this is not so useful. If I have a lot of test failures, I look to the test summary to understand what /kind/ of errors I have, so I can assess which ones I should look at first. In other words, this is better: ``` FAILED [0.1387s] test/distributed/tensor/test_dtensor_ops.py::TestLocalDTensorOpsCPU::test_dtensor_op_db__softmax_backward_data_cpu_float32 - Exception: Tensor-likes are not close! ``` Now I know specifically this is a numerics problem! This PR does it by prepending the old exception text to the wrapped exception. This is slightly redundant, as we are exception chaining, but it does the job. Open to bikeshedding. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162961 Approved by: https://github.com/malfet	2025-09-15 19:58:19 +00:00
Arijit Mukhopadhyay	de3a863cd8	AMD CPU CI - Add freezing + fix label trigger (#162176 ) Added the following changes: 1. Added freezing by default for AMD CPU based CI (to follow pattern introduced by https://github.com/pytorch/pytorch/pull/152298 ) 2. Fixed issue with label based CI triggers Addresses code review comment in https://github.com/pytorch/pytorch/pull/161155 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162176 Approved by: https://github.com/malfet, https://github.com/jeffdaily	2025-09-15 19:29:35 +00:00
PyTorch MergeBot	fa919feab6	Revert "[lint][CI] Don't checkout submodules for lintrunner-noclang (#162844 )" This reverts commit 6b231af23d63ee543a81c32952138090bebcf61d. Reverted https://github.com/pytorch/pytorch/pull/162844 on behalf of https://github.com/wdvr due to seems to be needed after all - failing lint ([comment](https://github.com/pytorch/pytorch/pull/162844#issuecomment-3293465058))	2025-09-15 18:43:53 +00:00
PenXLa	8e05749d5c	Fix integer overflow bug in triu/tril for large diagonal values (#153240 ) This PR fixes a bug in the implementation of `apply_triu_tril_single` where using extremely large values for the diagonal argument (e.g. `diagonal=9223372036854775807`) could result in integer overflow and incorrect results. The masking logic is re-written to avoid this issue by always iterating over all columns, ensuring correctness even for large or extreme diagonal values. Example of the original incorrect behavior: ```python a = torch.ones(5,5) torch.triu(a, 9223372036854775807) # Before: # tensor([[0., 0., 0., 0., 0.], # [1., 1., 1., 1., 1.], # [1., 1., 1., 1., 1.], # [1., 1., 1., 1., 1.], # [1., 1., 1., 1., 1.]]) ``` The new implementation guards against overflow and produces correct results for all valid input values. Pull Request resolved: https://github.com/pytorch/pytorch/pull/153240 Approved by: https://github.com/albanD	2025-09-15 18:07:19 +00:00
Jeff Daily	b334a5a379	[ROCm][benchmark] Add HF LLM benchmark expected accuracy (#162965 ) PR #156967 added HF LLM benchmarks but did not add the ci expected accuracy files for ROCm. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162965 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-15 18:04:39 +00:00
Catherine Lee	6b231af23d	[lint][CI] Don't checkout submodules for lintrunner-noclang (#162844 ) Shouldn't be needed? Pull Request resolved: https://github.com/pytorch/pytorch/pull/162844 Approved by: https://github.com/huydhn	2025-09-15 17:29:31 +00:00
fduwjj	19a4ef0256	[DeviceMesh] Make CuTe layout as mesh layout to be ready for using in DeviceMesh (#162414 ) We create a wrapper class named "_MeshLayout" acting as a layout for device mesh so that we can add new methods more specific to DeviceMesh and keep the core logic of CuTe manipulation inside pycute module. This PR create the main body of the code and then next PR will come with actual implementation and unit test for device mesh layout. (Actual implementation can be found in https://github.com/pytorch/pytorch/pull/161016) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162414 Approved by: https://github.com/ezyang, https://github.com/fegin ghstack dependencies: #162413, #162534	2025-09-15 17:04:41 +00:00
Justin Chu	9cd54d3443	Clean up 'torch.onnx' entries from public API allowlist (#162850 ) Clean up entries related to 'torch.onnx' from the allowlist as the apis in onnx are properly configured. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162850 Approved by: https://github.com/albanD	2025-09-15 16:14:43 +00:00
Aaryaman Vasishta	0826aafa04	[ROCm/Windows] Support aotriton for scaled_dot_product_attention on Windows. (#162330 ) Enables flash attention and/or memory efficient attention on Windows with scaled_dot_product_attention via. aotriton. Already tested to be working on Windows with TheRock. Steps to enable: simply set `USE_FLASH_ATTENTION=1` and `USE_MEM_EFF_ATTENTION=1` as usual. See https://github.com/ROCm/TheRock/blob/main/external-builds/pytorch/build_prod_wheels.py#L578-L604 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162330 Approved by: https://github.com/jeffdaily Co-authored-by: Scott Todd <scott.todd0@gmail.com>	2025-09-15 16:13:03 +00:00
Scott Wolchok	5dc4e78047	Fix excess refcounting in ObjLoaderFunc (#161528 ) expectRef is preferred over expect because it doesn't copy a std::shared_ptr. Differential Revision: [D81053710](https://our.internmc.facebook.com/intern/diff/D81053710/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161528 Approved by: https://github.com/Skylion007	2025-09-15 16:05:50 +00:00
atalman	c9e57d7e9f	[CI] Move libtorch-cpu-shared-with-deps-release-build to python 3.10 (#162877 ) Related to https://github.com/pytorch/pytorch/pull/162862 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162877 Approved by: https://github.com/malfet	2025-09-15 15:27:25 +00:00
James Wu	70337a066f	[easy] Handle Autotuners in get_triton_source_codes_for_gm (#161914 ) Some triton kernels are autotuners, in that case, grab the function from the autotuner. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161914 Approved by: https://github.com/oulgen	2025-09-15 15:19:04 +00:00
James Wu	7d1bcd9aea	[easy] Fix unsigned long issue in static cuda launcher (#162920 ) Fixes https://github.com/pytorch/pytorch/issues/162430 It's a bit hard to come up with a unit test where the stream exceeds a C++ long, so just using existing unit tests for now. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162920 Approved by: https://github.com/Skylion007, https://github.com/jansel	2025-09-15 15:00:32 +00:00
albanD	09cbf34e93	[BE] Preserve caller source location in the error message (#162808 ) Summary: Currently the C10_CUDA_CHECK only shows source location in CUDAException like below: ``` Exception raised from c10_cuda_check_implementation at fbcode/caffe2/c10/cuda/CUDAException.cpp:44 ``` which is not terribly useful. By checking the original diff D39619861 that introduced c10_cuda_check_implementation, it seems the original macro would show the source location correctly but c10_cuda_check_implementation broke it. This diff will propagate caller source location to c10_cuda_check_implementation to fix the issue. Test Plan: CI Observed desired error message after the change: ``` CUDA error: an illegal memory access was encountered Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information. CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Device-side assertion tracking was not enabled by user. Exception raised from operator() at fbcode/sigrid/predictor/aed/AedContainer.cpp:659 (most recent call first): ``` Note the last line reports actual caller location. Rollback Plan: Reviewed By: Raymo111 Differential Revision: D81880552 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162808 Approved by: https://github.com/janeyx99	2025-09-15 13:29:43 +00:00
PyTorch UpdateBot	456fbeaa6d	[xla hash update] update the pinned xla hash (#162947 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned xla hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162947 Approved by: https://github.com/pytorchbot	2025-09-15 11:42:02 +00:00
PyTorch UpdateBot	a8c80f3fa9	Update slow tests (#162946 ) This PR is auto-generated weekly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/weekly.yml). Update the list of slow tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162946 Approved by: https://github.com/pytorchbot	2025-09-15 11:31:37 +00:00
Natalia Gimelshein	bf6b40da3e	fix deterministic scatter_add path for multi-d tensors (#162866 ) PReviously for more than 2d tensor `select` didn't work correctly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162866 Approved by: https://github.com/valentinandrei	2025-09-15 06:50:00 +00:00
Zeng, Xiangdong	814ba34fa6	[2/N] Port 5 _composable distributed test to Intel GPU (#159241 ) For https://github.com/pytorch/pytorch/issues/114850, we will port distributed tests to Intel GPU. This is the second PR for _composable cases, the first is https://github.com/pytorch/pytorch/pull/159118. We could enable Intel GPU with following methods and try the best to keep the original code styles: - Use "torch.accelerator.current_accelerator()" to determine the accelerator backend - Enabled XPU for some test path - Skip some test cases which Intel GPU does not support - Added "cpu:gloo,xpu:xccl" for distributed backend Pull Request resolved: https://github.com/pytorch/pytorch/pull/159241 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-15 06:24:58 +00:00
Edward Yang	06bb32d55e	Skip empty tests, they don't make sense for numerics (#162932 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162932 Approved by: https://github.com/dcci	2025-09-15 06:20:26 +00:00
can-gaa-hou	b3ad8f4a9c	[BUG] Fix nonzero_static crash on CUDA when the input is a empty tensor (#162578 ) Fixes #162473 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162578 Approved by: https://github.com/ngimel	2025-09-15 05:44:15 +00:00
Edward Yang	755cf90672	Redirect all use of filesystem to c10/utils/FileSystem.h (#162914 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162914 Approved by: https://github.com/Skylion007, https://github.com/dcci, https://github.com/cyyever	2025-09-15 04:30:41 +00:00
Nikita Shulga	76e5df3866	[BE] Use `fmt::format` to define Conv key (#162925 ) Also use `getArrayRefString` instead of having separate cases for 2D and 3D Conv Pull Request resolved: https://github.com/pytorch/pytorch/pull/162925 Approved by: https://github.com/Skylion007 ghstack dependencies: #162921	2025-09-15 02:44:12 +00:00
Nikita Shulga	7fe1f5ea49	[BE] Delete [Ventura\|Sonoma]Ops header (#162921 ) Was a temp solution to make PyTorch+MPS buildable on MacOS-12, but it's no longer needed, as in 2.9+ MPS is only supported on MacOS Sonoma+ Pull Request resolved: https://github.com/pytorch/pytorch/pull/162921 Approved by: https://github.com/Skylion007, https://github.com/dcci	2025-09-15 02:44:12 +00:00
James Wu	e156a07171	[Precompile] [RFC] Implement aot_compile_module (#162171 ) This PR adds a new interface _aot_compile to `OptimizedModule`, so that the following is possible: ``` mod = SimpleLinearModule() inputs = [ ModelInput( args=(torch.randn(3, 3),), kwargs={}, contexts=[torch.no_grad(), eval_mode(model)], ), ModelInput( args=(torch.randn(3, 3),), kwargs={}, contexts=[train_mode(model)] ), ] assert isinstance(model, torch._dynamo.eval_frame.OptimizedModule) model._aot_compile( inputs, ) ``` After this PR, you can AOT precompile NanoGPT and use it to train directly. I'll share my fork of the repo to make this work. ## ModelInput The `ModelInput` API is a work in progress; for now it represents a set of inputs and contexts to instruct the compiler to compile. Most commonly, this is "compile an eval mode with no grad, and a training mode with grad", but also contains things like autocasting contexts, etc. ## Dispatch Dispatching is super simple here, we just iterate through all the precompiled fullgraphs and check guards for each one until there's one htat passes. I'm a bit worried that having this in python code is going to be too expensive. The guard checks are happening in C++ anyway, though, so the only python bottlenecked step here is just the for loop, so perhaps the overhead will not be high. I'll work on measuring this, though. ## TODOs This PR does not support `mod.compile()`, only `torch.compile(mod)`. In order to support `mod.compile()`, we'll need to update torch.nn.Module with an updated implementation — I can add that frontend later. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162171 Approved by: https://github.com/zhxchen17	2025-09-14 23:32:28 +00:00
Isalia20	ba5ca31676	[MPS] sparse mps any (#162885 ) Add SparseMPS key for any op Pull Request resolved: https://github.com/pytorch/pytorch/pull/162885 Approved by: https://github.com/malfet, https://github.com/Skylion007	2025-09-14 18:57:53 +00:00
Isalia20	8e1db46493	[MPS] enable empty like and unsqueeze for SparseMPS (#162910 ) Enable empty like and unsqueeze for SparseMPS Pull Request resolved: https://github.com/pytorch/pytorch/pull/162910 Approved by: https://github.com/malfet, https://github.com/Skylion007	2025-09-14 17:47:06 +00:00
Edward Yang	aff2438554	QoL: add pip to requirements-build.txt (#162896 ) uv venvs by default don't come with pip, but for example setup.py assumes it is available. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162896 Approved by: https://github.com/Skylion007	2025-09-14 17:08:05 +00:00
Shen Zhang	3f8a2e62ea	Fix rebind_unbacked in torch.fx.experimental.symbolic_shapes (#162788 ) ## Description Fix a float type handling in `torch.fx.experimental.symbolic_shapes` function. [#162480](https://github.com/pytorch/pytorch/issues/162480) ## Issue When I use AOTInductor to compile the YOLOv10, I encounter the bug `'float' object has no attribute 'node'`. [Torch AOTInductor Ahead-Of-Time Compilation Fail](https://github.com/opendatalab/DocLayout-YOLO/issues/177) The problem is due to missing float type handling. https://github.com/pytorch/pytorch/blob/main/torch/fx/experimental/symbolic_shapes.py#L597 ``` if isinstance(u1, int): log.info( "rebind_unbacked: discard %s %s %s -> %s", n.target, raw_u0, path, u1, ) continue ``` ## Solution Change the code `if isinstance(u1, float)` to `if isinstance(u1, (int,float))` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162788 Approved by: https://github.com/ezyang	2025-09-14 17:07:14 +00:00
Clark Kang	6d64bc3990	[data foundation][vizard] Prevent checking the device type of numpy object in Tensorboard logger (#162888 ) Summary: The check is introduced in D82262053 - `scalar_value` could be a numpy object - Move the check of `device.type` into `make_np` method where it happens only when it's a `torch.Tensor`. Test Plan: ``` vizard launch -j 1x8 --launch=flow --config-path=pkg://vizard_projects.image_classification.configs --config-name=resnet50 ++flow.secure_group=ml_sensors ++flow.entitlement=ai_frameworks_pnb ++max_train_steps_per_epoch=10 ++max_epochs=5 ++log_every_n_steps=10 ++profiler=null ++max_eval_steps_per_epoch=10 ``` Rollback Plan: Differential Revision: D82383428 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162888 Approved by: https://github.com/xush6528	2025-09-14 08:09:08 +00:00
angelayi	972140b7e9	[benchmark] Add HF LLM benchmarks (#156967 ) Results in https://docs.google.com/spreadsheets/d/1xXOPg9JjEmPx0zc5QBNdyXQq8-K2_r4ybHaiS-q7pZ0/edit?gid=88695043#gid=88695043 Pull Request resolved: https://github.com/pytorch/pytorch/pull/156967 Approved by: https://github.com/huydhn Co-authored-by: Huy Do <huydhn@gmail.com>	2025-09-14 07:41:06 +00:00
Thien Tran	84186c39ed	[NVRTC] Enable compiling templated kernels (#162875 ) Per NVRTC doc - https://docs.nvidia.com/cuda/nvrtc/index.html#accessing-lowered-names, we can compile a templated kernel (e.g. `kernel<float>`) with the following steps NVRTC side - (new) `nvrtcAddNameExpression` -> C++ template e.g. `f<float>` - `nvrtcCompileProgram` - (new) `nvrtcGetLoweredName` -> get mangled name. need to do a copy since later this string is freed after NVRTC program is destroyed - `nvrtcDestroyProgram` CUDA side - use mangled name instead of normal name -> profit - `extern "C"` is not even needed Pull Request resolved: https://github.com/pytorch/pytorch/pull/162875 Approved by: https://github.com/msaroufim	2025-09-14 06:17:36 +00:00
Nick Riasanovsky	74a35c6344	[Triton] [Inductor] Enable TMA store for TMA mm templates (#160480 ) Summary: Adds support for TMA store in all TMA matmul templates (notably persistent_tma including addmm and scaled_mm). This works by requiring a template be registered with `tma_store=True` and when met constructs indices/range_trees to hook into the existing code base's TMA store support. This also includes a couple notable changes: - Adds support in the TMA template support for checking the output layout. - Adds support for "hoisting" the tensor descriptor to the top of the kernel. This will currently only be used by template code right now, but in principle it can be generalized to other implementation. - Supports considering multiple indices as the "contiguous" index. This is handled with support for transposing the input data when the alignment is no longer consistent. In general since the TMA support is derived from the index it doesn't seems reasonable that the 1D index math forces a certain alignment depending on index ordering so long as the layout matches. Test Plan: Tested with test_max_autotune.py unit tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160480 Approved by: https://github.com/NikhilAPatel	2025-09-14 04:56:49 +00:00
PyTorch UpdateBot	d2f6daf6a7	[audio hash update] update the pinned audio hash (#162892 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162892 Approved by: https://github.com/pytorchbot	2025-09-14 04:27:37 +00:00
PyTorch UpdateBot	e74b21d66a	[vllm hash update] update the pinned vllm hash (#162891 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162891 Approved by: https://github.com/pytorchbot	2025-09-14 04:27:35 +00:00
Laith Sakka	f01bf0f64b	Do not use // but use CleanDiv or FloorDiv instead (#162869 ) Summary: When rewriting sympy expressions in the compiler codebase we want to generate FloorDiv(a, b) CleanDiv(a, b) directly and not a//b. since the later become floor(a*pow(b, -1)) For symnodes we automatically handle that conversions in the symnode op dispatch. I will follow up with an issue to track all other usages of //. Block internal Model. Test Plan: add test run existing tests. dakechen1993 testing on the model. Rollback Plan: Differential Revision: D82362241 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162869 Approved by: https://github.com/ezyang	2025-09-14 01:30:33 +00:00
Ben Niu	886699bc5c	Port shared_ptr optimization in std::shared_ptr to intrusive_ptr (#162784 ) Summary: Please see D21021645 for details about the optimization and why it's beneficial. A similar change has been added to libstdc++ as well, see `dbf8bd3c2f` Rollback Plan: Reviewed By: yfeldblum Differential Revision: D81960754 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162784 Approved by: https://github.com/swolchok	2025-09-13 21:01:00 +00:00
Varun Patil	72b5159782	[flatbuffer] Fix compile error due to discarded result (#162767 ) Summary: One of our builds fails because the return value of fread is discarded. Explicit cast to void fixes the build. ```log In file included from fbcode/caffe2/torch/csrc/jit/mobile/import.cpp:15: fbcode/caffe2/torch/csrc/jit/mobile/file_format.h:156:3: error: ignoring return value of function declared with 'warn_unused_result' attribute [-Werror,-Wunused-result] 156 \| fread(data.get(), size, 1, f); \| ^~~~~ ~~~~~~~~~~~~~~~~~~~~~~ 1 error generated. ... BUILD FAILED Failed to build 'fbcode//caffe2:libtorch (cfg:opt-linux-x86_64-clang19-no-san-opt-by-default#fef256f7ee896871)' ``` Test Plan: No runtime behavior change. CI. Rollback Plan: Differential Revision: D82265002 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162767 Approved by: https://github.com/Skylion007	2025-09-13 20:24:43 +00:00
Nyakku Shigure	f37eaebed1	Add missing `tags` parameter to `custom_op` overload signatures (#162047 ) It appears to be an omission in #149782. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162047 Approved by: https://github.com/zou3519, https://github.com/BoyuanFeng Co-authored-by: Boyuan Feng <fby.1994@gmail.com>	2025-09-13 19:57:23 +00:00
PyTorch MergeBot	5b9114bf19	Revert "[ROCm/Windows] Support aotriton for scaled_dot_product_attention on Windows. (#162330 )" This reverts commit 62843c14bbf694f5722fd6e1075da4792507fe42. Reverted https://github.com/pytorch/pytorch/pull/162330 on behalf of https://github.com/atalman due to Sorry reverting looks like broke windows nightlies see https://github.com/pytorch/pytorch/issues/162881 ([comment](https://github.com/pytorch/pytorch/pull/162330#issuecomment-3288544921))	2025-09-13 15:43:50 +00:00
PyTorch MergeBot	deb7ebe0a3	Revert "[Reland] Use std::string_view in torchgen (#158625 )" This reverts commit 972e409829343cc2062aeee0994a9c1c735d216a. Reverted https://github.com/pytorch/pytorch/pull/158625 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it seems to break a couple of ExecuTorch tests for Vulkan backend ([comment](https://github.com/pytorch/pytorch/pull/158625#issuecomment-3287754275))	2025-09-13 07:52:50 +00:00
PyTorch MergeBot	9c93dc8123	Revert "Return NoOpDeviceGuardImpl in replace of CudaDeviceGuard when device is not available, or cpu-only build (#160532 )" This reverts commit a956c4ab1cb13079203a8f07eb26218724f54dc8. Reverted https://github.com/pytorch/pytorch/pull/160532 on behalf of https://github.com/huydhn due to Reverted internally ([comment](https://github.com/pytorch/pytorch/pull/160532#issuecomment-3287745165))	2025-09-13 07:42:12 +00:00
PyTorch MergeBot	31040b6357	Revert "port some distributed tensor test files for Intel GPU (#161703 )" This reverts commit 179f10621b418427fc6e92f58ea2b0bbe4cc9c52. Reverted https://github.com/pytorch/pytorch/pull/161703 on behalf of https://github.com/huydhn due to Sorry for reverting your change but these tests are failing internally ([comment](https://github.com/pytorch/pytorch/pull/161703#issuecomment-3287720713))	2025-09-13 07:22:14 +00:00
Edward Yang	aa41d3e49c	Claude loves making these files in top level, ignore them for sanity. (#162806 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162806 Approved by: https://github.com/albanD	2025-09-13 04:59:00 +00:00
PyTorch UpdateBot	f0fcf436c5	[audio hash update] update the pinned audio hash (#162864 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162864 Approved by: https://github.com/pytorchbot	2025-09-13 04:17:21 +00:00
PyTorch UpdateBot	5663910472	[vllm hash update] update the pinned vllm hash (#162751 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162751 Approved by: https://github.com/pytorchbot	2025-09-13 04:16:51 +00:00
Xuan Zhang	da669d51bf	fusion of large accumulated reads only at ir level (#161978 ) This is to revert some of the changes in https://github.com/pytorch/pytorch/pull/158667 In particular, we only disallow fusion of large accumulate read at IR level and not at scheduler level, as users can create their own custom fusion logics for the scheduler level. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161978 Approved by: https://github.com/yf225	2025-09-13 04:07:25 +00:00
Georgia Phillips	783985e9fe	kjt pytree registration (#161114 ) Differential Revision: D80656182 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161114 Approved by: https://github.com/henryoier	2025-09-13 03:57:43 +00:00
Jimmy Lu	49d30f9a23	Fix boxcox to return same result for same input in one batch (#162772 ) Summary: The SIMD path is using SLEEF version of `pow` which is slightly different from `std::pow`. The fix is to use the same vectorized code (with partial load and store) for the trailing data as well to ensure consistency between results. Rollback Plan: Differential Revision: D82265247 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162772 Approved by: https://github.com/swolchok	2025-09-13 03:57:35 +00:00
Huy Do	66133b1ab7	Build vLLM aarch64 nightly wheels (#162664 ) PyTorch has published its aarch64 nightly wheels for all CUDA version after https://github.com/pytorch/pytorch/pull/162364 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162664 Approved by: https://github.com/atalman	2025-09-13 03:43:55 +00:00
Chen	543d50db2b	Fix torch export with dict input nested in args (#162618 ) Investigated together with @pyemma and @taotaohuang001 ## Problem when calling exported module with dict nested in the args tuple, it will make following complaits ``` Traceback (most recent call last): File "/home/chzhu/infinitrain/test_torch_export.py", line 32, in <module> print(exported_model({"a2": torch.randn(10), "a1": torch.randn(10)})) File "/home/chzhu/infinitrain/build/infinitrain/environments/development-venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 848, in call_wrapped return self._wrapped_call(self, args, kwargs) File "/home/chzhu/infinitrain/build/infinitrain/environments/development-venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 424, in __call__ raise e File "/home/chzhu/infinitrain/build/infinitrain/environments/development-venv/lib/python3.10/site-packages/torch/fx/graph_module.py", line 411, in __call__ return super(self.cls, obj).__call__(args, *kwargs) # type: ignore[misc] File "/home/chzhu/infinitrain/build/infinitrain/environments/development-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(args, *kwargs) File "/home/chzhu/infinitrain/build/infinitrain/environments/development-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1879, in _call_impl return inner() File "/home/chzhu/infinitrain/build/infinitrain/environments/development-venv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1806, in inner args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc] File "/home/chzhu/infinitrain/build/infinitrain/environments/development-venv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 929, in _fn return fn(args, *kwargs) File "/home/chzhu/infinitrain/build/infinitrain/environments/development-venv/lib/python3.10/site-packages/torch/export/_unlift.py", line 81, in _check_input_constraints_pre_hook flat_args_with_path = _check_inputs_match(args, kwargs, self._in_spec) File "/home/chzhu/infinitrain/build/infinitrain/environments/development-venv/lib/python3.10/site-packages/torch/export/_unlift.py", line 64, in _check_inputs_match raise ValueError( # noqa: B904 ValueError: Trying to flatten user inputs with exported input tree spec: TreeSpec(tuple, None, [TreeSpec(tuple, None, [TreeSpec(dict, ['a1', 'a2'], [, ])]), TreeSpec(dict, [], [])]) but actually got inputs with tree spec of: TreeSpec(tuple, None, [TreeSpec(tuple, None, [TreeSpec(dict, ['a2', 'a1'], [, ])]), TreeSpec(dict, [], [])]). Please check that the inputs have the same number and type of args and kwargs as the ones you used when tracing. ``` ## How to reproduce the issue ```python import torch # create a nn.Module with data_batch as input and output as output class MyModel(torch.nn.Module): def __init__(self): super(MyModel, self).__init__() self.linear = torch.nn.Linear(10, 1) def forward(self, data_batch): h1 = self.linear(data_batch["a1"]) h2 = self.linear(data_batch["a2"]) return h1 + h2 # torch export this module model = MyModel() example_args_forward = ( { "a1": torch.randn(10), "a2": torch.randn(10), }, ) exported_model = torch.export.export(model, example_args_forward, strict=True) # save the exported model torch.export.save(exported_model, "exported_model.pt2") # load the exported model exported_model = torch.export.load("exported_model.pt2").module() # run the exported model print(exported_model({"a2": torch.randn(10), "a1": torch.randn(10)})) ``` ## Root Cause Input spec is encoded as [TreeSpec](`582d278983/torch/utils/_pytree.py (L1059)`) in torch export. With (args, kwargs) at the top level. When we call the exported model, it has a pre-execution [hook](`582d278983/torch/export/_unlift.py (L66)`) to check the input TreeSpec matches the received TreeSpec, where in Treespec, the dict key order is preserved. Something like TreeSpec(dict, ['a2', 'a1'], [,*]) To workaround this, the input check reorders [kwargs](`582d278983/torch/export/_unlift.py (L67)`), that is why kwargs can be out of order. But the dict nested in the args is not re-ordered, so any re-ordering of the keys will throw errors. ## Solution Update eq_spec to handle the dict case, where we only guarantee that key set is the same without ordering constraints. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162618 Approved by: https://github.com/angelayi	2025-09-13 03:24:30 +00:00
PyTorch MergeBot	7dd5f7b125	Revert "python fastpath for DTensor detach(), confirm that aliasing DTensorSpec is ok (#160580 )" This reverts commit 4b2d297eec425475a82934a52e0edd96805524a1. Reverted https://github.com/pytorch/pytorch/pull/160580 on behalf of https://github.com/bdhirsh due to this broke shampoo, yanking ([comment](https://github.com/pytorch/pytorch/pull/160580#issuecomment-3287372891))	2025-09-13 02:04:36 +00:00
Sherlock Huang	a956c4ab1c	Return NoOpDeviceGuardImpl in replace of CudaDeviceGuard when device is not available, or cpu-only build (#160532 ) Summary: To support exporting a cuda model on a CPU-only machine under fake tensor mode. User commonly need to move sample inputs to the cuda device with .to("cuda:0") or .to("cuda") call. This diff supports this. I expect the following pattern to work ``` with FakeTensorMode(allow_non_fake_inputs=True): cuda_module = module.to("cuda:0") cuda_sample_inputs = tuple([x.to("cuda:0") for x in sample_inputs]) with torch.no_grad(): ep = torch.export.export(cuda_module, cuda_sample_inputs) ``` Test Plan: CI Rollback Plan: Differential Revision: D80181887 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160532 Approved by: https://github.com/henryoier, https://github.com/ezyang	2025-09-13 01:50:51 +00:00
Kevin Tang	0925c644ed	[DCP] Decrease checkpoint background process Gloo pg init timeout (#162760 ) Summary: Sometimes checkpoint background process creation times out during gloo pg init. Attempting to destroy the process during that time can block the trainer thread until the timeout completes. This diff reduces the pg init timeout from 30m -> 10m to reduce the cleanup time. Test Plan: CI Rollback Plan: Differential Revision: D81724668 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162760 Approved by: https://github.com/meetv18	2025-09-13 01:50:40 +00:00
Xu Han	b2553a6ec4	[AOTI] raise PyTorchStreamWriter open failed error code on windows (#162799 ) When I debug AOTI UT: `TestAOTInductorPackage_cpu::test_add`. I found it didn't output the verbose error code, when PyTorchStreamWriter open failed. This PR add the verbose error code output for debug. Local test shows as below: <img width="1124" height="653" alt="image" src="https://github.com/user-attachments/assets/01cb1a51-2982-4106-8b5b-c608ac26a075" /> The error code is 32, we can check the Windows error code 32 at https://learn.microsoft.com/en-us/windows/win32/debug/system-error-codes--0-499- ``` ERROR_SHARING_VIOLATION 32 (0x20) The process cannot access the file because it is being used by another process. ``` This issue is caused by the file is opened by another process. I fixed same issue in zip open as PR: https://github.com/pytorch/pytorch/pull/162617 But still no idea how to open file with shared access in `std::ofstream`. I will continue to researching it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162799 Approved by: https://github.com/jansel	2025-09-13 01:41:14 +00:00
Parshant Sharma	a749c40342	[Bilinear] move check to reset_parameters (#160952 ) Fixes #160407 ### Summary: Moved the check to reset_parameters to make `Bilinear` module lazy. Lazy modules have in_features initialized to 0 and a pre forward hook that initializes these to the appropriate shape, then calls reset parameters, ### Impact: module: nn, linear.py ### Test: <img width="903" height="182" alt="Screenshot From 2025-08-19 13-27-12" src="https://github.com/user-attachments/assets/bc04b0d6-5174-4dc9-8b21-9e019b3822a5" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160952 Approved by: https://github.com/mikaylagawarecki	2025-09-13 01:17:10 +00:00
Nick Riasanovsky	595e13feb7	[BE] [Inductor] Update NoValidChoicesError logic (#162814 ) Summary: Updates the NoValidChoicesError logic to include some additional context for if not choices exists or if no choices compiled. Test Plan: NFC. Depending on CI. Rollback Plan: Differential Revision: D82312035 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162814 Approved by: https://github.com/mlazos	2025-09-13 00:45:50 +00:00
Xuan Zhang	ddc5107601	An improved heuristic for operator reordering for peak memory + debugging logs (#161810 ) Revisiting the idea in https://github.com/pytorch/pytorch/pull/140195 For the lpmf algorithm in the memory reorder pass, in some cases, when all the nodes that can be scheduled are quite large, it is beneficial to switch the scheduling strategy. So instead of using size as the criterion, we choose a node that can unlock more nodes to become schedulable by analyzing their successor nodes. For an internal use case, we observe up to 20 GiB memory difference and here are the before and after memory snapshot. More information can be found in [D81270682](https://www.internalfb.com/diff/D81270682) (internal only). <img width="348" height="227" alt="image" src="https://github.com/user-attachments/assets/fb71e840-1508-44ed-bc9d-5eb4d364607d" /> In addition, add the functionality to upload the graph to tlparse for offline debugging. The format of the json is in consistency with the simulator [here](https://fburl.com/code/3l3d3qi4) (internal only). Pull Request resolved: https://github.com/pytorch/pytorch/pull/161810 Approved by: https://github.com/yf225	2025-09-13 00:42:32 +00:00
FFFrog	a94ddd9b00	[OpenReg] Fix the docs of Accelerator Intergration (#162826 ) ---- - Fixed the redirect link about step 1 - Formatted the autoload and added necessary links Pull Request resolved: https://github.com/pytorch/pytorch/pull/162826 Approved by: https://github.com/albanD ghstack dependencies: #161917, #161918, #160101	2025-09-12 23:53:17 +00:00
FFFrog	29f84b0f61	[OpenReg] Improve the Event and Stream capabilities of DeviceGuardImplInterface (#160101 ) Changes: - Based on `OpenRegStream` and `OpenRegEvent`, we improve the implementation of Device Guard for `OpenReg` - Add some related testcases Pull Request resolved: https://github.com/pytorch/pytorch/pull/160101 Approved by: https://github.com/albanD ghstack dependencies: #161917, #161918	2025-09-12 23:53:17 +00:00
FFFrog	27daa6af6a	[OpenReg] Strengthen Openreg's execution limits to minimize the waste of computing resources (#161918 ) Currently, OpenReg supports Linux, Windows, and OS X, ensuring stability and ease of integration with third-party devices across all three platforms. It also doesn't rely on any other accelerators (such as CUDA or MPS). Therefore, to minimize computational resource usage, `test_openreg` can be added to certain BLOCKLISTS to prevent its execution, limiting OpenReg's execution to only necessary scenarios. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161918 Approved by: https://github.com/albanD ghstack dependencies: #161917	2025-09-12 23:53:17 +00:00
FFFrog	9b429846e8	[OpenReg] Migrate OpenReg Tests from tests/test_openreg.py into torch_openreg/tests (#161917 ) Background: Almost all the tests in `test/test_openreg.py` are designed for `torch_openreg`, so placing these testcases in the test directory is not a good idea. Instead, they should be moved to the `tests` directory under `torch_openreg`, coordinating these tests with their corresponding functional logic. How to do: So how do we verify the quality of the third-party device integration mechanism? We will maintain a `test_openreg` entrypoint in `test/run_test.py`. This entrypoint will install `torch_openreg` and run all the testcases located in `torch_openreg`. As long as all testcases pass, we can guarantee that the out-of-tree backend integration mechanism is available. Next: We will also improve `torch_openreg's` test coverage in the future. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161917 Approved by: https://github.com/albanD	2025-09-12 23:53:17 +00:00
PyTorch MergeBot	cdfa298a3b	Revert "[MTIA Runtime] Add foreach_div ops to native_functions.yaml (#162732 )" This reverts commit a3f01f6418667f791f36d928f7e912eb89be2e67. Reverted https://github.com/pytorch/pytorch/pull/162732 on behalf of https://github.com/huydhn due to Reverted internally ([comment](https://github.com/pytorch/pytorch/pull/162732#issuecomment-3287163750))	2025-09-12 23:52:43 +00:00
Nikita Shulga	d25c35d2b2	[MPS] Fix `[nan]median` output for empty tensors (#162846 ) It should be `NaN` rather than 0 Added respective checks to `test_empty_tensor` Fixes https://github.com/pytorch/pytorch/issues/162798 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162846 Approved by: https://github.com/dcci	2025-09-12 22:26:29 +00:00
Dmitry Rogozhkin	ee53ad2dd0	xpu: test py_limited_api with SyclExtension (#162546 ) Commit extends existing CUDA test to cover XPU SyclExtension case for the same feature - `py_limited_api`. Commit required a fix for xpu to install some Aten header files (#145902) which got resolved after the merge of #159621. See: https://github.com/pytorch/pytorch/issues/145902 Requires: https://github.com/pytorch/pytorch/pull/159621 Requires: https://github.com/intel/torch-xpu-ops/pull/1743 CC: @guangyey, @EikanWang Pull Request resolved: https://github.com/pytorch/pytorch/pull/162546 Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/janeyx99	2025-09-12 21:57:01 +00:00
Haifeng Jin	0dcd9304aa	fix high=0 bug in nll_loss test (#162763 ) Minor bug fix for the `nll_loss` test. Before this PR, it runs `torch.randint(high=0)`, which will fail because it would try to generate a number that >= low and < high, i.e. x>=0 and x<0. The test did not fail because that line is not run when testing on CPU because it failed earlier because of a unsupported dtype. However, as we support TPUs at Google, this line is reached first before the dtype check, which triggers the bug. To my understanding, these OpInfo should be general enough to support different hardware. Fixing this obvious bug would make it more general cross different hardware. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162763 Approved by: https://github.com/soulitzer	2025-09-12 21:48:18 +00:00
Ruben Rodriguez Buchillon	25f1a5d8d1	[inductor][ez] add src_hash property for Templates (#161468 ) # why enable caching/overriding/filtering based on src hash later # what - KernelTemplate has a src_hash that is None by default - sha256 on TritonTemplate of the template src code - None on ExternKernelChoice to have same API # testing n/a (not in use in this change) Differential Revision: [](https://our.internmc.facebook.com/intern/diff/) Differential Revision: [D81821149](https://our.internmc.facebook.com/intern/diff/D81821149) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161468 Approved by: https://github.com/eellison ghstack dependencies: #161351, #161350, #162293	2025-09-12 21:10:45 +00:00
Ruben Rodriguez Buchillon	269c9907a0	[inductor][choices] rename get_mm_configs to get_template_configs (#162293 ) # why - eventually we want all templates to go through this - we're exposing this through diode as a sort of interface/API - avoid later renaming # what - rename get_mm_configs to get_template_configs - rename _finalize_mm_configs to _finalize_template_configs # testing - lintrunner - ci Differential Revision: [D81820641](https://our.internmc.facebook.com/intern/diff/D81820641) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162293 Approved by: https://github.com/eellison ghstack dependencies: #161351, #161350	2025-09-12 21:10:45 +00:00
Ruben Rodriguez Buchillon	a326ef37e6	[inductor] leverage template stacking in V.choices.get_mm_configs (#161350 ) # why - now everything is in place to just gather templates and run the V.choices.get_mm_configs once per op - enables any overrides inside V.choices.get_mm_configs to have a full view of the options for an op, not just for one template # what - replace multiple calls to V.choices.get_mm_configs with calls to gather the active templates, and then using those in a single call # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520571](https://our.internmc.facebook.com/intern/diff/D81520571) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161350 Approved by: https://github.com/eellison, https://github.com/jansel ghstack dependencies: #161351	2025-09-12 21:10:38 +00:00
Ruben Rodriguez Buchillon	cdb2d1838a	[inductor] FlexibleLayout for ExternKernelChoice for mms (#161351 ) # why - if we only use ExternKernelChoice we're not doing any codegen - if we're not doing any codegen, we can use a FlexibleLayout here, and provide deeper passes more chances to change it # what - if all the kernel template choices (KTC) are with a ExternKernelChoice template, we switch to a FlexibleLayout before generating the choice - add a test to make sure that works as intended (FlexibleLayout for only extern, and FixedLayout if Triton is involved) - caveats: - because CPP, CUTLASS, and CK are not using V.choices.get_mm_configs yet, we turn off the optimization if either of those backends are in use. This will be relaxed once they support this too - because Triton templates are still using their own calls (not a single call) to get_mm_configs, it's also turned off there. The next diff unifies Triton + ATEN to a single call to get_mm_configs and that in turn allows the optimization there too # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520584](https://our.internmc.facebook.com/intern/diff/D81520584) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161351 Approved by: https://github.com/eellison, https://github.com/jansel	2025-09-12 21:10:31 +00:00
LifengWang	f7ea4975ab	update the baseline data for the operator benchmark (#162693 ) According to the results of the last four operator benchmark runs, we found that five models achieved more than a 30% improvement compared to the baseline. Therefore, we will update the operator benchmark baseline data. We use the average results from the four runs as the new baseline for the five models. And add a pull request trigger for the operator benchmark workflow Benchmarking Framework \| Benchmarking Module Name \| Case Name \| tag \| run_backward \| baseline old \| r1 \| r2 \| r3 \| r4 \| avg \| speedup -- \| -- \| -- \| -- \| -- \| -- \| -- \| -- \| -- \| -- \| -- \| -- PyTorch \| add \| add_M1_N1_K1_cpu \| short \| FALSE \| 3.9497 \| 2.57 \| 2.54 \| 2.38 \| 2.31 \| 2.45 \| 1.61 PyTorch \| functional.hardtanh \| functional.hardtanh_dims(512 512)_contigFalse_inplaceFalse_dtypetorch.quint8 \| short \| FALSE \| 67.118 \| 50.02 \| 49.80 \| 46.78 \| 48.94 \| 48.88 \| 1.37 PyTorch \| relu6 \| relu6_dims(512 512)_contigFalse_inplaceFalse_dtypetorch.quint8 \| short \| FALSE \| 68.739 \| 51.17 \| 51.19 \| 48.07 \| 50.42 \| 50.21 \| 1.37 PyTorch \| relu6 \| relu6_dims(256 1024)_contigFalse_inplaceFalse_dtypetorch.quint8 \| short \| FALSE \| 69.1875 \| 51.97 \| 52.77 \| 50.00 \| 51.24 \| 51.50 \| 1.34 PyTorch \| functional.hardtanh \| functional.hardtanh_dims(256 1024)_contigFalse_inplaceFalse_dtypetorch.quint8 \| short \| FALSE \| 67.436 \| 50.98 \| 51.69 \| 49.06 \| 49.87 \| 50.40 \| 1.34 @chuanqi129 @huydhn @desertfire @jainapurva Pull Request resolved: https://github.com/pytorch/pytorch/pull/162693 Approved by: https://github.com/huydhn	2025-09-12 20:53:29 +00:00
Jeff Daily	65d642d6db	[ROCm] enable aoti tests, forward fix 162353 (#162827 ) Forward fix for tests added by #162353. Enables aoti tests on rocm. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162827 Approved by: https://github.com/dolpm, https://github.com/huydhn	2025-09-12 20:05:50 +00:00
karthickai	fa4d5e76ea	[Inductor] Fix ComboKernels failing due to missing helper functions (#162759 ) Fixes: #162756 Differential Revision: [D82257359](https://our.internmc.facebook.com/intern/diff/D82257359) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162759 Approved by: https://github.com/eellison, https://github.com/mlazos	2025-09-12 20:01:06 +00:00
William Wen	38afeb2ba2	Fix markdown link syntax in graph breaks index (#162400 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162400 Approved by: https://github.com/Skylion007	2025-09-12 19:29:49 +00:00
Isalia20	53b8bdb977	[MPS] enable cat op for sparse (#162007 ) Enable cat op for sparse on MPS Pull Request resolved: https://github.com/pytorch/pytorch/pull/162007 Approved by: https://github.com/malfet	2025-09-12 19:07:39 +00:00
David Berard	cad052423b	[triton] Update 3.5 pin to 5ae38bdb0dc066c5823e34dc9797afb9de42c866 (#162821 ) Include @aakhundov's sam_fast patch, plus NVIDIA's sm88/sm110 patches (thanks @nWEIdia) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162821 Approved by: https://github.com/atalman	2025-09-12 18:34:22 +00:00
PyTorch MergeBot	b5f4a7dc14	Revert "[DeviceMesh] Make CuTe layout as mesh layout to be ready for using in DeviceMesh (#162414 )" This reverts commit 195ac549d7d6538c4212ca73f69488e990b9527d. Reverted https://github.com/pytorch/pytorch/pull/162414 on behalf of https://github.com/malfet due to Looks like it broke test_circular_deps on Windows, see `d89189f289/1` ([comment](https://github.com/pytorch/pytorch/pull/162414#issuecomment-3286070938))	2025-09-12 16:57:09 +00:00
Jeffro	d89189f289	Fix inconsistent clock types in `ProcessGroupNCCL::runHookLoop` (#162543 ) ## Summary This PR fixes an inconsistency in `ProcessGroupNCCL::runHookLoop` when computing `timeStarted`. Both `timeFinished` and `timeStarted` in `WorkInfo` are expected to use `std::chrono::system_clock`, but previously the code was casting a duration from `steady_clock`. Reviewers suggested using `steady_clock` consistently for time measurement since it is appropriate for durations (see #153135 ). This PR updates both `timeStarted` and `timeFinished` in `WorkInfo`, and corresponding code in `runHookLoop`, to use `std::chrono::steady_clock`. ## Error message: ``` libcxx/include/__memory/allocator_traits.h:302:5: error: no matching function for call to '__construct_at' 302 \| std::__construct_at(__p, std::forward<_Args>(__args)...); \| ^~~~~~~~~~~~~~~~~~~ libcxx/include/__memory/shared_ptr.h:162:33: note: in instantiation of function template specialization 'std::allocator_traits<std::allocator<c10d::WorkInfo>>::construct<c10d::WorkInfo, c10d::OpType, unsigned long, std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<long long, std::ratio<1, 1000000000>>> &, std::chrono::time_point<std::chrono::system_clock> &, std::chrono::duration<float, std::ratio<1, 1000>>, 0>' requested here 162 \| allocator_traits<_TpAlloc>::construct(__tmp, __get_elem(), std::forward<_Args>(__args)...); \| ^ libcxx/include/__memory/shared_ptr.h:736:51: note: in instantiation of function template specialization 'std::__shared_ptr_emplace<c10d::WorkInfo, std::allocator<c10d::WorkInfo>>::__shared_ptr_emplace<c10d::OpType, unsigned long, std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<long long, std::ratio<1, 1000000000>>> &, std::chrono::time_point<std::chrono::system_clock> &, std::chrono::duration<float, std::ratio<1, 1000>>, std::allocator<c10d::WorkInfo>, 0>' requested here 736 \| ::new ((void)std::addressof(__guard.__get())) _ControlBlock(__a, std::forward<_Args>(__args)...); \| ^ libcxx/include/__memory/shared_ptr.h:744:15: note: in instantiation of function template specialization 'std::allocate_shared<c10d::WorkInfo, std::allocator<c10d::WorkInfo>, c10d::OpType, unsigned long, std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<long long, std::ratio<1, 1000000000>>> &, std::chrono::time_point<std::chrono::system_clock> &, std::chrono::duration<float, std::ratio<1, 1000>>, 0>' requested here 744 \| return std::allocate_shared<_Tp>(allocator<__remove_cv_t<_Tp> >(), std::forward<_Args>(__args)...); \| ^ torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:2674:32: note: in instantiation of function template specialization 'std::make_shared<c10d::WorkInfo, c10d::OpType, unsigned long, std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<long long, std::ratio<1, 1000000000>>> &, std::chrono::time_point<std::chrono::system_clock> &, std::chrono::duration<float, std::ratio<1, 1000>>, 0>' requested here 2674 \| onCompletionHook_(std::make_shared<WorkInfo>( \| ^ libcxx/include/__memory/construct_at.h:44:58: note: candidate template ignored: substitution failure [with _Tp = c10d::WorkInfo, _Args = <c10d::OpType, unsigned long, std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<long long, std::ratio<1, 1000000000>>> &, std::chrono::time_point<std::chrono::system_clock> &, std::chrono::duration<float, std::ratio<1, 1000>>>]: no matching constructor for initialization of 'c10d::WorkInfo' 43 \| template <class _Tp, class... _Args, class = decltype(::new(std::declval<void>()) _Tp(std::declval<_Args>()...))> \| ~~~ 44 \| _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp __construct_at(_Tp* __location, _Args&&... __args) { \| ^ 1 error generated. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162543 Approved by: https://github.com/cyyever, https://github.com/Skylion007	2025-09-12 16:50:42 +00:00
Justin Chu	d71a6497b7	Fix typo in ONNX export error message (#162819 ) Fix another "summit" 😅 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162819 Approved by: https://github.com/cyyever, https://github.com/titaiwangms	2025-09-12 16:34:49 +00:00
Jeffro	a0dca0fc60	Fix protobuf test comparison by parsing proto instead of raw strings (#162644 ) The tests were comparing raw exported strings for protobuf comparison, which is not backward/forward compatible with different versions of protobuf. This PR parses the strings into protobuf and compares the protobufs directly, similar to what we did in assertImageProto. Our test failed because we used a different version of protobuf, which output 44100.0 instead of 44100, which resulted in an error. However, they are equal, but only different in the exported strings. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162644 Approved by: https://github.com/justinchuby, https://github.com/Skylion007	2025-09-12 16:26:54 +00:00
Svetlana Karslioglu	e15686b40d	Remove actionable label from docathon label sync script (#155713 ) Make sure we don't propagate actionable label in docathon sync label script. Pull Request resolved: https://github.com/pytorch/pytorch/pull/155713 Approved by: https://github.com/clee2000	2025-09-12 15:36:50 +00:00
Jeff Daily	1e9ddf510f	[ROCm] fix hardsigmoid op (#162758 ) Currently std::min -> ::min did not work as expected on ROCm when input values >= 2147483648 It can be fixed by explicit typing std::min<opmath_t> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162758 Approved by: https://github.com/jeffdaily, https://github.com/pruthvistony Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-12 15:07:13 +00:00
Jeff Daily	7357eb66c5	[ROCm][CI] unskip some test_memory_format tests (#162766 ) Fixes #70125. Much of the work was done by #161687. This PR is additional test cleanup. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162766 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-12 15:02:40 +00:00
Mwiza Kunda	03798b0f91	[inductor] Fix removal of constexpr args from the launcher signature (#161924 ) Fixes the case described below which occurs when: - A user `torch.compile`s a function that uses a triton kernel. - `TORCHINDUCTOR_DUMP_LAUNCH_PARAMS=1` . Problem: If the user defined triton kernel is not autotuned: ```python import os os.environ["TORCHINDUCTOR_DUMP_LAUNCH_PARAMS"] = "1" @triton.jit def kernel(..., BLOCK_SIZE: tl.constexpr): ... @torch.compile def fn(..) kernel[..](..., 128) fn(..) ``` Then In `triton_heuristics. _interpret_args_grid`, `filter_signature` function: ```python def filtered_signature() -> list[str]: # constexprs are not passed in as args return [ x for x in self.triton_meta["signature"].keys() if x not in cfg.kwargs.keys() ] ``` because `triton.autotune` is not used on the the `triton.jit` function, `cfg` above will be empty, and so `BLOCK_SIZE` will not be removed from the signature even though it is constexpr, even though it is removed from the arguments that are passed in to `interpret_args_grid`. This results in a mismatch between the number of parameters in the signature and the number of arguments, which leads to the error `NameError: name '_grid_2' is not defined`. Fix: Use the triton jit kernel `constexprs` for args to remove. Not sure if this is a good fix so suggestions are welcome. Test plan: Added a parameter to an existing triton kernel to test for this edge case Pull Request resolved: https://github.com/pytorch/pytorch/pull/161924 Approved by: https://github.com/davidberard98	2025-09-12 13:58:09 +00:00
Edward Yang	6c334885d4	[RELAND] Always build USE_DISTRIBUTED (#160449 ) and Make distributed modules importable even when backend not built (#159889 ) (#162594 ) Summary: Original: D81957844 and D81957923 Also, https://github.com/pytorch/pytorch/pull/162142 is patched in as well #buildall Test Plan: sandcastle and oss ci Rollback Plan: Reviewed By: H-Huang Pull Request resolved: https://github.com/pytorch/pytorch/pull/162594 Approved by: https://github.com/H-Huang, https://github.com/dcci	2025-09-12 10:54:42 +00:00
Blaine Burton Rister	a7bbc5fea7	[Inductor-FX] Support ScatterFallback (#162686 ) # Problem Inductor has a `ScatterFallback` op with custom Python and C++ wrapper codegen macros. This is used in certain situations where the default Triton codegen doesn't apply, and especially for reductions which need to be deterministic. Since this op used direct Python/C++ codegen, it wasn't compatible with the FX backend. # Feature This PR refactors the associated wrapper codegen to support `ScatterFallback`. This follows the same basic steps that were used for other fallback ops including `MultiOutput` and `ExternKernel`: 1. Create a new wrapper IR op called `ScatterFallbackLine`. Move the logic in `ScatterFallback.cogeden` to `ScatterFallbackLine.codegen`, to prevent it from affecting the FX backend. This logic is unsafe for FX because it may generate Python or C++ strings with methods like `codegen_reference()`. 2. To eleminate the dependence on `V.graph`, move language-specific logic to the respective wrapper codegen subclasses. In this case, C++ codegen has some special logic, which is moved to `CppWrapperCpu`. 3. Create a new method in `FXWrapperCodegen` to handle `ScatterFallbackLine`. # Test plan Added a couple of CI tests for the FX backend with scatter fallbacks. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162686 Approved by: https://github.com/jansel	2025-09-12 08:41:50 +00:00
Zeng, Xiangdong	98e9440f30	[1/N] Port 5 _composable/fsdp distributed test cases to Intel GPU (#159118 ) For https://github.com/pytorch/pytorch/issues/114850, we will port distributed tests to Intel GPU. We could enable Intel GPU with following methods and try the best to keep the original code styles: - use "torch.accelerator.current_accelerator()" to determine the accelerator backend - enabled XPU for some test path - skip some test cases which Intel GPU does not support Pull Request resolved: https://github.com/pytorch/pytorch/pull/159118 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-12 08:36:20 +00:00
Miroslaw Oksiucik	66c0f14ecc	Support XPU in --nproc-per-node option to torchrun (#159474 ) Support both --nproc-per-node=xpu and autodetection of XPU device in case of --nproc-per-node=auto Pull Request resolved: https://github.com/pytorch/pytorch/pull/159474 Approved by: https://github.com/tsocha, https://github.com/guangyey, https://github.com/d4l3k Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com>	2025-09-12 08:32:04 +00:00
Yuanyuan Chen	972e409829	[Reland] Use std::string_view in torchgen (#158625 ) Reland of #157050, which is incidentally closed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158625 Approved by: https://github.com/albanD	2025-09-12 08:31:54 +00:00
Aaryaman Vasishta	52af91e4c1	[ROCm/Windows] Support load_inline on windows (#162577 ) Supports `torch.utils.cpp_extension.load_inline` on Windows with ROCm. Tested on Windows with gfx1201. Note that it currently only works when CC and CXX are set to `clang-cl`. This is also needed when building extensions via. `setuptools` due to linker errors when using `cl` directly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162577 Approved by: https://github.com/ezyang	2025-09-12 08:10:07 +00:00
Liao, Wei	179f10621b	port some distributed tensor test files for Intel GPU (#161703 ) it's another pr to port distributed tensor test for Intel GPU, while the other pr is https://github.com/pytorch/pytorch/pull/161604 We could enable Intel GPU with following methods and try the best to keep the original code styles: Use torch.accelerator for general gpu Skip the case if running on xpu which has known issues Pull Request resolved: https://github.com/pytorch/pytorch/pull/161703 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-12 07:57:32 +00:00
fduwjj	195ac549d7	[DeviceMesh] Make CuTe layout as mesh layout to be ready for using in DeviceMesh (#162414 ) We create a wrapper class acting as a layout for device mesh so that we can add new methods more specific to DeviceMesh and keep the core logic of CuTe manipulation inside pycute module. This PR create the main body of the code and then next PR will come with actual implementation and unit test for device mesh layout. (Actual implementation can be found in https://github.com/pytorch/pytorch/pull/161016) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162414 Approved by: https://github.com/ezyang ghstack dependencies: #162413, #162534	2025-09-12 07:32:56 +00:00
Shangdi Yu	636a511084	[aoti] add config for libtorch free so (#162655 ) Users can specify the following to get a libtorch_free `.so`. "aot_inductor.use_libtorch": False, The following config is only used for torchnative (see https://github.com/meta-pytorch/torchnative/pull/110). It's not intended to be used by executorch. The reason we need it for torchnative is because a lot of the symbol definitions in torchnative repo is only in header files. "aot_inductor.libtorch_free_header": "/data/users/shangdiy/torchnative/standalone,/data/users/shangdiy/torchnative/" (or their custom headers) The main motivating use case is for executorch to produce a libtorch free `.so`. TODO for follow-up PR: this flag should be consolidated with the `compile_standalone` flag. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162655 Approved by: https://github.com/angelayi	2025-09-12 07:31:04 +00:00
Michael Lazos	75de5b65b4	[Dynamo] Don't guard data ptrs by default with mark_static_address (#162208 ) Fixes https://github.com/pytorch/pytorch/issues/156377 Since we now re-record cudagraphs, it's not necessary to guard by default anymore and induce a full recompile. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162208 Approved by: https://github.com/anijain2305	2025-09-12 07:15:10 +00:00
PyTorch MergeBot	6b59a19242	Revert "[RELAND] Always build USE_DISTRIBUTED (#160449 ) and Make distributed modules importable even when backend not built (#159889 ) (#162594 )" This reverts commit 6e8f17c58029e5fa6bc222b2445ebbc0cbdc17c7. Reverted https://github.com/pytorch/pytorch/pull/162594 on behalf of https://github.com/huydhn due to Reverted internally ([comment](https://github.com/pytorch/pytorch/pull/162594#issuecomment-3283985880))	2025-09-12 06:52:03 +00:00
jainapurva	5f66902ecf	Fix operator benchmark issue#162708 (#162744 ) This PR skips memory metric calculation for ops which don't take tensor input, fixing the operator_benchmark bug Fixes https://github.com/pytorch/pytorch/issues/162708 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162744 Approved by: https://github.com/huydhn	2025-09-12 06:51:14 +00:00
PyTorch MergeBot	00e9ba75cd	Revert "[indexing] Prevent integer overflow from large step values in C++ (#161707 )" This reverts commit c140bf217f5ca5071ab9dbc1bcf9d4006242f44a. Reverted https://github.com/pytorch/pytorch/pull/161707 on behalf of https://github.com/huydhn due to Look like there is a land race as lots of jobs are failing after this lands ([comment](https://github.com/pytorch/pytorch/pull/161707#issuecomment-3283980465))	2025-09-12 06:49:36 +00:00
Boyuan Feng	333e546c02	[CUDAGraph][UX] warn many times for rerecording from dynamic shapes (#162696 ) Excessive re-recording CUDAGraphs lead to bad performance. We previously warns once if this happens. However, the limit (=50) is too high and users may just observe bad performance before actually seeing the warning message. Even worse, users may not see the warning message when there are many other logs. @anijain2305 reported that he never saw this warning message when using transformer library, but he DOES observe slowdown due to cudagraph re-recording & needs to turn off cudagraph. #162663 attempts to hard error when re-recording too many times due to dynamic shapes. But it is a bc-breaking change. Actually, hf-t5-generate model in torchbench failed due to 256 re-recordings. This PR a) reduces to smaller limit (=8); and b) makes the warning more spam, i.e., warn once for every distinct shapes once the limit is reached. Fixes #162299 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162696 Approved by: https://github.com/mlazos	2025-09-12 06:38:32 +00:00
Mark Saroufim	f7e8321961	fix cpp extension distributed warning spew (#162764 ) With the new change we only log the warning if we're running non distributed code or if we're in rank 0. Unit testing that certain messages get printed on certain ranks only feels kinda jank so test plan is below instead Test plan ```python # torchrun --nproc_per_node=2 demo_fix.py import os import logging logging.getLogger('torch.utils.cpp_extension').setLevel(logging.DEBUG) import torch if 'RANK' in os.environ: torch.distributed.init_process_group('nccl') from torch.utils.cpp_extension import _get_cuda_arch_flags _get_cuda_arch_flags() print(f"Rank {os.environ.get('RANK', '0')} done") ``` Logs showing how how `TORCH_CUDA_ARCH_LIST`only shows up once if we explicitly set the the logging level to `logging.DEBUG`. It also improves the debug message to explain what the actual behavior will be ``` (source) [marksaroufim@devgpu005]~% torchrun --nproc_per_node=2 demo_fix.py W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] *************************************** W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. W0911 18:30:16.594000 1315439 /home/marksaroufim/pytorch/torch/distributed/run.py:814] ************************************* [rank0]:V0911 18:30:18.921000 1316753 pytorch/torch/utils/cpp_extension.py:2444] TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='10.0+PTX' for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override. Rank 0 done Rank 1 done ``` But if we just use the default and comment out `logging.getLogger('torch.utils.cpp_extension').setLevel(logging.DEBUG)` Then we get ``` (source) [marksaroufim@devgpu005]~% torchrun --nproc_per_node=2 demo_fix.py W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] ************************************* W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. W0911 18:14:33.926000 690759 /home/marksaroufim/pytorch/torch/distributed/run.py:814] *************************************** Rank 0 done Rank 1 done (source) [marksaroufim@devgpu005]~% ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162764 Approved by: https://github.com/ezyang, https://github.com/zou3519	2025-09-12 06:12:46 +00:00
dolpm	30e16d6389	[nativert] aoti (#162353 ) Summary: att Test Plan: ci Rollback Plan: Differential Revision: D81731425 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162353 Approved by: https://github.com/yiming0416	2025-09-12 05:56:25 +00:00
orangeH25	28e8531032	Add api info for torch._C._nn.pyi (#162361 ) Fix part of #148404 APis involved are as followed: - im2col - l1_loss - mish - mish_ - mse_loss Pull Request resolved: https://github.com/pytorch/pytorch/pull/162361 Approved by: https://github.com/ezyang	2025-09-12 05:56:22 +00:00
Zeng, Xiangdong	0babdfad63	[1/N] Port 6 fsdp distributed test cases to Intel GPU (#160158 ) For https://github.com/pytorch/pytorch/issues/114850, we will port distributed tests to Intel GPU. We could enable Intel GPU with following methods and try the best to keep the original code styles: - Instantiate_device_type_tests() - Use "torch.accelerator.current_accelerator()" to determine the accelerator backend - Enabled XPU for some test path - Added allow_xpu=True for supported test class Pull Request resolved: https://github.com/pytorch/pytorch/pull/160158 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-12 05:52:08 +00:00
fduwjj	561430edcd	[CuTe] Add type for CuTe layout via claude (#162534 ) This PR mostly is a cosmetic change using Claude to add types for copied PyCute code. We removed all suppressions of linters and add type checker, type alias and mypy ignore(if needed) so that the pycute code will be checked by linter. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162534 Approved by: https://github.com/ezyang, https://github.com/Skylion007 ghstack dependencies: #162413	2025-09-12 04:59:21 +00:00
Isuru Fernando	79d2418b5a	[inductor] Add FLOAT_IS_NAN and COMPLEX_IS_NAN guards (#162537 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162537 Approved by: https://github.com/anijain2305, https://github.com/mlazos ghstack dependencies: #162528	2025-09-12 04:32:46 +00:00
Isuru Fernando	5dd84559a5	[dynamo] Add DUAL_LEVEL_MATCH C++ guard (#162528 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162528 Approved by: https://github.com/anijain2305	2025-09-12 04:32:46 +00:00
fduwjj	5dd14f0b65	[CuTe] Copy code from pycute for device mesh bookkeeping (#162413 ) We copied the whole module and its unit test into pytorch codebase. (https://github.com/NVIDIA/cutlass/blob/main/python%2Fpycute%2Flayout.py). We did change the indentation of code from 2 spaces to 4 spaces. And add lint suppressor to make mypy happy. Also we need to make changes to unit test to include ownership and use `run_tests, TestCase` so that the test gets picked up by CI. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162413 Approved by: https://github.com/ezyang, https://github.com/Skylion007	2025-09-12 04:28:03 +00:00
can-gaa-hou	95191522e0	[OpenReg] Implement device autoload mechanism (#158555 ) # Implement OpenReg device autoload mechanism ## Overview The Autoload mechanism in PyTorch simplifies the integration of third-party device backends by enabling automatic discovery and initialization at runtime. Traditionally, integrating a new backend required explicit imports or manual initialization, which could be cumbersome and error-prone. With Autoload, PyTorch dynamically detects and initializes device backends, providing a seamless user experience. This mechanism leverages Python entry points (e.g., `torch.backends`) and dynamic module loading. When PyTorch starts, it scans for registered entry points and invokes their initialization hooks, ensuring that all available backends are ready for use without requiring explicit imports. ## Motivation This PR aims to apply [device autoload mechanism](https://github.com/pytorch/pytorch/issues/122468) to the OpenReg module with some simple changes. ## Change ### Before ```python import torch import torch_openreg x = torch.tensor([1, 2, 3], device="openreg") print(x) ``` ### After ```python import torch # No need to import torch_openreg manually! x = torch.tensor([1, 2, 3], device="openreg") print(x) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158555 Approved by: https://github.com/FFFrog, https://github.com/albanD Co-authored-by: Jiawei Li <ljw1101.vip@gmail.com>	2025-09-12 04:24:11 +00:00
dependabot[bot]	da954f10d6	Bump protobuf from 5.29.4 to 5.29.5 in /.github/requirements (#160844 ) Bumps [protobuf](https://github.com/protocolbuffers/protobuf) from 5.29.4 to 5.29.5. <details> <summary>Commits</summary> <ul> <li><a href="`f5de0a0495`"><code>f5de0a0</code></a> Updating version.json and repo version numbers to: 29.5</li> <li><a href="`85637662f7`"><code>8563766</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/21858">#21858</a> from shaod2/py-cp-29</li> <li><a href="`05ba1a8104`"><code>05ba1a8</code></a> Add recursion depth limits to pure python</li> <li><a href="`1ef3f01c46`"><code>1ef3f01</code></a> Internal pure python fixes</li> <li><a href="`69cca9b7f5`"><code>69cca9b</code></a> Remove fast-path check for non-clang compilers in MessageCreator. (<a href="https://redirect.github.com/protocolbuffers/protobuf/issues/21612">#21612</a>)</li> <li><a href="`21fdb7acdb`"><code>21fdb7a</code></a> fix: contains check segfaults on empty map (<a href="https://redirect.github.com/protocolbuffers/protobuf/issues/20446">#20446</a>) (<a href="https://redirect.github.com/protocolbuffers/protobuf/issues/20904">#20904</a>)</li> <li><a href="`03c50e3874`"><code>03c50e3</code></a> Re-enable aarch64 tests. (<a href="https://redirect.github.com/protocolbuffers/protobuf/issues/20853">#20853</a>)</li> <li><a href="`128f0aafd9`"><code>128f0aa</code></a> Add volatile to featuresResolved (<a href="https://redirect.github.com/protocolbuffers/protobuf/issues/20767">#20767</a>)</li> <li><a href="`bdd49bb141`"><code>bdd49bb</code></a> Merge pull request <a href="https://redirect.github.com/protocolbuffers/protobuf/issues/20755">#20755</a> from protocolbuffers/29.x-202503192110</li> <li><a href="`c65946848f`"><code>c659468</code></a> Updating version.json and repo version numbers to: 29.5-dev</li> <li>See full diff in <a href="https://github.com/protocolbuffers/protobuf/compare/v5.29.4...v5.29.5">compare view</a></li> </ul> </details> <br /> [![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=protobuf&package-manager=pip&previous-version=5.29.4&new-version=5.29.5)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) --- <details> <summary>Dependabot commands and options</summary> <br /> You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/pytorch/pytorch/network/alerts). </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160844 Approved by: https://github.com/msaroufim Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-09-12 04:23:03 +00:00
PyTorch UpdateBot	d959eb02cb	[audio hash update] update the pinned audio hash (#162752 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162752 Approved by: https://github.com/pytorchbot	2025-09-12 04:18:54 +00:00
dependabot[bot]	62f044e260	Bump setuptools from 72.1.0 to 78.1.1 in /.github/requirements (#162701 ) Bumps [setuptools](https://github.com/pypa/setuptools) from 72.1.0 to 78.1.1. - [Release notes](https://github.com/pypa/setuptools/releases) - [Changelog](https://github.com/pypa/setuptools/blob/main/NEWS.rst) - [Commits](https://github.com/pypa/setuptools/compare/v72.1.0...v78.1.1) --- updated-dependencies: - dependency-name: setuptools dependency-version: 78.1.1 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-09-11 21:03:27 -07:00
Ti-Tai Wang	2335f90414	[ONNX] Support enable_gqa when dropout is non-zero (#162771 ) Fixes #162258 Related to https://github.com/microsoft/onnxscript/pull/2558 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162771 Approved by: https://github.com/justinchuby	2025-09-12 04:00:57 +00:00
Edward Yang	6e8f17c580	[RELAND] Always build USE_DISTRIBUTED (#160449 ) and Make distributed modules importable even when backend not built (#159889 ) (#162594 ) Summary: Original: D81957844 and D81957923 Also, https://github.com/pytorch/pytorch/pull/162142 is patched in as well #buildall Test Plan: sandcastle and oss ci Rollback Plan: Reviewed By: H-Huang Pull Request resolved: https://github.com/pytorch/pytorch/pull/162594 Approved by: https://github.com/H-Huang, https://github.com/dcci	2025-09-12 03:56:18 +00:00
Klaus Zimmermann	31345fb4f7	Make functorch notebook symlinks PEP 517 valid (#157813 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/157813 Approved by: https://github.com/zou3519, https://github.com/atalman	2025-09-12 03:52:08 +00:00
Daniel Vega-Myhre	872ed60679	[mxfp8 torch._scaled_grouped_mm] fix meta registration for 3d tensor (#162765 ) Meta registration checks for torch._scaled_grouped_mm has a bug for 3d "B" tensors. Namely, the scale shape for such a tensor should be 2d with shape (G, blocked_K * blocked_N), but it currently enforces an expected 3d shape of (G, blocked_K, blocked_N). See Blas.cpp for correct validation logic [here](`8e217a9f6d/aten/src/ATen/native/cuda/Blas.cpp (L1622)`). Pull Request resolved: https://github.com/pytorch/pytorch/pull/162765 Approved by: https://github.com/ngimel	2025-09-12 03:51:52 +00:00
atalman	e8eeb06034	Move inductor jobs 3.9->3.10 (#162323 ) Related to: https://github.com/pytorch/pytorch/issues/161167 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162323 Approved by: https://github.com/huydhn, https://github.com/Skylion007 Co-authored-by: Huy Do <huydhn@gmail.com>	2025-09-12 03:43:06 +00:00
Yang Wang	3cd734584d	bring back the old vllm's use_existing_torch.py (#162747 ) vllm's pr will override our dependencies for torch. quick fix to add the use_existing_torch.py. syncing with vllm now regarding the uv approach they have Pull Request resolved: https://github.com/pytorch/pytorch/pull/162747 Approved by: https://github.com/huydhn	2025-09-12 03:41:39 +00:00
PyTorch MergeBot	222ec8d28e	Revert "AMD CPU CI - Add freezing + fix label trigger (#162176 )" This reverts commit 9cac1b92595ec7836101d51dbe1415081042c7a0. Reverted https://github.com/pytorch/pytorch/pull/162176 on behalf of https://github.com/huydhn due to Sorry for reverting this but hardcoding the input online 122 does not make sense ([comment](https://github.com/pytorch/pytorch/pull/162176#issuecomment-3283532452))	2025-09-12 03:39:13 +00:00
thenumberouscode	c140bf217f	[indexing] Prevent integer overflow from large step values in C++ (#161707 ) Fixes https://github.com/pytorch/pytorch/issues/160868 hmmm, I found an existing fix PR after I've finished this one. For reference, the old PR was https://github.com/pytorch/pytorch/pull/147433/files. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161707 Approved by: https://github.com/leslie-fang-intel, https://github.com/CaoE, https://github.com/mlazos	2025-09-12 03:16:23 +00:00
Janani Sriram	7eb92b076f	[Inductor][FP8] Validate exhaustive autotuning for FP8 Inductor templates (#162678 ) Summary: Validate exhaustive autotuning for FP8 Inductor templates: scaled MM templates require `block_k >= 32`. Before, exhaustive autotuning defaulted to a limited set of autotuning configs, as limitations for exhaustively autotuning on FP8 shapes had not been tested. Test Plan: ``` CUDA_VISIBLE_DEVICES=0 TRITON_PRINT_AUTOTUNING=1 TRITON_ALWAYS_COMPILE=1 TORCH_LOGS=+inductor TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 ENABLE_PERSISTENT_TMA_MATMUL=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE=DEFAULT buck2 run mode/{opt,inplace} pytorch/t ritonbench:run -- --op fp8_gemm --only torch_fp8_gemm,pt2_fp8_gemm --metrics tflops,accuracy --input-loader=/home/jananisriram/personal/exhaustive_autotune_rowwise_persistent_tma/json_fi les/rowwise_ptma_0.json --output="/home/jananisriram/personal/exhaustive_autotune_rowwise_persistent_tma/autotune/gpu0_bench.csv" --atol=1e-2 --rtol=0.5 2>&1 \| tee ~/personal/exhaustive_ autotune_rowwise_persistent_tma/autotune/gpu0.log ``` Rollback Plan: Differential Revision: D82174075 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162678 Approved by: https://github.com/coconutruben	2025-09-12 02:12:33 +00:00
Shangdi Yu	ccb450b190	[pre_compile] Add check for cuda and hardware version (#162438 ) if we detect compiled model is using cuda in meaningful way, we should store information about cuda + hardware Example: `SystemInfo(python_version='3.12.9', torch_version='2.9.0a0+gite02b0e6', cuda_version='12.6', triton_version=(3, 4), gpu_name='NVIDIA PG509-210')` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162438 Approved by: https://github.com/zhxchen17	2025-09-12 01:42:07 +00:00
Gabriel Ferns	ae97eb86f7	Reland "Fix conv exhaustive autotuning and expand Exhaustive test coverage" (#161957 ) reland https://github.com/pytorch/pytorch/pull/159387 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161957 Approved by: https://github.com/coconutruben	2025-09-12 01:36:43 +00:00
mengph	7a9c4d794c	[BUG]Fixed handle cannot be hit in the cache in the IPC ExpandableSegment (#161885 ) Fixed the bug that handle cannot be hit in the ipcMemHandle_to_devptr cache in the IPC scenario of ExpandableSegment. Fixes #161884 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161885 Approved by: https://github.com/albanD	2025-09-12 01:09:17 +00:00
Avik Chaudhuri	501e19137a	fix var args for shape guards (#162633 ) Summary: Fixes #162599 Test Plan: added test based on repro Rollback Plan: Differential Revision: D82144520 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162633 Approved by: https://github.com/tugsbayasgalan	2025-09-12 00:33:35 +00:00
Aaryaman Vasishta	4a757e1e17	[ROCm] Support torch.cuda._compile_kernel (#162510 ) Supports `torch.cuda._compile_kernel` on ROCm. Related to https://github.com/pytorch/pytorch/pull/151484 Tested on Windows with gfx1201. Testing on Linux pending. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162510 Approved by: https://github.com/mycpuorg, https://github.com/msaroufim	2025-09-12 00:18:47 +00:00
Yuxingwang-intel	563921619b	Fix the regression issue caused by non-arrch64 platforms not hitting the MKLDNN path. (#162168 ) This issue was introduced by the commit in issue #161065. Added an extra check to provide a proper path for other platforms. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162168 Approved by: https://github.com/mingfeima, https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-09-12 00:17:08 +00:00
Nikita Shulga	84d8ec73f1	[CD] Build Mac wheels using `setup-python` action (#162136 ) Biggest difference between both conda and homebrew CPython builds and one from python.org, is that later are universal binaries and they are always trying to build universal extension... Workaround lots of universal binary build attempts by explicitly specifying both `_PYTHON_PLATFORM` and `--plat-name` as well as `ARCH_FLAGS` Suppressed actionlint warning on use of `freethreaded` flag which is document in https://github.com/actions/setup-python/tree/v5 TODO: Remove lots of temporary workarounds when `3.14` is out in October 2025 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162136 Approved by: https://github.com/atalman, https://github.com/huydhn ghstack dependencies: #162297, #162265	2025-09-12 00:16:31 +00:00
Ramya Ramineni	a956066b4e	[ROCm] Define uint32 t when ROCM_VERSION >= 70000 (#160587 ) This PR fixes the errors like below: ``` [rank3]: RuntimeError: The following operation failed in the TorchScript interpreter. [rank3]: Traceback of TorchScript (most recent call last): [rank3]: RuntimeError: /tmp/comgr-28f951/input/CompileSourceACC062:67:7: error: unknown type name 'uint32_t'; did you mean '__hip_internal::uint32_t'? [rank3]: 67 \| uint32_t int32; [rank3]: \| ^~~~~~~~ [rank3]: \| __hip_internal::uint32_t ``` Earlier uint32_t was defined in HIP headers in std namespace. Now it is moved to __hip_internal namespace in hip headers. This change is made in ROCm 7.0. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160587 Approved by: https://github.com/jeffdaily	2025-09-12 00:13:26 +00:00
David Berard	ff6870d134	[BE][flex attention] compute RMSE in float64 (#162088 ) I saw a failure where the reference error was 0.0, and the compiled error was 0.035. Although the failure still occurs with or without this change, it was confusing to see RMSE of 0.0. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162088 Approved by: https://github.com/drisspg	2025-09-11 23:53:31 +00:00
PyTorch MergeBot	92f9ed7ac3	Revert "[2/N]Port several test files under test/distributed to Intel GPU (#159473 )" This reverts commit fa1d409e83af93425a2672d62e134e8f20c5ccc0. Reverted https://github.com/pytorch/pytorch/pull/159473 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it seems to break an distributed tests ([comment](https://github.com/pytorch/pytorch/pull/159473#issuecomment-3282999084))	2025-09-11 23:51:21 +00:00
Zhengxu Chen	8e217a9f6d	[precompile] Fix issues with guard serialization on distributed types. (#162418 ) Summary: Add more support for torch internal distributed data structures. Test Plan: test_guard_serialization.py Rollback Plan: Differential Revision: D81927732 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162418 Approved by: https://github.com/dolpm	2025-09-11 23:09:55 +00:00
hanchchch	429052f151	fix: raise value error on init ParametrizationList if original.device != new.device (#162717 ) raise value error on init `ParametrizationList`, if `original.device != new.device`. currently `_maybe_set` will throw below error in such situations, which I think it's not convenient to debug. ``` [rank1]: RuntimeError: Attempted to set the storage of a tensor on device "cuda:1" to a storage on different device "cpu". This is no longer allowed; the devices must match. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162717 Approved by: https://github.com/lezcano	2025-09-11 23:07:58 +00:00
Nakul Iyer	a3f01f6418	[MTIA Runtime] Add foreach_div ops to native_functions.yaml (#162732 ) Summary: Quick fix for runtime support on foreach_div, see D81274963. Fixed an issue that I created in that diff so that the CIs pass. Test Plan: CIs created in D81274963 and D81286593 pass. Added some logs in [aten_mtia_ops.py](https://www.internalfb.com/code/fbsource/[c56272ba042c43c65517dcac254364cf732fcfa9]/fbcode/mtia/host_runtime/torch_mtia/aten_mtia_ops.cpp?lines=3676) to all the foreach_div ops. We can see that the correct MTIA kernels are being invoked in the tests. https://www.internalfb.com/intern/testinfra/testrun/15481123829281588 Rollback Plan: Differential Revision: D82161434 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162732 Approved by: https://github.com/danielhou0515	2025-09-11 22:47:03 +00:00
Aaryaman Vasishta	62843c14bb	[ROCm/Windows] Support aotriton for scaled_dot_product_attention on Windows. (#162330 ) Enables flash attention and/or memory efficient attention on Windows with scaled_dot_product_attention via. aotriton. Already tested to be working on Windows with TheRock. Steps to enable: simply set `USE_FLASH_ATTENTION=1` and `USE_MEM_EFF_ATTENTION=1` as usual. See https://github.com/ROCm/TheRock/blob/main/external-builds/pytorch/build_prod_wheels.py#L578-L604 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162330 Approved by: https://github.com/xinyazhang, https://github.com/ScottTodd, https://github.com/jeffdaily Co-authored-by: Scott Todd <scott.todd0@gmail.com>	2025-09-11 22:35:09 +00:00
Nick Riasanovsky	082d3dd9d5	[Triton] [Inductor] Restrict subprocess autotuning to just Triton (#162688 ) Summary: Restricts subprocess benchmarking to only `TritonTemplateCaller`, which is expected by the underlying `target` method. THhis triggered a bug with large K shapes because the decompose k is `SubgraphChoiceCaller`. Test Plan: mm autotuning with a large k and `TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC=1` Rollback Plan: Differential Revision: D82181924 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162688 Approved by: https://github.com/PaulZhang12, https://github.com/eellison, https://github.com/mlazos	2025-09-11 22:17:57 +00:00
PyTorch MergeBot	468c1f9e9d	Revert "[nn] Assert parsed iterable arguments are an appropriate length (#162340 )" This reverts commit b5e6e58050bd2a15f4173cfffa00c7e32e382b49. Reverted https://github.com/pytorch/pytorch/pull/162340 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it seems to break an MPS tests on ExecuTorch ([comment](https://github.com/pytorch/pytorch/pull/162340#issuecomment-3282676242))	2025-09-11 21:22:57 +00:00
Nick Riasanovsky	9614c2eb14	[Triton] [Inductor] Pruned failed compilations from Autotuning candidates (#162673 ) Summary: When exahaustively autotuning a new template you may hit situations that lead to compilation failures. This template will still attempt to autotune because nothing was marking this as failed and in my experiments lead to a crash/segfault if I didn't set `TORCHINDUCTOR_AUTOTUNE_IN_SUBPROC=1`. To help eliminate this issue this PR marks any template that fails to compile as "failed" and then removes all of the failed templates from the choice candidates. In the case where it would have just failed to compile twice, this should at least reduce compilation time. Test Plan: Tested locally when experminenting with the new blackwell templates and a Triton version that contains a bug related to `num_warps < 4`. Rollback Plan: Differential Revision: D82172207 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162673 Approved by: https://github.com/PaulZhang12, https://github.com/mlazos	2025-09-11 21:22:36 +00:00
Janani Sriram	4c6a6c2db9	[Inductor][FP8] Add new scaled_mm and scaled_persistent_mm configs to Inductor FP8 Triton templates (#162699 ) Summary: Add new `scaled_mm` and `scaled_persistent_mm` configs to `template_heuristics.py` for Inductor FP8 Triton templates. These configs are a representative subset of the most performant configs generated from exhaustively autotuning FP8 Triton kernels with per-tensor and per-row scaling. See this [spreadsheet](https://docs.google.com/spreadsheets/d/1Fal1vhFUJIUcLpM2kJect6IkgeUFvCY-nUr3RTupM_4/edit?gid=1732602731#gid=1732602731) for benchmarks and performance metrics. Test Plan: Verify that configs do not error, i.e. ``` CUDA_VISIBLE_DEVICES=0 TRITON_PRINT_AUTOTUNING=1 TRITON_ALWAYS_COMPILE=1 TORCH_LOGS=+i nductor TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 ENABLE_PERSISTENT_TMA_MATMUL=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM=1 buck2 run mode/{opt,inplace} pytorch/tritonbench:run -- --op fp8_gemm --only pt2_fp8_gemm --metrics tflops,accuracy --input-loader={input_path} --output="{output_csv}" --atol=1e-2 --rtol=0.5 2>&1 \| tee {log_file} ``` Rollback Plan: Reviewed By: NikhilAPatel, PaulZhang12 Differential Revision: D81651226 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162699 Approved by: https://github.com/PaulZhang12	2025-09-11 21:21:06 +00:00
Rohit Manav	3ad3bfe11d	added example for torch.is_storage (#162614 ) Fixes #162613 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162614 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-09-11 20:25:26 +00:00
PyTorch MergeBot	1c6dfbe557	Revert "[inductor] FlexibleLayout for ExternKernelChoice for mms (#161351 )" This reverts commit f08487aa8692751c36e608e338204490b0955583. Reverted https://github.com/pytorch/pytorch/pull/161351 on behalf of https://github.com/huydhn due to Check with @coconutruben and the internal failures look real ([comment](https://github.com/pytorch/pytorch/pull/161351#issuecomment-3282511692))	2025-09-11 20:24:15 +00:00
PyTorch MergeBot	934f878883	Revert "[inductor] leverage template stacking in V.choices.get_mm_configs (#161350 )" This reverts commit 623e623c821f639559248e9acd6084311c8fd3d5. Reverted https://github.com/pytorch/pytorch/pull/161350 on behalf of https://github.com/huydhn due to Check with @coconutruben and the internal failures look real ([comment](https://github.com/pytorch/pytorch/pull/161351#issuecomment-3282511692))	2025-09-11 20:24:15 +00:00
PyTorch MergeBot	cef05b1202	Revert "[inductor][choices] rename get_mm_configs to get_template_configs (#162293 )" This reverts commit 30191fcf03ddd6a09381a490096c4bb721874316. Reverted https://github.com/pytorch/pytorch/pull/162293 on behalf of https://github.com/huydhn due to Check with @coconutruben and the internal failures look real ([comment](https://github.com/pytorch/pytorch/pull/161351#issuecomment-3282511692))	2025-09-11 20:24:15 +00:00
Boyuan Feng	b500c166ef	[FlexAttention][Easy] turn off TMA when cannot use it (#162569 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162569 Approved by: https://github.com/drisspg	2025-09-11 19:51:19 +00:00
Jeff Daily	d65ffdef3d	[ROCm] fix miopen batchnorm changing output format (#162112 ) It was found that the integration of miopen batchnorm was causing the output to always be in default contig memory format even when the input was channels last. This also unskips a number of related unit tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162112 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com> Co-authored-by: Dmitry Nikolaev <dmitry.nikolaev@amd.com> Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>	2025-09-11 19:37:48 +00:00
Pian Pawakapan	ac72f81c12	[dynamic shapes] unbacked-safe should_swap (#160473 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/160473 Approved by: https://github.com/laithsakka	2025-09-11 18:51:25 +00:00
Arijit Mukhopadhyay	9cac1b9259	AMD CPU CI - Add freezing + fix label trigger (#162176 ) Added the following changes: 1. Added freezing by default for AMD CPU based CI 2. Fixed issue with label based CI triggers Addresses code review comment in https://github.com/pytorch/pytorch/pull/161155 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162176 Approved by: https://github.com/malfet, https://github.com/jeffdaily	2025-09-11 18:41:29 +00:00
Isalia20	9bc648235d	[MPS] mps sparse mul op implementation (#162349 ) Implements mps sparse mul operation as well as enables other operations such as: 1. copy_ 2. div 3. sum 4. floor 5. power 6. sub 7. floor_divide Pull Request resolved: https://github.com/pytorch/pytorch/pull/162349 Approved by: https://github.com/pearu, https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-09-11 18:36:24 +00:00
David Berard	799471d92b	[triton] Update 3.5 pin (AMD compilation fix + warp spec) (#162733 ) Fixes #162390 Also adds warp spec (thanks @manman-ren!) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162733 Approved by: https://github.com/atalman	2025-09-11 18:19:16 +00:00
justinchuby	43d9b5ecaa	[ONNX] Set fallback=False by default (#162726 ) This change addresses confusing error messages users encounter when using the ONNX exporter with default settings. Previously, `fallback=True` was the default, which would attempt to fall back to the TorchScript exporter when the dynamo path failed, leading to mixed error messages that obscured the actual issues. ## Problem When `fallback=True` by default: - Users get confusing error messages mixing dynamo and TorchScript export failures - Error messages tell users to provide the `f` argument unnecessarily - Dynamo error messages get flushed with TorchScript errors when both paths fail - Users expecting the dynamo path get unexpected fallback behavior ## Solution Changed the default from `fallback=True` to `fallback=False` in both: - `torch.onnx.export()` function - `torch.onnx._internal.exporter._compat.export_compat()` function ## Impact Before: ```python # Would fallback to TorchScript on dynamo failure, causing mixed error messages torch.onnx.export(model, args) ``` After: ```python # Clean dynamo-only errors by default torch.onnx.export(model, args) # Advanced users can still opt-in to fallback behavior torch.onnx.export(model, args, fallback=True) ``` Fixes #162697 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162726 Approved by: https://github.com/titaiwangms, https://github.com/xadupre	2025-09-11 18:09:58 +00:00
Tugsbayasgalan Manlaibaatar	463fbc8ca0	Support vmap + custom autograd function/improve DTensor constructor inefficiency (#162240 ) This makes gemma3 exportable on transformers=4.55.4 In HF, there is a torch funciton mode called TransformGetItemToIndex which internally calls custom autograd function. When this custom autograd function is called under vmap, It triggers CustomFunctionHigherOrderOP which error-ed because there was no pre-dispatch proxy mode implementation. Since there are number of requests lately to add various operators in pre-dispatch IR, I introduce a decorator in export that works similar to `allow_in_graph`. Basically: 1) We intercept custom_autograd_function.apply at pre-dispatch mode when this decorator is applied 2) We apply `flat_apply` HOP to hide the pytree spec for this autograd function. Note that this adds restriction that this custom autograd function needs to take in fx-able types. 3) subclass constructor decorator is implemented similarly, so we just refactor it to use similar implementation as this new decorator. eventually we should delete the subclass constructor decorator. 4) Move some code in subclass constructor decorator to exit early in non-export environment which should shave off some inefficiency (around 1% according to @swolchok 's benchmark) Fixes: https://github.com/pytorch/pytorch/issues/161563#issuecomment-3246309758 Differential Revision: [D82141316](https://our.internmc.facebook.com/intern/diff/D82141316) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162240 Approved by: https://github.com/ydwu4	2025-09-11 17:42:41 +00:00
Catherine Lee	2f53395943	[ez][CI] Fix docs push in nightly workflow (#162657 ) HUD metrics page says docs push hasn't happened in 21 days <img width="293" height="142" alt="image" src="https://github.com/user-attachments/assets/f930aab8-0503-4bf2-b962-8c375dec6b78" /> I guess main branch docs just haven't been updated? Did anyone notice? Do we care? Either way I think this should fix it Likely started after https://github.com/pytorch/pytorch/pull/161182 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162657 Approved by: https://github.com/huydhn	2025-09-11 16:45:41 +00:00
Avik Chaudhuri	fccddf02b6	repro 161902 (#162416 ) Summary: Sometimes `ShapeEnv.create_symbol` can return a `sympy.Integer`. This messes up our phantom symbol infra for derived dims. Fixes #161902 Test Plan: added test based on repro Rollback Plan: Differential Revision: D81960709 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162416 Approved by: https://github.com/tugsbayasgalan	2025-09-11 16:35:23 +00:00
Nikita Shulga	8be8b94793	Update SECURITY.md with reporting guidelines (#162608 ) Added clarification that all reports will be disclosed within 90 days Pull Request resolved: https://github.com/pytorch/pytorch/pull/162608 Approved by: https://github.com/seemethere, https://github.com/albanD	2025-09-11 16:30:29 +00:00
suo	fe8cc619b8	[torch][c10d] fix split_group in mixed backend case (#162424 ) Today we can initialize a mixed-backend process group (e.g. "cpu:gloo,cuda:nccl") but we can only pass one set of process group options. However, when we call `split_group`, we retrieve that set of options from the parent PG and pass it to the ProcessGroup::groupSplit C++ API, which then attempts to propagate that set of options to all backends. This leads to an assert on some user code, where ProcessGroupGloo::split is expecting gloo options but receives nccl options instead. Arguably the APIs as currently designed are just broken; we should not ever expect a single set of backend options to apply across multiple backends. However, fixing this would require changing quite a few public APIs. As a quick fix, since user-provided options really only exist for NCCL, just warn and fall-back to defaulted options for Gloo if non-gloo options are detected. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162424 Approved by: https://github.com/d4l3k, https://github.com/fduwjj, https://github.com/H-Huang	2025-09-11 16:29:32 +00:00
atalman	2f5a24c2a2	Smoke tests don't run nvshmem on Windows (#162646 ) Only available for linux x86 and aarch64 : https://pypi.org/project/nvidia-nvshmem-cu13/#files nvshmem is available only on linux: `` "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' \| " `` https://github.com/pytorch/pytorch/blob/main/.github/scripts/generate_binary_build_matrix.py#L57 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162646 Approved by: https://github.com/kwen2501	2025-09-11 16:09:20 +00:00
Nikita Shulga	24492cbab2	[BE] Cleanup stale comments/copy from `gemm` (#162001 ) Followup after https://github.com/pytorch/pytorch/pull/154012 Since the introduction of `gemm_no_downcast_stub` it's no longer necessary to allocate temporary array and then manually implement the `beta` logic in the codebase Pull Request resolved: https://github.com/pytorch/pytorch/pull/162001 Approved by: https://github.com/drisspg ghstack dependencies: #161999	2025-09-11 15:48:43 +00:00
Avik Chaudhuri	3f6d88f04c	paths to exclude shape guards (#162684 ) Summary: Easier to land than https://www.internalfb.com/diff/D82030581 Test Plan: everything blamed by https://www.internalfb.com/diff/D80713603 (except some old exir tests) Rollback Plan: Differential Revision: D82180349 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162684 Approved by: https://github.com/tugsbayasgalan	2025-09-11 15:34:06 +00:00
PyTorch MergeBot	94db2ad51d	Revert "Move prioritized text linker optimization code from setup.py to cmake (#160078 )" This reverts commit 26b3ae58908becbb03b28636f7384d2972a8c9a5. Reverted https://github.com/pytorch/pytorch/pull/160078 on behalf of https://github.com/atalman due to Sorry reverting this broke linux aarch64 CUDA nightlies [pytorch/pytorch/actions/runs/17637486681/job/50146967503](https://github.com/pytorch/pytorch/actions/runs/17637486681/job/50146967503) ([comment](https://github.com/pytorch/pytorch/pull/160078#issuecomment-3281426631))	2025-09-11 15:29:29 +00:00
PyTorch MergeBot	9f783e172d	Revert "Build and Install Arm Compute Library in manylinux docker image (#159737 )" This reverts commit 582d278983b28a91ac0cedd035183f2495bb6887. Reverted https://github.com/pytorch/pytorch/pull/159737 on behalf of https://github.com/atalman due to Sorry reverting this broke linux aarch64 CUDA nightlies [pytorch/pytorch/actions/runs/17637486681/job/50146967503](https://github.com/pytorch/pytorch/actions/runs/17637486681/job/50146967503) ([comment](https://github.com/pytorch/pytorch/pull/159737#issuecomment-3281398272))	2025-09-11 15:25:24 +00:00
Animesh Jain	a8432bcaad	[dynamo][guards] Fail on an unknown framelocals to dict conversion (#162695 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162695 Approved by: https://github.com/williamwen42 ghstack dependencies: #162694	2025-09-11 15:01:00 +00:00
Animesh Jain	a3a40cb741	[dynamo][guards] Do not consturct framelocals to dict on GlobalsGuardAccessor (#162694 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162694 Approved by: https://github.com/williamwen42	2025-09-11 15:01:00 +00:00
Tugsbayasgalan Manlaibaatar	c924c675d0	Fix persistent buffer bug (#162190 ) For non-persistent buffers, we should properly register them. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162190 Approved by: https://github.com/zhxchen17	2025-09-11 14:56:26 +00:00
Jithun Nair	c3f30eca9e	Remove tests-to-include from rocm-mi300 workflow (#162721 ) Accidentally introduced by https://github.com/pytorch/pytorch/pull/162288 (was meant to be a temporary change) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162721 Approved by: https://github.com/jeffdaily	2025-09-11 14:36:07 +00:00
Jeff Daily	1e710552c1	[ROCm][CI] benchmark must patch fbgemm_gpu with tbb dep (#162649 ) fbgemm adds tbb as a dep only for rocm to avoid missing tbb symbols at import. But the way it was done was in setup.py to add the linker flag to CMAKE_CXX_FLAGS and it wasn't working for reasons unknown to me. But what did work was to add tbb as a dep in the cmake file. [We have a PR against upstream fbgemm](https://github.com/pytorch/FBGEMM/pull/4859) for that. Meanwhile, a much smaller patch is applied here in this PR until the fbgemm rocm ci commit hash is moved forward to include the tbb patch from upstream. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162649 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-11 14:10:51 +00:00
Sun,Jiabin	7c39b2ecbe	use torch.accelerator and device_module instead of cuda to make DataParallel more device agnostic. (#162573 ) use torch.accelerator and `_get_device_module` instead of cuda to make DataParallel more device agnostic. Fixes #162152 recently, I've done some works to support my own privateuse1 backend in DataParallel module, but I found some cuda related APIs exist in parallel_apply.py file, that makes me have to monkey patch DataParallel module to support DP on my own backend. so I make some small changes to replace cuda.xxx to accelerator.xxx, and acquire device module by `_get_device_module`. this is my first time to contribute to pytorch, please let me know if there is any problem about the change. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162573 Approved by: https://github.com/ezyang, https://github.com/guangyey Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com> Co-authored-by: Edward Z. Yang <ezyang@mit.edu>	2025-09-11 10:04:27 +00:00
Naveen Suda	afdd4247a2	[torchao][pt2e] Make prepare and convert faster by caching (#162550 ) Summary: D79674759 tried to fix the expensive prepare and convert steps, as `assert_and_get_unique_device` was called multiple times. This change fixes that issue by using `functools.cache` decorator. Test Plan: Verified on llm export to QNN. LLM Quantization prepare time of ~20min reduced to ~3min. Rollback Plan: Differential Revision: D82073679 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162550 Approved by: https://github.com/andrewor14	2025-09-11 07:59:22 +00:00
Lucy Qiu	22df9332da	[serialization] Add pte file to archive (#162520 ) Summary: Add _package_executorch_files to archive apis. Allow us to package a PTE file into the archive. I don't think there's a use-case to have more than one PTE file at the moment, but left it as `EXECUTORCH_FILES` just in case. Test Plan: Tested in D81992612 Rollback Plan: Differential Revision: D81977483 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162520 Approved by: https://github.com/angelayi	2025-09-11 07:59:11 +00:00
Sun, Jiayi	6b9b7ce6fe	fix torch.sparse.log_softmax on CPU (#161959 ) Fix https://github.com/pytorch/pytorch/issues/152293. Example: ``` import torch from torch.sparse import log_softmax as sparse_log_softmax def test_bug(): a = torch.rand(4, 3) b = a - 10000000.0 b_sparse = b.to_sparse() cpu_out_sparse = sparse_log_softmax(b_sparse, dim=1).to_dense() print('cpu_out_sparse =', cpu_out_sparse) b_sparse_double = b.double().to_sparse() cpu_out_sparse_double = sparse_log_softmax(b_sparse_double, dim=1).to_dense() print('cpu_out_sparse_double =', cpu_out_sparse_double) if __name__ == '__main__': test_bug() ``` Output: - before ``` cpu_out_sparse = tensor([[-2., -1., -2.], [-1., -1., -1.], [-1., -2., -2.], [-1., -1., -2.]]) cpu_out_sparse_double = tensor([[-1.5514, -0.5514, -1.5514], [-1.0986, -1.0986, -1.0986], [-0.5514, -1.5514, -1.5514], [-0.8620, -0.8620, -1.8620]], dtype=torch.float64) ``` - after ``` cpu_out_sparse = tensor([[-0.8620, -1.8620, -0.8620], [-1.0986, -1.0986, -1.0986], [-1.8620, -0.8620, -0.8620], [-1.0986, -1.0986, -1.0986]]) cpu_out_sparse_double = tensor([[-0.8620, -1.8620, -0.8620], [-1.0986, -1.0986, -1.0986], [-1.8620, -0.8620, -0.8620], [-1.0986, -1.0986, -1.0986]], dtype=torch.float64) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161959 Approved by: https://github.com/Skylion007, https://github.com/malfet, https://github.com/mingfeima	2025-09-11 07:52:05 +00:00
Scott Wolchok	1274297e06	Remove __torch_dispatch__ check in THPVariable_make_dtensor (#162337 ) We control DTensor, so we can just guarantee there isn't a programming error with __torch_dispatch__. (The guard is already less-than-perfect; see the note that the deleted comment refers to.) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162337 Approved by: https://github.com/Skylion007 ghstack dependencies: #161591, #161595, #161633, #161634, #161692, #162219, #162220, #162218, #161596	2025-09-11 06:58:35 +00:00
Scott Wolchok	f68f76d8c7	Remove logger.debug statements in DTensor dispatch (#161596 ) These seem to have been costing us 5-10 usec per detach (out of ~~95 usec total). If they need to ship let's talk about requirements and how we can make this more efficient given that we would prefer if an entire DTensor op could finish in 10 usec. Differential Revision: [D81530106](https://our.internmc.facebook.com/intern/diff/D81530106) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161596 Approved by: https://github.com/ezyang, https://github.com/Skylion007 ghstack dependencies: #161591, #161595, #161633, #161634, #161692, #162219, #162220, #162218	2025-09-11 06:58:35 +00:00
Deng, Daisy	fa1d409e83	[2/N]Port several test files under test/distributed to Intel GPU (#159473 ) For https://github.com/pytorch/pytorch/issues/114850, we will port distributed tests to Intel GPU. This PR will work on some test files under test/distributed. We could enable Intel GPU with following methods and try the best to keep the original code styles: - instantiate_device_type_tests() - use "torch.accelerator.current_accelerator()" to determine the accelerator backend - use requires_accelerator_dist_backend to allow both nccl and xccl test - enabled XPU for some test path - Change the hardcoded world_size according to device_count. - Unify some common code under torch/testing/_internal for multiple backend, for example: Added xpu for Backend.backend_capability and dist.Backend.register_backend() Pull Request resolved: https://github.com/pytorch/pytorch/pull/159473 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-11 06:44:26 +00:00
Xu Han	52d4660ae9	[AOTI] Fix Windows fail to zip opened file. (#162617 ) Original issue: <img width="1767" height="544" alt="Image" src="https://github.com/user-attachments/assets/9de90d50-217f-4049-8f19-77ff1660c8b0" /> reproducer: ```cmd pytest test\inductor\test_aot_inductor.py -v -k test_weight_on_disk_legacy_cpu ``` Fixed list: 1. `WritableTempFile`'s `__exit__` function auto unlink opened file, when the file was opened, it should raise error. Ignore it on Windows. 2. When open zip file, if the file is opened, it would be failed. Switch to `_wfsopen` with shared access flag, which can open file with shared access. Local test passed: <img width="1101" height="233" alt="image" src="https://github.com/user-attachments/assets/935cbf2e-52db-41f1-80fa-617569b92a96" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162617 Approved by: https://github.com/jansel	2025-09-11 06:22:21 +00:00
Mark Saroufim	7345454e2e	compile_kernel: Handle python floats as c double (#162626 ) This was an open todo in the code and probably a footgun in waiting Pull Request resolved: https://github.com/pytorch/pytorch/pull/162626 Approved by: https://github.com/malfet	2025-09-11 06:03:25 +00:00
PyTorch MergeBot	23170dfebc	Revert "Move inductor jobs 3.9->3.10 (#162323 )" This reverts commit 0663bdb12383b9717af49d58aed9d88de0dd0ecc. Reverted https://github.com/pytorch/pytorch/pull/162323 on behalf of https://github.com/huydhn due to Not sure what had happened, but some inductor unit tests start failing after this lands ([comment](https://github.com/pytorch/pytorch/pull/162323#issuecomment-3278125192))	2025-09-11 05:57:13 +00:00
Mark Saroufim	12e993f533	compile_kernel large shared memory fix (#162647 ) Alternate solution to https://github.com/pytorch/pytorch/pull/162328 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162647 Approved by: https://github.com/eqy	2025-09-11 05:52:46 +00:00
PyTorch UpdateBot	07d2531672	[vllm hash update] update the pinned vllm hash (#162551 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162551 Approved by: https://github.com/pytorchbot	2025-09-11 04:56:04 +00:00
Jagadish Krishnamoorthy	6944d4b639	[ROCm] rocblas Aten GEMM overload for FP32 output from FP16/BF16 inputs (#162600 ) Fix ROCm GEMM helper to set output type (C/D) based on C_Dtype template parameter. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162600 Approved by: https://github.com/jeffdaily, https://github.com/pruthvistony	2025-09-11 03:34:07 +00:00
Isuru Fernando	f654cff566	[inductor] Add shape to load_input in matmul templates (#162513 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162513 Approved by: https://github.com/eellison ghstack dependencies: #162426	2025-09-11 01:51:15 +00:00
Isuru Fernando	f17c5e0789	[inductor] Add shape for store_output in matmul templates (#162426 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162426 Approved by: https://github.com/eellison	2025-09-11 01:51:15 +00:00
Tianyu Liu	435c18fb4a	[DTensor] add op support for aten.unbind.int (#162560 ) As titled. It seems unbind returns views of the original tensor. E.g. see https://stackoverflow.com/questions/78910951/does-unbind-return-the-views-of-tensors-in-pytorch So we error out when `shard_dim == unbind_dim`. This is similar to why we error out in view ops. https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/_ops/_view_ops.py#L544-L546 This PR also refactors some other tensor ops code, by creating two utils function `shift_shard_dims_after_insert`, `shift_shard_dims_after_remove`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162560 Approved by: https://github.com/zpcore	2025-09-11 00:58:23 +00:00
dolpm	612cdc8f48	-ldl for nativert tests (#162643 ) Fixes #162640 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162643 Approved by: https://github.com/yiming0416, https://github.com/robert-hardwick	2025-09-11 00:35:57 +00:00
Edward Yang	da5069f289	Don't include cuh header when USE_NVSHMEM is off (#162635 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162635 Approved by: https://github.com/kwen2501	2025-09-11 00:24:50 +00:00
Mark Saroufim	4fd2a2b273	Add cuda headers automatically for compile_kernel (#162634 ) Issue was pointed out before by @ngimel and more recently by https://gau-nernst.github.io/nvrtc-matmul/#missing-cuda-and-c-headers- by @gau-nernst Benefit is now we can add `#include <cuda_fp16.h>` without crapping out Pull Request resolved: https://github.com/pytorch/pytorch/pull/162634 Approved by: https://github.com/ngimel	2025-09-11 00:20:33 +00:00
Ting Lu	bb1d53bc47	[CD] CUDA 13 specific followup changes (#162455 ) Follow up for CUDA 13 bring up https://github.com/pytorch/pytorch/issues/159779 sm50-70 should not be added to sbsa build arch list, as previous archs had no support for arm. remove platform_machine from PYTORCH_EXTRA_INSTALL_REQUIREMENTS Pull Request resolved: https://github.com/pytorch/pytorch/pull/162455 Approved by: https://github.com/atalman	2025-09-11 00:03:47 +00:00
Ben Niu	36338fc7f2	Relax fences for intrusive ptr's refcnt (#162072 ) Summary: Relax fences for intrusive ptr's refcnt dec op for performance testing. lock needs acquire when the op succeeds and relaxed if the op is not. In addition, the expire call and the following refcnt reads were merged to remove one extra read. incref does not need any fences because the caller should already have a valid reference. use_count follows the same reasoning. decref only needs a release fence to make sure every write op prior to it has finished. When the refcnt goes to zero, there should be a acquire fence to make sure no read op reads stale data before the object is destructed. However, microbenchmark showed that the optimal fence for decref is not performing noticeably better than the current decref with acq-rel, so we keep decref as-is. This change should have no material impact on x86, but for Arm64 (and other CPUs with weak memory models), it should boost performance. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162072 Approved by: https://github.com/swolchok, https://github.com/yfeldblum	2025-09-10 23:17:01 +00:00
Daniel Vega-Myhre	e0c910149c	Build fbgemm_gpu for TORCH_CUDA_ARCH_LIST=10.0 and CUDA 12.8 and 12.9 (#162544 ) ## Summary - pytorch is not built for a variants of SM architectures, due to non-portability. However, we need fbgemm_gpu kernels built for sm100a (see #162209) ## Changes - Setting USE_FBGEMM_GENAI for CUDA builds: fbgemm_gpu builds for sm100a if using CUDA 12.8 or 12.9 ([source](`2033a0a08f/.github/scripts/nova_dir.bash (L29-L32)`)), so I follow the same rule here. - Extra nvcc flags*: if USE_FBGEMM_GENAI and USE_CUDA are set, we add extra nvcc flags for sm100a ## Test plan Test build: ``` echo $CUDA_HOME /usr/local/cuda-12.9 export TORCH_CUDA_ARCH_LIST=10.0 python -m pip install --no-build-isolation -v -e . ``` Check build logs: ``` CMake Warning at CMakeLists.txt:901 (message): Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a ``` Run unit tests: - `pytest test/test_matmul_cuda.py -k test_mxfp8_scaled_grouped_mm` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162544 Approved by: https://github.com/drisspg	2025-09-10 22:59:41 +00:00
eellison	f4aeceaa9d	Use upper bound for persistent rblock (#162441 ) Previously, we were using 128 and increasing to upper bound. We should be setting at the upper bound and raising to next power of 2. Differential Revision: [D81984103](https://our.internmc.facebook.com/intern/diff/D81984103) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162441 Approved by: https://github.com/PaulZhang12	2025-09-10 22:29:02 +00:00
Michael Lazos	d8e6b2fddc	[Cutlass] Add exp and sigmoid activations (#162536 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162536 Approved by: https://github.com/henrylhtsang, https://github.com/eellison ghstack dependencies: #162535	2025-09-10 21:44:26 +00:00
Michael Lazos	31c25c7d01	[Cutlass] Add tanh activation and test case for activations (#162535 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162535 Approved by: https://github.com/henrylhtsang	2025-09-10 21:44:26 +00:00
eqy	5dbee5691c	[cuDNN][Convolution][TF32][64bit] Add `tf32_on_and_off` decorator to conv3d 64bit test (#161004 ) cuDNN has new generated kernels that can use TF32. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161004 Approved by: https://github.com/janeyx99, https://github.com/Skylion007	2025-09-10 21:39:35 +00:00
drisspg	864ffe12d7	Fix some edge cases (#162295 ) ``` Summary 🔝 Top 5 Performance Differences (by absolute %): shape: (5, 7) ┌────────────────┬────────────────┬─────────────────────────────┬───────────────────┬──────────────────────┬───────────────────────────┬───────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops BWD (base) ┆ TFlops BWD (no_peel) ┆ no_peel_speedup_over_base ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞════════════════╪════════════════╪═════════════════════════════╪═══════════════════╪══════════════════════╪═══════════════════════════╪═══════════╡ │ sliding_window ┆ torch.bfloat16 ┆ (2, 16, 1024, 4, 1024, 64) ┆ 56.937931 ┆ 58.960459 ┆ 1.035522 ┆ 3.552163 │ │ noop ┆ torch.bfloat16 ┆ (2, 16, 1024, 4, 1024, 128) ┆ 89.221306 ┆ 86.295642 ┆ 0.967209 ┆ -3.27911 │ │ causal ┆ torch.bfloat16 ┆ (2, 16, 4096, 4, 4096, 128) ┆ 111.552594 ┆ 114.380841 ┆ 1.025353 ┆ 2.535349 │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, 1024, 64) ┆ 74.830149 ┆ 76.685445 ┆ 1.024793 ┆ 2.479344 │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 4, 1024, 64) ┆ 55.279932 ┆ 56.369312 ┆ 1.019707 ┆ 1.97066 │ └────────────────┴────────────────┴─────────────────────────────┴───────────────────┴──────────────────────┴───────────────────────────┴───────────┘ 🔺 Top 5 Cases Where no_peel (change) is Faster than base (baseline): shape: (5, 7) ┌────────────────┬────────────────┬─────────────────────────────┬───────────────────┬──────────────────────┬───────────────────────────┬───────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops BWD (base) ┆ TFlops BWD (no_peel) ┆ no_peel_speedup_over_base ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞════════════════╪════════════════╪═════════════════════════════╪═══════════════════╪══════════════════════╪═══════════════════════════╪═══════════╡ │ sliding_window ┆ torch.bfloat16 ┆ (2, 16, 1024, 4, 1024, 64) ┆ 56.937931 ┆ 58.960459 ┆ 1.035522 ┆ 3.552163 │ │ causal ┆ torch.bfloat16 ┆ (2, 16, 4096, 4, 4096, 128) ┆ 111.552594 ┆ 114.380841 ┆ 1.025353 ┆ 2.535349 │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, 1024, 64) ┆ 74.830149 ┆ 76.685445 ┆ 1.024793 ┆ 2.479344 │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 4, 1024, 64) ┆ 55.279932 ┆ 56.369312 ┆ 1.019707 ┆ 1.97066 │ │ causal ┆ torch.bfloat16 ┆ (4, 16, 4096, 4, 4096, 64) ┆ 111.08814 ┆ 112.447047 ┆ 1.012233 ┆ 1.22327 │ └────────────────┴────────────────┴─────────────────────────────┴───────────────────┴──────────────────────┴───────────────────────────┴───────────┘ 🔻 Top 5 Cases Where no_peel (change) is Slower than base (baseline): shape: (5, 7) ┌────────────────┬────────────────┬─────────────────────────────┬───────────────────┬──────────────────────┬───────────────────────────┬───────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops BWD (base) ┆ TFlops BWD (no_peel) ┆ no_peel_speedup_over_base ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞════════════════╪════════════════╪═════════════════════════════╪═══════════════════╪══════════════════════╪═══════════════════════════╪═══════════╡ │ noop ┆ torch.bfloat16 ┆ (2, 16, 1024, 4, 1024, 128) ┆ 89.221306 ┆ 86.295642 ┆ 0.967209 ┆ -3.27911 │ │ causal ┆ torch.bfloat16 ┆ (4, 16, 1024, 4, 1024, 64) ┆ 78.23082 ┆ 76.693169 ┆ 0.980345 ┆ -1.965531 │ │ sliding_window ┆ torch.bfloat16 ┆ (2, 16, 2048, 4, 2048, 128) ┆ 96.95663 ┆ 95.573333 ┆ 0.985733 ┆ -1.426717 │ │ alibi ┆ torch.bfloat16 ┆ (4, 16, 2048, 4, 2048, 64) ┆ 93.373473 ┆ 92.294147 ┆ 0.988441 ┆ -1.155924 │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 2048, 4, 2048, 128) ┆ 96.95147 ┆ 96.105389 ┆ 0.991273 ┆ -0.872685 │ ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162295 Approved by: https://github.com/mlazos, https://github.com/v0i0	2025-09-10 21:33:45 +00:00
Yuhui Shi	4e35594674	[Lowering] Fix the edge case of empty subgraph split due to dataclass node (#161716 ) Summary: Fix the edge case by allowing `call_function` nodes with no deps as graph entry (starter_nodes) in the splitter. Test Plan: The test shall pass in the current diff (after fix), and fail in the parent diff (before fix) ``` buck test mode/opt //glow/fb/fx/lowering:split_tests -- test_dataclass_as_graph_entry ``` Rollback Plan: Differential Revision: D81232435 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161716 Approved by: https://github.com/ezyang	2025-09-10 21:23:42 +00:00
Gabriel Ferns	35d7b32159	Improve device info with new flops and bandwidth formula based on hardware libraries (#162245 ) Previously, DeviceInfo provided theoretical hardware information based on a hardcoded list manually created from various datasheets. This update: - Attempting to gather the information from a hardware library like `pynvml`, improving accuracy and expanding support to devices that don't have entries in the datasheet list. - Adjusts flops and bw calculation based on these hardware values. For example, if the the memory or SMs are underclocked, it adjusts the theoretical max flops/bw accordingly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162245 Approved by: https://github.com/v0i0, https://github.com/shunting314	2025-09-10 21:19:13 +00:00
atalman	0663bdb123	Move inductor jobs 3.9->3.10 (#162323 ) Related to: https://github.com/pytorch/pytorch/issues/161167 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162323 Approved by: https://github.com/huydhn, https://github.com/Skylion007	2025-09-10 20:58:41 +00:00
PyTorch MergeBot	40ea6e418a	Revert "Fix decorators skipping NCCL tests (#158846 )" This reverts commit c2388201fc85b0748173212de5a17514c7a71f21. Reverted https://github.com/pytorch/pytorch/pull/158846 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it is failing some inductor tests ([comment](https://github.com/pytorch/pytorch/pull/158846#issuecomment-3276471387))	2025-09-10 20:51:31 +00:00
Colin Peppler	348303ebd2	[ez] add docstring/typing for codegen_kernel_benchmark (#162609 ) ``` lintrunner init && lintrunner -m origin/main ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162609 Approved by: https://github.com/coconutruben ghstack dependencies: #162442	2025-09-10 20:49:38 +00:00
Colin Peppler	94755e81c4	[inductor] Enable combo kernels with unbacked inputs (#162442 ) Internal user tried enabling combo kernels, but ran into "Cannot convert symbols to int". This PR is to enable combo kernels on inputs with data-dependent shapes. ### Example exception ``` File "/data/users/colinpeppler/pytorch/torch/_inductor/codegen/triton.py", line 4997, in benchmark_combo_kernel kernel_code_list = self.generate_combo_kernel_code( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/colinpeppler/pytorch/torch/_inductor/codegen/simd.py", line 1849, in generate_combo_kernel_code src_code = kernel.codegen_kernel() ^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/colinpeppler/pytorch/torch/_inductor/codegen/triton_combo_kernel.py", line 802, in codegen_kernel code.splice(self.codegen_kernel_benchmark(num_gb=0)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/colinpeppler/pytorch/torch/_inductor/codegen/triton_combo_kernel.py", line 852, in codegen_kernel_benchmark var_names.extend(self.kernel_benchmark_extra_args()) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/colinpeppler/pytorch/torch/_inductor/codegen/triton_combo_kernel.py", line 733, in kernel_benchmark_extra_args extra_args.append(str(V.graph.sizevars.size_hint(tree.numel))) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/colinpeppler/pytorch/torch/_inductor/sizevars.py", line 584, in size_hint return int(out) ^^^^^^^^ File "/home/colinpeppler/.conda/envs/pytorch/lib/python3.12/site-packages/sympy/core/expr.py", line 307, in __int__ raise TypeError("Cannot convert symbols to int") torch._inductor.exc.InductorError: TypeError: Cannot convert symbols to int ``` Differential Revision: [D82042230](https://our.internmc.facebook.com/intern/diff/D82042230) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162442 Approved by: https://github.com/jansel	2025-09-10 20:49:38 +00:00
Tugsbayasgalan Manlaibaatar	6d65737aee	testing infra and some fixes (#162183 ) This PR is quite large in that it covers most of rough edges in the new strict export flow: 1. Handle nn_module_stack correctly now that we are tracing wrapper module 2. module_call_spec needs to get queried from source directly because we are not running the bytecode anymore. 3. Correct input and output handling. @diff-train-skip-merge Pull Request resolved: https://github.com/pytorch/pytorch/pull/162183 Approved by: https://github.com/zhxchen17	2025-09-10 20:48:12 +00:00
PyTorch MergeBot	053251b98d	Revert "Make functorch notebook symlinks PEP 517 valid (#157813 )" This reverts commit b494547f0bd6cb1ce5d8d104cb419802434c9c08. Reverted https://github.com/pytorch/pytorch/pull/157813 on behalf of https://github.com/huydhn due to Sorry for reverting your change, but this surfaces a weird discrepancy between GitHub and Mecurial used internally ([comment](https://github.com/pytorch/pytorch/pull/157813#issuecomment-3276442242))	2025-09-10 20:45:48 +00:00
Justin Chu	7e2e83cdbe	[ONNX] Update export docstring (#162622 ) Update export docstring to reflect the latest configuration. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162622 Approved by: https://github.com/titaiwangms	2025-09-10 20:29:46 +00:00
PyTorch MergeBot	d033d11d26	Revert "[torch][c10d] fix split_group in mixed backend case (#162424 )" This reverts commit 2dc26131801a430e030a773c4fbfe874e263259d. Reverted https://github.com/pytorch/pytorch/pull/162424 on behalf of https://github.com/clee2000 due to failure seems related, maybe a hang/timeout distributed/test_distributed_spawn.py::TestDistBackendWithSpawn::test_ddp_model_diff_shape_across_ranks log classifier is pointing at the wrong line ([comment](https://github.com/pytorch/pytorch/pull/162424#issuecomment-3276360494))	2025-09-10 20:13:44 +00:00
PyTorch MergeBot	80d4da893c	Revert "Put torchao (0.13.0) back to benchmark workflow (#162227 )" This reverts commit 00985970e312c3c5e674e8e14d39fe77c226600e. Reverted https://github.com/pytorch/pytorch/pull/162227 on behalf of https://github.com/huydhn due to Crashing some inductor jobs in trunk ([comment](https://github.com/pytorch/pytorch/pull/162227#issuecomment-3276355034))	2025-09-10 20:11:37 +00:00
Parshant Sharma	bf7f481144	Update misleading torch.sparse_coo_tensor error check (#161900 ) Fixes #160622 ### Summary Updated the misleading torch.sparse_coo_tensor error check to provide clear context. earlier: `RuntimeError: number of dimensions must be sparse_dim (3) + dense_dim (0), but got 1` Updated: `RuntimeError: 'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = 1, sparse_dim = 3, dense_dim = 0` Impacts: - Comprehensive error message that will improve developer experience. - module: sparse Pull Request resolved: https://github.com/pytorch/pytorch/pull/161900 Approved by: https://github.com/nikitaved, https://github.com/pearu	2025-09-10 19:57:11 +00:00
Max Podkorytov	ab0694f1c6	[ROCm][Inductor][CK backend] Install rocm-composable-kernel python package on ROCm Linux CI docker images (#162288 ) Reopened from #158747 which got reverted since without setuptools-scm in pytorch index URL the wheel cannot be built We reconsider the original PR idea of introducing CK as a pytorch dependency on ROCm Linux and install the CK python package in CI only -- since (1) rocm-composable-kernel depends on setuptools-scm which depends on tomli and the existing index URLs need to be modified to host the new packages and (2) there also is a packaging [bug](https://github.com/pypa/setuptools/issues/3269#issuecomment-1254507377) in Ubuntu 22.04 which prevents correct dynamic version calculation with default system pip. Extras: -> this PR reconsiders how TORCHINDUCTOR_CK_DIR env variable is used; previously, this var was used to point to rocm-composable-kernel package installation path on the filesystem; now, the path is inferred by trying to import ck4inductor -> the tests are updated to reflect this change -> since in CI clang points to a bash script which invokes sccache, we cannot patch PATH to not contain sccache, this logic is removed from the testing code -> scaled_mm test crashes during the benchmarking when the benchmarking happens in the main process, and times out benchmarking when it happens in a subprocess, on gfx942, so it is disabled TBD: roll back rocm-mi300 workflow before merging Pull Request resolved: https://github.com/pytorch/pytorch/pull/162288 Approved by: https://github.com/jeffdaily	2025-09-10 19:33:40 +00:00
Animesh Jain	5f630d28d7	[dynamo][guards] Do not construct entire framelocals dict for LAMBDA_GUARD (#162525 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162525 Approved by: https://github.com/williamwen42 ghstack dependencies: #162509	2025-09-10 18:52:15 +00:00
Animesh Jain	a67e798cb7	[dynamo][guards] Prevent framelocals to dict conversion for not required LAMBDA_GUARD (#162509 ) This is a smaller PR to reduce framelocals to dict conversion. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162509 Approved by: https://github.com/williamwen42	2025-09-10 18:52:15 +00:00
Ruben Rodriguez Buchillon	30191fcf03	[inductor][choices] rename get_mm_configs to get_template_configs (#162293 ) # why - eventually we want all templates to go through this - we're exposing this through diode as a sort of interface/API - avoid later renaming # what - rename get_mm_configs to get_template_configs - rename _finalize_mm_configs to _finalize_template_configs # testing - lintrunner - ci Differential Revision: [D81820641](https://our.internmc.facebook.com/intern/diff/D81820641) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162293 Approved by: https://github.com/eellison ghstack dependencies: #161351, #161350	2025-09-10 18:47:44 +00:00
Ruben Rodriguez Buchillon	623e623c82	[inductor] leverage template stacking in V.choices.get_mm_configs (#161350 ) # why - now everything is in place to just gather templates and run the V.choices.get_mm_configs once per op - enables any overrides inside V.choices.get_mm_configs to have a full view of the options for an op, not just for one template # what - replace multiple calls to V.choices.get_mm_configs with calls to gather the active templates, and then using those in a single call # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520571](https://our.internmc.facebook.com/intern/diff/D81520571) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161350 Approved by: https://github.com/eellison, https://github.com/jansel ghstack dependencies: #161351	2025-09-10 18:47:44 +00:00
Ruben Rodriguez Buchillon	f08487aa86	[inductor] FlexibleLayout for ExternKernelChoice for mms (#161351 ) # why - if we only use ExternKernelChoice we're not doing any codegen - if we're not doing any codegen, we can use a FlexibleLayout here, and provide deeper passes more chances to change it # what - if all the kernel template choices (KTC) are with a ExternKernelChoice template, we switch to a FlexibleLayout before generating the choice - add a test to make sure that works as intended (FlexibleLayout for only extern, and FixedLayout if Triton is involved) - caveats: - because CPP, CUTLASS, and CK are not using V.choices.get_mm_configs yet, we turn off the optimization if either of those backends are in use. This will be relaxed once they support this too - because Triton templates are still using their own calls (not a single call) to get_mm_configs, it's also turned off there. The next diff unifies Triton + ATEN to a single call to get_mm_configs and that in turn allows the optimization there too # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520584](https://our.internmc.facebook.com/intern/diff/D81520584) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161351 Approved by: https://github.com/eellison, https://github.com/jansel	2025-09-10 18:47:34 +00:00
Edward Yang	1051c7dbc2	Don't unconditionally import torch._dynamo, it's slow (#162595 ) A trivial test on OS X. Before: ``` real 0m6.550s user 0m2.532s sys 0m3.359s ``` After: ``` real 0m2.607s user 0m1.898s sys 0m3.344s ``` Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162595 Approved by: https://github.com/albanD	2025-09-10 17:21:03 +00:00
suo	2dc2613180	[torch][c10d] fix split_group in mixed backend case (#162424 ) Today we can initialize a mixed-backend process group (e.g. "cpu:gloo,cuda:nccl") but we can only pass one set of process group options. However, when we call `split_group`, we retrieve that set of options from the parent PG and pass it to the ProcessGroup::groupSplit C++ API, which then attempts to propagate that set of options to all backends. This leads to an assert on some user code, where ProcessGroupGloo::split is expecting gloo options but receives nccl options instead. Arguably the APIs as currently designed are just broken; we should not ever expect a single set of backend options to apply across multiple backends. However, fixing this would require changing quite a few public APIs. As a quick fix, since user-provided options really only exist for NCCL, just warn and fall-back to defaulted options for Gloo if non-gloo options are detected. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162424 Approved by: https://github.com/d4l3k, https://github.com/fduwjj, https://github.com/H-Huang	2025-09-10 16:59:18 +00:00
Robert Hardwick	582d278983	Build and Install Arm Compute Library in manylinux docker image (#159737 ) ---- This PR will be part of a series of PR's that aims to remove `.ci/aarch64_linux` folder entirely, such that Aarch64 manylinux build happens as part of `.ci/manywheel/build.sh`, the same as other platforms. In this PR: - We prebuild + install Arm Compute Library in the manylinux docker image ( at /acl ), instead of a build time for every pytorch build. Also updated jammy install path to be /acl too. - We can therefore remove build_ArmComputeLibrary functions from the ci build scripts. - There is also some refactoring of install_openblas.sh and install_acl.sh to align them together ( similar formatting, similar variable names, same place for version number update ) - We had 2 places to define openblas version, this has been reduced to 1 now ( install_openblas.sh ). - ACL_VERSION and OPENBLAS_VERSION are now able to be overriden at build.sh level for developers, but there is only 1 version of each hardcoded for ci. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159737 Approved by: https://github.com/seemethere ghstack dependencies: #160078	2025-09-10 15:39:38 +00:00
Benjamin Glass	b5e6e58050	[nn] Assert parsed iterable arguments are an appropriate length (#162340 ) Fixes #162327 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162340 Approved by: https://github.com/Skylion007	2025-09-10 15:15:49 +00:00
Masaki Kozuki	fefc406a3d	fix typo: summit -> submit (#162587 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162587 Approved by: https://github.com/justinchuby	2025-09-10 14:43:53 +00:00
atalman	3d32bb114b	[CD] Aarch64 Fix packaging ``libarm_compute.so`` and other libraries to the aarch64 CUDA wheels (#162566 ) Fixes aarch64 linux packaging, following error: https://github.com/pytorch/vision/actions/runs/17612462583/job/50037380487#step:15:62 ``` Traceback (most recent call last): File "/__w/vision/vision/pytorch/vision/setup.py", line 13, in <module> import torch File "/__w/_temp/conda_environment_17612462583/lib/python3.11/site-packages/torch/__init__.py", line 415, in <module> from torch._C import * # noqa: F403 ^^^^^^^^^^^^^^^^^^^^^^ ImportError: libarm_compute.so: cannot open shared object file: No such file or directory ``` Due to missing dependencies. Current Error: File torch-2.10.0.dev20250910+cu130-cp310-cp310-linux_aarch64.whl is extracted File is repackaged as torch-2.10.0.dev20250910+cu130-cp310-cp310-manylinux_2_28_aarch64.whl File torch-2.10.0.dev20250910+cu130-cp310-cp310-linux_aarch64.whl renamed as torch-2.10.0.dev20250910+cu130-cp310-cp310-manylinux_2_28_aarch64.whl Hence the repackaging does not take any effect. This PR does following File torch-2.10.0.dev20250910+cu130-cp310-cp310-linux_aarch64.whl is extracted File torch-2.10.0.dev20250910+cu130-cp310-cp310-linux_aarch64.whl deleted File is repackaged as torch-2.10.0.dev20250910+cu130-cp310-cp310-manylinux_2_28_aarch64.whl Looks like after migrating from zipping the wheel to wheel pack renaming the wheel is no longer necessary. Hence removing renaming and deleting old file. ``` 2025-09-10T10:10:05.9652454Z Using nvidia libs from pypi - skipping CUDA library bundling 2025-09-10T10:10:05.9656595Z Copying to /pytorch/dist/tmp/torch/lib/libgomp.so.1 2025-09-10T10:10:05.9873843Z Copying to /pytorch/dist/tmp/torch/lib/libgfortran.so.5 2025-09-10T10:10:06.0410041Z Copying to /pytorch/dist/tmp/torch/lib/libarm_compute.so 2025-09-10T10:10:06.2869242Z Copying to /pytorch/dist/tmp/torch/lib/libarm_compute_graph.so 2025-09-10T10:10:06.4385740Z Copying to /pytorch/dist/tmp/torch/lib/libnvpl_lapack_lp64_gomp.so.0 2025-09-10T10:10:06.5461372Z Copying to /pytorch/dist/tmp/torch/lib/libnvpl_blas_lp64_gomp.so.0 2025-09-10T10:10:06.5728970Z Copying to /pytorch/dist/tmp/torch/lib/libnvpl_lapack_core.so.0 2025-09-10T10:10:06.6231872Z Copying to /pytorch/dist/tmp/torch/lib/libnvpl_blas_core.so.0 2025-09-10T10:10:14.1503110Z Updated tag from Tag: cp310-cp310-linux_aarch64 2025-09-10T10:10:14.1503482Z to Tag: cp310-cp310-manylinux_2_28_aarch64 2025-09-10T10:10:14.1503682Z 2025-09-10T10:10:41.6498892Z Repacking wheel as /pytorch/dist/torch-2.10.0.dev20250910+cu130-cp310-cp310-manylinux_2_28_aarch64.whl...OK 2025-09-10T10:10:41.9394460Z Renaming torch-2.10.0.dev20250910+cu130-cp310-cp310-linux_aarch64.whl wheel to torch-2.10.0.dev20250910+cu130-cp310-cp310-manylinux_2_28_aarch64.whl ``` Test Plan, Executed on local file: ``` inflating: ubuntu/dist/tmp/torch-2.9.0.dev20250909+cu130.dist-info/WHEEL inflating: ubuntu/dist/tmp/torch-2.9.0.dev20250909+cu130.dist-info/entry_points.txt inflating: ubuntu/dist/tmp/torch-2.9.0.dev20250909+cu130.dist-info/top_level.txt inflating: ubuntu/dist/tmp/torch-2.9.0.dev20250909+cu130.dist-info/RECORD Bundling CUDA libraries with wheel Updated tag from Tag: cp310-cp310-manylinux_2_28_aarch64 to Tag: cp310-cp310-manylinux_2_28_aarch64 Repacking wheel as ubuntu/dist/torch-2.9.0.dev20250909+cu130-cp310-cp310-manylinux_2_28_aarch64.whl...OK Copying torch-2.9.0.dev20250909+cu130-cp310-cp310-manylinux_2_28_aarch64.whl to artifacts Build Complete. Created torch-2.9.0.dev20250909+cu130-cp310-cp310-manylinux_2_28_aarch64.whl.. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162566 Approved by: https://github.com/jeanschmidt, https://github.com/NicolasHug	2025-09-10 14:22:41 +00:00
Tugsbayasgalan (Tugsuu) Manlaibaatar	de05dbc39c	Replace export_for_training with export (#162396 ) Summary: replace export_for_training with epxort Test Plan: CI Rollback Plan: Differential Revision: D81935792 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162396 Approved by: https://github.com/angelayi, https://github.com/jerryzh168	2025-09-10 14:19:34 +00:00
PyTorch MergeBot	fc1b09a52a	Revert "Fix DCE eliminating in-place operations by improving Node.is_impure() (#162267 )" This reverts commit b9a7d0e13b4a34be83c778734dbad437c7c5117b. Reverted https://github.com/pytorch/pytorch/pull/162267 on behalf of https://github.com/malfet due to Not sure how it happened, but looks like it broke everything, see `c2388201fc/1` ([comment](https://github.com/pytorch/pytorch/pull/162267#issuecomment-3275164109))	2025-09-10 14:12:22 +00:00
Alexander Grund	c2388201fc	Fix decorators skipping NCCL tests (#158846 ) Avoid failures caused by tests exiting via sys.exit instead of `unittest.skip` In particular it will not try to start the test (causing forks into subprocess) just to stop them (killing the subprocess) which is done in the test setup Using `unittest.skip` decorators avoids the starting of the test in the first place. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158846 Approved by: https://github.com/Skylion007	2025-09-10 12:25:42 +00:00
Tan Hoang	a6f9e0e62a	[c10d][nvshmem] fix override function modifier (#162515 ) Summary: Fix compilation error in fbsource by missing override modifier Differential Revision: D82038876 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162515 Approved by: https://github.com/Skylion007, https://github.com/kwen2501	2025-09-10 11:35:49 +00:00
Yiming Zhou	337fe1079d	[nativert] AOTI delegate with flat inputs and outputs (#162538 ) Summary: `executorch_call_delegate` should have flattened inputs and outputs. So that it can be correctly serialized and the input/output specs are consistent with runtime. Test Plan: CI Rollback Plan: Differential Revision: D82064354 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162538 Approved by: https://github.com/dolpm	2025-09-10 11:35:44 +00:00
Klaus Zimmermann	b494547f0b	Make functorch notebook symlinks PEP 517 valid (#157813 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/157813 Approved by: https://github.com/zou3519, https://github.com/atalman	2025-09-10 10:13:24 +00:00
dolpm	d9832d8425	[triton][export] serialization in internal path + unit tests (#162200 ) Summary: will package triton artifacts to be runnable in nativert if wrappers exist. Test Plan: unit tests Rollback Plan: Differential Revision: D81368559 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162200 Approved by: https://github.com/angelayi	2025-09-10 09:49:10 +00:00
Menglu Yu	f0ae3a57f6	[Optimus] Add batch dropout pattern (#162443 ) Summary: We observe dropout pattern in AFOC, such add a new pattern to Optimus Test Plan: ``` buck2 test 'fbcode//mode/dev-nosan' fbcode//caffe2/test/inductor:group_batch_fusion -- test_batch_dropout_pre_grad_fusion ``` Buck UI: https://www.internalfb.com/buck2/2c899fb5-6e8b-43eb-8fb3-b53abfbfa6d9 Test UI: https://www.internalfb.com/intern/testinfra/testrun/15762598805248688 Network: Up: 0B Down: 0B (reSessionID-bfbb9e6a-7e2a-425a-a027-b44282cef419) Executing actions. Remaining 0/3 1.3s exec time total Command: test. Finished 2 local Time elapsed: 1:22.3s Tests finished: Pass 2. Fail 0. Fatal 0. Skip 0. Build failure 0 ### E2E baseline f791163796 proposal f793225207 Rollback Plan: Differential Revision: D81981264 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162443 Approved by: https://github.com/Yuzhen11, https://github.com/mlazos	2025-09-10 09:49:01 +00:00
Robert Hardwick	26b3ae5890	Move prioritized text linker optimization code from setup.py to cmake (#160078 ) Note. This is a replica PR of #155901 which will be closed. I had to create a new PR in order to add it into my ghstack as there are some later commits which depend on it. ### Summary 🚀 This PR moves the prioritized text linker optimization from setup.py to cmake ( and enables by default on Linux aarch64 systems ) This change consolidates what was previously manual CI logic into a single location (cmake), ensuring consistent behavior across local builds, CI pipelines, and developer environments. ### Motivation Prioritized text layout has measurable performance benefits on Arm systems by reducing code padding and improving cache utilization. This optimization was previously triggered manually via CI scripts (.ci/aarch64_linux/aarch64_ci_build.sh) or user-set environment variables. By detecting the target architecture within setup.py, this change enables the optimization automatically where applicable, improving maintainability and usability. Note: Due to ninja/cmake graph generation issues we cannot apply the linker file globally to all targets to the targets must be manually defined. See CMakeLists.txt the main libraries torch_python, torch, torch_cpu, torch_cuda, torch_xpu have been targetted which should be enough to maintain the performance benefits outlined above. Co-authored-by: Usamah Zaheer <usamah.zaheer@arm.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160078 Approved by: https://github.com/seemethere	2025-09-10 09:21:53 +00:00
fduwjj	be8095b07f	[DeviceMesh] Clarifying flatten use case (#161311 ) Since we are in the middle of big refactoring and simplying the bookkeeping for device mesh. We found an interesting bug inside DeviceMesh flatten implementation. Here is the finding: 1. In unit test, we assume users can call `dp_cp_mesh._flatten()` many times but no backend will be created (aka cached). 2. From the implementation of slicing, we actually throw exception erroring out doing the `_flatten` more than once. But there is bug which was partially fixed in https://github.com/pytorch/pytorch/pull/160709 but it does not fixed the check for the case when we call the `_flatten` twice. What's more important question to ask is, what behavior we want for `_flatten`? Do we allow calling `_flatten` multiple times (with same mesh_name)? I think we should, why? 1. We allow slicing for the same mesh_name or name_list multiple times, and we cache the PG behinds. Although we will return a new device mesh object everytime, when we compare them they are all the same (according to __eq__). 2. We actually cached the flattened mesh today inside `root_to_flatten_mapping` and actually do the early return but that line will never be reached if we error out before that. Also we should allow a no-op for flatten a 1D mesh into itself's mesh_dim_name, I added a unit test for it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161311 Approved by: https://github.com/fegin	2025-09-10 07:46:51 +00:00
FFFrog	b2d8f6a6af	[OpenReg] Update the docs about Accelerator Integration (#162046 ) Fix the issue describled by this [comment](https://github.com/pytorch/pytorch/pull/161845#discussion_r2317299390) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162046 Approved by: https://github.com/albanD	2025-09-10 07:45:07 +00:00
Huy Do	98e22c8a69	Skip test_ind_worker_queue on Windows and macOS (flaky) (#162555 ) Fixes https://github.com/pytorch/pytorch/issues/68643 It was closed by the bot yesterday and the issue was still there https://github.com/pytorch/pytorch/actions/runs/17595694816/job/49989589647. It's better to just skip it directly in the code as this test has been disabled on Windows and MacOS since 2021 O_o Pull Request resolved: https://github.com/pytorch/pytorch/pull/162555 Approved by: https://github.com/clee2000	2025-09-10 07:05:14 +00:00
PyTorch MergeBot	e1f0a69943	Revert "test fixing benchmarks (#162503 )" This reverts commit 484c4093a87a3e6767e55ed553f95db8fc137442. Reverted https://github.com/pytorch/pytorch/pull/162503 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it regresses CPU perf smoke test ([comment](https://github.com/pytorch/pytorch/pull/162503#issuecomment-3273554680))	2025-09-10 06:55:35 +00:00
Xingyuan Li	833997a6fd	[Inductor][UT] Fix flex attention related inductor cases (#162450 ) ## Motivation Fixes #162435, Fixes #162436 UT failures: * https://github.com/pytorch/pytorch/actions/runs/17523991468/job/49772651636 * https://github.com/pytorch/pytorch/actions/runs/17523991468/job/49772651637 To fix flex attention related cases. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162450 Approved by: https://github.com/drisspg	2025-09-10 06:48:00 +00:00
Benjamin Girard	b9a7d0e13b	Fix DCE eliminating in-place operations by improving Node.is_impure() (#162267 ) Change is_impure to check in-place operations on Node to prevent eliminate_dead_code from eliminating in-place operations. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162267 Approved by: https://github.com/ezyang	2025-09-10 06:02:15 +00:00
dolpm	1c16c18a53	[nativert][triton] improve hardware registration (#162499 ) Summary: att Test Plan: ci Rollback Plan: Differential Revision: D82031814 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162499 Approved by: https://github.com/angelayi	2025-09-10 04:52:57 +00:00
PyTorch MergeBot	96ef26f71a	Revert "[ROCm] Integrate AITER Fav3 fwd kernels (#160105 )" This reverts commit d2393c2d7da03a1523a12e6f80edb6bd7b464ec5. Reverted https://github.com/pytorch/pytorch/pull/160105 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it is failing internal ROCm build ([comment](https://github.com/pytorch/pytorch/pull/160105#issuecomment-3273297183))	2025-09-10 04:42:28 +00:00
Rob Timpe	5ac112b569	[dynamo] Graph break on on user-defined class in compiled region (#161670 ) Currently, user-defined classes inside of a compiled frame will cause the whole frame to be skipped by dynamo. This change defers the Unsupported exception until the __build_class__ builtin is actually called, which allows a graph break to be inserted. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161670 Approved by: https://github.com/williamwen42, https://github.com/guilhermeleobas	2025-09-10 04:39:20 +00:00
Edward Yang	dda071587f	Revert "Make distributed modules importable even when backend not built (#159889 )" (#162568 ) This reverts commit a0d026688cd69583d5a4e0c6f3e5fda141a7f4a9. Revert "Always build USE_DISTRIBUTED. (#160449)" This reverts commit d80297a6846f1f2c36fd4f19e22919f2abe8fcea. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162568 Approved by: https://github.com/huydhn	2025-09-10 04:29:42 +00:00
PyTorch UpdateBot	11acfed3ce	[audio hash update] update the pinned audio hash (#162552 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162552 Approved by: https://github.com/pytorchbot	2025-09-10 04:24:39 +00:00
Nikita Shulga	5f40a8a9a3	[BE] Fix `'_WIN32' is not defined` warning (#162516 ) Summary: As indeed it is not defined neither on Linux nor on MacOS platforms Test Plan: CI Rollback Plan: Differential Revision: D82044853 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162516 Approved by: https://github.com/Skylion007	2025-09-10 04:21:38 +00:00
Huy Do	e64965300a	Repackage vLLM nightlies (#162371 ) I suspected that I would need to repack vLLM wheels from https://github.com/pytorch/pytorch/pull/162000 because I renamed the wheel, and it turns out to be true. The error is as follows: ``` $ uv pip install --pre xformers --index-url https://download.pytorch.org/whl/nightly/cu129 Using Python 3.12.11+meta environment at: venv/py3.12 Resolved 28 packages in 759ms error: Failed to install: xformers-0.0.33.dev20250901+cu129-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (xformers==0.0.33.dev20250901+cu129) Caused by: Wheel version does not match filename: 0.0.33+5d4b92a5.d20250907 != 0.0.33.dev20250901+cu129 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162371 Approved by: https://github.com/atalman	2025-09-10 04:02:34 +00:00
Huy Do	00985970e3	Put torchao (0.13.0) back to benchmark workflow (#162227 ) 0.13.0 was released on Sep 3rd https://pypi.org/project/torchao/#history, which should have fixed the crashing issue on transformers now Pull Request resolved: https://github.com/pytorch/pytorch/pull/162227 Approved by: https://github.com/malfet	2025-09-10 03:56:25 +00:00
angelayi	484c4093a8	test fixing benchmarks (#162503 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162503 Approved by: https://github.com/huydhn ghstack dependencies: #160741	2025-09-10 03:15:49 +00:00
Boyuan Feng	760c478a14	[FlexAttn][Minor] Update FlexConfig doc (#162533 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162533 Approved by: https://github.com/drisspg	2025-09-10 02:03:48 +00:00
Yu Guo	dc4f97e9c1	[triton] enable int64 indexing in convolution and mm template (#162506 ) Summary: hitting illegal memory access issue when compiling conv and addmm kernels with the change in https://github.com/pytorch/pytorch/pull/157767 Differential Revision: D81995664 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162506 Approved by: https://github.com/iseeyuan	2025-09-10 01:53:26 +00:00
Justin Chu	c66e58b7d0	[ONNX] Expose the testing module (#162495 ) * Created a new module `torch/onnx/testing.py` that exposes the `assert_onnx_program` function for testing exported ONNX models. * Updated the ONNX documentation (`docs/source/onnx.md`) to include `onnx_testing` in the list of relevant modules. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162495 Approved by: https://github.com/titaiwangms, https://github.com/xadupre	2025-09-10 01:40:24 +00:00
Tristan Rice	878f59ef75	DeviceMesh: support _rank for use with non-global PGs (#162439 ) Summary: This adds a `_rank` field to DeviceMesh init that allows for instantiating a DeviceMesh without depending on `dist.get_rank()` which requires a global PG to be instantiated. Test Plan: ``` buck2 test mode/opt -c fbcode.enable_gpu_sections=true //caffe2/test/distributed:device_mesh -- init_backend ``` Rollback Plan: Differential Revision: D81981777 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162439 Approved by: https://github.com/kwen2501, https://github.com/fduwjj	2025-09-10 01:18:28 +00:00
Tianyu Liu	e60ad4f628	[DTensor] fix copy_ strategy to support linearity (#162460 ) Fixing issue introduced in https://github.com/pytorch/pytorch/pull/158538 where `aten.copy_.default` is registered as a pointwise op, but without linearity. In particular, when both `src` and `dst` tensors have same `Partial` placements, direct copy should happen without redistribute, instead of redistributing both to `Replicate` before making the copy. This was discovered from silent incorrect results e.g. on `torch.einsum` backward. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162460 Approved by: https://github.com/zpcore	2025-09-10 00:47:14 +00:00
PyTorch MergeBot	2281d009e5	Revert "[ROCm] Add specific compile options for CK SDPA (#161759 )" This reverts commit d22d916719eb7daff8455a01d216d65f81899a9e. Reverted https://github.com/pytorch/pytorch/pull/161759 on behalf of https://github.com/huydhn due to Sorry for reverting your change but this seems to break internal ROCm jobs ([comment](https://github.com/pytorch/pytorch/pull/161759#issuecomment-3272807726))	2025-09-10 00:44:30 +00:00
Saurabh Mishra	33589374b6	[DCP] Avoid multiple storage writer resets in async save (#159448 ) Summary: Avoid multiple storage writer resets in async save. Currently the reset gets called by the async_save method and then again in the save method. In the async path, async_save should only do the staging and the reset should only happen in the synchronous save path. Test Plan: ``` buck test 'fbcode//mode/opt' //aiplatform/modelstore/experimental/DCP/tests:checkpoint_dist_client_test ``` https://www.internalfb.com/intern/testinfra/testrun/15199648841705052 Rollback Plan: Differential Revision: D79230339 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159448 Approved by: https://github.com/meetv18	2025-09-10 00:43:03 +00:00
Animesh Jain	5539916fe1	[dynamo][refactor] Move get_framelocals_idx to a helper (#162519 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162519 Approved by: https://github.com/williamwen42	2025-09-10 00:35:09 +00:00
Laith Sakka	e4174b1fd7	remove gso from collapse_view_helper (#162212 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162212 Approved by: https://github.com/aorenste Co-authored-by: Aaron Orenstein <aorenste@fb.com>	2025-09-10 00:17:15 +00:00
Scott Wolchok	0e7ccc09db	[easy] Don't force copy result of getAllOperatorsFor in init.cpp (#162218 ) It returns a const reference to a vector. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162218 Approved by: https://github.com/Skylion007 ghstack dependencies: #161591, #161595, #161633, #161634, #161692, #162219, #162220	2025-09-10 00:08:15 +00:00
Thomas Bohnstingl	87cc126457	[associative_scan] partial gradient support (#162388 ) This PR tests the partial gradient support of the `associative_scan` operation. It replaces https://github.com/bohnstingl/pytorch/pull/6 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162388 Approved by: https://github.com/ydwu4	2025-09-09 23:52:29 +00:00
PyTorch MergeBot	a3e26d1727	Revert "[dynamo] Graph break on on user-defined class in compiled region (#161670 )" This reverts commit e2545487de3dbbe663e3f0adb699547a14da0f6a. Reverted https://github.com/pytorch/pytorch/pull/161670 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it is failing a trunk test ([comment](https://github.com/pytorch/pytorch/pull/161670#issuecomment-3272626391))	2025-09-09 23:40:26 +00:00
Andy Lugo	d2393c2d7d	[ROCm] Integrate AITER Fav3 fwd kernels (#160105 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/160105 Approved by: https://github.com/jeffdaily	2025-09-09 22:30:12 +00:00
SandishKumarHN	b498299953	154849 Add support to handle IGUSR1 and SIGUSR2 in multiprocessing (#160690 ) Fixes #154849 This change addresses the request to add support for SIGUSR1 and SIGUSR2 signals in torchrun for SLURM environments. Changes supports these signals through the configurable `TORCHELASTIC_SIGNALS_TO_HANDLE` environment variable and signals_to_handle parameter from laucher api Tests: For validations purpose: test_signal_handling.py, simple_test_api_signal_handling.py, Unit Tests: for launcher changes:launcher/test_api.py for api changes: multiprocessing/test_api.py E2E: test_run.py Pull Request resolved: https://github.com/pytorch/pytorch/pull/160690 Approved by: https://github.com/fduwjj	2025-09-09 22:23:06 +00:00
Howard Huang	4d66a3b894	fix Dtensor doc link (#162494 ) Small fix for https://docs.pytorch.org/docs/main/distributed.tensor.parallel.html <img width="890" height="274" alt="image" src="https://github.com/user-attachments/assets/6ee7fc7c-e0fe-4f5e-ab7e-a895bb3fa79f" /> now it is: <img width="909" height="320" alt="image" src="https://github.com/user-attachments/assets/8b2c41ef-1684-4597-8dae-144b49723796" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162494 Approved by: https://github.com/XilunWu	2025-09-09 22:10:37 +00:00
Rob Timpe	e2545487de	[dynamo] Graph break on on user-defined class in compiled region (#161670 ) Currently, user-defined classes inside of a compiled frame will cause the whole frame to be skipped by dynamo. This change defers the Unsupported exception until the __build_class__ builtin is actually called, which allows a graph break to be inserted. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161670 Approved by: https://github.com/williamwen42, https://github.com/guilhermeleobas	2025-09-09 21:07:49 +00:00
Ke Wen	8922bbcaab	Use same NVSHMEM version across CUDA builds (#162206 ) #161321 bumped NVSHMEM version to 3.3.24 for CUDA 13, leaving CUDA 12 with 3.3.20. This PR bumps the NVSHMEM version to 3.3.24 for CUDA 12 as well. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162206 Approved by: https://github.com/tinglvv, https://github.com/Skylion007	2025-09-09 20:59:50 +00:00
atalman	14744e1ab2	[Release 2.9] Add compatibility matrix, Version Bump (#162526 ) Release 2.9 1. Add release compatibility matrix 2. Add version bump for 2.10 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162526 Approved by: https://github.com/malfet	2025-09-09 20:38:15 +00:00
Jeff Daily	b477fb106f	[ROCm] enable grouped gemm fallback (#162419 ) Enables bf16 group gemm alternative path as described in #161366 Fast path will be enabled in future through CK integration. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162419 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-09 20:04:56 +00:00
Andy Lugo	d22d916719	[ROCm] Add specific compile options for CK SDPA (#161759 ) Updates CK version and adds CK specific compilation options Pull Request resolved: https://github.com/pytorch/pytorch/pull/161759 Approved by: https://github.com/jeffdaily	2025-09-09 20:04:19 +00:00
morrison-turnansky	86d34a43f5	NamedTuple: Allow side effects for dynamic attributes (#161645 ) I confirmed that the tracing was correct i.e. NamedTupleVariable had the correct dynamic attribute added to it. The problem was that NamedTupleVariable was always marked as immutable. This does not reflect the behavior of namedtuple. Subclasses of namedtuple may be mutable, so when a NamedTupleVariable is derived from a subclass that is mutable, I made NamedTupleVariable mutable as well. Then side_effects correctly updates the returned object. Fixes #161610 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161645 Approved by: https://github.com/anijain2305, https://github.com/StrongerXi	2025-09-09 19:42:02 +00:00
Huy Do	8508651477	Fix flaky AOTFxirTestCase (#162472 ) Fixes https://github.com/pytorch/pytorch/issues/162357 Fixes https://github.com/pytorch/pytorch/issues/160970 Fixes https://github.com/pytorch/pytorch/issues/161038 Fixes https://github.com/pytorch/pytorch/issues/160951 Fixes https://github.com/pytorch/pytorch/issues/161698 These tests were introduced in https://github.com/pytorch/pytorch/pull/160765 and they are all flaky when `torch._inductor.aot_compile` uses multiple threads (the default option). The issue could be reproduced by running them locally multiple times. For example, ``` pytest --flake-runs 10 --flake-finder -v inductor/test_fxir_backend.py -k test_aoti_fx_add (output logs at P1938386961) ... --------------------------------------------------------------------------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------------------------------------------------------------------------- inductor [('async_compile_cache_miss', 1)] graph_break [] --------------------------------------------------------------------------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------------------------------------------------------------------------- inductor [('async_compile_cache_miss', 1)] graph_break [] --------------------------------------------------------------------------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------------------------------------------------------------------------- inductor [('async_compile_cache_miss', 1)] graph_break [] --------------------------------------------------------------------------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------------------------------------------------------------------------- inductor [('async_compile_cache_miss', 1)] graph_break [] --------------------------------------------------------------------------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------------------------------------------------------------------------- inductor [('async_compile_cache_miss', 1)] graph_break [] --------------------------------------------------------------------------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------------------------------------------------------------------------- inductor [('async_compile_cache_miss', 1)] graph_break [] --------------------------------------------------------------------------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------------------------------------------------------------------------- inductor [('async_compile_cache_miss', 1)] graph_break [] --------------------------------------------------------------------------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------------------------------------------------------------------------- inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 1)] graph_break [] --------------------------------------------------------------------------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------------------------------------------------------------------------- inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 1)] graph_break [] --------------------------------------------------------------------------------------------------------------------------------------------------- Captured stdout call --------------------------------------------------------------------------------------------------------------------------------------------------- inductor [('async_compile_cache_miss', 2), ('async_compile_cache_hit', 1)] graph_break [] ================================================================================================================================================= short test summary info ================================================================================================================================================== FAILED [0.4834s] inductor/test_fxir_backend.py::AOTFxirTestCase::test_aoti_fx_add - AttributeError: 'NoneType' object has no attribute '__code__' FAILED [0.4576s] inductor/test_fxir_backend.py::AOTFxirTestCase::test_aoti_fx_add - AttributeError: 'NoneType' object has no attribute '__code__' FAILED [0.4613s] inductor/test_fxir_backend.py::AOTFxirTestCase::test_aoti_fx_add - AttributeError: 'NoneType' object has no attribute '__code__' =============================================================================================================================================== 3 failed, 7 passed in 12.89s =============================================================================================================================================== ``` Setting `compile_threads` to 1 will get rid of the test flakiness, but there might be underlying issues from https://github.com/pytorch/pytorch/pull/160765. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162472 Approved by: https://github.com/angelayi, https://github.com/Skylion007	2025-09-09 19:39:24 +00:00
rzou	723c27ed78	[standalone_compile] binary format write should be atomic (#162432 ) We update it to call write_atomic instead of file.write Pull Request resolved: https://github.com/pytorch/pytorch/pull/162432 Approved by: https://github.com/oulgen	2025-09-09 18:43:13 +00:00
Benjamin Glass	bdbe931d58	[build] Add LeakSanitizer option to CMake (#158686 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158686 Approved by: https://github.com/eellison	2025-09-09 18:41:20 +00:00
jainapurva	af60398c3a	Update the operator benchmarking, to benchmark using torch.compile (#161394 ) This pull request enhances the PyTorch operator benchmarking suite by introducing support for benchmarking with `torch.compile` mode, in addition to existing Eager and JIT. It also adds peak memory measurement (fwd/bwd pass); improves the output format in JSON to be used by dashboard for reporting; and introduce some more CLI options. The new CLI flags introduced are: - Added `--use-compile` CLI argument and corresponding logic to run benchmarks using `torch.compile`, including mutual exclusivity with `--use-jit` - Added `--benchmark-name` argument for customizing the benchmark name in output - Updated default value for `--output-json-for-dashboard` to `benchmark-results.json` for more predictable output file name Sample command to run a single operator: `python -m pt.mm_test --use-compile` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161394 Approved by: https://github.com/jbschlosser	2025-09-09 18:17:37 +00:00
PyTorch MergeBot	82f1eb9b03	Revert "[MPS] mps sparse mul op implementation (#162349 )" This reverts commit 3ea686804925f1291de57ffdb3394da0b46deb54. Reverted https://github.com/pytorch/pytorch/pull/162349 on behalf of https://github.com/malfet due to Fails trunk tests, with uint8 sum ([comment](https://github.com/pytorch/pytorch/pull/162349#issuecomment-3271783442))	2025-09-09 18:14:16 +00:00
Brian Hirsh	4b2d297eec	python fastpath for DTensor detach(), confirm that aliasing DTensorSpec is ok (#160580 ) My goal right now is to try to make the "vanilla" AccumulateGrad path for DTensor (that just calls detach) fast. I'm doing this in two steps: (1) [this PR]: hardcode aten.detach in DTensor to re-use the input tensor's DTensorSpec, instead of running "real" sharding prop. (2) [assuming success of 1]: move the detach() call into C++, try adding a DTensor dispatch key, and avoid dispatching back to python entirely (except for some code that probably needs to allocate a pyobject for the output DTensor, from C++) I'm pushing this PR first to confirm that I don't break anything with my detach fastpath. I did some manual local testing to confirm that for normal usages of detach, the input and output DTensor have equal DTensorSpec objects. Technically, we previously would allocate a fresh DTensorSpec, and with this change we are just re-using the input tensor's DTensorSpec. So I'm mostly hoping that DTensorSpecs don't generally get mutated This by itself does seem to speed up `alias` by quite a bit (roughly 2.5x speedup, from ~336us -> 133us): aten.detach(plain_tensor) ``` <torch.utils.benchmark.utils.common.Measurement object at 0x7f8da2921790> _ = x.detach() 4.80 us 1 measurement, 100000 runs , 1 thread ``` aten.detach(DTensor) [before this PR] ``` <torch.utils.benchmark.utils.common.Measurement object at 0x7f47cd68e750> _ = x_dt.detach() 336.40 us 1 measurement, 1000 runs , 1 thread ``` aten.detach(DTensor) [after this PR] ``` <torch.utils.benchmark.utils.common.Measurement object at 0x7f0a34c05520> _ = x_dt.detach() Median: 133.45 us 2 measurements, 1000 runs per measurement, 1 thread ``` benchmark script: ``` import torch import torch.distributed as dist from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard from torch.testing._internal.distributed.fake_pg import FakeStore import torch.utils.benchmark as benchmark fake_store = FakeStore() dist.init_process_group("fake", store=fake_store, rank=0, world_size=2) mesh = torch.distributed.device_mesh.init_device_mesh('cuda', (2,)) x = torch.randn(4, 4, requires_grad=True) x_dt = DTensor.from_local(x, mesh, [Shard(0)], run_check=False) t0 = benchmark.Timer( stmt='_ = x_dt.detach()', globals={'x_dt': x_dt}, ) print(t0.blocked_autorange()) dist.destroy_process_group() ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160580 Approved by: https://github.com/ezyang	2025-09-09 18:04:56 +00:00
Jane Xu	0ec723acd0	Update docs for quantile to be clearer for nearest (#162423 ) Correct the rounding scheme for nearest in quantile. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162423 Approved by: https://github.com/soulitzer	2025-09-09 18:04:12 +00:00
Howard Huang	e1be887870	[PP] Add spacing to visualizer (#160474 ) When visualizing the schedules using `_PipelineScheduleExecution`, we don't provide any spacing between dependencies, so when visualizing `DualPipeV` it looks like this: <img width="3168" height="486" alt="image" src="https://github.com/user-attachments/assets/d2c881ad-4ee0-46b6-ac03-13e5600b5a55" /> While it has the correct order of operations, it does not show the dependencies correctly. As shown in the original implementation, it should look something like this: <img width="3542" height="384" alt="image" src="https://github.com/user-attachments/assets/c930fa98-848e-4951-a58b-c81f41092d14" /> This allows an option to add spacing to the visualizer, so it is easier to see dependencies. After change: <img width="3633" height="486" alt="image" src="https://github.com/user-attachments/assets/7708367e-bdb4-46e8-a7c4-f19e18047f59" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160474 Approved by: https://github.com/fegin	2025-09-09 17:52:52 +00:00
Ruben Rodriguez Buchillon	d91eecc9a5	[inductor][template heuristics] don't take layout to generate choices (#162238 ) # why - unnecessary as we only ever need to know the dtype and maybe the device - we already take in the kernel inputs which have the device - enable us to specify the layout after finding all the configs but before generating the ChoiceCallers # what - replace all calls in template_heuristics that used to take Layout with now just taking out_dtype # testing ci Differential Revision: [D81820115](https://our.internmc.facebook.com/intern/diff/D81820115) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162238 Approved by: https://github.com/eellison ghstack dependencies: #161347, #161348, #161349	2025-09-09 17:17:04 +00:00
Ruben Rodriguez Buchillon	24a4dae85b	[inductor] V.choices.get_mm_configs override point (#161349 ) # why - enable us to override the default configs, or fall back to them through subclassing InductorChoices # what - override (private) function - default implementationt takes the kernel template choice (ktc) generator for every template and just executes the generator - future overrides can decide to replace those generators, or filter out choices - the 2nd expensive step (maybe_append_choices, choice_or_none) is handled outside this function, in the main V.choices.get_mm_configs this means that any overriding benefits from not generating expensive templates that aren't going to be used # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520570](https://our.internmc.facebook.com/intern/diff/D81520570) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161349 Approved by: https://github.com/eellison ghstack dependencies: #161347, #161348	2025-09-09 17:17:04 +00:00
Ruben Rodriguez Buchillon	d3c4cf838e	[inductor][ez] V.choices.get_mm_configs returns list of ChoiceCallers (#161348 ) \# why - every callsite just executes the generator on the spot - previous pr adds the ability to add an override before expensive generators are executed, so we don't need this generator anymore \# what - rather than yielding the ChoiceCaller, just return the list of all valid ChoiceCallers \# testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520574](https://our.internmc.facebook.com/intern/diff/D81520574) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161348 Approved by: https://github.com/eellison ghstack dependencies: #161347	2025-09-09 17:16:57 +00:00
Ruben Rodriguez Buchillon	b1e99c8c7a	[inductor] add kernel template choice (ktc) (#161347 ) # why - gather everything up to make choices, without running potentially expensive generators - enables overrides where we toss the entire list of configs from inductor, without having to enumrate it (expensive) # what - add a holding class that just gets all the components necessary to generate a ChoiceCaller - use that class to generate ChoiceCallers - this does not (yet) add the override function, but just prepares the scene ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520569](https://our.internmc.facebook.com/intern/diff/D81520569) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161347 Approved by: https://github.com/eellison	2025-09-09 17:16:50 +00:00
Eddie Yan	5eb35d2ab8	[CUDA][float8][TF32] Disable tf32 for vs. emulated rowwise comparison (#162387 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162387 Approved by: https://github.com/Skylion007	2025-09-09 17:04:06 +00:00
Jeff Daily	f03d635dc6	[ROCm][CI] skip test_max_autotune until resolved (#162496 ) many tests taking >30 min and causing timeouts Pull Request resolved: https://github.com/pytorch/pytorch/pull/162496 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-09 16:34:01 +00:00
Hashem Hashemi	1f0b01d4b6	[ROCm] OffsetCalc Unroll Optimization (#161700 ) Our compiler is generating inefficient code for the offsetCalc in certain situations. The root-cause for this needs to be identified. For now specialized unrolling based on 'dims' notably helps perf. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161700 Approved by: https://github.com/jeffdaily	2025-09-09 16:11:48 +00:00
Prachi Gupta	c0142f5c06	[ROCm] Enabling several UTs (#161715 ) All these UTs are working as is, just removing the skip - test_p2p_ipc - test_repros.py: working, added fp8 support - test_activation_checkpointing.py - test_content_store.py - test_cuda_multigpu.py - test_compute_comm_reordering.py - test_segment_reductions.py - test_dataloader.py - test_math_ops.py - test_loop_ordering.py - test_control_flow.py - distributed_test.py - test_mem_tracker.py - test_fsdp_optim_state.py - test_fully_shard_mixed_precision.py: skippped for < ROCm7.0 - test_aot_inductor_custom_ops.py - test_c10d_ops_nccl.py - test_eager_transforms.py - test_sparse_csr.py - test_inductor_collectives.py - test_fake_tensor.py - test_cupy_as_tensor.py - test_cuda.py: enable UTs that are working - test_matmul_cuda.py: enable UTs that are working Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/161715 Approved by: https://github.com/msaroufim Co-authored-by: Mark Saroufim <marksaroufim@fb.com>	2025-09-09 15:49:21 +00:00
Isalia20	3ea6868049	[MPS] mps sparse mul op implementation (#162349 ) Implements mps sparse mul operation as well as enables other operations such as: 1. copy_ 2. div 3. sum 4. floor 5. power 6. sub 7. floor_divide Pull Request resolved: https://github.com/pytorch/pytorch/pull/162349 Approved by: https://github.com/pearu, https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-09-09 15:45:37 +00:00
Jack Taylor	be3b8d2ec9	[ROCm][CI] update fbgemm nightly benchmark hash (#162385 ) fbgemm_gpu was failing to clone due to missing submodule commit. ``` + pushd fbgemm/fbgemm_gpu ~/pytorch/fbgemm/fbgemm_gpu ~/pytorch + git checkout 7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8 --recurse-submodules fatal: failed to unpack tree object b1281b8b08d973a7064f864f47eeb30f3e2596e9 error: Submodule 'external/composable_kernel' could not be updated. error: Cannot update submodule: external/composable_kernel ``` Log File [inductor-periodic · pytorch/pytorch@5babb4d](https://github.com/pytorch/pytorch/actions/runs/17536630806/job/49802458834) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162385 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-09 15:44:39 +00:00
PyTorch MergeBot	5ccf3ca3ec	Revert "Use same NVSHMEM version across CUDA builds (#162206 )" This reverts commit 0d9c95cd7ee299e2e8c09df26d395be8775b506b. Reverted https://github.com/pytorch/pytorch/pull/162206 on behalf of https://github.com/malfet due to Broke lint, see `4dd73e659a/1` ([comment](https://github.com/pytorch/pytorch/pull/162206#issuecomment-3271040521))	2025-09-09 14:40:45 +00:00
atalman	e38e953432	CUDA 13.0 Windows Nvidia Driver Update to 580.88 (#162425 ) Related to https://github.com/pytorch/pytorch/issues/162333 https://github.com/pytorch/pytorch/issues/159779 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162425 Approved by: https://github.com/tinglvv, https://github.com/malfet	2025-09-09 14:40:34 +00:00
PyTorch MergeBot	4dd73e659a	Revert "fix torch.sparse.log_softmax on CPU (#161959 )" This reverts commit 002e59440afe8711019e68df500f5e18b9a43f3c. Reverted https://github.com/pytorch/pytorch/pull/161959 on behalf of https://github.com/davidberard98 due to test failure: test_sparse.py::TestSparseMPS::test_log_softmax_float_mps_float32 [GH job link](https://github.com/pytorch/pytorch/actions/runs/17573794461/job/49915138287) [HUD commit link](`002e59440a`) ([comment](https://github.com/pytorch/pytorch/pull/161959#issuecomment-3270509418))	2025-09-09 12:33:25 +00:00
Ke Wen	0d9c95cd7e	Use same NVSHMEM version across CUDA builds (#162206 ) #161321 bumped NVSHMEM version to 3.3.24 for CUDA 13, leaving CUDA 12 with 3.3.20. This PR bumps the NVSHMEM version to 3.3.24 for CUDA 12 as well. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162206 Approved by: https://github.com/tinglvv, https://github.com/Skylion007	2025-09-09 08:52:27 +00:00
Scott Wolchok	dcc42e95f4	Fix missing moves in initJITBindings (#162428 ) Per @Skylion007 on #162219 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162428 Approved by: https://github.com/Skylion007	2025-09-09 08:47:33 +00:00
Sun, Jiayi	002e59440a	fix torch.sparse.log_softmax on CPU (#161959 ) Fix https://github.com/pytorch/pytorch/issues/152293. Example: ``` import torch from torch.sparse import log_softmax as sparse_log_softmax def test_bug(): a = torch.rand(4, 3) b = a - 10000000.0 b_sparse = b.to_sparse() cpu_out_sparse = sparse_log_softmax(b_sparse, dim=1).to_dense() print('cpu_out_sparse =', cpu_out_sparse) b_sparse_double = b.double().to_sparse() cpu_out_sparse_double = sparse_log_softmax(b_sparse_double, dim=1).to_dense() print('cpu_out_sparse_double =', cpu_out_sparse_double) if __name__ == '__main__': test_bug() ``` Output: - before ``` cpu_out_sparse = tensor([[-2., -1., -2.], [-1., -1., -1.], [-1., -2., -2.], [-1., -1., -2.]]) cpu_out_sparse_double = tensor([[-1.5514, -0.5514, -1.5514], [-1.0986, -1.0986, -1.0986], [-0.5514, -1.5514, -1.5514], [-0.8620, -0.8620, -1.8620]], dtype=torch.float64) ``` - after ``` cpu_out_sparse = tensor([[-0.8620, -1.8620, -0.8620], [-1.0986, -1.0986, -1.0986], [-1.8620, -0.8620, -0.8620], [-1.0986, -1.0986, -1.0986]]) cpu_out_sparse_double = tensor([[-0.8620, -1.8620, -0.8620], [-1.0986, -1.0986, -1.0986], [-1.8620, -0.8620, -0.8620], [-1.0986, -1.0986, -1.0986]], dtype=torch.float64) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161959 Approved by: https://github.com/Skylion007	2025-09-09 06:25:16 +00:00
Huy Do	4840a1a591	Run vLLM tests on all trunk commits before 2.9 branch cut (#161797 ) This makes it easier to bisect issue now given that we don't have lots of time. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161797 Approved by: https://github.com/yangw-dev	2025-09-09 05:56:41 +00:00
Yang Wang	d49205fe1f	Add more tests for vllm and clean out the old vllm test (#162292 ) Test failure coverage from pytorch 2.8 release issues [internal access only](https://docs.google.com/document/d/1zvK1eUAHubHGGHg9jKxd-QlP89fzgfqOBvE2m9mUs90/edit?tab=t.0 ) See coverage mapping \| Given test / pattern \| Suite ID (from config) \| \|---\|---\| \| pytest -v -s basic_correctness/test_cumem.py \| vllm_basic_correctness_test \| \| pytest -v -s entrypoints/openai/test_sleep.py \| vllm_entrypoints_test \| \| pytest -v -s entrypoints/openai/test_translation_validation.py::test_long_audio_request \| vllm_entrypoints_test \| \| pytest -v -s lora/test_quant_model.py \| vllm_lora_28_failure_test \| \| pytest -v -s -x tests/lora/test_llama_tp.py \| vllm_lora_tp_test_distributed \| \| pytest -v -s distributed/test_sequence_parallel.py -k test_tp_sp_generation \|vllm_distributed_test_28_failure_test \| \| pytest -v -s distributed/test_sequence_parallel.py::test_tp_sp_generation[...] \| vllm_distributed_test_28_failure_test \| \| pytest models/language/generation/test_mistral.py::test_models[...] \| vllm_languagde_model_test_extended_generation_28_failure_test \| \| pytest models/multimodal/pooling/test_jinavl_reranker.py::test_model_text_image[...] \| vllm_multi_model_test_28_failure_test \| \| tests/lora/test_qwen2vl.py::test_qwen2vl_lora \| vllm_lora_test \| \| tests/lora/test_qwen2vl.py::test_qwen25vl_lora \| vllm_lora_test \| \| tests/lora/test_qwen2vl.py::test_qwen2vl_lora_beam_search \| vllm_lora_test \| \| tests/lora/test_phi.py::test_phi2_lora \| DIDN'T FIND IT IT IN VLLM \| \| models/multimodal/generation/test_voxtral.py::test_models_with_multiple_audios[5-128-half] \| vllm_multi_model_test_28_failure_test \| \| models/test_initialization.py::test_can_initialize[VoxtralForConditionalGeneration] \| vllm_basic_models_test \| \| pytest -v -s -x lora/test_chatglm3_tp.py -k test_chatglm3_lora_tp4_fully_sharded_loras \| vllm_lora_tp_test_distributed \| Pull Request resolved: https://github.com/pytorch/pytorch/pull/162292 Approved by: https://github.com/atalman, https://github.com/huydhn	2025-09-09 05:53:46 +00:00
James Wu	d85392a88e	Add BundledAOTAutogradSerializableCallable (#162170 ) This PR hooks up the python wrapper inductor backend to aot_compile. This is not the best way for us to grab the output of AOTAutograd; that involves a refactor to make AOTAutograd itself return a serializable callable. I'll do that refactor soon, but I want a basic interface to test with for now. In the medium term, we'll want aot_compile to call AOTAutograd directly, instead of using the TorchInductorWrapper's callback through compile_fx. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162170 Approved by: https://github.com/zhxchen17 ghstack dependencies: #162169	2025-09-09 05:42:19 +00:00
Chien-Chin Huang	7feb8fc589	[SymmMEM] Allow to import _SymmetricMemory when NVSHMEM is not available (#162142 ) Summary: As we have multiple backends, _SymmetricMemory should not be imported together with NVSHMEM related modules Pull Request resolved: https://github.com/pytorch/pytorch/pull/162142 Approved by: https://github.com/dcci, https://github.com/kwen2501	2025-09-09 05:37:43 +00:00
PyTorch MergeBot	60d009267e	Revert "testing infra and some fixes (#162183 )" This reverts commit d8b6622bb6a3879d3832ab6cdc26ff4188ea4a2d. Reverted https://github.com/pytorch/pytorch/pull/162183 on behalf of https://github.com/huydhn due to Failing a test on macos ([comment](https://github.com/pytorch/pytorch/pull/162183#issuecomment-3268922096))	2025-09-09 05:26:32 +00:00
Isuru Fernando	4590438329	[fx] fix qualified name for methods of torch.Tensor (#162407 ) This fixes an error in the previous PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162407 Approved by: https://github.com/ezyang, https://github.com/XuehaiPan	2025-09-09 05:14:43 +00:00
Jeffro	8494afb837	Add missing fstream include to fix std::ofstream compilation error (#162421 ) ## Summary This PR adds a missing `#include <fstream>` to fix a compilation error that occurred with the clang compiler on the standard Google internal compile setup (built with bazel). ## Details The `std::ofstream` type was implicitly instantiated, which can cause compilation to fail with certain compilers. In this case, the clang compiler within the Google internal compile setup failed with an implicit instantiation error of `std::basic_ofstream<char>`. By explicitly including the `<fstream>` header, this PR resolves the error and ensures proper compilation in a wider range of setups and compilers. ## Error message: ``` torch/csrc/distributed/c10d/FlightRecorder.cpp:8:17: error: implicit instantiation of undefined template 'std::basic_ofstream<char>' 8 \| std::ofstream file(filename_, std::ios::binary); \| ^ libcxx/include/__fwd/fstream.h:26:7: note: template is declared here 26 \| class basic_ofstream; \| ^ 1 error generated. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162421 Approved by: https://github.com/ezyang	2025-09-09 05:14:32 +00:00
PyTorch UpdateBot	7ad40de60e	[audio hash update] update the pinned audio hash (#162437 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162437 Approved by: https://github.com/pytorchbot	2025-09-09 04:41:34 +00:00
PyTorch UpdateBot	607327beae	[vllm hash update] update the pinned vllm hash (#162356 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162356 Approved by: https://github.com/pytorchbot	2025-09-09 04:40:25 +00:00
Ke Wen	f216d64bfe	[SymmMem] Better tuning of A2AV based on accurate node boundary (#162003 ) Use `world_within_direct_access()` to distinguish intra- vs inter- node. Previously we assumed a fixed node size of 8, which is not true for NVL72. Also added env var `TORCH_SYMMMEM_NBLOCKS` for control. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162003 Approved by: https://github.com/ngimel, https://github.com/fduwjj	2025-09-09 04:18:17 +00:00
Nikita Shulga	847d7f21af	[CUDA-13] Implement workaround for cudaErrorNotSupported (#162412 ) See https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162412 Approved by: https://github.com/eqy, https://github.com/atalman	2025-09-09 04:12:10 +00:00
Ke Wen	065c446193	[SymmMem] Use global pe for put and get (#162394 ) NVSHMEM put/get APIs take global PE instead of local counterpart. So we'd need to do a translation within the kernel. Also added a sub-group test for dispatch and combine mimic'ing the Expert Parallel cases. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162394 Approved by: https://github.com/ngimel, https://github.com/fegin ghstack dependencies: #162320	2025-09-09 03:58:48 +00:00
Ke Wen	98ecc0f374	[SymmMem] Add team pool to hold duplicated teams for the same rank group (#162320 ) When multiple threadblocks call device-side collectives concurrently, NVSHMEM requires each call being made on a separate team struct, see [Collective operations scopes and active sets](https://docs.nvidia.com/nvshmem/api/gen/api/collectives.html?highlight=nvshmem_barrier_all#collective-operations-scopes-and-active-sets). This PR adds a util `get_n_teams` for creating duplicated nvshmem teams for the same rank group, i.e. team pool. So that we can use them on device side. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162320 Approved by: https://github.com/ngimel	2025-09-09 03:58:48 +00:00
Arsh Zahed	4c45090cf7	[DTensor] Check if tracing for sharding propagation to handle unhashable keys (#160798 ) Fixes #159590 This is similar to the reverted commit #156868, except it resolves an issue with two caches becoming misaligned, leading to incorrect objects for stateful placements (i.e. `_MaskPartial`) as in issue #159601. This adds little to no overhead in eager ([see past benchmarks](https://github.com/pytorch/pytorch/pull/156868#issuecomment-3047831149)). This also handles cases such as #159590 where dynamo is disabled during tracing by entering the Python Dispatcher ahead of the sharding propogation during compile. Tests are added/modified to handle these, and the list/tuple inputs with the cat op. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160798 Approved by: https://github.com/bdhirsh	2025-09-09 03:52:05 +00:00
PyTorch MergeBot	1641606aa4	Revert "Add BundledAOTAutogradSerializableCallable (#162170 )" This reverts commit 5babb4d5c04b1ff7ed5f96f7aea1898cd4faef5a. Reverted https://github.com/pytorch/pytorch/pull/162170 on behalf of https://github.com/huydhn due to This PR has a merge conflict with D81793200 on aot_compile.py where PRs and diffs are landed in reverted order ([comment](https://github.com/pytorch/pytorch/pull/162170#issuecomment-3268735428))	2025-09-09 03:33:36 +00:00
Shunting Zhang	7b8a64557d	[inductor] fix 3d tiled online softmax (#162341 ) The online_softmax_reduce runtime helper previously assumes the input tl.Tensor's are 2d tensors. But with tiled reduction, they can be 3d (y, x, r). Pull Request resolved: https://github.com/pytorch/pytorch/pull/162341 Approved by: https://github.com/jansel, https://github.com/eellison ghstack dependencies: #162311	2025-09-09 02:59:52 +00:00
Tugsbayasgalan Manlaibaatar	d8b6622bb6	testing infra and some fixes (#162183 ) This PR is quite large in that it covers most of rough edges in the new strict export flow: 1. Handle nn_module_stack correctly now that we are tracing wrapper module 2. module_call_spec needs to get queried from source directly because we are not running the bytecode anymore. 3. Correct input and output handling. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162183 Approved by: https://github.com/zhxchen17 ghstack dependencies: #162167	2025-09-09 02:42:11 +00:00
Yiming Zhou	a965f09793	[export] Update PT2 archive docs (#162308 ) Summary: Minor updates based on the recent refactoring for weight saving and loading Test Plan: doc change only Rollback Plan: Differential Revision: D81821994 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162308 Approved by: https://github.com/angelayi	2025-09-09 02:08:13 +00:00
Kurt Mohler	583bbf7761	[MPS] Add `native_dropout` and `native_dropout_backward` (#162108 ) Fixes #162002 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162108 Approved by: https://github.com/malfet	2025-09-09 01:44:06 +00:00
Scott Wolchok	e025c0f459	Dynamo: set_eval_frame microoptimization (#162220 ) Optimize for common case and remove a pair of refcount operations (see new comments.) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162220 Approved by: https://github.com/jansel, https://github.com/williamwen42 ghstack dependencies: #161591, #161595, #161633, #161634, #161692, #162219	2025-09-09 01:10:06 +00:00
Scott Wolchok	a8a187b2cf	Overload _get_operation_for_overload_or_packet & friends to accept ArrayRef (#162219 ) Avoids requiring vector allocation to call this. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162219 Approved by: https://github.com/Skylion007 ghstack dependencies: #161591, #161595, #161633, #161634, #161692	2025-09-09 01:10:06 +00:00
Scott Wolchok	12db2a7889	Call checkLong in is_int_or_symint, completing TODO (#161692 ) Calling this first minimizes overhead for plain old ints, making cheap things cheap. Differential Revision: [D81530098](https://our.internmc.facebook.com/intern/diff/D81530098) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161692 Approved by: https://github.com/ezyang, https://github.com/Skylion007 ghstack dependencies: #161591, #161595, #161633, #161634	2025-09-09 01:10:06 +00:00
Scott Wolchok	eab2afeff7	fastpath type Tensor in THPVariable_NewWithVar (#161634 ) It is cheap to do an exact check against Tensor and much faster when it works (PyType_IsSubtype does not have this fastpath, I checked [source](`9ee0214b5d/Objects/typeobject.c (L2889)`)). Spot-checked in perf on detach-DTensor-in-a-loop benchmark; small win but clear. Differential Revision: [D81530101](https://our.internmc.facebook.com/intern/diff/D81530101) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161634 Approved by: https://github.com/Skylion007, https://github.com/albanD ghstack dependencies: #161591, #161595, #161633	2025-09-09 01:10:06 +00:00
Scott Wolchok	a951f435fd	Avoid redundant PyTuple_GetSize call in _maybe_handle_torch_function (#161633 ) py::args::size() calls PyTuple_GetSize. Compiler can't know the two calls will always return the same result, so we have to consolidate them ourselves. Differential Revision: [D81530096](https://our.internmc.facebook.com/intern/diff/D81530096) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161633 Approved by: https://github.com/ezyang, https://github.com/Skylion007 ghstack dependencies: #161591, #161595	2025-09-09 01:10:06 +00:00
karthickai	6eb14ac60f	[Inductor] Fix cross-device scalar lowering - cpu scalar with cuda tensor fails in torch.compile (#161447 ) This PR fixes bug in TorchInductor where cross-device scalar indexing fails during compilation, causing discrepancies from eager mode behavior. Fixes: #140457 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161447 Approved by: https://github.com/mlazos	2025-09-09 01:07:02 +00:00
PyTorch MergeBot	ed77e23b68	Revert "[dynamo] Constant fold torch.autograd._profiler_enabled (#158482 )" This reverts commit d7e1b8b11d7430c7633dcad6f6596b5df8fa02f7. Reverted https://github.com/pytorch/pytorch/pull/158482 on behalf of https://github.com/borgstrom due to NCCL hangs in S560336 ([comment](https://github.com/pytorch/pytorch/pull/158482#issuecomment-3268426781))	2025-09-09 00:21:05 +00:00
Ting Lu	897c4e70a7	Move to small wheel approach for CUDA SBSA wheel (#160720 ) https://github.com/pytorch/pytorch/issues/160673 Use download.pytorch.org's dependencies like x86 build instead of bundling libs into the wheel Pull Request resolved: https://github.com/pytorch/pytorch/pull/160720 Approved by: https://github.com/atalman	2025-09-09 00:18:43 +00:00
Zhengxu Chen	8485aac873	[precompile] Fix inlined source tracking with generators. (#162389 ) Summary: When compiled code has generator, code.co_firstlineno will be inconsistent with the result from inspect.getsource, which returns the toplevel enclosing code source rather than the inner code location. In this case, it seems simpler to just use the toplevel enclosing code location rather than the co_firstlineno field. Test Plan: test_package.py -k test_code_with_generator Rollback Plan: Differential Revision: D81929751 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162389 Approved by: https://github.com/dolpm, https://github.com/hrithick-codes	2025-09-09 00:13:54 +00:00
atalman	c0fc86b511	Fix aarch64 wheel pack (#159481 ) PR that introduced the change: https://github.com/pytorch/builder/pull/1775 Use wheel pack instead of zip to repack the wheel. It should regenerate the RECORD file and update all the hashes correctly. TODO: Apply wheel pack instead of zip to Rest of builds Add validation test to make sure wheel contents matches RECORD file Pull Request resolved: https://github.com/pytorch/pytorch/pull/159481 Approved by: https://github.com/malfet	2025-09-08 23:36:50 +00:00
Thomas Bohnstingl	07f07309c6	[associative_scan] Autograd separated (#139939 ) This PR implements the Autograd feature of the associative_scan. Pull Request resolved: https://github.com/pytorch/pytorch/pull/139939 Approved by: https://github.com/huydhn	2025-09-08 23:30:11 +00:00
Laith Sakka	189a054cfb	Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. [attempt2] (#160869 ) [relanding again after fixing internal build] Summary: This might cause some new DDEs on call sites that do not use is_contiguous_or_false() or sym_is_contiguous() but want to find those call sites to handle this properly by calling is_contiguous_or_false() and not is_contiguous() explitly when appropriate. I had to fix one issue after removing the implicit size oblivious reasoning. here is context we defined in this https://github.com/pytorch/pytorch/pull/157472 sym_is_contiguous to be the function computing contiguity for dynamic shapes in c++. It returns a symbolic expression that represents contiguity and guaranteed not to throw a DDE. when people call is_contiguous we do sym_is_contiguous().guard_bool() when people call is_contiguous_or_false we do sym_is_contiguous().guard_or_false() one issue not handled well was this path ``` c10::SymBool TensorImpl::sym_is_contiguous_custom( at::MemoryFormat memory_format) const { if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) { return pyobj_slot_.load_pyobj_interpreter()->is_contiguous( this, memory_format); } return sym_is_contiguous_default(memory_format); } ``` namely if we call sym_is_contiguous_custom but we have matches_python_custom(SizesStridesPolicy::CustomStrides) return true , then we used to call is_contiguous(this, memory_format); This used to go through the load_pyobj_interpreter and end up calling the python is_contiguous call which used implicit size oblivious reasoning. once we removed that implicit size oblivious reasoning, the right thing we want is to call return pyobj_slot_.load_pyobj_interpreter()->sym_is_contiguous(this, memory_format); otherwise we would get DDE even if the caller is doing sym_is_contiguous. so I had to define it for pyinterpreter, and then I had to override it for nested tensors. Approved by: https://github.com/ezyang Test Plan: contbuild & OSS CI, see `e444cd24d4` Rollback Plan: Differential Revision: D80435179 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160869 Approved by: https://github.com/ezyang	2025-09-08 22:59:13 +00:00
Colin Peppler	5fd6b6a2db	[refactor] add helper sizevars function, is_size_one, for size==1 checks (#162189 ) ## Summary - document guard behavior in `SizeVarAllocator.is_size_one` - use `is_size_one` for broadcast/expand checks. - This diff is a no-op since we'd use `shape_env.evaluate_expr(... fallback_value=False)` `a4f9132a17/torch/_inductor/sizevars.py (L450-L453)` ------ https://chatgpt.com/codex/tasks/task_e_68b8d0d1f2c48328b2d38c00e738bc8c Pull Request resolved: https://github.com/pytorch/pytorch/pull/162189 Approved by: https://github.com/laithsakka	2025-09-08 22:48:16 +00:00
drisspg	ac9ccd0dc2	Add return-max-scores to flex-attention (#161667 ) # Summary ### Update API ```Py class AuxRequest(NamedTuple): """Request which auxiliary outputs to compute from flex_attention. Each field is a boolean indicating whether that auxiliary output should be computed. """ lse: bool = False max_scores: bool = False class AuxOutput(NamedTuple): """Auxiliary outputs from flex_attention operation. Fields will be None if not requested, or contain the tensor if requested. """ lse: Optional[Tensor] = None max_scores: Optional[Tensor] = None out_only = flex_attention(query, key, value, score_mod) out_max, aux_max = flex_attention( query, key, value, score_mod, return_aux=FlexAttentionAuxRequest(max_scores=True), ) out_both, aux_both = flex_attention( query, key, value, score_mod, return_aux=FlexAttentionAuxRequest(lse=True, max_scores=True), ) ``` Returns the max post mod scores from flex attention. Not being able to break BC is kinda of annoying here since we end up with a combinatorial problem where if we need to add any more return vals we need to new kwargs that gate if they get returned by the function and need to support the 2**N additional args possible return groups. Ideally there isn't much more we need to return, but we might want to think about how best to set this up for expansion in the future. I added kwarg only now Maybe we make a `ExtraReturns` type kwarg that can grow and we don't need to keep adding new top level args. We could also return a Struct that holds all the extra tensors and start deprecation cycle for logsumexp eventually returning just 1 `ExtraReturns` like struct with the tensors. ### Req Grad I currently dont return a max_scores that supports backproping grads. I think this might be feasible but since max is essentially 1 hot on the inputs and a reduction we would either need to save another `max_location` from the forward or find the max_score but also only apply to first occurence if there is multiple equivalent scores (need to check if thats we define for vanilla max op in torch). For now no grad, we can re-visit if needed. ## Perf I am going to disable for flex_decode. Since at least initially the motivation is for training. I also more hard than it should be to have ops return nuns or optional tensors, If return max is at the false, we should probably just create a tensor of size zero so that we don't slow down the hot path. ```Shell 🔝 Top 5 TFlops Deltas (by absolute %): shape: (5, 7) ┌────────────────┬────────────────┬───────────────────────┬───────────────┬──────────────┬───────────┬───────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops (base) ┆ TFlops (max) ┆ delta ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞════════════════╪════════════════╪═══════════════════════╪═══════════════╪══════════════╪═══════════╪═══════════╡ │ causal ┆ torch.bfloat16 ┆ (4, 16, 2048, 16, ┆ 249.514658 ┆ 243.078974 ┆ 6.435684 ┆ 2.647569 │ │ ┆ ┆ 2048, 64) ┆ ┆ ┆ ┆ │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, ┆ 57.971274 ┆ 56.633641 ┆ 1.337633 ┆ 2.361905 │ │ ┆ ┆ 1024, 64) ┆ ┆ ┆ ┆ │ │ noop ┆ torch.bfloat16 ┆ (4, 16, 1024, 16, ┆ 244.052884 ┆ 248.65129 ┆ -4.598406 ┆ -1.849339 │ │ ┆ ┆ 1024, 64) ┆ ┆ ┆ ┆ │ │ noop ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, ┆ 280.71254 ┆ 275.686991 ┆ 5.025549 ┆ 1.822918 │ │ ┆ ┆ 1024, 128) ┆ ┆ ┆ ┆ │ │ sliding_window ┆ torch.bfloat16 ┆ (2, 16, 16384, 16, ┆ 152.970031 ┆ 150.489109 ┆ 2.480923 ┆ 1.648573 │ │ ┆ ┆ 16384, 64) ┆ ┆ ┆ ┆ │ └────────────────┴────────────────┴───────────────────────┴───────────────┴──────────────┴───────────┴───────────┘ 🔺 Top 5 Positive TFlops Deltas (highest +%): shape: (5, 7) ┌────────────────┬────────────────┬────────────────────────┬───────────────┬──────────────┬──────────┬───────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops (base) ┆ TFlops (max) ┆ delta ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞════════════════╪════════════════╪════════════════════════╪═══════════════╪══════════════╪══════════╪═══════════╡ │ causal ┆ torch.bfloat16 ┆ (4, 16, 2048, 16, ┆ 249.514658 ┆ 243.078974 ┆ 6.435684 ┆ 2.647569 │ │ ┆ ┆ 2048, 64) ┆ ┆ ┆ ┆ │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, ┆ 57.971274 ┆ 56.633641 ┆ 1.337633 ┆ 2.361905 │ │ ┆ ┆ 1024, 64) ┆ ┆ ┆ ┆ │ │ noop ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, ┆ 280.71254 ┆ 275.686991 ┆ 5.025549 ┆ 1.822918 │ │ ┆ ┆ 1024, 128) ┆ ┆ ┆ ┆ │ │ sliding_window ┆ torch.bfloat16 ┆ (2, 16, 16384, 16, ┆ 152.970031 ┆ 150.489109 ┆ 2.480923 ┆ 1.648573 │ │ ┆ ┆ 16384, 64) ┆ ┆ ┆ ┆ │ │ causal ┆ torch.bfloat16 ┆ (4, 16, 1024, 16, ┆ 161.031318 ┆ 158.597808 ┆ 2.43351 ┆ 1.534391 │ │ ┆ ┆ 1024, 64) ┆ ┆ ┆ ┆ │ └────────────────┴────────────────┴────────────────────────┴───────────────┴──────────────┴──────────┴───────────┘ 🔻 Top 5 Negative TFlops Deltas (lowest -%): shape: (5, 7) ┌────────────────┬────────────────┬───────────────────────┬───────────────┬──────────────┬───────────┬───────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops (base) ┆ TFlops (max) ┆ delta ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞════════════════╪════════════════╪═══════════════════════╪═══════════════╪══════════════╪═══════════╪═══════════╡ │ noop ┆ torch.bfloat16 ┆ (4, 16, 1024, 16, ┆ 244.052884 ┆ 248.65129 ┆ -4.598406 ┆ -1.849339 │ │ ┆ ┆ 1024, 64) ┆ ┆ ┆ ┆ │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 4, ┆ 175.546923 ┆ 177.81205 ┆ -2.265127 ┆ -1.273888 │ │ ┆ ┆ 1024, 128) ┆ ┆ ┆ ┆ │ │ sliding_window ┆ torch.bfloat16 ┆ (4, 16, 16384, 4, ┆ 156.282597 ┆ 158.209134 ┆ -1.926537 ┆ -1.217715 │ │ ┆ ┆ 16384, 64) ┆ ┆ ┆ ┆ │ │ sliding_window ┆ torch.bfloat16 ┆ (2, 16, 2048, 16, ┆ 232.542929 ┆ 235.140136 ┆ -2.597207 ┆ -1.104536 │ │ ┆ ┆ 2048, 128) ┆ ┆ ┆ ┆ │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, ┆ 169.652791 ┆ 171.475986 ┆ -1.823195 ┆ -1.063236 │ │ ┆ ┆ 1024, 128) ┆ ┆ ┆ ┆ │ └────────────────┴────────────────┴───────────────────────┴───────────────┴──────────────┴───────────┴───────────┘ ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161667 Approved by: https://github.com/Chillee, https://github.com/BoyuanFeng	2025-09-08 22:44:48 +00:00
Avik Chaudhuri	711c8c821e	shape guards (#161178 ) Summary: This PR introduces shape guards to export. Previously only value ranges, equalities, and specializations would be tracked for symbolic expressions, and we had a forward hook to check them. Instead now we create a function to check shape guards and call it in the exported program. Test Plan: updated several tests Rollback Plan: Differential Revision: D80713603 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161178 Approved by: https://github.com/tugsbayasgalan	2025-09-08 22:44:09 +00:00
Laith Sakka	2c538c9acf	rewrite __maybe_broadcast should_expand check for unbacked (#162109 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162109 Approved by: https://github.com/aorenste ghstack dependencies: #162084, #162099	2025-09-08 22:41:18 +00:00
Laith Sakka	85fe94e933	make should_swap more dde friendly (#162099 ) unblock customers for common cases with DDE ,until @pianpwk land the change to should_swap https://github.com/pytorch/pytorch/pull/160473. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162099 Approved by: https://github.com/aorenste ghstack dependencies: #162084	2025-09-08 22:41:18 +00:00
Hao Wu	fecd9686f5	Graph split event tracker (#159795 ) Summary: A tool to track events in graph split, specifically on how nodes being end up in acc or cpu subgraphs. Usage: use env var to specify a mode and necessary arguments. FX_NET_ACC_SPLITTER_TRACKER_MODE: Tracker mode. ``` Different modes of the event tracker: "0": Tracker not enabled (by default) "1": Tracker enabled but no dumps. Information available by setting breakpoints and visually inspect in pdb. "2": Tracker enabled and dumps all events to DUMP_PREFIX_all.txt "3": In addition to events dump, track nodes specified by ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES recusrively and dump to DUMP_PREFIX_nodex.txt "4:: In addition to events dump, track all nodes with more than 1 event recusrively and dump to DUMP_PREFIX_nodex.txt ``` FX_NET_ACC_SPLITTER_TRACKER_DUMP_PATH: overriding dump path. Leave empty for `~`. FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES: Nodes to track for mode "3". Test Plan: New unit test Reviewed By: georgiaphillips Differential Revision: D79203595 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159795 Approved by: https://github.com/ezyang	2025-09-08 21:30:17 +00:00
PyTorch MergeBot	dd44faa9d9	Revert "Modify ROCm MI2xx-based workflows to run on cron schedule (#162103 )" This reverts commit 0af70e2353e1dcda83175fd4834ecb7b63e009e0. Reverted https://github.com/pytorch/pytorch/pull/162103 on behalf of https://github.com/jithunnair-amd due to Cirrascale network outage resolved. Reverting back to running per commit to aid in triage and CI health ([comment](https://github.com/pytorch/pytorch/pull/162103#issuecomment-3267977825))	2025-09-08 20:53:05 +00:00
PyTorch MergeBot	5d819f3faf	Revert "[associative_scan] Autograd separated (#139939 )" This reverts commit 103f725afa8dbf0204a1be6a042ab93aa16d85d8. Reverted https://github.com/pytorch/pytorch/pull/139939 on behalf of https://github.com/huydhn due to Sorry for reverting your change but I am seeing a weird failure after this lands in trunk ([comment](https://github.com/pytorch/pytorch/pull/139939#issuecomment-3267945657))	2025-09-08 20:42:47 +00:00
Nikita Shulga	015423bef8	Add fp16-overflow regression test (#162401 ) Discovered while debugging https://github.com/pytorch/pytorch/issues/160841 where sdpa returned NaNs, because during the computation intermediate values were cast back to fp16 before normalization, which was fixed by https://github.com/pytorch/pytorch/pull/161999 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162401 Approved by: https://github.com/Skylion007, https://github.com/drisspg	2025-09-08 20:33:23 +00:00
William Wen	26a1b9cce2	[dynamo] fix resume_execution.py KeyError in Python 3.11+ (#162318 ) Fixes https://github.com/pytorch/pytorch/issues/162313 Differential Revision: [D81938289](https://our.internmc.facebook.com/intern/diff/D81938289) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162318 Approved by: https://github.com/Lucaskabela, https://github.com/mlazos, https://github.com/anijain2305	2025-09-08 20:26:24 +00:00
Benjamin Glass	8f114650eb	Add std::any_of to ConvParams struct (#162334 ) Removes some for-loops that didn't short-circuit in favor of std::any_of. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162334 Approved by: https://github.com/Skylion007	2025-09-08 20:12:20 +00:00
Aaron Gokaslan	ec2c1371af	[BE]: Update cudnn frontend submodule to 1.14.1 (#162347 ) Fixes a few bugs introduced to CUDNN 1.11 which affects all our CUDA13 builds. Also adds support for new CUDNN features whenever we choose to update. @eqy pretty sure this addresses the concern you had over the previous upgrade since that bugfix is now merged. This is a simple header only update. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162347 Approved by: https://github.com/eqy, https://github.com/atalman	2025-09-08 20:03:23 +00:00
IvanKobzarev	8ec01f34e9	[bucketing] custom_ops mode to hide inductor copies overhead (#161499 ) Adding "_custom_ops" bucketing to temporary fallback to eager execution of for_each, to workaround too many generated kernels on inductor side. This PR also reverts parts of bucketing changes for cycles detection that resulted in accuracy problems. Differential Revision: [D81152293](https://our.internmc.facebook.com/intern/diff/D81152293) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161499 Approved by: https://github.com/eellison	2025-09-08 20:03:08 +00:00
Ting Lu	9c991b63ff	[CD] [aarch64] Add CUDA 12.6 and 12.8 to build matrix, remove 12.9 build (#162364 ) https://github.com/pytorch/pytorch/issues/159779 Add the full CUDA support matrix to sbsa build (12.6, 12.8) Same arch support as x86 build Remove 12.9 sbsa build Pull Request resolved: https://github.com/pytorch/pytorch/pull/162364 Approved by: https://github.com/atalman	2025-09-08 20:00:25 +00:00
rzou	4e50651c5f	[DTensor] fix F.one_hot (#162307 ) F.one_hot(dtensor) used to run into a mixed DTensor-Tensor operation due to an arange call creating a new Tensor (not DTensor). This PR fixes it by allowing implicit replication of Tensors for the arange call and the one consumer of the arange call (the at::eq call). Test Plan: - new test. Also, F.one_hot(num_classes=-1) is broken so we skip that. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162307 Approved by: https://github.com/ezyang ghstack dependencies: #162117	2025-09-08 19:37:08 +00:00
Edward Z. Yang	a0d026688c	Make distributed modules importable even when backend not built (#159889 ) This PR is greatly simplified now that it stacked on top of a PR that builds with distributed always. We only need to stub functions that may not be defined due to a backend not being enabled. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159889 Approved by: https://github.com/wconstab ghstack dependencies: #160449	2025-09-08 19:10:36 +00:00
Edward Yang	d80297a684	Always build USE_DISTRIBUTED. (#160449 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160449 Approved by: https://github.com/wconstab, https://github.com/albanD, https://github.com/dcci	2025-09-08 19:10:36 +00:00
angelayi	fbcabb4fbd	Handle f([]) vs. f() in fake tensor caching (#162284 ) Fixes https://github.com/pytorch/pytorch/issues/162279 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162284 Approved by: https://github.com/manuelcandales, https://github.com/aorenste	2025-09-08 18:28:05 +00:00
PyTorch UpdateBot	314d47a210	[audio hash update] update the pinned audio hash (#162315 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162315 Approved by: https://github.com/pytorchbot	2025-09-08 18:26:33 +00:00
atalman	bc4176c92a	CD Windows CUDA 13.0 build - fix packaging of cuda dlls (#162383 ) Trying to fix https://github.com/pytorch/pytorch/issues/162333 CUDA 13.0 file structure changed. Instead of keeping most of dlls in bin folder its now in ``bin\x64`` except for cudnn dll. See attached picture : <img width="511" height="361" alt="Screenshot 2025-09-08 at 9 46 26 AM" src="https://github.com/user-attachments/assets/d2e630ee-930f-4da6-9b81-f9ef48fde7ce" /> <img width="490" height="333" alt="Screenshot 2025-09-08 at 9 46 34 AM" src="https://github.com/user-attachments/assets/194cbf43-b6ef-4218-b516-db37b91302be" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162383 Approved by: https://github.com/seemethere, https://github.com/ZainRizvi, https://github.com/malfet	2025-09-08 17:57:22 +00:00
eqy	de5dc1f038	[cuDNN][SDPA][Nested Tensor] add forward/backward caching support for cuDNN SDPA Nested tensor/varlen (#161434 ) Don't recompile every time Pull Request resolved: https://github.com/pytorch/pytorch/pull/161434 Approved by: https://github.com/drisspg	2025-09-08 17:51:13 +00:00
morrison-turnansky	72e6717d00	Avoid crash with release_available_cached_blocks (#162269 ) updated release behavior for cached blocks Fixes #159567 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162269 Approved by: https://github.com/eqy, https://github.com/Skylion007	2025-09-08 17:46:43 +00:00
Shunting Zhang	ebd29a13fe	[inductor] fuse for scalar shared data (#162311 ) LOAF previously may skip these fusion opportunities and cause some tests fail. Test: - TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION=1 python test/inductor/test_torchinductor_strided_blocks.py TritonBlockPointerTestGPU.test_2d_reduction_odd_shapes_view_size4_num_block_pointers_1_num_triton_kernels_1_reduction_op4_cuda Pull Request resolved: https://github.com/pytorch/pytorch/pull/162311 Approved by: https://github.com/jansel	2025-09-08 17:20:46 +00:00
fengqing.lu	5793dd7875	[Intel GPU] Integrate OneDNN SDPA training forward and backward (#161058 ) This PR is the first split PR of https://github.com/pytorch/pytorch/pull/156272, only contains the OneDNN code. Please help review. Pending on OneDNN v3.9 commit update. Don't merge. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161058 Approved by: https://github.com/guangyey, https://github.com/EikanWang	2025-09-08 17:07:31 +00:00
Scott Wolchok	49c446c617	Add C++ function for torch.distributed.tensor._op_schema.is_view_op (#161595 ) This seems to have been an especially slow one because of the repeated pybind access (schema is a pybind, as is arguments, and then we hit each argument). It's still ~~1% of total benchmark runtime because of the repeated single pybind function call, but that's a lot better. Differential Revision: [D81530095](https://our.internmc.facebook.com/intern/diff/D81530095) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161595 Approved by: https://github.com/ezyang, https://github.com/bdhirsh ghstack dependencies: #161466, #161586, #161590, #161591	2025-09-08 16:28:08 +00:00
Scott Wolchok	8e076d889c	Don't call check_has_torch_dispatch in THPVariable_NewWithVar if we already know (#161591 ) We already know when we're called from make_wrapper_subclass or make_dtensor. The check isn't particularly cheap. Differential Revision: [D81530099](https://our.internmc.facebook.com/intern/diff/D81530099) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161591 Approved by: https://github.com/ezyang ghstack dependencies: #161466, #161586, #161590	2025-09-08 16:28:08 +00:00
Chien-Chin Huang	f044fa2902	[AsyncTP] Use assertEqual instead of allClose for bf16 tests (#162041 ) The async tp result and regular MM result are very close. If we adjust the allclose threshold, the test succeeds. This seems to indicate that the error is from numerical error of low precision. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162041 Approved by: https://github.com/danielvegamyhre, https://github.com/ngimel ghstack dependencies: #162040	2025-09-08 16:12:52 +00:00
PyTorch MergeBot	a92773eeb1	Revert "Use vectorized stores for all dtypes in cat (#161649 )" This reverts commit 377033757ae5ca524ea842f1b0a5f446ed3d8fe0. Reverted https://github.com/pytorch/pytorch/pull/161649 on behalf of https://github.com/ngimel due to reverted internally ([comment](https://github.com/pytorch/pytorch/pull/161649#issuecomment-3266963044))	2025-09-08 15:58:58 +00:00
PyTorch MergeBot	53297f6ad0	Revert "[audio hash update] update the pinned audio hash (#162315 )" This reverts commit c9ac8c25ef9ad020542898ab569910a9d0cd1f7e. Reverted https://github.com/pytorch/pytorch/pull/162315 on behalf of https://github.com/jeanschmidt due to Reverting in order to see if this introduced the failure https://github.com/pytorch/pytorch/actions/runs/17539536914/job/49810513700 ([comment](https://github.com/pytorch/pytorch/pull/162315#issuecomment-3266932718))	2025-09-08 15:52:30 +00:00
IvanKobzarev	25c170b72e	[inductor] Runtime estimations: use nccl estimator; mm only benchmark mode (#161405 ) During comms reordering , sink wait iterative observed previous runtime estimations pretty off for collectives and mms. Adding optional usage of: - c10d.time_estimator for collectives, which is based on NCCL estimator Benchmark mode only for matmuls, as they are highly dependent on mm backend - The logic mostly copied from Ruisi's PRs for inductor simple_fsdp https://github.com/pytorch/pytorch/pull/157572 This estimations corrections are in default `BaseSchedulerNode.estimate_runtime()` Differential Revision: [D81152294](https://our.internmc.facebook.com/intern/diff/D81152294) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161405 Approved by: https://github.com/eellison	2025-09-08 14:33:19 +00:00
David Berard	3f5993316e	[upstream triton] update triton pin to triton 3.5 (#162278 ) Update PyTorch to the latest Triton release candidate branch (release/3.5.x in triton-lang/triton) Notably: * this does not include the version number bump from 3.4 -> 3.5 (we'll do that in a follow-up PR) * sam_fast is still failing, so we've disabled it temporarily https://github.com/pytorch/pytorch/issues/162282 and we are committed to fixing it, ideally before the branch cut but possibly as a cherry-pick into the release branch. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162278 Approved by: https://github.com/atalman ghstack dependencies: #162244, #162309	2025-09-08 14:29:24 +00:00
PyTorch UpdateBot	e101411b9f	Update slow tests (#161395 ) This PR is auto-generated weekly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/weekly.yml). Update the list of slow tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161395 Approved by: https://github.com/pytorchbot	2025-09-08 13:33:32 +00:00
PyTorch UpdateBot	32911ff541	[xla hash update] update the pinned xla hash (#162372 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned xla hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162372 Approved by: https://github.com/pytorchbot	2025-09-08 11:31:16 +00:00
Chien-Chin Huang	5b90e85112	[AsyncTP] Fixes AsyncMM (#162040 ) The original implementation set beta to be 1, which cause the out (C) being added to the the output. Thus if the output is not initialized as zero beforehand, the output can be incorrect. Removing the alpha and beta fixes the issue. Thanks @ngimel to figure out the root cause. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162040 Approved by: https://github.com/danielvegamyhre	2025-09-08 10:53:59 +00:00
David Berard	31d5c67539	[inductor][triton] support static cuda launcher after triton # 7866 (#162309 ) Fixes static cuda launcher after https://github.com/triton-lang/triton/pull/7866. Static cuda launcher checks to make sure that no hook knobs are set (and if they are, it throws an error). But Triton has changed the semantics of hooks so that "empty hooks" are now represented by empty `HookChain`s instead of being represented by `None`. This PR changes the way we define "empty hooks" to account for HookChains. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162309 Approved by: https://github.com/aakhundov ghstack dependencies: #162244	2025-09-08 07:57:48 +00:00
David Berard	fb0afa853e	[inductor][triton] more JITCallable._hash_lock support (#162244 ) Follow-up to #161768. Context: ProcessPool pickles the outputs before sending them back to the main process. Triton kernels have some un-pickleable fields, so `prepare_for_pickle()` is used to strip out those fields. Previously, in the standard case (without triton_bundler.py), `prepare_for_pickle()` would strip out the un-pickleable fields and they would never be added back after unpickling, because the un-pickleable fields were not actually needed after compilation finished. In #161768 updated `prepare_for_pickle` to also strip out the `fn._hash_lock` field, a newly added field in JITCallable instances which is a `threading.RLock()`, which is not pickleable. It turns out that we do need to restore the `fn._hash_lock` field, even in the non-triton_bundler case - the MultiKernel case uses the hash lock. To do this, we add `restore_after_unpickle()` which will restore fields (or if the old fields are not provided, initialize just the hash_lock) Compile time benchmarks look good, maybe a very minor regression (see the comment below on the PR) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162244 Approved by: https://github.com/atalman	2025-09-08 07:57:48 +00:00
PyTorch MergeBot	1e0656f063	Revert "Always build USE_DISTRIBUTED. (#160449 )" This reverts commit de893e96c775023aa3be895060848fac3296772c. Reverted https://github.com/pytorch/pytorch/pull/160449 on behalf of https://github.com/jeanschmidt due to internal changes breaks import checks, see [D81845053](https://www.internalfb.com/diff/D81845053) ([comment](https://github.com/pytorch/pytorch/pull/160449#issuecomment-3264887002))	2025-09-08 07:04:36 +00:00
PyTorch MergeBot	29e09a6545	Revert "Make distributed modules importable even when backend not built (#159889 )" This reverts commit 01edcd4df8bf0c7b4cc2d3ec868bd2059eeea83b. Reverted https://github.com/pytorch/pytorch/pull/159889 on behalf of https://github.com/jeanschmidt due to internal changes breaks import checks, see [D81845053](https://www.internalfb.com/diff/D81845053) ([comment](https://github.com/pytorch/pytorch/pull/160449#issuecomment-3264887002))	2025-09-08 07:04:36 +00:00
PyTorch UpdateBot	c9ac8c25ef	[audio hash update] update the pinned audio hash (#162315 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162315 Approved by: https://github.com/pytorchbot	2025-09-08 04:17:23 +00:00
Thomas Bohnstingl	103f725afa	[associative_scan] Autograd separated (#139939 ) This PR implements the Autograd feature of the associative_scan. Pull Request resolved: https://github.com/pytorch/pytorch/pull/139939 Approved by: https://github.com/ydwu4	2025-09-08 03:21:17 +00:00
James Wu	5babb4d5c0	Add BundledAOTAutogradSerializableCallable (#162170 ) This PR hooks up the python wrapper inductor backend to aot_compile. This is not the best way for us to grab the output of AOTAutograd; that involves a refactor to make AOTAutograd itself return a serializable callable. I'll do that refactor soon, but I want a basic interface to test with for now. In the medium term, we'll want aot_compile to call AOTAutograd directly, instead of using the TorchInductorWrapper's callback through compile_fx. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162170 Approved by: https://github.com/zhxchen17 ghstack dependencies: #162169	2025-09-07 23:37:31 +00:00
James Wu	eb9073a6b7	[easy] [precompile] Convert CompileArtifacts to callable (#162169 ) The goal of this PR stack is to be able to implement `aot_compile_module`, which AOT precompiles a torch.nn.Module. Step 1 is a simple refactor to make CompileArtifacts itself the callable, which makes it easier to use directly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162169 Approved by: https://github.com/zhxchen17	2025-09-07 23:37:31 +00:00
Yidi Wu	ec2e3687c7	[while_loop][autograd] support autograd_key of while_loop (#160483 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160483 Approved by: https://github.com/zou3519	2025-09-07 21:55:29 +00:00
PyTorch MergeBot	ff2de5d522	Revert "[2/N]Port several test files under test/distributed to Intel GPU (#159473 )" This reverts commit 040d00af048967dde7938d358d7f5988cbd18388. Reverted https://github.com/pytorch/pytorch/pull/159473 on behalf of https://github.com/jeanschmidt due to Seems to be breaking internal signals, @d4l3k please help the author to have this change landed. [D81718444](https://www.internalfb.com/diff/D81718444) ([comment](https://github.com/pytorch/pytorch/pull/159473#issuecomment-3264046983))	2025-09-07 21:06:38 +00:00
PyTorch MergeBot	8235c4f65d	Revert "[ROCm] Enabling several UTs (#161715 )" This reverts commit b9ba612f7a968f7b27e121ca8f4d0a4d954f5354. Reverted https://github.com/pytorch/pytorch/pull/161715 on behalf of https://github.com/jeanschmidt due to Need to revert in order to revert https://github.com/pytorch/pytorch/pull/159473, feel free to merge it back once conflicts are cleared ([comment](https://github.com/pytorch/pytorch/pull/161715#issuecomment-3264040604))	2025-09-07 21:03:17 +00:00
PyTorch MergeBot	e246a85b76	Revert "[1/N] Port 5 _composable/fsdp distributed test cases to Intel GPU (#159118 )" This reverts commit 5c473e9f5ee0ef0fc38e6cf34a95b547f8cdc8d5. Reverted https://github.com/pytorch/pytorch/pull/159118 on behalf of https://github.com/jeanschmidt due to Need to revert in order to revert https://github.com/pytorch/pytorch/pull/159473 ([comment](https://github.com/pytorch/pytorch/pull/159118#issuecomment-3264037799))	2025-09-07 21:00:29 +00:00
PyTorch MergeBot	df59c21768	Revert "[BE] Cleanup stale comments/copy from `gemm` (#162001 )" This reverts commit 6087ef41e54c2494b117ffd923faf20f515a6806. Reverted https://github.com/pytorch/pytorch/pull/162001 on behalf of https://github.com/jeanschmidt due to breaks internal ads signal, see [D81845017](https://www.internalfb.com/diff/D81845017) ([comment](https://github.com/pytorch/pytorch/pull/162001#issuecomment-3264034312))	2025-09-07 20:53:16 +00:00
PyTorch MergeBot	093ab5f477	Revert "[inductor] add kernel template choice (ktc) (#161347 )" This reverts commit 9a8d454c464c0b811fc4586ff104424bccf1da0c. Reverted https://github.com/pytorch/pytorch/pull/161347 on behalf of https://github.com/jeanschmidt due to Seems to have broken internal builds, see [D81520569](https://www.internalfb.com/diff/D81520569) ([comment](https://github.com/pytorch/pytorch/pull/161347#issuecomment-3264027436))	2025-09-07 20:39:39 +00:00
PyTorch MergeBot	4348db0b92	Revert "[inductor][ez] V.choices.get_mm_configs returns list of ChoiceCallers (#161348 )" This reverts commit c32111149921b48bfef909293f1049e21619ed76. Reverted https://github.com/pytorch/pytorch/pull/161348 on behalf of https://github.com/jeanschmidt due to Seems to have broken internal builds, see [D81520569](https://www.internalfb.com/diff/D81520569) ([comment](https://github.com/pytorch/pytorch/pull/161347#issuecomment-3264027436))	2025-09-07 20:39:39 +00:00
Vinayak Pawar	9ad5e8edb1	Improve typing of ONNX decorators with ParamSpec (#162332 ) ## Summary This PR improves typing in ONNX-related modules by replacing TypeVar bound to Callable[..., Any] with ParamSpec to preserve parameter types and avoid type erasure in decorator functions. ## Changes - `torch/onnx/_internal/exporter/_flags.py`: Replace TCallable TypeVar with ParamSpec - `torch/onnx/ops/_impl.py`: Replace _T TypeVar with ParamSpec for _onnx_op decorator - `torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py`: Replace _T TypeVar with ParamSpec ## Motivation The previous implementation used TypeVar bound to Callable which erased parameter type information to Any. ParamSpec preserves the exact parameter types and return types, providing better type safety and IDE support. ## Testing - Verified all changes compile and import correctly - Created comprehensive test suite to validate ParamSpec functionality - No linting errors introduced - Maintains backward compatibility Fixes #142306 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162332 Approved by: https://github.com/Skylion007	2025-09-07 18:06:03 +00:00
PyTorch MergeBot	7a83cf430e	Revert " [while_loop][autograd] support autograd_key of while_loop (#160483 )" This reverts commit 2b8a83901c58a0858ea9e4ce00055f48e6ed164c. Reverted https://github.com/pytorch/pytorch/pull/160483 on behalf of https://github.com/huydhn due to Sorry for reverting your PR, but some trunk tests are failing either from this PR or the previous one in the stack ([comment](https://github.com/pytorch/pytorch/pull/160483#issuecomment-3263597325))	2025-09-07 08:50:49 +00:00
PyTorch MergeBot	ada43ed39c	Revert "[inductor] pdl inductor option (disabled by default) (#160928 )" This reverts commit 9458d1ac3bd70c2af316a8ba95d2c6c9c1199c9c. Reverted https://github.com/pytorch/pytorch/pull/160928 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/160928#issuecomment-3263560378))	2025-09-07 07:37:37 +00:00
Huy Do	93fb23d6fa	Build vLLM nightly wheels (#162000 ) This uses the same approach as building triton wheel where we publish a nightly wheel for vLLM whenever its pinned commit is updated. The key change is to use `pytorch/manylinux2_28-builder` as the base image to build vLLM, so there are a couple of changes on the vLLM Dockerfile used by lumen_cli 1. `pytorch/manylinux2_28-builder` is RedHat instead of Debian-based, so no apt-get 2. Fix a bug in `.github/actions/build-external-packages/action.yml` where `CUDA_VERSION` is not set correctly, preventing CUDA 12.9 build 3. Fix a bug in `.github/actions/build-external-packages/action.yml` where `TORCH_WHEELS_PATH` is not set correctly and always defaulted to `dist` 4. In vLLM Dockerfile, use the correct index for the selected CUDA version, i.e. https://download.pytorch.org/whl/nightly/cu12[89] for CUDA 12.[89] 5. Install torch, vision, audio in one command. Unlike the CI image `pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm`, `pytorch/manylinux2_28-builder` doesn't have any torch dependencies preinstalled 6. Bump xformers version to 0.0.32.post2 now that PyTorch 2.8.0 has been landed on vLLM We need to prepare 3 wheels for vLLM, xformers, and flashinfer-python. And I rename them in the same convention as PyTorch nightlies `MAJOR.MINOR.PATCH.devYYYYMMDD` so that vLLM nightlies will work with torch nightlies on the same date. ### Usage * Install latest nightlies ``` pip install --pre torch torchvision torchaudio vllm xformers flashinfer_python \ --index-url https://download.pytorch.org/whl/nightly/cu129 ``` * Install a specific version ``` pip install --pre torch==2.9.0.dev20250903 torchvision torchaudio \ vllm==1.0.0.dev20250903 \ xformers=0.0.33.dev20250903 \ flashinfer_python=0.2.14.dev20250903 \ --index-url https://download.pytorch.org/whl/nightly/cu129 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162000 Approved by: https://github.com/atalman	2025-09-07 06:09:17 +00:00
PyTorch MergeBot	104f2680e0	Revert "Add return-max-scores to flex-attention (#161667 )" This reverts commit 486b20b73cfcf32a773a4301b1b97f91c157ce76. Reverted https://github.com/pytorch/pytorch/pull/161667 on behalf of https://github.com/huydhn due to Sorry for reverting your change but reverting https://github.com/pytorch/pytorch/pull/161730 does not seem to fix all trunk failures ([comment](https://github.com/pytorch/pytorch/pull/161667#issuecomment-3263512642))	2025-09-07 06:00:55 +00:00
PyTorch MergeBot	eac3d6f04c	Revert "[inductor] fuse for scalar shared data (#162311 )" This reverts commit 2a45837e98c63cae9d1a2e2133a727b829e549d5. Reverted https://github.com/pytorch/pytorch/pull/162311 on behalf of https://github.com/huydhn due to Sorry for reverting your change, but it is breaking lint ([comment](https://github.com/pytorch/pytorch/pull/162311#issuecomment-3263511162))	2025-09-07 05:57:43 +00:00
PyTorch UpdateBot	fea20775ad	[vllm hash update] update the pinned vllm hash (#162314 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162314 Approved by: https://github.com/pytorchbot	2025-09-07 04:29:23 +00:00
Shunting Zhang	2a45837e98	[inductor] fuse for scalar shared data (#162311 ) LOAF previously may skip these fusion opportunities and cause some tests fail. Test: - TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION=1 python test/inductor/test_torchinductor_strided_blocks.py TritonBlockPointerTestGPU.test_2d_reduction_odd_shapes_view_size4_num_block_pointers_1_num_triton_kernels_1_reduction_op4_cuda Pull Request resolved: https://github.com/pytorch/pytorch/pull/162311 Approved by: https://github.com/jansel ghstack dependencies: #162028, #162221, #162303	2025-09-07 01:48:45 +00:00
Yiming Zhou	b919560c4a	[nativert] AOTI lowering and packaging as NativeRT delegate (#162285 ) Summary: A demo for creating AOTI delegate for NativeRT in OSS. - It supports full graph lowering only. - It leverages `executorch_call_delegate` HOP but doesn't rely on `executorch`. - The delegate graph is obtained by tracing a `LoweredBackendModule` whose forward function calls `executorch_call_delegate`. - The main difference between `executorch_call_delegate` and `aoti_call_delegate` is that the delegate graph from `executorch_call_delegate` doesn't have weights lifted as inputs. - original_ep and delegate_ep are treated as flat EP dictionary and there is no nested structure. - The naming contract is enforced by `model_name` and `backend_id` Test Plan: CI Rollback Plan: Differential Revision: D81641157 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162285 Approved by: https://github.com/dolpm	2025-09-07 01:29:54 +00:00
Animesh Jain	e3068cdb44	[dynamo] Use relaxed CLOSURE_MATCH guard then ID_MATCH (#162247 ) I am unable to write a test that would fail here. The reason is that when we do _dynamo.disable(fn) in the compiled frame, the id of disabled function changes but currently we guard on the original function - `fn` whose id is not changing. This PR still guards on the `fn.__code__` just to be more precise. Thanks to @thenumberouscode for pointing this out. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162247 Approved by: https://github.com/StrongerXi, https://github.com/jansel	2025-09-07 01:25:52 +00:00
Yiming Zhou	5211f1f908	[export] Move example inputs in move_to_device_pass (#162301 ) Summary: If i have a EP that's exported on CPU and want to AOTI compile it for CUDA. I need to use `move_to_device_pass`. But in `torch._inductor.aoti_compile_and_package()`, it directly uses the `example_inputs` attached to the EP, so we should move the example inputs as well if applicable. Test Plan: buck2 run mode/dev-nosan caffe2/test:test_export -- -r test_move_device_example_inputs Rollback Plan: Differential Revision: D81812366 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162301 Approved by: https://github.com/angelayi	2025-09-06 23:54:54 +00:00
Yidi Wu	2b8a83901c	[while_loop][autograd] support autograd_key of while_loop (#160483 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160483 Approved by: https://github.com/zou3519 ghstack dependencies: #160548, #160467	2025-09-06 21:26:33 +00:00
Yidi Wu	48e3be3ab6	[while_loop][autograd] add hop while_loop_stack_output (#160467 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160467 Approved by: https://github.com/zou3519 ghstack dependencies: #160548	2025-09-06 21:26:33 +00:00
mansiag05	5927a70934	NLLLoss: validate target is 0D when input is 1D (#161412 ) Add a shape check in nll_loss_forward to error out when both input and target are 1D. Added a unit test to cover the incompatible 1D/1D case. Fixes #157420 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161412 Approved by: https://github.com/ngimel	2025-09-06 20:58:42 +00:00
Shunting Zhang	1a588ace46	[inductor] rename deps during refreshing (#162303 ) Skiping renaming cause wrong dependencies when mutations are involved. Test: CUDA_VISIBLE_DEVICES=4,5,6 TORCHINDUCTOR_LOOP_ORDERING_AFTER_FUSION=1 python test/distributed/test_compute_comm_reordering.py TestComputeCommReorderingMultiProc.test_reorder_compute_for_overlap Both all-reduce and wait-tensor ir node contains a MutationBuffer for this test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162303 Approved by: https://github.com/eellison, https://github.com/jansel ghstack dependencies: #162028, #162221	2025-09-06 20:38:28 +00:00
Shunting Zhang	541aa23de5	[inductor] fix TemplateBuffer.extract_read_writes (#162221 ) Make sure TemplateBuffer & ComputedBuffer have the same dependencies prefix. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162221 Approved by: https://github.com/jansel, https://github.com/eellison ghstack dependencies: #162028	2025-09-06 20:38:28 +00:00
Tugsbayasgalan Manlaibaatar	047603d35b	New export implementation with flat inp/out (#162167 ) This is my first attempt of building new export API. The main thing it addresses is correctly getting input and output relations. Subsequent diffs willl add functionality for dynamic shapes, nn_module_stack etc. Differential Revision: [D81793205](https://our.internmc.facebook.com/intern/diff/D81793205) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162167 Approved by: https://github.com/zhxchen17, https://github.com/avikchaudhuri	2025-09-06 20:03:52 +00:00
Deng, Daisy	ae0edc133e	[3/N] Enable 6 fsdp test on Intel GPU (#161601 ) For https://github.com/pytorch/pytorch/issues/114850, we will port distributed tests to Intel GPU. This PR is created base on PR https://github.com/pytorch/pytorch/pull/158533 and https://github.com/pytorch/pytorch/pull/159473 and will work on some test files under test/distributed/fsdp. We could enable Intel GPU with following methods and try the best to keep the original code styles in this PR: 1. add allow_xpu=True in instantiate_device_type_tests() if needed. 2. use "torch.accelerator.current_accelerator()" to determine the accelerator backend 3. enabled XPU for some test path Pull Request resolved: https://github.com/pytorch/pytorch/pull/161601 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-06 16:47:13 +00:00
Daniel Vega-Myhre	b6d0a9ea90	MXFP8 grouped GEMM support for torch._scaled_grouped_mm + submodule bump (#162209 ) ## Summary - We just landed 2d-2d support for mxfp8 grouped gemm in FBGEMM: https://github.com/pytorch/FBGEMM/pull/4816 - This is needed for backward pass of mxfp8 MoE training with grouped gemms - Changes: - Add dispatching + input validation for mxfp8 grouped gemm in `torch._scaled_grouped_mm` - Add meta registration input validation for mxfp8 grouped gemm, for composability with compile - Add unit tests exercising torch._scaled_grouped_mm with mxfp8 inputs - Bump FBGEMM third party submodule to include: - https://github.com/pytorch/FBGEMM/pull/4816 - https://github.com/pytorch/FBGEMM/pull/4820 - https://github.com/pytorch/FBGEMM/pull/4821 - https://github.com/pytorch/FBGEMM/pull/4823 #### How fbgemm dependency was bumped Documenting this since I haven't found it documented elsewhere: - `cd ~/pytorch/third_party/fbgemm` - `git fetch` - `git checkout <hash>` - `cd ~/pytorch` - `git add third_party/fbgemm` ## Test plan #### Test build ``` USE_FBGEMM_GENAI=1 python -m pip install --no-build-isolation -v -e . ... Successfully installed torch-2.9.0a0+gitf5070f3 ``` [full build log](https://www.internalfb.com/phabricator/paste/view/P1933787581) #### Unit tests ``` pytest test/test_matmul_cuda.py -k test_mxfp8_scaled_grouped_mm_ ... test/test_matmul_cuda.py ......... [100%] ============================================================== 9 passed, 1668 deselected in 5.34s =============================================================== ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162209 Approved by: https://github.com/ngimel	2025-09-06 15:25:30 +00:00
eqy	5985e28912	[CUDA 13][cuDNN][Windows] Roll back cuDNN upgrade from 9.13 to 9.12 on Windows (#162322 ) Forward fix for #162268 CC @atalman Pull Request resolved: https://github.com/pytorch/pytorch/pull/162322 Approved by: https://github.com/atalman, https://github.com/nWEIdia	2025-09-06 13:32:07 +00:00
Blaine Burton Rister	9aedb3cd87	[AOTI-FX] Support registering custom FX backends (#162317 ) # Feature Currently, `torch._inductor.compile_aot` always uses the `WrapperFxCodegen` class. In contrast, Python and C++ codegen allow users to register custom backends. This PR brings that feature to FX codegen. # Test plan Added a CI test registering a custom FX backend. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162317 Approved by: https://github.com/jansel	2025-09-06 07:32:03 +00:00
PyTorch MergeBot	0ff8eabf13	Revert "[dynamo] Graph break on on user-defined class in compiled region (#161670 )" This reverts commit 146371483318e17929daefd37c8e459d9d6d47bb. Reverted https://github.com/pytorch/pytorch/pull/161670 on behalf of https://github.com/jeanschmidt due to seems to have introduced https://github.com/pytorch/pytorch/actions/runs/17507127561/job/49733379267 and https://github.com/pytorch/pytorch/actions/runs/17507127561/job/49733379271 ([comment](https://github.com/pytorch/pytorch/pull/161670#issuecomment-3261241229))	2025-09-06 06:18:57 +00:00
Jeffro	28f4ab0737	Add -Wno-ctad-maybe-unsupported compiler flag (#162223 ) When running bazel build, we (Google) run into the following error. The `-Wctad-maybe-unsupported` warning would be raised to an error and break the build in certain cases. So, we propose to suppress the warning to make the build with bazel more smooth. This is the error message we got: ``` c10/util/IntrusiveList.h:166:12: error: 'std::reverse_iterator' may not intend to support class template argument deduction [-Werror,-Wctad-maybe-unsupported] 166 \| return std::reverse_iterator{end()}; \| ^ c10/test/util/IntrusiveList_test.cpp:24:18: note: in instantiation of member function 'c10::IntrusiveList<(anonymous namespace)::ListItem>::rbegin' requested here 24 \| auto it = c1.rbegin(); \| ^ c10/test/util/IntrusiveList_test.cpp:43:5: note: in instantiation of function template specialization '(anonymous namespace)::check_containers_equal<(anonymous namespace)::ListItem>' requested here 43 \| check_containers_equal(l, v); \| ^ libcxx/include/__iterator/reverse_iterator.h:51:7: note: add a deduction guide to suppress this warning 51 \| class reverse_iterator \| ^ 1 error generated. ``` @haifeng-jin Pull Request resolved: https://github.com/pytorch/pytorch/pull/162223 Approved by: https://github.com/ezyang	2025-09-06 06:11:37 +00:00
Codeboi007	c98ddaca6d	Fixed comment to match logic in distributed_c10d.py (#162158 ) inconsistent with the logic introduced in #162157 and modified in #142216.This update ensures the documentation matches the actual behavior of the code. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162158 Approved by: https://github.com/wconstab	2025-09-06 05:37:49 +00:00
morrison-turnansky	bc505977fb	torch.zeros bound checks for symint (#161976 ) Fixes #161490 I added a bounds check for negative symints to create a better error message. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161976 Approved by: https://github.com/ezyang	2025-09-06 05:37:42 +00:00
orangeH25	aac1a50a19	Add api info for torch._C._nn.pyi (#162148 ) Fix part of #148404 APis involved are as followed: - cross_entropy_loss - hardsigmoid_ - hardswish - hardswish_ - huber_loss Pull Request resolved: https://github.com/pytorch/pytorch/pull/162148 Approved by: https://github.com/FFFrog, https://github.com/ezyang	2025-09-06 05:21:40 +00:00
Isuru Fernando	20b47acef8	[fx] fix qualified name for methods of torch.Tensor (#162224 ) Fixes #160077, #154721 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162224 Approved by: https://github.com/ezyang	2025-09-06 05:16:19 +00:00
Mario Šaško	da4db4b33d	Fix `DeviceMesh._flatten` docstring example (#162277 ) Fix the `DeviceMesh._flatten` docstring example of use. Alternative fix would be to replace `mesh_3d["dp", "cp"]` with `mesh_3d["cp", "tp"]`. (I verified the fix using the `gloo` backend) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162277 Approved by: https://github.com/ezyang	2025-09-06 05:00:00 +00:00
PyTorch MergeBot	a3e5466002	Revert "Resize to 0 if not going to be used (#161730 )" This reverts commit 081cab045472ce045634548cc6c14a4870641e23. Reverted https://github.com/pytorch/pytorch/pull/161730 on behalf of https://github.com/davidberard98 due to functorch/test_aotdispatch.py::TestAOTModuleSimplified::test_flex_attn_noncontiguous_tangents [GH job link](https://github.com/pytorch/pytorch/actions/runs/17506617662/job/49731934012) [HUD commit link](`081cab0454`) ([comment](https://github.com/pytorch/pytorch/pull/161730#issuecomment-3260492575))	2025-09-06 04:17:08 +00:00
Boyuan Feng	c0983e6cc0	[Graph Partition] interface for custom cg wrapper (#162207 ) This PR adds an interface to allow users to specify custom cudagraph wrapper. User example: [vllm](https://github.com/vllm-project/vllm/pull/24281) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162207 Approved by: https://github.com/zou3519, https://github.com/eellison, https://github.com/ProExpertProg	2025-09-06 03:13:01 +00:00
Edward Z. Yang	b2b4add0e7	Docs on export joint with descriptors (#159006 ) Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159006 Approved by: https://github.com/SherlockNoMad	2025-09-06 03:02:58 +00:00
Gabriel Ferns	20629b1619	Add contiguous subgraph transformation threshold (#162192 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162192 Approved by: https://github.com/coconutruben	2025-09-06 02:48:00 +00:00
Raman Kumar	c3ceca2995	codebase structure documentation to include torchgen (#162261 ) 📚 The doc update adding description about torchgen folder in code structure guide Pull Request resolved: https://github.com/pytorch/pytorch/pull/162261 Approved by: https://github.com/ezyang	2025-09-06 02:10:57 +00:00
Eddie Yan	145a3a7bda	[CUDA 13][cuDNN] Bump CUDA 13 to cuDNN 9.13.0 (#162268 ) Fixes some `d_qk` != `d_v` cases on Hopper that are broken by cuDNN 9.11-9.12 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162268 Approved by: https://github.com/drisspg, https://github.com/Skylion007	2025-09-06 01:59:03 +00:00
ruisizhang123	291cd11f2d	[inductor] estimate peak memory in codegen only when buffer reuse (#162300 ) As titled, this PR ensures peak memory is estimated only when buffer reuse is enabled. Without this config, some nodes' successor nodes are eliminated from memory estimation after inductor bucketing, which can cause errors. The original codegen peak memory estimation code is from this PR: https://github.com/pytorch/pytorch/pull/159530 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162300 Approved by: https://github.com/eellison, https://github.com/v0i0	2025-09-06 01:30:38 +00:00
Yang Wang	7f4ff79210	remove deprecated vllm test (#162306 ) Fixes https://github.com/pytorch/pytorch/issues/162274 the test is removed from vllm side Pull Request resolved: https://github.com/pytorch/pytorch/pull/162306 Approved by: https://github.com/malfet	2025-09-06 01:27:13 +00:00
Will Feng	0f45aaf441	Disable autocast when running joint graph passes (#162304 ) Fixes #159469. See https://github.com/pytorch/pytorch/issues/159469#issuecomment-3221474027 for root-cause analysis. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162304 Approved by: https://github.com/bdhirsh, https://github.com/zou3519, https://github.com/eellison	2025-09-06 00:57:58 +00:00
dolpm	4f72d932fe	re-land triton runtime implementation" (#162217 ) Summary: original pr - https://github.com/pytorch/pytorch/pull/161798 Test Plan: ci Rollback Plan: Differential Revision: D81724234 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162217 Approved by: https://github.com/SherlockNoMad	2025-09-06 00:52:29 +00:00
Rob Timpe	1463714833	[dynamo] Graph break on on user-defined class in compiled region (#161670 ) Currently, user-defined classes inside of a compiled frame will cause the whole frame to be skipped by dynamo. This change defers the Unsupported exception until the __build_class__ builtin is actually called, which allows a graph break to be inserted. Fixes #161562 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161670 Approved by: https://github.com/williamwen42, https://github.com/guilhermeleobas	2025-09-06 00:04:57 +00:00
drisspg	081cab0454	Resize to 0 if not going to be used (#161730 ) Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * __->__ #161730 * #161667 ```Py with torch.cuda._DeviceGuard(0): torch.cuda.set_device(0) buf0 = empty_strided_cuda((2, 32, 1024), (32768, 1024, 1), torch.float32) buf1 = empty_strided_cuda((2, 32, 1024), (32768, 1024, 1), torch.float32) buf2 = empty_strided_cuda((2, 32, 1024, 64), (2097152, 65536, 64, 1), torch.float32) # Topologically Sorted Source Nodes: [flex_attention], Original ATen: [] stream0 = get_raw_stream(0) triton_tem_fused_0.run(arg0_1, arg1_1, arg2_1, buf0, buf1, arg4_1, arg3_1, arg5_1, arg6_1, buf2, 8, 2, 32, stream=stream0) del arg0_1 del arg1_1 del arg2_1 del arg3_1 del arg4_1 del arg5_1 del arg6_1 del buf0 del buf1 return (buf2, ) ``` Vs ```Py with torch.cuda._DeviceGuard(0): torch.cuda.set_device(0) buf0 = empty_strided_cuda((2, 32, 1024), (32768, 1024, 1), torch.float32) buf1 = empty_strided_cuda((0, ), (1, ), torch.float32) buf2 = empty_strided_cuda((2, 32, 1024, 64), (2097152, 65536, 64, 1), torch.float32) # Topologically Sorted Source Nodes: [flex_attention], Original ATen: [] stream0 = get_raw_stream(0) triton_tem_fused_0.run(arg0_1, arg1_1, arg2_1, buf0, buf1, arg4_1, arg3_1, arg5_1, arg6_1, buf2, 8, 2, 32, stream=stream0) del arg0_1 del arg1_1 del arg2_1 del arg3_1 del arg4_1 del arg5_1 del arg6_1 del buf0 del buf1 return (buf2, ) ``` <img width="428" height="145" alt="Screenshot 2025-08-28 at 12 37 11 PM" src="https://github.com/user-attachments/assets/240a7bca-97e1-40c4-bf93-f075fdc1a40d" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161730 Approved by: https://github.com/Skylion007, https://github.com/BoyuanFeng ghstack dependencies: #161667	2025-09-05 23:21:46 +00:00
drisspg	486b20b73c	Add return-max-scores to flex-attention (#161667 ) # Summary ### Update API ```Py class AuxRequest(NamedTuple): """Request which auxiliary outputs to compute from flex_attention. Each field is a boolean indicating whether that auxiliary output should be computed. """ lse: bool = False max_scores: bool = False class AuxOutput(NamedTuple): """Auxiliary outputs from flex_attention operation. Fields will be None if not requested, or contain the tensor if requested. """ lse: Optional[Tensor] = None max_scores: Optional[Tensor] = None out_only = flex_attention(query, key, value, score_mod) out_max, aux_max = flex_attention( query, key, value, score_mod, return_aux=FlexAttentionAuxRequest(max_scores=True), ) out_both, aux_both = flex_attention( query, key, value, score_mod, return_aux=FlexAttentionAuxRequest(lse=True, max_scores=True), ) ``` Returns the max post mod scores from flex attention. Not being able to break BC is kinda of annoying here since we end up with a combinatorial problem where if we need to add any more return vals we need to new kwargs that gate if they get returned by the function and need to support the 2**N additional args possible return groups. Ideally there isn't much more we need to return, but we might want to think about how best to set this up for expansion in the future. I added kwarg only now Maybe we make a `ExtraReturns` type kwarg that can grow and we don't need to keep adding new top level args. We could also return a Struct that holds all the extra tensors and start deprecation cycle for logsumexp eventually returning just 1 `ExtraReturns` like struct with the tensors. ### Req Grad I currently dont return a max_scores that supports backproping grads. I think this might be feasible but since max is essentially 1 hot on the inputs and a reduction we would either need to save another `max_location` from the forward or find the max_score but also only apply to first occurence if there is multiple equivalent scores (need to check if thats we define for vanilla max op in torch). For now no grad, we can re-visit if needed. ## Perf I am going to disable for flex_decode. Since at least initially the motivation is for training. I also more hard than it should be to have ops return nuns or optional tensors, If return max is at the false, we should probably just create a tensor of size zero so that we don't slow down the hot path. ```Shell 🔝 Top 5 TFlops Deltas (by absolute %): shape: (5, 7) ┌────────────────┬────────────────┬───────────────────────┬───────────────┬──────────────┬───────────┬───────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops (base) ┆ TFlops (max) ┆ delta ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞════════════════╪════════════════╪═══════════════════════╪═══════════════╪══════════════╪═══════════╪═══════════╡ │ causal ┆ torch.bfloat16 ┆ (4, 16, 2048, 16, ┆ 249.514658 ┆ 243.078974 ┆ 6.435684 ┆ 2.647569 │ │ ┆ ┆ 2048, 64) ┆ ┆ ┆ ┆ │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, ┆ 57.971274 ┆ 56.633641 ┆ 1.337633 ┆ 2.361905 │ │ ┆ ┆ 1024, 64) ┆ ┆ ┆ ┆ │ │ noop ┆ torch.bfloat16 ┆ (4, 16, 1024, 16, ┆ 244.052884 ┆ 248.65129 ┆ -4.598406 ┆ -1.849339 │ │ ┆ ┆ 1024, 64) ┆ ┆ ┆ ┆ │ │ noop ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, ┆ 280.71254 ┆ 275.686991 ┆ 5.025549 ┆ 1.822918 │ │ ┆ ┆ 1024, 128) ┆ ┆ ┆ ┆ │ │ sliding_window ┆ torch.bfloat16 ┆ (2, 16, 16384, 16, ┆ 152.970031 ┆ 150.489109 ┆ 2.480923 ┆ 1.648573 │ │ ┆ ┆ 16384, 64) ┆ ┆ ┆ ┆ │ └────────────────┴────────────────┴───────────────────────┴───────────────┴──────────────┴───────────┴───────────┘ 🔺 Top 5 Positive TFlops Deltas (highest +%): shape: (5, 7) ┌────────────────┬────────────────┬────────────────────────┬───────────────┬──────────────┬──────────┬───────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops (base) ┆ TFlops (max) ┆ delta ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞════════════════╪════════════════╪════════════════════════╪═══════════════╪══════════════╪══════════╪═══════════╡ │ causal ┆ torch.bfloat16 ┆ (4, 16, 2048, 16, ┆ 249.514658 ┆ 243.078974 ┆ 6.435684 ┆ 2.647569 │ │ ┆ ┆ 2048, 64) ┆ ┆ ┆ ┆ │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, ┆ 57.971274 ┆ 56.633641 ┆ 1.337633 ┆ 2.361905 │ │ ┆ ┆ 1024, 64) ┆ ┆ ┆ ┆ │ │ noop ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, ┆ 280.71254 ┆ 275.686991 ┆ 5.025549 ┆ 1.822918 │ │ ┆ ┆ 1024, 128) ┆ ┆ ┆ ┆ │ │ sliding_window ┆ torch.bfloat16 ┆ (2, 16, 16384, 16, ┆ 152.970031 ┆ 150.489109 ┆ 2.480923 ┆ 1.648573 │ │ ┆ ┆ 16384, 64) ┆ ┆ ┆ ┆ │ │ causal ┆ torch.bfloat16 ┆ (4, 16, 1024, 16, ┆ 161.031318 ┆ 158.597808 ┆ 2.43351 ┆ 1.534391 │ │ ┆ ┆ 1024, 64) ┆ ┆ ┆ ┆ │ └────────────────┴────────────────┴────────────────────────┴───────────────┴──────────────┴──────────┴───────────┘ 🔻 Top 5 Negative TFlops Deltas (lowest -%): shape: (5, 7) ┌────────────────┬────────────────┬───────────────────────┬───────────────┬──────────────┬───────────┬───────────┐ │ attn_type ┆ dtype ┆ shape(B,Hq,M,Hkv,N,D) ┆ TFlops (base) ┆ TFlops (max) ┆ delta ┆ pct_delta │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │ ╞════════════════╪════════════════╪═══════════════════════╪═══════════════╪══════════════╪═══════════╪═══════════╡ │ noop ┆ torch.bfloat16 ┆ (4, 16, 1024, 16, ┆ 244.052884 ┆ 248.65129 ┆ -4.598406 ┆ -1.849339 │ │ ┆ ┆ 1024, 64) ┆ ┆ ┆ ┆ │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 4, ┆ 175.546923 ┆ 177.81205 ┆ -2.265127 ┆ -1.273888 │ │ ┆ ┆ 1024, 128) ┆ ┆ ┆ ┆ │ │ sliding_window ┆ torch.bfloat16 ┆ (4, 16, 16384, 4, ┆ 156.282597 ┆ 158.209134 ┆ -1.926537 ┆ -1.217715 │ │ ┆ ┆ 16384, 64) ┆ ┆ ┆ ┆ │ │ sliding_window ┆ torch.bfloat16 ┆ (2, 16, 2048, 16, ┆ 232.542929 ┆ 235.140136 ┆ -2.597207 ┆ -1.104536 │ │ ┆ ┆ 2048, 128) ┆ ┆ ┆ ┆ │ │ alibi ┆ torch.bfloat16 ┆ (2, 16, 1024, 16, ┆ 169.652791 ┆ 171.475986 ┆ -1.823195 ┆ -1.063236 │ │ ┆ ┆ 1024, 128) ┆ ┆ ┆ ┆ │ └────────────────┴────────────────┴───────────────────────┴───────────────┴──────────────┴───────────┴───────────┘ ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161667 Approved by: https://github.com/Chillee, https://github.com/BoyuanFeng	2025-09-05 23:21:46 +00:00
Xuan Zhang	4d4abec80f	allow user to pass in custom partitioner function (#157580 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/157580 Approved by: https://github.com/bdhirsh	2025-09-05 22:49:39 +00:00
Nikita Shulga	9c03d6be87	[CD][BE] Delete Python-3.9 case (#162265 ) And raise error when building for an unsupported version Pull Request resolved: https://github.com/pytorch/pytorch/pull/162265 Approved by: https://github.com/clee2000, https://github.com/Skylion007, https://github.com/ZainRizvi ghstack dependencies: #162297	2025-09-05 22:46:36 +00:00
Nikita Shulga	8d50355d97	[CD][EZ] Update libtorch python version to 3.10 (#162297 ) Not sure why it was at 3.9 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162297 Approved by: https://github.com/clee2000, https://github.com/atalman	2025-09-05 22:46:36 +00:00
dolpm	e0a62b266c	[aot-precompile] default-filter global guards (#162090 ) if the user doesn't provide their own guard filter fn, we should by default filter global guards. pytest test/dynamo/test_aot_compile.py Pull Request resolved: https://github.com/pytorch/pytorch/pull/162090 Approved by: https://github.com/zhxchen17	2025-09-05 22:44:55 +00:00
Saurabh Mishra	01ab325cc2	[DCP][Quantization] Fix the issue when scale vector is in a different SafeTensors file (#162214 ) Summary: The current dequantization implementation assumes that the weight and scale tenors are in the same SafeTensors files. This diff fixes the issue to support the case when these could be in different files. Test Plan: buck test fbcode//caffe2/test/distributed/checkpoint\:test_quantized_hf_storage Buck UI: https://www.internalfb.com/buck2/532bf151-bb40-41fd-b080-ff898675afe2 Test UI: https://www.internalfb.com/intern/testinfra/testrun/15199648851011082 Rollback Plan: Differential Revision: D81718598 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162214 Approved by: https://github.com/wwwjn	2025-09-05 22:43:58 +00:00
Laith Sakka	79fcd5247a	symbolic cpp channels_last_contiguous (#160402 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160402 Approved by: https://github.com/aorenste	2025-09-05 21:40:32 +00:00
rzou	70d36e047d	Making batching rule for F.embedding DTensor-aware (#162117 ) `vmap(F.embedding)(DTensor, DTensor)` was failing because F.embedding's batching rule generates a new tensor via at::arange, at::arange generates a regular tensor, and DTensor rightfully errors on mixed DTensor-regular Tensor operations. This PR fixes the problem by activating DTensor implicit replication on just the at::arange and the subsequent add operation. In order to accomplish this I move the DTensor implicit replication flag to C++ (most batching rules are in C++). Test Plan: - new test Pull Request resolved: https://github.com/pytorch/pytorch/pull/162117 Approved by: https://github.com/bdhirsh	2025-09-05 21:40:14 +00:00
Nikita Shulga	a00cdc1e41	[CD][BE] Get rid of SETUPTOOLS and PYYAML extra pins (#162266 ) As those weren't really a pins to begin with, and requirments.txt already has those Pull Request resolved: https://github.com/pytorch/pytorch/pull/162266 Approved by: https://github.com/clee2000, https://github.com/Skylion007, https://github.com/ZainRizvi ghstack dependencies: #162263, #162264	2025-09-05 21:32:52 +00:00
Shunzhi Wen	c10195e723	[C10d][Gloo] Enable complex datatype support in ProcessGroupGloo (#156633 ) - Enable communication of tensors with Complex datatype in ProcessGroupGloo, similar to how ProcessGroupNCCL handles it. - Move a function, which checks if Complex datatype is supported by a reduce operation, from ProcessGroupNCCL.cpp into a new file to be shared with ProcessGroupGloo. Fixes #156632 Pull Request resolved: https://github.com/pytorch/pytorch/pull/156633 Approved by: https://github.com/d4l3k	2025-09-05 21:24:36 +00:00
Boyuan Feng	771f369448	[Inductor] Improve RoPE (#161420 ) This PR fuses ROPE from 2 kernels into 1 kernel. Shape: ``` q: [B, Hq, S, D] k: [B, Hkv, S, D] ``` `Hq=32, Hkv=8, D=128` following Llama3 setting. <img width="980" height="624" alt="image" src="https://github.com/user-attachments/assets/652a8227-6f1d-465c-97fd-2b0af41f8ed9" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161420 Approved by: https://github.com/shunting314	2025-09-05 20:55:20 +00:00
henrylhtsang	92a43025e0	[cutlass backend] Add FP8 tests for multiple linears (#160782 ) Adding a test that is closer to real use case. Thanks @mlazos for fixing a few issues so this test works for most cases. We still have to skip the AOTI and dynamic case due to accuracy issues. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160782 Approved by: https://github.com/mlazos	2025-09-05 20:23:25 +00:00
Xuehai Pan	2fa0520a64	[BE][pytree] cleanup parameterized pytree tests (#160842 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160842 Approved by: https://github.com/Skylion007	2025-09-05 20:15:29 +00:00
Edward Z. Yang	01edcd4df8	Make distributed modules importable even when backend not built (#159889 ) This PR is greatly simplified now that it stacked on top of a PR that builds with distributed always. We only need to stub functions that may not be defined due to a backend not being enabled. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159889 Approved by: https://github.com/wconstab ghstack dependencies: #160449	2025-09-05 20:15:11 +00:00
Edward Yang	de893e96c7	Always build USE_DISTRIBUTED. (#160449 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160449 Approved by: https://github.com/wconstab, https://github.com/albanD, https://github.com/dcci	2025-09-05 20:15:11 +00:00
Nikita Shulga	6087ef41e5	[BE] Cleanup stale comments/copy from `gemm` (#162001 ) Followup after https://github.com/pytorch/pytorch/pull/154012 Since the introduction of `gemm_no_downcast_stub` it's no longer necessary to allocate temporary array and then manually implement the `beta` logic in the codebase Pull Request resolved: https://github.com/pytorch/pytorch/pull/162001 Approved by: https://github.com/drisspg ghstack dependencies: #161999	2025-09-05 19:59:51 +00:00
Nikita Shulga	a3c7f77e50	[EZ][CD] Update MacOS deployment platform to 11.0 (#162264 ) Fixes following warning ``` MACOSX_DEPLOYMENT_TARGET is set to a lower value (10.15) than the version on which the Python interpreter was compiled (11.0) ``` Update deployment platform in `README.MD` as well Pull Request resolved: https://github.com/pytorch/pytorch/pull/162264 Approved by: https://github.com/clee2000, https://github.com/Skylion007, https://github.com/ZainRizvi ghstack dependencies: #162263	2025-09-05 19:58:04 +00:00
Justin Chu	3771380f83	[ONNX] Hide draft export under a flag (#162225 ) Use `TORCH_ONNX_ENABLE_DRAFT_EXPORT` to control whether draft_export should be used as a strategy in onnx export. Follow up of https://github.com/pytorch/pytorch/pull/161454 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162225 Approved by: https://github.com/xadupre, https://github.com/titaiwangms	2025-09-05 19:54:50 +00:00
PyTorch MergeBot	adae7f66aa	Revert "Always build USE_DISTRIBUTED. (#160449 )" This reverts commit c37103234afc832dcad307e9016230810957c9d5. Reverted https://github.com/pytorch/pytorch/pull/160449 on behalf of https://github.com/jeanschmidt due to Breaking internal build rules, see D81756619 ([comment](https://github.com/pytorch/pytorch/pull/160449#issuecomment-3259430011))	2025-09-05 18:58:47 +00:00
PyTorch MergeBot	70f865ac9b	Revert "Make distributed modules importable even when backend not built (#159889 )" This reverts commit ef3be6726f7ff4b77c22db10cec5b686f9107ea9. Reverted https://github.com/pytorch/pytorch/pull/159889 on behalf of https://github.com/jeanschmidt due to Breaking internal build rules, see D81756619 ([comment](https://github.com/pytorch/pytorch/pull/160449#issuecomment-3259430011))	2025-09-05 18:58:47 +00:00
Scott Wolchok	88d94d17e8	Add torch.Tensor._make_dtensor to accelerate DTensor.__new__ further (#161590 ) This seems to be a (very very roughly) ~8% improvement on DTensor benchmark very similar to the benchmark from #160580 (120ish usec -> 110ish usec) Differential Revision: [D81530105](https://our.internmc.facebook.com/intern/diff/D81530105) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161590 Approved by: https://github.com/albanD ghstack dependencies: #161466, #161586	2025-09-05 18:43:41 +00:00
Ruben Rodriguez Buchillon	c321111499	[inductor][ez] V.choices.get_mm_configs returns list of ChoiceCallers (#161348 ) \# why - every callsite just executes the generator on the spot - previous pr adds the ability to add an override before expensive generators are executed, so we don't need this generator anymore \# what - rather than yielding the ChoiceCaller, just return the list of all valid ChoiceCallers \# testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520574](https://our.internmc.facebook.com/intern/diff/D81520574) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161348 Approved by: https://github.com/eellison ghstack dependencies: #162075, #161340, #161341, #161342, #161343, #161344, #161345, #161346, #161347	2025-09-05 18:02:53 +00:00
Ruben Rodriguez Buchillon	9a8d454c46	[inductor] add kernel template choice (ktc) (#161347 ) # why - gather everything up to make choices, without running potentially expensive generators - enables overrides where we toss the entire list of configs from inductor, without having to enumrate it (expensive) # what - add a holding class that just gets all the components necessary to generate a ChoiceCaller - use that class to generate ChoiceCallers - this does not (yet) add the override function, but just prepares the scene ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520569](https://our.internmc.facebook.com/intern/diff/D81520569) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161347 Approved by: https://github.com/eellison ghstack dependencies: #162075, #161340, #161341, #161342, #161343, #161344, #161345, #161346	2025-09-05 18:02:53 +00:00
Ruben Rodriguez Buchillon	e02e9edb55	[inductor] V.choice.get_mm_configs takes a stack of templates (#161346 ) # why - enables us to just gather relevant templates and get all choices at once - that in turns allows us to make op wide override decisions # what - V.choice.get_mm_configs takes a stack of templates - all callsites just provide a stack of size 1 right now but do not merge everything yet (other features pending) # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520583](https://our.internmc.facebook.com/intern/diff/D81520583) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161346 Approved by: https://github.com/eellison ghstack dependencies: #162075, #161340, #161341, #161342, #161343, #161344, #161345	2025-09-05 18:02:46 +00:00
Ruben Rodriguez Buchillon	d63ad53a99	[inductor][ez] return choicecallers directly (#161345 ) # why - remove repeat patterns - we have everything to make the choicecallers - templates - input_nodes - layouts - all the kwargs # what - yield a choicecaller directly from V.choices.get_mm_configs # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520577](https://our.internmc.facebook.com/intern/diff/D81520577) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161345 Approved by: https://github.com/jansel ghstack dependencies: #162075, #161340, #161341, #161342, #161343, #161344	2025-09-05 18:02:38 +00:00
Ruben Rodriguez Buchillon	031d79cb51	[inductor] move max-autotune logic inside V.choices.get_mm_configs (#161344 ) # why - heuristics providers know decide whether to (or which choices to add) in the max-autotune case - enables an eventual override point to gracefully fallback to the standard behavior # what - max-autotune is determined inside V.choices.get_mm_configs because it's mm only right now, we can just do `config.max_autotune or config.max_autotune_gemm` a TODO indicates that this can change in the future when this expands to more templates # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520573](https://our.internmc.facebook.com/intern/diff/D81520573) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161344 Approved by: https://github.com/jansel ghstack dependencies: #162075, #161340, #161341, #161342, #161343	2025-09-05 18:02:30 +00:00
Ruben Rodriguez Buchillon	a301dc3b60	[inductor][ez] pass template rather than template.uid (#161343 ) # why - simpler interface - enables future of extracting more things out of the template e.g. a hash # what V.choices.get_mm_configs now takes the whole template rather than just the template.uid # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520576](https://our.internmc.facebook.com/intern/diff/D81520576) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161343 Approved by: https://github.com/jansel ghstack dependencies: #162075, #161340, #161341, #161342	2025-09-05 18:02:22 +00:00
Ruben Rodriguez Buchillon	af590cb729	[inductor][aten] treat like a template in GEMMs (#161342 ) # why - central point to analyze and override all generated choices # what - add a pseudo heuristic for aten that just yields a single, empty kwargs - add a pseudo heuristic with the bias_addmm logic for it - add an addmm specific heuristic that yields a single choice, but also expands it with alpha and beta kwargs - replace all the aten.bind calls with V.choices.get_mm_configs using the now matching API for aten # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520580](https://our.internmc.facebook.com/intern/diff/D81520580) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161342 Approved by: https://github.com/jansel ghstack dependencies: #162075, #161340, #161341	2025-09-05 18:02:10 +00:00
Ruben Rodriguez Buchillon	4902c76c65	[inductor][ez] add template/externchoice uid (#161341 ) # why - to have a central registry of templates/externkernelchoice to match them to heuristics etc, they need unique names - mm is both the triton template name and the aten_mm name # what - add a uid() to KernelTemplate/ExternKernelChoice that returns name - override in ExternKernel to prepend "aten::" - override in TritonTemplate to prepend "triton::" This id is just use to find template heuristics, so it has no other impact # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520579](https://our.internmc.facebook.com/intern/diff/D81520579) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161341 Approved by: https://github.com/jansel, https://github.com/eellison ghstack dependencies: #162075, #161340	2025-09-05 18:01:58 +00:00
Ruben Rodriguez Buchillon	9602590b15	[inductor] move scaled_mm input nodes logic (#161340 ) # why - a step towards a unified interface for all choices, where any adjustment to nodes (e.g. unsqueezing) happens as part of choice specific preprocessing, behind a common point # what - move the unsqueeze logic for triton nodes for scaled_mm inside the new hookup for adjusting the kernel inputs for template heuristics # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v -k "scale" ``` Differential Revision: [D81520582](https://our.internmc.facebook.com/intern/diff/D81520582) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161340 Approved by: https://github.com/jansel, https://github.com/eellison ghstack dependencies: #162075	2025-09-05 18:01:44 +00:00
Ruben Rodriguez Buchillon	2ef665ae19	[inductor][contigous mm] mild refactor (#162075 ) # why - use the new heuristics logic better to handle kwargs # what - move all checks into the heuristics to yield a single choice or not choices if the decomposition should not be used - fix `hip` device type, which should be `cuda` - let heuristics handle the kwarg passing # testing in ci Differential Revision: [D81706776](https://our.internmc.facebook.com/intern/diff/D81706776) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162075 Approved by: https://github.com/exclamaforte, https://github.com/jansel	2025-09-05 18:01:07 +00:00
Mikayla Gawarecki	b18bb6796f	Add const to stable amax (#162082 ) Fixes https://github.com/pytorch/pytorch/issues/161826 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162082 Approved by: https://github.com/soulitzer	2025-09-05 17:37:49 +00:00
PyTorch MergeBot	d711f27845	Revert "[ROCm] [CK] Composable Kernel integration for inductor backend (#158747 )" This reverts commit 019fed39aa6b2dd8c69347378d53423e5efae8d4. Reverted https://github.com/pytorch/pytorch/pull/158747 on behalf of https://github.com/jithunnair-amd due to Broke linux-binary-manywheel-rocm / manywheel-py3_9-rocm6_4-test: `019fed39aa/1` ... PR didn't have this job run successfully due to CI outage ([comment](https://github.com/pytorch/pytorch/pull/158747#issuecomment-3259212343))	2025-09-05 17:27:45 +00:00
Nikita Shulga	261a84a176	[CD][BE] Remove unnecessary checks for XCode version (#162263 ) None of them have worked for a while, PyTorch for Mac is build with XCode-15.4 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162263 Approved by: https://github.com/clee2000, https://github.com/Skylion007, https://github.com/ZainRizvi	2025-09-05 17:02:36 +00:00
xinan.lin	98374612fc	[Intel GPU] Update Intel triton commit pin to Triton 3.5.x (#161777 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161777 Approved by: https://github.com/EikanWang	2025-09-05 16:55:47 +00:00
Eddie Yan	c2a3024617	[cuBLASLt][FP8] `cuBLASLt` appears to support float8 rowwise-scaling on H100 (#161305 ) Following #157905 I think the macro around ``` TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt"); ``` was never updated and this would cause `float8` tests to fail. Also it appears the `Lt` accepts two inputs with `e4m3` and `e5m2` dtypes simultaneously, so removing that check here as well... CC @lw Pull Request resolved: https://github.com/pytorch/pytorch/pull/161305 Approved by: https://github.com/Skylion007, https://github.com/drisspg, https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-05 16:55:09 +00:00
Xingyuan Li	b2c7b9ad2d	[Intel GPU][FlexAttention] Enable TMA path on Intel GPU (#162138 ) The existing `can_use_tma` has some conditions that are unnecessary for Intel GPUs. We have removed these useless conditions on the Intel GPU path. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162138 Approved by: https://github.com/liangan1, https://github.com/EikanWang, https://github.com/jansel, https://github.com/etaf	2025-09-05 16:54:51 +00:00
PyTorch MergeBot	f3cebec39e	Revert "Rename propagate_tensor_meta to make private again (#161744 )" This reverts commit 734ce8eba9c69381f187359bf0fef1d71d84cd20. Reverted https://github.com/pytorch/pytorch/pull/161744 on behalf of https://github.com/jeanschmidt due to seems to break internal tests, see D81657000 for more details ([comment](https://github.com/pytorch/pytorch/pull/161744#issuecomment-3258934519))	2025-09-05 16:20:29 +00:00
Saurabh Mishra	06da7c0730	[DCP][Quantization] Fix for FP8 multiplication during dequantization (#162202 ) Summary: Weight vector needs to be upcasted since some FP8 formats (like Float8_e4m3fn) don't have CPU implementations in PyTorch. Reference: https://docs.pytorch.org/docs/stable/tensors.html#id13 We will use FP32 for the scale vector multiplication and convert to the target dtype. Upcasting helps with the following: 1. Full CPU support: `float32` has complete CPU kernel implementations for all operations 2. Numerical stability: `float32` provides more precision during intermediate calculations 3. Compatibility: Works across all devices (CPU/GPU) and PyTorch versions Test Plan: UTs Rollback Plan: Differential Revision: D81711093 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162202 Approved by: https://github.com/wwwjn	2025-09-05 16:06:21 +00:00
Edward Yang	2dd529df00	A basic CLAUDE.md based on bad things I see claude code doing (#162163 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162163 Approved by: https://github.com/albanD, https://github.com/Skylion007	2025-09-05 14:52:36 +00:00
Shunting Zhang	a714437093	[ez][inductor] add a few outer dimension reduction cases for LOAF (#162028 ) For the not able to fuse issue reported here: https://github.com/pytorch/pytorch/issues/93718 , LOAF can fuse the outer dimension softmax into a single kernel and brings 1.87x speedup for the example shape mentioned in the issue. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162028 Approved by: https://github.com/jansel, https://github.com/eellison	2025-09-05 09:30:13 +00:00
atalman	bffc7dd1f3	[CD] Add cuda 13.0 libtorch builds, remove CUDA 12.9 builds (#161916 ) Related to https://github.com/pytorch/pytorch/issues/159779 Adding CUDA 13.0 libtorch builds, followup after https://github.com/pytorch/pytorch/pull/160956 Removing CUDA 12.9 builds, See https://github.com/pytorch/pytorch/issues/159980 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161916 Approved by: https://github.com/jeanschmidt, https://github.com/Skylion007 Co-authored-by: Ting Lu <tingl@nvidia.com>	2025-09-05 07:47:54 +00:00
Zeng, Xiangdong	5c473e9f5e	[1/N] Port 5 _composable/fsdp distributed test cases to Intel GPU (#159118 ) For https://github.com/pytorch/pytorch/issues/114850, we will port distributed tests to Intel GPU. We could enable Intel GPU with following methods and try the best to keep the original code styles: - use "torch.accelerator.current_accelerator()" to determine the accelerator backend - enabled XPU for some test path - skip some test cases which Intel GPU does not support Pull Request resolved: https://github.com/pytorch/pytorch/pull/159118 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-05 05:52:15 +00:00
Pian Pawakapan	5da573c42c	[PGO] handle PGO profile merges (#162097 ) Avoid merges from extra PGO key, if same source has different rank. Unlikely to happen (needs code hash match & source variable type to change), but being safe. Differential Revision: D81299840 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162097 Approved by: https://github.com/bobrenjc93	2025-09-05 04:58:15 +00:00
PyTorch UpdateBot	494878a11b	[audio hash update] update the pinned audio hash (#162114 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162114 Approved by: https://github.com/pytorchbot	2025-09-05 04:32:16 +00:00
PyTorch UpdateBot	3bbc2e3e4f	[vllm hash update] update the pinned vllm hash (#162226 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162226 Approved by: https://github.com/pytorchbot	2025-09-05 04:32:08 +00:00
Nick Riasanovsky	b67c410398	[BE] [Inductor] Add Kernel name to all coor-desc tuning (#161409 ) Summary: When running coordinate descent tuning the logging is difficult to parse if the results are parallelized at all. This includes the kernel name in each step so post-processing can unify the results, even if run in parallel. Test Plan: NFC. Just a logging change. Rollback Plan: Differential Revision: D80942794 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161409 Approved by: https://github.com/PaulZhang12	2025-09-05 02:53:13 +00:00
Colin L Reliability Rice	be5b03dde9	Allow for using a dedicated binary for the torch subproc pool. (#162093 ) Summary: The binary torch is running inside of can be larger than needed and in certain situations, this can cause a loss of memory. Test Plan: We've manually run tests via ``` TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 TORCHINDUCTOR_WORKER_SUPPRESS_LOGGING=0 make mc8-train-publish-cint-datafm-toy -C minimal_viable_ai/models/ifr_mtml/main_v1/ 2>&1 \| tee ~/run_out ``` and overriding the binary used to be the built fbpkg in /packages. We've also kicked off manual runs at ``` fire-feid-20250903-1051-ae8c6827 ``` Which do show the binary running - https://fburl.com/scuba/procprint/e6lwv32m Rollback Plan: steps: - jk.update: jk: pytorch/compiler:subproc_worker_binary constant_bool: null consistent_pass_rate: null fractional_host_rollout: null sampling_rate: null - manual.note: content: '' Differential Revision: D81616624 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162093 Approved by: https://github.com/masnesral	2025-09-05 01:43:46 +00:00
Eddie Yan	73eb4511fb	[B200][NVFP4] Fix argument passing in `test_blockwise_mxfp8_nvfp4_mxfp4_numerics_` (#162185 ) to unblock https://github.com/pytorch/pytorch/pull/159494 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162185 Approved by: https://github.com/Skylion007, https://github.com/drisspg	2025-09-05 01:24:59 +00:00
Jeffro	29280864d9	Add new parameter for gen_pyi.py to make it more configureable. (#161772 ) This is a reposting of PR #128519. This change is important to how we maintain PyTorch at Google. From the previous PR: " This will make the script more flexible for the directory where it is executed. ... We plan to use the deprecated_yaml from a blaze genrule that invokes pyi.py. As the input to the pyi.py, genrule requires the input file to be explicitly listed out. When we feed the value of tools/autograd/deprecated.yaml to genrule, it failed to resolve since tools/autograd is a package from blaze perspective. Any file under a blaze package will a proper blaze target to be access. " Pull Request resolved: https://github.com/pytorch/pytorch/pull/161772 Approved by: https://github.com/albanD Co-authored-by: Haifeng Jin <haifeng-jin@users.noreply.github.com>	2025-09-05 00:48:15 +00:00
angelayi	5c67426d68	[dynamo] Add support for const prop on .item (#162204 ) Fixes some of the errors in https://fb.workplace.com/groups/1028545332188949/permalink/1303030824740397/ Pull Request resolved: https://github.com/pytorch/pytorch/pull/162204 Approved by: https://github.com/williamwen42	2025-09-05 00:28:49 +00:00
Nikita Shulga	d2d4c8e9b2	[BLAS] Avoid downcasts for fp16fp16->fp32 BLAS (#161999 ) Followup after https://github.com/pytorch/pytorch/pull/154012 Fixes CPU part of https://github.com/pytorch/pytorch/issues/160841 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161999 Approved by: https://github.com/drisspg	2025-09-04 23:35:27 +00:00
Eddie Yan	c7e41071a0	[B200][MXFP8] Fix regex in `test_blockwise_mxfp8_nvfp4_error_messages_recipe_mxfp8_cuda` (#162180 ) to unblock https://github.com/pytorch/pytorch/pull/159494 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162180 Approved by: https://github.com/Skylion007, https://github.com/drisspg, https://github.com/nWEIdia	2025-09-04 23:29:10 +00:00
xinan.lin	9499c8761c	[Inductor][Intel GPU] Register triton template heuristic for addmm tma. (#162132 ) Fixes #162048 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162132 Approved by: https://github.com/jansel	2025-09-04 23:01:57 +00:00
Nan Zhang	3a207816cc	Forward fix for user defined triton kernel grid calc (#162162 ) Summary: This change fixes the test: inductor:fxir_backend - test_custom_triton_autotune_dynamic which was broken by https://github.com/pytorch/pytorch/pull/160997 Test Plan: inductor:fxir_backend - test_custom_triton_autotune_dynamic Rollback Plan: Differential Revision: D81679217 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162162 Approved by: https://github.com/eellison, https://github.com/jansel	2025-09-04 22:51:23 +00:00
Yiming Zhou	09be1890d7	[export] Fix torch.export.load with storage offset (#162172 ) Summary: As titled Test Plan: CI Rollback Plan: Differential Revision: D81687701 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162172 Approved by: https://github.com/angelayi	2025-09-04 22:50:33 +00:00
Pian Pawakapan	0d84ff3b78	[PGO] log add_extra_remote PGO to tlparse (#161751 ) Summary: log when additional PGO profile is merged in, from added read key Test Plan: test_pgo Rollback Plan: Differential Revision: D81284190 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161751 Approved by: https://github.com/bobrenjc93	2025-09-04 22:47:03 +00:00
PyTorch MergeBot	1ec2c15914	Revert "Fix Arm64 OSS pytorch build with FBGEMM (#161527 )" This reverts commit dbec08729fb9848bebed6048c63831b87170d061. Reverted https://github.com/pytorch/pytorch/pull/161527 on behalf of https://github.com/malfet due to This breaks all Mac builds, see `b04e922712/1` ([comment](https://github.com/pytorch/pytorch/pull/161527#issuecomment-3256034443))	2025-09-04 22:29:38 +00:00
Shangdi Yu	b04e922712	Fix memory leak in AOTI when calling `aoti_torch_as_strided` (#162118 ) Summary: Fix memory leak in AOTI when calling `aoti_torch_as_strided` If you have something like `AtenTensorHandle buf_handle`; and you allocated memory to it, you have to make it a `RAIIAtenTensorHandle` to release the ownership. Otherwise you have leaked the memory because even when the program ends, there's still a pointer pointing to the underlying storage of `buf_handle_restrided`, and the storage is never freed. Test Plan: ``` buck run fbcode//mode/dev-nosan fbcode//caffe2/test/inductor:test_aot_inductor -- -r test_pad_non_zero_memory_leak ``` Also verified by looking at `print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB")` Differential Revision: D81640339 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162118 Approved by: https://github.com/angelayi	2025-09-04 22:17:06 +00:00
Brian Hirsh	0d71a9dd5b	fix incorrect interaction between DDPOptimizer and donated buffers (#160745 ) This should fix https://x.com/wightmanr/status/1953147089518772254?t=ng_R4t0-tRhO_qQE8NqOhw&s=19. Still working on adding a reasonable test. You can see more of a description of the problem in the code comments. But the TLDR is that: * When using DDPOptimizer, we partition the graph and compile several subgraphs. So 1 dynamo graphs becomes N AOT/inductor artifacts * We have some existing logic to stash graph metadata (`fw_metadata`) in dynamo's TracingContext. When using DDPOptimizer, we generate one `fw_metadata` per AOT graph, and we stash it on the 1 TracingContext from dynamo. So we end up clobbering the `fw_metadata` for graph i-1 when AOT and inductor start compiling graph i * This is normally ok, but it becomes a problem if inductor ever wants to read from this `fw_metadata` during backward compilation. Why? We (by default) compile the backwards lazily. So when using DDPOptimizer, we will compile backward graph N, then bw graph N-1, etc. But... at the time that we have stated compiling bw graph N-1, its corresponding fw_metadata has already been clobbered! So we end up reusing graph N's metadata for all of our backward graph compilations. With donated buffer metadata, that means we end up donated and writing into incorrect input buffers The fix that I added was to add more dedicated DDPOptimizer metadata into the TracingContext, so we can properly switch between these N different `fw_metadata` objects in the backward. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160745 Approved by: https://github.com/ezyang, https://github.com/zou3519	2025-09-04 21:57:27 +00:00
Ke Wen	89d41d3f61	[SymmMem] Feed tensor.data_ptr instead of handle.buffer_ptr into kernels (#162193 ) After MemPool support, `get_buffer_ptrs` points to base address of allocation segment. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162193 Approved by: https://github.com/ngimel	2025-09-04 21:26:05 +00:00
Ke Wen	9bdcee01f8	[SymmMem] Add root argument to broadcast op (#161090 ) It was missing earlier. Also added range check. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161090 Approved by: https://github.com/fegin	2025-09-04 21:09:54 +00:00
Prachi Gupta	b9ba612f7a	[ROCm] Enabling several UTs (#161715 ) All these UTs are working as is, just removing the skip - test_p2p_ipc - test_repros.py: working, added fp8 support - test_activation_checkpointing.py - test_content_store.py - test_cuda_multigpu.py - test_compute_comm_reordering.py - test_segment_reductions.py - test_dataloader.py - test_math_ops.py - test_loop_ordering.py - test_control_flow.py - distributed_test.py - test_mem_tracker.py - test_fsdp_optim_state.py - test_fully_shard_mixed_precision.py: skippped for < ROCm7.0 - test_aot_inductor_custom_ops.py - test_c10d_ops_nccl.py - test_eager_transforms.py - test_sparse_csr.py - test_inductor_collectives.py - test_fake_tensor.py - test_cupy_as_tensor.py - test_cuda.py: enable UTs that are working - test_matmul_cuda.py: enable UTs that are working Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/161715 Approved by: https://github.com/pruthvistony, https://github.com/jeffdaily	2025-09-04 20:43:03 +00:00
PyTorch MergeBot	d5b38410b5	Revert "[SymmMem] Add root argument to broadcast op (#161090 )" This reverts commit 3c0ff1b569c45cfa6935ad8031a9d4cf1551aa3f. Reverted https://github.com/pytorch/pytorch/pull/161090 on behalf of https://github.com/jeanschmidt due to breaks internal builds ([comment](https://github.com/pytorch/pytorch/pull/161090#issuecomment-3255574093))	2025-09-04 20:42:31 +00:00
PyTorch MergeBot	48bedd753d	Revert "Fix usage of forwarding references (#161094 )" This reverts commit 1ebd70d0c0d562d3be9abdee2a21906584af7d99. Reverted https://github.com/pytorch/pytorch/pull/161094 on behalf of https://github.com/jeanschmidt due to checking if revert will fix https://github.com/pytorch/pytorch/actions/runs/17470601839/job/49621447581 ([comment](https://github.com/pytorch/pytorch/pull/161094#issuecomment-3255541480))	2025-09-04 20:35:41 +00:00
Wang, Eikan	a3d72b09ae	Apply Triton tensor descriptor for flex-decoding for performance (#161643 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161643 Approved by: https://github.com/drisspg	2025-09-04 20:10:41 +00:00
Edward Z. Yang	ef3be6726f	Make distributed modules importable even when backend not built (#159889 ) This PR is greatly simplified now that it stacked on top of a PR that builds with distributed always. We only need to stub functions that may not be defined due to a backend not being enabled. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159889 Approved by: https://github.com/wconstab ghstack dependencies: #160449	2025-09-04 20:05:50 +00:00
PyTorch MergeBot	95ee0bfea9	Revert "[nativert] triton runtime implementation (#161798 )" This reverts commit 3dde5d7f9bf80dd6623a712bc429e9e4302464b5. Reverted https://github.com/pytorch/pytorch/pull/161798 on behalf of https://github.com/jeanschmidt due to introducing linting failures ([comment](https://github.com/pytorch/pytorch/pull/161798#issuecomment-3255412085))	2025-09-04 20:05:24 +00:00
Ben Niu	dbec08729f	Fix Arm64 OSS pytorch build with FBGEMM (#161527 ) Summary: X-link: https://github.com/pytorch/FBGEMM/pull/4775 Without this change, Arm64 OSS pytorch build with FBGEMM failed with the following error. Undefined symbols for architecture arm64: "fbgemm::FindMinMax(float const, float, float*, long long)", referenced from: at::native::fbgemm_linear_int8_weight_fp32_activation(at::Tensor const&, at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::Scalar const&, c10::Scalar const&, at::Tensor const&) in QuantizedLinear.cpp.o at::native::fbgemm_linear_quantize_weight(at::Tensor const&) in QuantizedLinear.cpp.o PackedConvWeight<2>::apply_dynamic(at::Tensor const&, bool) in qconv_dynamic.cpp.o PackedConvWeight<3>::apply_dynamic(at::Tensor const&, bool) in qconv_dynamic.cpp.o at::Tensor PackedLinearWeight::apply_dynamic_impl<false>(at::Tensor, bool) in qlinear_dynamic.cpp.o at::Tensor PackedLinearWeight::apply_dynamic_impl<true>(at::Tensor, bool) in qlinear_dynamic.cpp.o ld: symbol(s) not found for architecture arm64 This change fixed the issue by moving FindMinMax's implementation from QuantUtilsAvx2.cc to QuantUtils.cc. FindMinMax is a platform-agnostic function with AVX2-specific optimizations so conceptually it can be put in QuantUtils.cc. Test Plan: With this change, Arm64 OSS pytorch built successfully with FBGEMM enabled. Rollback Plan: Reviewed By: q10 Differential Revision: D81052327 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161527 Approved by: https://github.com/q10	2025-09-04 20:01:13 +00:00
PyTorch MergeBot	c3d54dea9f	Revert "[BLAS] Avoid downcasts for fp16fp16->fp32 BLAS (#161999 )" This reverts commit 02c83f13348631d80aa23f57aaff6b7d1223bbdd. Reverted https://github.com/pytorch/pytorch/pull/161999 on behalf of https://github.com/jeanschmidt due to break a few internal tests ([comment](https://github.com/pytorch/pytorch/pull/161999#issuecomment-3255381925))	2025-09-04 19:56:48 +00:00
PyTorch MergeBot	afa6e5604d	Revert "[BE] Cleanup stale comments/copy from `gemm` (#162001 )" This reverts commit b40d9432be44a6b5974ee62e7d19c3c61c5ece37. Reverted https://github.com/pytorch/pytorch/pull/162001 on behalf of https://github.com/jeanschmidt due to break a few internal tests ([comment](https://github.com/pytorch/pytorch/pull/161999#issuecomment-3255381925))	2025-09-04 19:56:48 +00:00
PyTorch MergeBot	9e5247f51d	Revert "[MPS] enable cat op for sparse (#162007 )" This reverts commit 2c03f0acc53ed13fe8ebfe809129f25996e009a0. Reverted https://github.com/pytorch/pytorch/pull/162007 on behalf of https://github.com/jeanschmidt due to Breaks internal builds see [D81588372](https://www.internalfb.com/diff/D81588372), @malfet may you help the author? ([comment](https://github.com/pytorch/pytorch/pull/162007#issuecomment-3255357336))	2025-09-04 19:49:44 +00:00
Edward Yang	c37103234a	Always build USE_DISTRIBUTED. (#160449 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160449 Approved by: https://github.com/wconstab, https://github.com/albanD, https://github.com/dcci	2025-09-04 19:43:17 +00:00
dolpm	3dde5d7f9b	[nativert] triton runtime implementation (#161798 ) Summary: att Test Plan: ci Rollback Plan: Reviewed By: minjang Differential Revision: D80828148 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161798 Approved by: https://github.com/minjang, https://github.com/SherlockNoMad	2025-09-04 19:00:15 +00:00
Aaron Gokaslan	1f51056bd6	[BE]: Update cpp-httplib submodule to 0.26.0 (#162181 ) Update cpp-httplib with better error handling, bugfixes, and performance. Header only library update. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162181 Approved by: https://github.com/jansel	2025-09-04 18:59:32 +00:00
Animesh Jain	6b1900c22f	[dynamo][hops] Remove const outputs from the speculated subgraph (#161355 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161355 Approved by: https://github.com/zou3519	2025-09-04 18:52:01 +00:00
mansiag05	9480cdc0b6	Modified the docs to add example for torch.is_floating_point and torc… (#161951 ) …h.is_complex. The PR proposes adding a simple, self-explanatory example to the documentation page. The example demonstrates the function's output for tensors with various data types, showing both True and False return values. Fixes #161859 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161951 Approved by: https://github.com/zou3519	2025-09-04 18:50:19 +00:00
eqy	6f7608d603	[cuDNN][SDPA] Enable cuDNN SDPA by default for SM 9.0, SM 10.0 (#162073 ) for 2.9 🙏 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162073 Approved by: https://github.com/drisspg	2025-09-04 18:46:28 +00:00
Albert W	d1a15abfdc	export: add explicit decomposition for aten.expand_copy and unit test (#161688 ) Fixes #161080 torch.export.export fails with TypeError: expand() got an unexpected keyword argument 'implicit' when calling torch.expand_copy(..., implicit=True). This happened because expand_copy = _make_copy_from_view(aten.expand) register aten. expand as the decomposition path for aten.expand_copy, which doesn’t accept the implicit argument. I have added an explicit a decomposition for aten.expand_copy in torch/_decomp/decompositions.py to ignore the implicit argument, and a simple unit test to demonstrate the bug being fixed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161688 Approved by: https://github.com/angelayi, https://github.com/can-gaa-hou	2025-09-04 18:16:56 +00:00
Animesh Jain	33028597bf	[dynamo] Make the MRO walk more narrow (#162105 ) I dont have a failing test case but just saw an extra guard somewhere. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162105 Approved by: https://github.com/williamwen42, https://github.com/StrongerXi, https://github.com/jansel	2025-09-04 17:54:33 +00:00
vasiliy	9eadb37cdd	enable float32 and float16 in `torch._grouped_mm` fallback (#162059 ) Summary: Enables `torch.float32` and `torch.float16` options in `torch._grouped_mm`. Note that the fast path is only enabled if `mat_a`, `mat_b`, and `out_dtype` are `torch.bfloat16`. Saving for future PRs: 1. enabling testing on more platforms 2. supporting out_dtype != mat_a.dtype 3. opinfo 4. better compile support Test Plan: ```bash // on A100 and H100 pytest test/test_matmul_cuda.py -s -k test_grouped_gemm -x // on H100 pytest test/test_matmul_cuda.py -s -k test_scaled_grouped_gemm -x ``` Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/162059 Approved by: https://github.com/ngimel, https://github.com/eqy ghstack dependencies: #161407, #161717	2025-09-04 17:48:52 +00:00
vasiliy	61fb632cfb	move `_grouped_mm` fallback to composite explicit autograd (#161717 ) Summary: Moves the `torch._grouped_mm` fallback from cuda-only code to a place where it can be used by multiple backends. Specifically: 1. make the fallback path and util functions reusable and move them to `ATen/native/GroupedMMUtils.h` 2. register a backend-agnostic kernel to composite explicit autograd key 3. refactor the grouped_mm tests to their own test case and enable CPU At the end of this PR, here is the support matrix: * CUDA SM90+: fast path with test coverage (no change) * CUDA SM80+: fallback with test coverage (no change) * CPU: fallback works, but without test coverage (new in this PR) * other SM versions and other backends: will probably already work, but let's leave this to future PRs * float32/float16: will probably already work, but let's leave this to future PRs Test Plan: ```bash pytest test/test_matmul_cuda.py -s -k test_grouped_gemm -x ``` Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/161717 Approved by: https://github.com/ngimel, https://github.com/drisspg ghstack dependencies: #161407	2025-09-04 17:48:52 +00:00
vasiliy	8a736fa1ea	create torch._grouped_mm fallback path with for loops / bmm (#161407 ) Summary: Creates a fallback path for `torch._grouped_mm`, using the naive for loop implementation (or bmm). For the sake of keeping the PR small, this PR only enables SM80+ (CUDA capability 8.0 and up), since I am testing this on an A100 machine. In future PRs, we can increase the coverage of the fallback to: 1. float32 and float16, which will extend the GPU coverage 2. cpu Test Plan: ```bash pytest test/test_matmul_cuda.py -s -k test_grouped_gemm_2d_3d -x pytest test/test_matmul_cuda.py -s -k test_grouped_gemm_3d_2d -x pytest test/test_matmul_cuda.py -s -k test_grouped_gemm_2d_2d -x pytest test/test_matmul_cuda.py -s -k test_grouped_gemm_3d_3d -x ``` Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/161407 Approved by: https://github.com/drisspg, https://github.com/eqy	2025-09-04 17:48:44 +00:00
Ke Wen	8bb213b6d5	[SymmMem] Increase signal pad size for NVL72 (#162026 ) so that the signal calls do not step on each other's foot. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162026 Approved by: https://github.com/ngimel	2025-09-04 17:41:38 +00:00
Ke Wen	869cbcc16e	[SymmMem] Add a helper API to distinguish intra- and inter- node (#161984 ) Added a helper API to tell if the world is entirely within a P2P domain or crosses network. This is mainly for nblocks tuning purpose. (In later PRs) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161984 Approved by: https://github.com/ngimel ghstack dependencies: #161983	2025-09-04 17:37:59 +00:00
Frank Lin	0c0e056a9e	[CUDA] Reuse blocks with record_stream during CUDA Graph capture in the CUDACachingAllocator (#158352 ) ## Introduction During CUDA Graph capture, the CUDA caching allocator currently defers reclaiming blocks until capture ends. This is because CUDA forbids querying events recorded during capture (the CUDA operation is not executed during the capture stage), so the allocator cannot use its normal event-based logic. However, capture records an DAG (we call it capturing graph) of work. We can use the capturing graph to determine when a block’s old lifetime is fully before future work, and safely reuse it within the same capture. This PR adds an experimental flag `graph_capture_record_stream_reuse: True\|False (default: False)`. When enabled, the allocator inserts lightweight free markers and uses capture ordering to decide if a freed block is safe to reuse during capture. If the proof cannot be established, we fall back to the existing post-capture path. ## Terms * Free marker: A capture-legal no-op (created with `cudaGraphAddEmptyNode`) inserted after the last captured use of the block on each stream that used it. * Terminal: The set of the lastest operations of the stream (or the capturing graph). Any newly captured op on that stream will attach after all nodes in this set. For a stream currently capturing, it is the set of nodes returned in `dependencies_out` by `cudaStreamGetCaptureInfo`. ## When can we reuse a block during capture? ### Strong Rule (Graph-Wide Safety) This rule provides a universal guarantee that a block is safe for reuse by any stream in the graph. > A block is safe to reuse if every free marker is a predecessor of every terminal of all active streams in the graph. Why it's safe: This rule establishes a strict global ordering. Since any new operation on any stream must be appended after that stream's terminals, this condition guarantees that the block's new lifetime begins only after its old lifetime has completely ended everywhere. This prevents lifetime overlaps when the graph is replayed, ensuring correctness. ### Per-stream Rule (A Practical Optimization) The strong rule, while safe, is often unnecessarily restrictive. The `DeviceCachingAllocator` introduces a crucial constraint that allows for a simpler check. In `DeviceCachingAllocator`, `get_free_block` only returns blocks whose `block->stream == p.stream()`. In other words, we never reuse a block on a stream different from the allocation stream. This means we don't need to verify safety across the entire graph. We only need to confirm that the block is safe to reuse from the perspective of its own allocation stream. > Reuse a block for allocations on stream S if every free marker is a predecessor of every node in the terminal set of S. In short, a block is considered reusable on stream S as long as all marker marking it "free" are guaranteed to complete before any new work that might need it on stream S begins. ## Implementation * On `free(block)` during capture * For each stream in `block->stream_uses` and the allocation stream, insert a free marker (empty node) and make it that stream’s tail. * If we cannot place markers for all such streams (for example, a stream is not in capture), defer to the post-capture path. * Otherwise, store the marker handles and keep the block in the capture-private structures. * On `allocate(stream)` during capture (attempt per-stream reclaim) * Query the allocation stream S’s terminal via `cudaStreamGetCaptureInfo`. * For each deferred block, check whether it is allocated on this stream, and each of its free markers is a predecessor of the terminal. * If yes, hand the block to S for immediate reuse within the same capture. * If no, keep it deferred; it will be reconsidered as capture progresses and S’s terminal advances. * On capture end * Any still-deferred blocks follow the existing post-capture reclamation (event insertion/polling). External behavior remains unchanged if we cannot prove safety during capture. ## Examples (2 streams) <img width="641" height="801" alt="pytorch-remove-cudagraph-defer-reclaiming (6)" src="https://github.com/user-attachments/assets/41adc835-d448-483b-99ba-b4341cb7d2a2" /> * Case 0 — Unsafe The two frees are not ordered with respect to each other. For stream 1, the other stream’s free marker does not precede this stream’s terminal, so the per-stream condition fails. Counterexample intuition for the unsafe setups: imagine `f2(x)` runs for a long time. If DeviceCachingAllocator reused block `x` on a stream whose terminal is not ordered after the free markers, the new lifetime could overlap the old one on replay, risking use-after-free or data corruption. The per-stream rule prevents exactly this. * Case 1 — Reusable on stream 1 Stream 1’s terminal is after both frees, so every free marker precedes stream 1’s terminal. The block is reusable for allocations on stream 1. * Case 2 — Not reusable on stream 2, but this cannot occur in `DeviceCachingAllocator` This depicts reusing the block on stream 2 while stream 1’s free is not yet ordered before stream 2’s terminal. Though the block is not safe to reuse on stream 2, DeviceCachingAllocator will not choose that block for stream 2 anyway: `get_free_block` rejects blocks whose `stream != p.stream()`. So this case is unreachable. * Case 3 — Safe (strong rule holds) In this scenario, the terminal nodes of all streams are positioned after the block's free markers, satisfying the strong rule. This guarantees the block is safe for reuse by any stream in the capturing graph. However, since `DeviceCachingAllocator ` only reuses a block on its original allocation stream, verifying this strong condition is unnecessary. We only need to ensure the per-stream rule is met for the specific stream requesting the block. * Case 4 — Freeing after a join See the note below. ## Edge Case: Freeing after a join Our current dependency tracking has a limitation in scenarios where a block is freed after a stream join, see @galv's [comments here](https://github.com/pytorch/pytorch/pull/158352#pullrequestreview-3112565198)). In the case 4, we have a missed opportunity. Because the block's usage is not explicitly marked, we cannot determine that the block's actual last use may have occurred much earlier, long before the join. Then, we must wait for the subsequent join before the block can be reused. ## Thanks Thanks to @galv for his great idea around graph parsing and empty nodes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158352 Approved by: https://github.com/ngimel, https://github.com/eqy Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-04 17:21:26 +00:00
William Wen	f36f285953	[dynamo] change error_on_graph_break/fullgraph semantics (#161747 ) This PR implements the semantics change to `torch._dynamo.error_on_graph_break`: - ~`torch.compile` now has a new `error_on_graph_break` kwarg that serves as a lower-priority toggle for erroring/continuing on graph breaks~ - `error_on_graph_break` is a new internal `torch.compile `setting that is lower-priority than `fullgraph`. It allows the user to toggle erroring/continuing on graph breaks. - `error_on_graph_break` does nothing when `fullgraph=True` - `error_on_graph_break` does NOT guarantee a single graph Followup [DONE]: need to change the programming model docs to reflect the 3 graph break modes for compilation: - `fullgraph=True`: enforce one graph, no graph breaks, cannot be toggled - `fullgraph=False, error_on_graph_break=True`: errors on graph breaks, latter can be toggled during compile time - `fullgraph=False, error_on_graph_break=False`: resumes tracing on graph breaks, latter can be toggled during compile time Pull Request resolved: https://github.com/pytorch/pytorch/pull/161747 Approved by: https://github.com/mlazos ghstack dependencies: #161739	2025-09-04 17:10:17 +00:00
Cui, Yifeng	ba7f546ccc	Update torch-xpu-ops commit pin (#162062 ) Update the torch-xpu-ops commit to [intel/torch-xpu-ops@83c5a5](`83c5a5a551`), includes: - Revert "Disable xccl timer avoid drlm hang" because XPU time event issue has been fixed - Fallback lu_factor kernel to CPU for single batch - Enable aten::linalg_inv and aten::linalg_inv_ex on XPU Pull Request resolved: https://github.com/pytorch/pytorch/pull/162062 Approved by: https://github.com/EikanWang	2025-09-04 17:05:33 +00:00
Lakshay Garg	43b7c86a2c	Add dependency-groups.dev to pyproject.toml (#161216 ) [PEP 735](https://peps.python.org/pep-0735) introduces the [dependency-groups] table for a number of use-cases one of which includes specifying development dependencies for projects. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161216 Approved by: https://github.com/seemethere	2025-09-04 16:51:36 +00:00
iupaikov-amd	019fed39aa	[ROCm] [CK] Composable Kernel integration for inductor backend (#158747 ) This is a part of our effort for integrating Composable Kernel library for Inductor backend. Currently we have a submodule, but would prefer to have commit pin control over the library as with Triton. We intentionally avoid putting all installation logic in CI scripts to allow locally built versions to have this functionality. The idea is to have CK as a pytorch dependency in pytorch 2.9 release to allow people to use it with inductor and AOT inductor and then gradually step away from submodule usage. Right now CK usage in SDPA/Gemm is tied to submodule files. This PR is a remake of due to branch error: https://github.com/pytorch/pytorch/pull/156192 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158747 Approved by: https://github.com/jeffdaily Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Co-authored-by: Jack Taylor <108682042+jataylo@users.noreply.github.com> Co-authored-by: Max Podkorytov <4273004+tenpercent@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>	2025-09-04 16:51:06 +00:00
Oguz Ulgen	81aeefa657	Add torch.compile support for triton.constexpr_function (#162106 ) Fixes #161868 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162106 Approved by: https://github.com/jansel, https://github.com/zou3519	2025-09-04 16:46:55 +00:00
Edward Yang	248355faf5	Don't require FakeStore to be passed into fake backend (#162164 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162164 Approved by: https://github.com/bdhirsh, https://github.com/albanD, https://github.com/wconstab	2025-09-04 16:43:49 +00:00
Lakshay Garg	1ebd70d0c0	Fix usage of forwarding references (#161094 ) I found a number of places that seem to want forwarding references but the type signature does not reflect that Pull Request resolved: https://github.com/pytorch/pytorch/pull/161094 Approved by: https://github.com/malfet	2025-09-04 16:34:39 +00:00
Alexander Grund	cc5bdd1240	Keep default `CMAKE_PREFIX_PATH` in test_aot_inductor_package (#161907 ) `CMAKE_PREFIX_PATH` is a list of paths used to find dependencies. The test overwrites that with a single path causing dependencies such as protobuf or Abseil not being found. Instead prepend the path to the existing value. This fixes a test failure: > pytorch-v2.7.1/test/inductor/test_aot_inductor_package.py", line 242, in test_compile_after_package > self.assertTrue(so_path.exists()) > AssertionError: False is not true Caused by: ``` /software/binutils/2.42-GCCcore-13.3.0/bin/ld: cannot find -labsl::utility: No such file or directory /software/binutils/2.42-GCCcore-13.3.0/bin/ld: cannot find -labsl::variant: No such file or directory collect2: error: ld returned 1 exit status ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161907 Approved by: https://github.com/Skylion007	2025-09-04 16:27:57 +00:00
Yu, Guangye	3a20a20e70	Fix largeTensorTest malfunction on XPU (#161988 ) # Motivation https://github.com/pytorch/pytorch/pull/143553/files#diff-6492991193449e118ff0c8d42ca544cc38a73604e505ff246a3c711aeab91748R1345 makes `largeTensorTest` malfunction on XPU. This PR aims to fix it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161988 Approved by: https://github.com/EikanWang, https://github.com/albanD	2025-09-04 16:10:03 +00:00
PyTorch MergeBot	6b8b3ac440	Revert "[ROCm] Use MI325 (gfx942) runners for binary smoke testing (#162044 )" This reverts commit cd529b686d54bbaa443f5b310140de48422d96c7. Reverted https://github.com/pytorch/pytorch/pull/162044 on behalf of https://github.com/jeffdaily due to mi200 backlog is purged, and mi300 runners are failing in GHA download ([comment](https://github.com/pytorch/pytorch/pull/162044#issuecomment-3254427869))	2025-09-04 16:06:30 +00:00
Boyuan Feng	601ae8e483	[CUDAGraph] add config to error on skipping cudagraph (#161862 ) Many users want a config to force all cuda ops captured by cudagraph. When not possible, pt2 should error. This PR adds `torch._inductor.triton.cudagraph_or_error` for that (default as False). Also added an environment variable `TORCHINDUCTOR_CUDAGRAPH_OR_ERROR` to control. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161862 Approved by: https://github.com/ezyang, https://github.com/mlazos	2025-09-04 15:52:39 +00:00
PyTorch MergeBot	b7dad7dd49	Revert "Always build USE_DISTRIBUTED. (#160449 )" This reverts commit 90b08643c3a6eb1f3265b7d1388bd76660759f46. Reverted https://github.com/pytorch/pytorch/pull/160449 on behalf of https://github.com/jeanschmidt due to Already discussed with @ezyang about the internal quirks and errors ([comment](https://github.com/pytorch/pytorch/pull/160449#issuecomment-3254219358))	2025-09-04 15:25:07 +00:00
Alexander Grund	e532c9d4f1	Relax tolerance for test_quick_baddbmm_cpu_complex64 (#152424 ) On Zen 2 (AMD EPYC) and Intel Sapphire Rapids this fails with small differences when compiled with native targeted optimizations. I.e. it fails with `-march=znver2` but succeeds with `-march=znver1`. I assume some operator fusing is being used by GCC. Small differences like using `vmovdqa` can be seen in the minimized code of the baddbmm kernel: https://godbolt.org/z/jsxMa91Wb The greatest differences are consistent and the same on both CPU architectures: ``` Greatest absolute difference: 3.43852152582258e-05 at index (1, 2, 1) (up to 1e-05 allowed) Greatest relative difference: 3.6034286949870875e-06 at index (1, 2, 1) (up to 1.3e-06 allowed) ``` Hence I assume this is in the expected tolerances especially as `complex128` and all other types pass. Pull Request resolved: https://github.com/pytorch/pytorch/pull/152424 Approved by: https://github.com/malfet	2025-09-04 13:26:42 +00:00
PyTorch MergeBot	34aa78274d	Revert "Make distributed modules importable even when backend not built (#159889 )" This reverts commit 4ae57d448c0a7d37e4cfd5c27d977fad2cef4051. Reverted https://github.com/pytorch/pytorch/pull/159889 on behalf of https://github.com/jeanschmidt due to Failing internal tests, probably typechecks. See D81588399 ([comment](https://github.com/pytorch/pytorch/pull/159889#issuecomment-3253651785))	2025-09-04 13:13:52 +00:00
Deng, Daisy	040d00af04	[2/N]Port several test files under test/distributed to Intel GPU (#159473 ) For https://github.com/pytorch/pytorch/issues/114850, we will port distributed tests to Intel GPU. This PR will work on some test files under test/distributed. We could enable Intel GPU with following methods and try the best to keep the original code styles: - instantiate_device_type_tests() - use "torch.accelerator.current_accelerator()" to determine the accelerator backend - use requires_accelerator_dist_backend to allow both nccl and xccl test - enabled XPU for some test path - Change the hardcoded world_size according to device_count. - Unify some common code under torch/testing/_internal for multiple backend, for example: Added xpu for Backend.backend_capability and dist.Backend.register_backend() Pull Request resolved: https://github.com/pytorch/pytorch/pull/159473 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-04 12:53:17 +00:00
Klaus Zimmermann	9c957723a0	Replace setup.py develop with pip install -e (#156710 ) #156027 already replaced most use of `python setup.py develop`. This PR only adds a few more occurrences. Pull Request resolved: https://github.com/pytorch/pytorch/pull/156710 Approved by: https://github.com/atalman	2025-09-04 11:07:44 +00:00
fengqing.lu	acece97c3a	[Intel GPU] Upgrade OneDNN XPU Tag to v3.9.1 (#161932 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161932 Approved by: https://github.com/EikanWang, https://github.com/Skylion007, https://github.com/guangyey	2025-09-04 11:05:10 +00:00
kbabiuchx	ea1883dfd3	Fixes #154982 : add missing to_result_dtype in vector_norm (#155111 ) Fixes #154982 Pull Request resolved: https://github.com/pytorch/pytorch/pull/155111 Approved by: https://github.com/isuruf, https://github.com/eellison	2025-09-04 10:49:08 +00:00
Shangdi Yu	d67c29ad22	[inductor] Fix int64 from MutationOutput Buffer (#162020 ) Summary: When we have a user defined triton kernel, it marks the mutated outputs as `MutationOutput` with a NoneLayout. This MutationOutput may later be used as input to another inductor-generated triton kernel. When we determine whether to use int32 or int64 for the inductor generated triton kernel, we need to look at the number of elements for all buffers involved. If one of the buffer is a MutationOutput, we should still consider it's number of elements, instead of skipping it. To get a hint on the MutationOutput size, we look at the buffers corresponding to `mutation_names` in MutationOutput. Test Plan: ``` buck run mode/opt fbcode//caffe2/test/inductor:test_aot_inductor -- -r test_autotune_int64_user_defined_triton_kernel ``` Differential Revision: D81530083 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162020 Approved by: https://github.com/davidberard98, https://github.com/eellison	2025-09-04 09:47:57 +00:00
vishalgoyal316	09587daf8c	Adding missing example of torch.full_like Issue#161899 (#162051 ) Fixes #161899 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162051 Approved by: https://github.com/zou3519	2025-09-04 08:45:49 +00:00
Chong Gu	c024b1f5a1	[AMD] [Reland] Fix AMD User Defined Kernel Autotune (#161521 ) Summary: This is a reland of D80285441, fixed the unit test. Test Plan: ``` buck2 run mode/opt-amd-gpu -m rocm641 -c fbcode.split-dwarf=true -c fbcode.use_link_groups=true -c fbcode.enable_gpu_sections=true //hpc/new/models/feed/benchmark:feed_lower_benchmark -- --load=manifold://ads_storage_fblearner/tree/user/facebook/fblearner/predictor/894698382/0/gpu_lowering/new_input8 --skip-eager --skip-flop-estimation --sync-mode=0 --lower-backend=AOT_INDUCTOR ``` will succeed after this diff. Rollback Plan: Differential Revision: D80971224 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161521 Approved by: https://github.com/frank-wei	2025-09-04 08:41:18 +00:00
zeshengzong	8fd3c9ce91	Optimize AMP custom_backend_name error message (#162037 ) Print out amp target dtype and let custom backend easier find out expected dtype while integration. ## Test Result ### Before ```python In [1]: import torch ...: import torch_openreg ...: ...: a = torch.randn(3, 4) ...: b = torch.randn(4, 2) ...: with torch.autocast("openreg", dtype=torch.float16): ...: torch.mm(a, b) ...: /home/coder/code/pytorch/torch/amp/autocast_mode.py:332: UserWarning: In openreg autocast, but the target dtype is not supported. Disabling autocast. openreg Autocast only supports dtypes of torch.float32 currently. warnings.warn(error_message ``` ### After ```python In [1]: import torch ...: import torch_openreg ...: ...: a = torch.randn(3, 4) ...: b = torch.randn(4, 2) ...: with torch.autocast("openreg", dtype=torch.float16): ...: torch.mm(a, b) ...: /home/coder/code/pytorch/torch/amp/autocast_mode.py:332: UserWarning: In openreg autocast, but the target dtype torch.float16 is not supported. Disabling autocast. openreg Autocast only supports dtypes of torch.float32 currently. warnings.warn(error_message) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162037 Approved by: https://github.com/zou3519	2025-09-04 08:27:56 +00:00
Liao, Wei	e19e02c84c	port distributed tensor test files for Intel GPU (#161604 ) In this pr, we port test/distributed/tensor test filesfor Intel GPU We could enable Intel GPU with following methods and try the best to keep the original code styles: Use torch.accelerator for general gpu Skip the case if running on xpu which has known issues Pull Request resolved: https://github.com/pytorch/pytorch/pull/161604 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-04 07:49:25 +00:00
Chris Thi	69a25f6888	[ROCm] Enable USE_FBGEMM_GENAI (#160676 ) Summary: X-link: https://github.com/pytorch/FBGEMM/pull/4703 X-link: https://github.com/facebookresearch/FBGEMM/pull/1728 In this diff we enable the support for the new FBGEMM backed FP8 _scaled_grouped_mm on ROCm. For now we only enable support for `gfx942` as that is what we have thoroughly tested performance and correctness on. Rollback Plan: Differential Revision: D79564024 Test Plan: Ensure builds with: - `USE_FBGEMM_GENAI=1` and without gfx942 - `USE_FBGEMM_GENAI=1` and with gfx942 - `USE_FBGEMM_GENAI=1` and all current [`PYTORCH_ROCM_ARCH`](`9491d289b3/.ci/docker/libtorch/build.sh (L48)`) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160676 Approved by: https://github.com/drisspg	2025-09-04 07:13:17 +00:00
tqchen	890626632d	[DLPACK] Optimize toDLPack Conversion Speed (#162111 ) Previously in gh-83069, the toDLPack converter introduces a normalization step that changes the strides to 1 when shape[i] == 1 This step, however, calls as_strided during toDLPack, and can slow down the toDLPack about 3x. This causes PyTorch's DLPack conversion to be around 0.6 us overhead per call from the < 0.2us. This PR updates the logic by adding a need_normalize_strides check, to first confirm if the strides normalization is necessary. In most common cases, when the tensor is continguous, such normalization is not necessary. We confirmed that having this additional step would recover the speed of toDLPack to below 0.2us and can help significantly speedup eager mode integration of DLPack with PyTorch. If we detect that there is normalization needs, the older path will be invoked. Fixes #162113 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162111 Approved by: https://github.com/msaroufim	2025-09-04 05:27:05 +00:00
Guilherme Leobas	480c739112	Capture TypeError in `CONTAINS_OP` (#161069 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161069 Approved by: https://github.com/anijain2305	2025-09-04 04:49:09 +00:00
Gabriel Ferns	66f3b4a682	Contiguous subgraph decomposition (#161241 ) ## Summary Adds a subgraph decomposition for addmm and mm that performs well on large `K` compared to `M` and `N`, and functions well as an alternative to `split-k` on AMD (transposed only), which does not support AMD currently. ## Background On AMD (MI300x), for a matmul A * B, if B is non-contiguous, the resulting matmul is quite a bit slower. For example: ``` args[0]: TensorBox(StorageBox( InputBuffer(name='arg0_1', layout=FixedLayout('cuda:0', torch.float16, size=[1024, 178176], stride=[178176, 1])) )) args[1]: TensorBox(StorageBox( InputBuffer(name='arg1_1', layout=FixedLayout('cuda:0', torch.float16, size=[178176, 6144], stride=[1, 178176])) )) ``` is a lot slower than: ``` args[0]: TensorBox(StorageBox( InputBuffer(name='arg0_1', layout=FixedLayout('cuda:0', torch.float16, size=[1024, 178176], stride=[178176, 1])) )) args[1]: TensorBox(StorageBox( InputBuffer(name='arg1_1', layout=FixedLayout('cuda:0', torch.float16, size=[178176, 6144], stride=[6144, 1])) )) ``` This PR adds a subgraph decomposition to test out whether making B contiguous is faster than just using the normal kernels. ## Data I ran this on unique non-contiguous shapes from torchbench/huggingface and got these speedups: ``` Parsed 420 unique shapes from benchmark output addmm improvements when best: addmm_16448x512x2048: +0.14% addmm_128x2048x2048: +0.01% addmm_128x768x1000: +0.75% addmm_12672x3072x768: +1.08% addmm_512x768x32000: +0.62% addmm_12608x384x384: +0.00% addmm_4160x1024x4096: +0.90% addmm_16x768x2: +0.56% addmm_12608x3072x768: +0.09% addmm_64x4096x1000: +2.77% addmm_256x1024x512: +1.99% addmm_30x256x256: +1.12% addmm_100480x128x384: +0.91% addmm_6400x2048x512: +0.25% addmm_61568x1024x256: +0.08% addmm_1x768x768: +0.93% addmm_12544x384x384: +0.19% addmm_128x512x1000: +0.77% addmm_2048x128x128: +1.32% addmm_128x3072x1000: +0.24% addmm_7936x512x2048: +0.07% addmm_8192x512x2048: +0.33% addmm_64x1024x1000: +1.43% addmm_128x2304x1000: +0.01% addmm_32768x256x2: +0.75% addmm_64x384x1152: +0.79% addmm_64x640x1000: +0.01% addmm_100480x128x128: +0.87% addmm_1152x3072x768: +1.13% addmm_8192x256x2048: +1.40% addmm_4096x128x768: +0.01% addmm_128x2560x1000: +0.01% addmm_12544x2048x512: +0.43% addmm_200704x24x96: +0.14% addmm_8448x512x2048: +0.96% addmm_50176x256x1024: +0.62% addmm_4160x4096x1024: +0.22% addmm_4096x768x768: +0.32% addmm_220x2048x512: +0.56% addmm_8x2048x1000: +1.12% addmm_256x197951x512: +26.99% addmm_401536x64x192: +0.60% addmm_2040x2048x512: +0.47% addmm_512x1024x256: +1.32% addmm_128x4096x1000: +1.67% addmm_12672x768x768: +0.34% addmm_128x368x1000: +0.77% addmm_96x1280x1000: +0.01% addmm_12544x512x2048: +0.41% addmm_6272x320x1280: +0.76% addmm_12544x3072x768: +0.09% addmm_64x384x1000: +0.39% mm improvements when best: mm_200704x128x512: +1.29% mm_663552x16x16: +0.80% mm_4096x768x768: +0.51% mm_131072x64x31: +0.24% mm_12544x1152x384: +0.11% mm_128x2048x2: +0.46% mm_262144x16x23: +0.62% mm_50176x576x192: +0.37% mm_131072x16x31: +0.26% ================================================================================ BENCHMARK ANALYSIS RESULTS ================================================================================ Operation: addmm ---------------------------------------- Total shapes analyzed: 247 Average Subgraph placement: 3.38 Median Subgraph placement: 2.0 Subgraph is best choice: 52/247 shapes (21.1%) Average improvement when best: 1.15% Median improvement when best: 0.58% Largest improvement when best: +26.99% Operation: bmm ---------------------------------------- Total shapes analyzed: 85 Average Subgraph placement: 24.00 Median Subgraph placement: 21.0 Subgraph is best choice: 0/85 shapes (0.0%) Average improvement when best: N/A (never best) Median improvement when best: N/A (never best) Largest improvement when best: N/A (never best) Operation: mm ---------------------------------------- Total shapes analyzed: 88 Average Subgraph placement: 15.08 Median Subgraph placement: 4.0 Subgraph is best choice: 9/88 shapes (10.2%) Average improvement when best: 0.52% Median improvement when best: 0.46% Largest improvement when best: +1.29% ``` ## Results The largest shape gain, `256,197951,512`, seemed to be driven by a case where the extern kernel is way faster than the best triton configs on the recursive autotune: ``` addmm,Extern,extern_kernels.addmm,256,197951,512,0.38024500012397766 addmm,Triton,256,197951,512,32,256,16,2,2,4,2.005444049835205 addmm,Triton,256,197951,512,32,128,32,2,4,8,2.04189395904541 addmm,Triton,256,197951,512,64,128,16,2,4,8,2.1911399364471436 addmm,Triton,256,197951,512,64,128,32,2,4,8,2.496040105819702 addmm,Triton,256,197951,512,64,128,64,2,8,16,2.9306790828704834 addmm,Triton,256,197951,512,64,64,32,2,4,8,3.0347819328308105 ... ``` Compared to the non-transposed autotune: ``` addmm,Subgraph,contiguous_addmm_1384,256,197951,512,0.5024129748344421 addmm,Extern,extern_kernels.addmm,256,197951,512,0.6881489753723145 addmm,Triton,256,197951,512,32,256,16,2,2,4,2.5115010738372803 addmm,Triton,256,197951,512,32,128,32,2,4,8,2.5167479515075684 addmm,Triton,256,197951,512,64,128,16,2,4,8,2.9507460594177246 addmm,Triton,256,197951,512,64,256,64,2,8,4,2.9673290252685547 addmm,Triton,256,197951,512,64,128,64,2,8,16,3.3906331062316895 addmm,Triton,256,197951,512,64,128,32,2,4,8,3.496859073638916 ``` It seems to perform really well for high values of `K` vs `N` and `M`. Testing this hypothesis with some custom shapes: ``` Parsed 64 unique shapes from benchmark output addmm improvements when best: addmm_128x16384x128: +0.18% addmm_128x262144x256: +38.24% addmm_128x200000x512: +14.76% addmm_256x800000x128: +0.06% addmm_131072x128x256: +0.27% addmm_128x256x131072: +0.25% addmm_2048x200000x64: +12.45% mm improvements when best: mm_128x16384x128: +0.18% mm_128x262144x256: +38.05% mm_128x200000x512: +9.47% mm_256x800000x128: +0.99% mm_512x6400000x256: +3.17% mm_524288x64x64: +0.29% mm_2048x200000x64: +11.19% mm_8192x1000000x256: +34.14% mm_128x4096x100000: +0.40% mm_128x3072x150000: +0.27% ================================================================================ BENCHMARK ANALYSIS RESULTS ================================================================================ Operation: addmm ---------------------------------------- Total shapes analyzed: 33 Average Subgraph placement: 4.39 Median Subgraph placement: 2.0 Subgraph is best choice: 7/33 shapes (21.2%) Average improvement when best: 9.46% Median improvement when best: 0.27% Largest improvement when best: +38.24% Operation: mm ---------------------------------------- Total shapes analyzed: 30 Average Subgraph placement: 7.63 Median Subgraph placement: 2.0 Subgraph is best choice: 10/30 shapes (33.3%) Average improvement when best: 9.81% Median improvement when best: 2.08% Largest improvement when best: +38.05% ``` ## Conclusion Contiguous Subgraph Decompositionseems worthwhile for `mm` and `addmm`, but not `bmm`, and has a very large improvment on low `M`, low `N`, and high `K` shapes. Data gathering scripts: https://gist.github.com/exclamaforte/4a896c064d301b27bf5ca0a4f8fc3866 ## Test Plan: New unit tests. Differential Revision: D80771648 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161241 Approved by: https://github.com/eellison	2025-09-04 04:43:58 +00:00
PyTorch UpdateBot	302df2ac5d	[vllm hash update] update the pinned vllm hash (#162115 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162115 Approved by: https://github.com/pytorchbot	2025-09-04 04:26:34 +00:00
Shangdi Yu	dec72ea4b0	[reland] Add inductor provenance mapping for cpp extern kernel (#161656 ) (#162069 ) Summary: Add inductor provenance mapping for cpp extern kernel Test Plan: ``` buck run fbcode//caffe2/test/inductor:provenance_tracing -- -r test_cpu_extern_kernel ``` Differential Revision: D81598857 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162069 Approved by: https://github.com/angelayi	2025-09-04 04:18:43 +00:00
Richard Howell	8975cda252	[pt] strip error messages in profile builds (#162076 ) Summary: Profile builds should match production builds, and error messages result in large static initializers running. Omit them for profile builds too. Test Plan: Before: ``` $ buck build //xplat/caffe2:aten_native_cpuApple -c user.sandcastle_build_mode=profile --show-output $ llvm-nm buck-out/v2/gen/fbsource/31fc3668aa0b4012/xplat/caffe2/__aten_native_cpuApple__/libaten_native_cpuApple.pic.a \| grep ZN3c106detail12_str_wrapperIJPKcRKiS3_RKxS3_RKS3_S3_EE4callES9_S5_S9_S7_S9_S9_S9 0000000000003234 T __ZN3c106detail12_str_wrapperIJPKcRKiS3_RKxS3_RKS3_S3_EE4callES9_S5_S9_S7_S9_S9_S9_ ``` After: ``` $ buck build //xplat/caffe2:aten_native_cpuApple -c user.sandcastle_build_mode=profile --show-output $ llvm-nm buck-out/v2/gen/fbsource/31fc3668aa0b4012/xplat/caffe2/__aten_native_cpuApple__/libaten_native_cpuApple.pic.a \| grep ZN3c106detail12_str_wrapperIJPKcRKiS3_RKxS3_RKS3_S3_EE4callES9_S5_S9_S7_S9_S9_S9 ``` Rollback Plan: Reviewed By: yury-dymov, abashyam Differential Revision: D81599582 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162076 Approved by: https://github.com/swolchok	2025-09-04 04:18:27 +00:00
Guilherme Leobas	d636c181f9	Fix `range.__getitem__()` (#161804 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161804 Approved by: https://github.com/anijain2305 ghstack dependencies: #161801, #161802, #161803	2025-09-04 02:33:03 +00:00
Guilherme Leobas	c8255c67cd	redirect `iter(range)` to `range.__iter__()` (#161803 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161803 Approved by: https://github.com/anijain2305 ghstack dependencies: #161801, #161802	2025-09-04 02:33:03 +00:00
Guilherme Leobas	485a7bd82e	Add `range_count` and `range.__contains__` (#161802 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161802 Approved by: https://github.com/anijain2305 ghstack dependencies: #161801	2025-09-04 02:33:03 +00:00
Guilherme Leobas	1ef7efa592	Add `range_equals` (#161801 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161801 Approved by: https://github.com/anijain2305	2025-09-04 02:33:03 +00:00
Sun, Jiayi	57278d45f0	[Quant][Inductor][CPU] add qconv int8-mixed-bf16 patterns (#161487 ) Summary: Expand the patterns supported by qconv weight prepack, Specifically, expand the conv patterns of int8-mixed-bf16 datatype to support the following two cases: Case 1: the `out_dtype `of `dequantize_per_tensor `is `torch.float32` ``` dq_per_tensor dq_per_channel \| \| to_bf16 to_bf16 \ / Conv2d ``` Case 2: the `out_dtype `of `dequantize_per_tensor `is `torch.bfloat16` ``` dq_per_tensor dq_per_channel \ \| to_bf16 / Conv2d ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161487 Approved by: https://github.com/Xia-Weiwen, https://github.com/CaoE, https://github.com/jansel ghstack dependencies: #161486	2025-09-04 02:01:34 +00:00
Sun, Jiayi	cec0ff1228	[Quant][Inductor][CPU] add qlinear int8-mixed-bf16 patterns (#161486 ) Summary: Expand the patterns supported by qlinear weight prepack, Specifically, expand the linear patterns of int8-mixed-bf16 datatype to support the following two cases: Case 1: the `out_dtype` of `dequantize_per_tensor ` is `torch.float32` dq_per_tensor dq_per_channel \| \| to_bf16 to_bf16 \| \| OPT(reshape) permute \ / addmm/mm \| OPT(reshape) or dq_per_tensor dq_per_channel \| \| to_bf16 to_bf16 \| \| expand permute \ \| expand / bmm \| OPT(add) Case 2: the `out_dtype` of `dequantize_per_tensor ` is `torch.bfloat16` dq_per_tensor dq_per_channel \| \| to_bf16 \| OPT(reshape) permute \ / addmm/mm \| OPT(reshape) or dq_per_tensor dq_per_channel \| \| to_bf16 \| expand permute \ \| expand / bmm \| OPT(add) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161486 Approved by: https://github.com/Xia-Weiwen, https://github.com/jansel	2025-09-04 01:53:02 +00:00
Jacob Szwejbka	65985937d9	expose number of outputs in native runtime for unified runtime (#161723 ) This is only user outputs which is what we want. Spoke to @zhxchen17 though and it seems like nativeRT might have some bugs on propogating updates to things like input mutation or buffer mutation though. Something to take a look at in a follow up. Also I have no idea where the nativeRT tests are. Any pointers @zhxchen17 @SherlockNoMad Pull Request resolved: https://github.com/pytorch/pytorch/pull/161723 Approved by: https://github.com/zhxchen17	2025-09-04 01:20:31 +00:00
Laith Sakka	fbf3d2027d	use sym_or instead of any to avoid dde in calc_conv_nd_return_shape (#162084 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/162084 Approved by: https://github.com/aorenste Co-authored-by: Aaron Orenstein <aorenste@fb.com>	2025-09-04 01:20:22 +00:00
William Wen	8678d831c4	[dynamo] rename set_fullgraph to error_on_graph_break (#161739 ) Renaming `set_fullgraph` to `error_on_graph_break` for now. There are no semantic differences yet. In a followup PR, we will introduce a new `torch.compile` option `error_on_graph_break` that has lower priority than `fullgraph` so that `fullgraph` really returns 1 graph. I could keep `set_fullgraph` as a deprecated alias for `error_on_graph_break` for now, but I'm hoping that won't be necessary since it's still private API (there are no internal callsites yet, and there are no significant OSS callsites yet). cc @albanD @voznesenskym @penguinwu @EikanWang @jgong5 @Guobing-Chen @XiaobingSuper @zhuhaozhe @blzheng @wenzhe-nrv @jiayisunx @chenyang78 @kadeng @chauhang @amjames @Lucaskabela @mlazos @guilhermeleobas @xmfan as primary users for `set_fullgraph` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161739 Approved by: https://github.com/xmfan, https://github.com/Lucaskabela, https://github.com/anijain2305, https://github.com/mlazos	2025-09-04 01:15:06 +00:00
Saurabh Mishra	1281470155	[DCP][HuggingFace] Add Support for dequantization of SafeTensors checkpoints (#160682 ) This PR introduces the QuantizedHuggingFaceReader component which enables the reading and dequantization of the quantized tensors in the SafeTensors checkpoint. Following capabilities are inrtoduced: - Configuration the target DType and the block size. - Multi threaded dequantization for efficiency Test Plan: buck test //caffe2/test/distributed/checkpoint\:test_quantized_hf_storage ``` Time elapsed: 2:34.1s Tests finished: Pass 31. Fail 0. Fatal 0. Skip 0. Build failure 0 ``` Differential Revision: D80174674 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160682 Approved by: https://github.com/ankitageorge	2025-09-04 01:09:53 +00:00
Markus Hoehnerbach	9458d1ac3b	[inductor] pdl inductor option (disabled by default) (#160928 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160928 Approved by: https://github.com/eellison	2025-09-04 00:35:23 +00:00
Avik Chaudhuri	3c45af079a	kill allow_complex_guards_as_runtime_asserts (#161794 ) Summary: [reland] Since `allow_complex_guards_as_runtime_asserts` is now sync'd with `prefer_deferred_runtime_asserts_over_guards`, we can kill the former (especially since it was a export-only concept). Test Plan: updated tests Rollback Plan: Differential Revision: D81334984 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161794 Approved by: https://github.com/zhxchen17	2025-09-04 00:17:01 +00:00
PyTorch MergeBot	aad96a2022	Revert "Contiguous subgraph decomposition (#161241 )" This reverts commit d64718503728001a1e78168fd7f2d4ff23e57285. Reverted https://github.com/pytorch/pytorch/pull/161241 on behalf of https://github.com/jeffdaily due to breaks rocm mi300 tests ([comment](https://github.com/pytorch/pytorch/pull/161241#issuecomment-3251185098))	2025-09-04 00:14:22 +00:00
Rohit Manav	5f3cbc9442	fixed typo error (#162055 ) Fixes #162054 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162055 Approved by: https://github.com/RajeshvShiyal, https://github.com/malfet	2025-09-04 00:06:58 +00:00
Xu Han	a918bbad6a	[inductor] fix test output path 2 (#162085 ) Fix test_output_path_2 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162085 Approved by: https://github.com/angelayi, https://github.com/jansel	2025-09-04 00:03:47 +00:00
dolpm	8ec551bb35	[aot-compile] strip internal tracebacks for non-verbose graph breaks + include user file/lineno (#162005 ) pytest test/dynamo/test_aot_compile.py -k test_aot_compile_graph_break_error_fmt before ``` Traceback (most recent call last): File "/data/users/$USER/vllm-tests/graph-break.py", line 15, in <module> aot_compiled_fn = compiled.aot_compile((example_inputs, {})) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/eval_frame.py", line 717, in aot_compile return aot_compile_fullgraph( ^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/aot_compile.py", line 132, in aot_compile_fullgraph capture_output = convert_frame.fullgraph_capture( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/convert_frame.py", line 947, in fullgraph_capture dynamo_output = compile_frame( ^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/convert_frame.py", line 1020, in compile_frame bytecode, tracer_output = transform_code_object(code, transform) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/bytecode_transformation.py", line 1592, in transform_code_object tracer_output = transformations(instructions, code_options) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/convert_frame.py", line 992, in transform tracer_output = trace_frame( ^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/convert_frame.py", line 312, in _fn return fn(args, kwargs) ^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/convert_frame.py", line 821, in trace_frame run_tracer() File "/data/users/$USER/pytorch/torch/_dynamo/convert_frame.py", line 803, in run_tracer tracer.run() File "/data/users/$USER/pytorch/torch/_dynamo/symbolic_convert.py", line 1472, in run while self.step(): ^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/symbolic_convert.py", line 1342, in step self.dispatch_table[inst.opcode](self, inst) File "/data/users/$USER/pytorch/torch/_dynamo/symbolic_convert.py", line 902, in wrapper return inner_fn(self, inst) ^^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/symbolic_convert.py", line 3364, in CALL self._call(inst) File "/data/users/$USER/pytorch/torch/_dynamo/symbolic_convert.py", line 3358, in _call self.call_function(fn, args, kwargs) File "/data/users/$USER/pytorch/torch/_dynamo/symbolic_convert.py", line 1260, in call_function self.push(fn.call_function(self, args, kwargs)) # type: ignore[arg-type] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/variables/lazy.py", line 212, in realize_and_forward return getattr(self.realize(), name)(args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/variables/functions.py", line 1513, in call_function unimplemented_v2( File "/data/users/$USER/pytorch/torch/_dynamo/exc.py", line 596, in unimplemented_v2 raise Unsupported(msg) torch._dynamo.exc.Unsupported: Call to `torch._dynamo.graph_break()` Explanation: User-inserted graph break. Message: None Hint: Remove the `torch._dynamo.graph_break()` call. Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}` For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html ``` after ``` Traceback (most recent call last): File "/data/users/$USER/vllm-tests/graph-break.py", line 15, in <module> aot_compiled_fn = compiled.aot_compile((example_inputs, {})) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/eval_frame.py", line 737, in aot_compile raise e.with_traceback(None) from e.__cause__ # User compiler error ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ torch._dynamo.exc.Unsupported: Call to `torch._dynamo.graph_break()` Explanation: User-inserted graph break. Message: None Hint: Remove the `torch._dynamo.graph_break()` call. Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}` For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html from user code: File "/data/users/$USER/vllm-tests/graph-break.py", line 5, in foo torch._dynamo.graph_break() Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" ``` consistent w/ std torch.compile ``` Traceback (most recent call last): File "/data/users/$USER/vllm-tests/graph-break.py", line 16, in <module> res = compiled(example_inputs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/data/users/$USER/pytorch/torch/_dynamo/eval_frame.py", line 850, in compile_wrapper raise e.with_traceback(None) from e.__cause__ # User compiler error ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ torch._dynamo.exc.Unsupported: Call to `torch._dynamo.graph_break()` Explanation: User-inserted graph break. Message: None Hint: Remove the `torch._dynamo.graph_break()` call. Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}` For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html from user code: File "/data/users/$USER/vllm-tests/graph-break.py", line 5, in foo torch._dynamo.graph_break() Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162005 Approved by: https://github.com/zhxchen17, https://github.com/tugsbayasgalan	2025-09-03 23:19:47 +00:00
Catherine Lee	36d207fcaa	[CI] viable strict upgrade: Explicitly name which linux binary wheels should block (#162100 ) Reason: rocm binary builds should not block viable strict upgrade. It is queuing/canceled so viable strict is 1.2 days old Tested by mangling the workflow file to get to the actual call of the python script `python ../test-infra/tools/scripts/fetch_latest_green_commit.py --required-checks '["pull", "trunk", "lint", "^linux-binary-manywheel$", "^linux-binary-libtorch-release$", "linux-aarch64"]' --viable-strict-branch viable/strict --main-branch master`, which I then ran locally where I have credentials. It returned d64718503728001a1e78168fd7f2d4ff23e57285 which is green. Without this change, it returns 5e5870e858f60ff4bf87d03f3592097e934a9580, which is pretty old The other solution would have been to mark it as unstable I think Side note, why is it master and how is it working like that Pull Request resolved: https://github.com/pytorch/pytorch/pull/162100 Approved by: https://github.com/huydhn	2025-09-03 22:38:32 +00:00
Jeff Daily	99f356fa58	[ROCm] revamp miopen integration (#161687 ) Update sources under ATen/miopen and ATen/native/miopen to align with best practices. Avoid reshape_ calls inside backward operations. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161687 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-03 22:28:09 +00:00
Jithun Nair	0af70e2353	Modify ROCm MI2xx-based workflows to run on cron schedule (#162103 ) To mitigate queueing on MI2xx runners since Cirrascale runners are offline. Match cron schedule of periodic.yml Pull Request resolved: https://github.com/pytorch/pytorch/pull/162103 Approved by: https://github.com/jeffdaily, https://github.com/seemethere	2025-09-03 21:51:03 +00:00
Jeff Daily	b1bb98ddeb	[ROCm] TunableOp should use HIP version, not ROCm version (#162067 ) Fixes #160874 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162067 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-03 21:42:23 +00:00
Howard Huang	abc447174c	[PP] Add profiling to schedule execution (#160753 ) Profiling title will be `str(action)` <img width="1545" height="694" alt="image" src="https://github.com/user-attachments/assets/60b3506b-b8d6-4ae0-8b32-0d51d45fa2f0" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160753 Approved by: https://github.com/wconstab	2025-09-03 21:31:50 +00:00
Arsh Zahed	734ce8eba9	Rename propagate_tensor_meta to make private again (#161744 ) Rename the wrapper `propagate_tensor_meta` added in #161334 to make it clearly private, and rename the existing LRU function to accommodate. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161744 Approved by: https://github.com/bdhirsh	2025-09-03 21:11:45 +00:00
Xinya Zhang	98efc9e93d	[ROCm] Bump AOTriton to 0.11b (#161754 ) Notable new features/optimizations for SDPA operators on AMD systems from AOTriton 0.11b: * Invoke AITER Assembly kernels on gfx942/gfx950 when inputs meet requirements - AITER ASM kernels deliver over 500TFLOPS training performance. See [AOTriton 0.11b Release Page](https://github.com/ROCm/aotriton/releases/tag/0.11b) for more details. * Now returns natural based `logsumexp` tensor, matching CUDA's behavior - PR #156903 is reverted in this PR as well since it is not needed anymore. * Enables `CausalVariant.LOWER_RIGHT` The build system changes drastically along with new packaging scheme of AOTriton 0.11 * AOTriton 0.11 packs GPU images separately from AOTriton runtime * `aotriton.cmake` now selectively downloads image packs according to `PYTORCH_ROCM_ARCH` * `aotriton.cmake` now only use pre-compiled runtime library that exactly matches the ROCM in the build environment. For PyTorch builds with ROCm versions not listed in the file, the build process will build AOTriton runtime without GPU images from source - This avoids any further ABI breaks like ROCM 6.4 -> 7.0 - recursive git clone is disabled since building AOTriton runtime does not require submodules. Bug fixes: * Fix a kernel bug introduced when implementing SWA Known Problems: * gfx1100 target (Radeon RX 7000 Series) is moved back to experimental status due to accuracy issues. Triton compiler fixes are needed to restore the support status. * Enabling TF32 tests affects accuracy for later non-TF32 tests on ROCM 7.0. This issue is under investigation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161754 Approved by: https://github.com/jithunnair-amd, https://github.com/jeffdaily	2025-09-03 20:45:44 +00:00
Ke Wen	994f2a5dbc	[SymmMem][CI] Make sure group names are consistent (#162035 ) Unblocking #161741 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162035 Approved by: https://github.com/Skylion007, https://github.com/ngimel	2025-09-03 20:40:24 +00:00
Natalia Gimelshein	d1706d9128	[Symmetric memory] set handle type for ROCm (#161741 ) Fixes #161722 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161741 Approved by: https://github.com/kwen2501	2025-09-03 20:33:35 +00:00
arkadip-maitra	1aa7476885	fix to segmentation fault when empty tensor is passed to choose_qpara… (#161966 ) …ms_optimized Fixes #153326 Minimal code to reproduce error: ``` import torch tensor = torch.tensor([]) torch.choose_qparams_optimized( tensor, 0, 200, 0.16, 8 ) ``` Previous Output: `Segmentation fault` Now Output: ``` Traceback (most recent call last): File "/home/amaitra/work/tests/issue_153326.py", line 5, in <module> torch.choose_qparams_optimized( RuntimeError: input tensor is empty and has no data ``` Caused because `const float* input_row =input_tensor.const_data_ptr<float>();` becomes null Pull Request resolved: https://github.com/pytorch/pytorch/pull/161966 Approved by: https://github.com/Skylion007	2025-09-03 20:26:26 +00:00
Aaryaman Vasishta	8e23a1227b	[ROCm/Windows] Fix build failures and support some BLAS calls (#161981 ) * Support getrsBatched/geqrfBatched/gelsBatched on Windows ROCm (fixes https://github.com/ROCm/TheRock/issues/1367) * Fix windows pytorch build with USE_DISTRIBUTED=ON by default Pull Request resolved: https://github.com/pytorch/pytorch/pull/161981 Approved by: https://github.com/ScottTodd, https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-03 20:26:14 +00:00
Yulun Wang	850e1382a9	[hipify] Replace cudaStreamCaptureStatusNone (#161992 ) Replacing additional cuda symbols to hip symbols Differential Revision: D81420086 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161992 Approved by: https://github.com/jeffdaily, https://github.com/Skylion007	2025-09-03 20:23:32 +00:00
Ke Wen	3c0ff1b569	[SymmMem] Add root argument to broadcast op (#161090 ) It was missing earlier. Also added range check. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161090 Approved by: https://github.com/fegin	2025-09-03 20:17:45 +00:00
Yiming Zhou	c465b3d52c	[2/n][export] Refactor PT2 Archive weight saving and loading (#161520 ) Summary: The saving (serialization) part of PT2 archive weight refactoring. The loading (deserialization part) has been landed in D80035490 Test Plan: CI Rollback Plan: bifferential Revision: D80970931 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161520 Approved by: https://github.com/SherlockNoMad	2025-09-03 20:12:49 +00:00
andrewor14	f4c33cd44a	[pt2e] Avoid getting model device once per node (#159901 ) Summary: Previously, we call `assert_and_get_unqiue_device` once per node in both prepare and convert. This is expensive and unnecessary since the model device is the same across all nodes, so we should just call this once in the beginning and reuse the same model device across all the nodes. Test Plan: python test/test_quantization.py -k TestQuantizePT2E Pull Request resolved: https://github.com/pytorch/pytorch/pull/159901 Approved by: https://github.com/jerryzh168	2025-09-03 19:29:00 +00:00
Tugsbayasgalan (Tugsuu) Manlaibaatar	92576a594b	Prototype for building non-strict leak detector (#160456 ) Summary: Our strategy for detecting fake tensor leakage in non-strict for outside scope (side effects happening outside of model.forward) is: 1. We do gc.collect() before export and get the alive fake tensors 2. We dump the proxy to fake tensor map from make_fx tracer 3. We query gc again to get alive fake tensors 4. We take the delta between (1) and (3) 5. Filter out fake tensors that are: 1. Associated with `TrackedFake` (input tracking thing in symbolic_shapes) 2. Associated with `gm.meta` 6. Do ID match with the proxies and emit their stacktraces. We rely on (https://github.com/pytorch/pytorch/pull/159923) for other sources of leakages such as: 1. We failed to proxy an operator (like param.data) 2. We cache some tensor in model.forward (https://github.com/pytorch/pytorch/issues/155114) In general, we notice `gc.collect()` and query-ing gc for live objects are kinda slow. So we turn on this feature under env variable. We should document on export public facing documents that if you run into weird errors regarding fake tensors, they should look into turning on this env variable for further analysis. Test Plan: Test plan Rollback Plan: Differential Revision: D80003204 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160456 Approved by: https://github.com/pianpwk	2025-09-03 19:21:27 +00:00
Jithun Nair	cd529b686d	[ROCm] Use MI325 (gfx942) runners for binary smoke testing (#162044 ) ### Motivation * MI250 Cirrascale runners are currently having network timeout leading to huge queueing of binary smoke test jobs: <img width="483" height="133" alt="image" src="https://github.com/user-attachments/assets/17293002-78ad-4fc9-954f-ddd518bf0a43" /> * MI210 Hollywood runners (with runner names such as `pytorch-rocm-hw-`) are not suitable for these jobs, because they seem to take much longer to download artifacts: https://github.com/pytorch/pytorch/pull/153287#issuecomment-2918420345 (this is why these jobs were specifically targeting Cirrascale runners). However, it doesn't seem like Cirrascale runners are necessarily doing much better either e.g. [this recent build](https://github.com/pytorch/pytorch/actions/runs/17332256791/job/49231006755). Moving to MI325 runners should address the stability part at least, while also reducing load on limited MI2xx runner capacity. * However, I'm not sure if the MI325 runners will do any better on the artifact download part (this may need to be investigated more) cc @amdfaa * Also removing `ciflow/binaries` and `ciflow/binaries_wheel` label/tag triggers for `generated-linux-binary-manywheel-rocm-main.yml` because we already trigger ROCm binary build/test jobs via these labels/tags in `generated-linux-binary-manywheel-nightly.yml`. And for developers who want to trigger ROCm binary build/test jobs on their PRs, they can use the `ciflow/rocm-mi300` label/tag as per this PR. ### TODOs (cc @amdfaa): * Check that the workflow runs successfully on the MI325 runners in this PR. Note how long the test jobs take esp. the "Download Build Artifacts" step * Once this PR is merged, clear the queue of jobs targeting `linux.rocm.gpu.mi250` Pull Request resolved: https://github.com/pytorch/pytorch/pull/162044 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-03 18:34:07 +00:00
Isuru Fernando	62c3f9a97f	[inductor] Follow integer overflow rules in TypedExpr (#161922 ) Fixes https://github.com/pytorch/pytorch/issues/161763 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161922 Approved by: https://github.com/jansel	2025-09-03 18:33:18 +00:00
Guilherme Leobas	8076a185c8	Offload set method execution to CPython when possible (#160763 ) Reduces CPython `test_set.py` runtime from 63.477s to 40.298s Pull Request resolved: https://github.com/pytorch/pytorch/pull/160763 Approved by: https://github.com/anijain2305	2025-09-03 18:26:05 +00:00
Ruben Rodriguez Buchillon	f00445b43e	[inductor][ez] add hook for heuristics to adjust kernel input nodes (#161339 ) # why - some templates e.g. scale_mm need to unsqueeze/squeeze the nodes for codegen and heuristics - unified place where we can just adjust them for the template # what - inside get_mm_configs, return not the passed in kernel inputs, but allow the template heuristic to adjust them if necessary - the default implementation right now just passes them back this diff just adds the functionality, but does not exercise it other than the default (passthrough) # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520572](https://our.internmc.facebook.com/intern/diff/D81520572) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161339 Approved by: https://github.com/eellison, https://github.com/jansel ghstack dependencies: #161123, #161124, #161125, #161126, #161336, #161338	2025-09-03 18:23:22 +00:00
Laith Sakka	3559c354ce	stop suggesting using guard_size_oblivious on data dependent errors (#160510 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160510 Approved by: https://github.com/ezyang	2025-09-03 18:07:59 +00:00
Aleksei Nikiforov	71992dd805	S390x: build nightly binaries for new pythons (#161920 ) Enable python 3.13t, 3.14 and 3.14t on s390x for nightly binaries Fixes #161515 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161920 Approved by: https://github.com/malfet	2025-09-03 17:38:38 +00:00
Gabriel Ferns	d647185037	Contiguous subgraph decomposition (#161241 ) ## Summary Adds a subgraph decomposition for addmm and mm that performs well on large `K` compared to `M` and `N`, and functions well as an alternative to `split-k` on AMD (transposed only), which does not support AMD currently. ## Background On AMD (MI300x), for a matmul A * B, if B is non-contiguous, the resulting matmul is quite a bit slower. For example: ``` args[0]: TensorBox(StorageBox( InputBuffer(name='arg0_1', layout=FixedLayout('cuda:0', torch.float16, size=[1024, 178176], stride=[178176, 1])) )) args[1]: TensorBox(StorageBox( InputBuffer(name='arg1_1', layout=FixedLayout('cuda:0', torch.float16, size=[178176, 6144], stride=[1, 178176])) )) ``` is a lot slower than: ``` args[0]: TensorBox(StorageBox( InputBuffer(name='arg0_1', layout=FixedLayout('cuda:0', torch.float16, size=[1024, 178176], stride=[178176, 1])) )) args[1]: TensorBox(StorageBox( InputBuffer(name='arg1_1', layout=FixedLayout('cuda:0', torch.float16, size=[178176, 6144], stride=[6144, 1])) )) ``` This PR adds a subgraph decomposition to test out whether making B contiguous is faster than just using the normal kernels. ## Data I ran this on unique non-contiguous shapes from torchbench/huggingface and got these speedups: ``` Parsed 420 unique shapes from benchmark output addmm improvements when best: addmm_16448x512x2048: +0.14% addmm_128x2048x2048: +0.01% addmm_128x768x1000: +0.75% addmm_12672x3072x768: +1.08% addmm_512x768x32000: +0.62% addmm_12608x384x384: +0.00% addmm_4160x1024x4096: +0.90% addmm_16x768x2: +0.56% addmm_12608x3072x768: +0.09% addmm_64x4096x1000: +2.77% addmm_256x1024x512: +1.99% addmm_30x256x256: +1.12% addmm_100480x128x384: +0.91% addmm_6400x2048x512: +0.25% addmm_61568x1024x256: +0.08% addmm_1x768x768: +0.93% addmm_12544x384x384: +0.19% addmm_128x512x1000: +0.77% addmm_2048x128x128: +1.32% addmm_128x3072x1000: +0.24% addmm_7936x512x2048: +0.07% addmm_8192x512x2048: +0.33% addmm_64x1024x1000: +1.43% addmm_128x2304x1000: +0.01% addmm_32768x256x2: +0.75% addmm_64x384x1152: +0.79% addmm_64x640x1000: +0.01% addmm_100480x128x128: +0.87% addmm_1152x3072x768: +1.13% addmm_8192x256x2048: +1.40% addmm_4096x128x768: +0.01% addmm_128x2560x1000: +0.01% addmm_12544x2048x512: +0.43% addmm_200704x24x96: +0.14% addmm_8448x512x2048: +0.96% addmm_50176x256x1024: +0.62% addmm_4160x4096x1024: +0.22% addmm_4096x768x768: +0.32% addmm_220x2048x512: +0.56% addmm_8x2048x1000: +1.12% addmm_256x197951x512: +26.99% addmm_401536x64x192: +0.60% addmm_2040x2048x512: +0.47% addmm_512x1024x256: +1.32% addmm_128x4096x1000: +1.67% addmm_12672x768x768: +0.34% addmm_128x368x1000: +0.77% addmm_96x1280x1000: +0.01% addmm_12544x512x2048: +0.41% addmm_6272x320x1280: +0.76% addmm_12544x3072x768: +0.09% addmm_64x384x1000: +0.39% mm improvements when best: mm_200704x128x512: +1.29% mm_663552x16x16: +0.80% mm_4096x768x768: +0.51% mm_131072x64x31: +0.24% mm_12544x1152x384: +0.11% mm_128x2048x2: +0.46% mm_262144x16x23: +0.62% mm_50176x576x192: +0.37% mm_131072x16x31: +0.26% ================================================================================ BENCHMARK ANALYSIS RESULTS ================================================================================ Operation: addmm ---------------------------------------- Total shapes analyzed: 247 Average Subgraph placement: 3.38 Median Subgraph placement: 2.0 Subgraph is best choice: 52/247 shapes (21.1%) Average improvement when best: 1.15% Median improvement when best: 0.58% Largest improvement when best: +26.99% Operation: bmm ---------------------------------------- Total shapes analyzed: 85 Average Subgraph placement: 24.00 Median Subgraph placement: 21.0 Subgraph is best choice: 0/85 shapes (0.0%) Average improvement when best: N/A (never best) Median improvement when best: N/A (never best) Largest improvement when best: N/A (never best) Operation: mm ---------------------------------------- Total shapes analyzed: 88 Average Subgraph placement: 15.08 Median Subgraph placement: 4.0 Subgraph is best choice: 9/88 shapes (10.2%) Average improvement when best: 0.52% Median improvement when best: 0.46% Largest improvement when best: +1.29% ``` ## Results The largest shape gain, `256,197951,512`, seemed to be driven by a case where the extern kernel is way faster than the best triton configs on the recursive autotune: ``` addmm,Extern,extern_kernels.addmm,256,197951,512,0.38024500012397766 addmm,Triton,256,197951,512,32,256,16,2,2,4,2.005444049835205 addmm,Triton,256,197951,512,32,128,32,2,4,8,2.04189395904541 addmm,Triton,256,197951,512,64,128,16,2,4,8,2.1911399364471436 addmm,Triton,256,197951,512,64,128,32,2,4,8,2.496040105819702 addmm,Triton,256,197951,512,64,128,64,2,8,16,2.9306790828704834 addmm,Triton,256,197951,512,64,64,32,2,4,8,3.0347819328308105 ... ``` Compared to the non-transposed autotune: ``` addmm,Subgraph,contiguous_addmm_1384,256,197951,512,0.5024129748344421 addmm,Extern,extern_kernels.addmm,256,197951,512,0.6881489753723145 addmm,Triton,256,197951,512,32,256,16,2,2,4,2.5115010738372803 addmm,Triton,256,197951,512,32,128,32,2,4,8,2.5167479515075684 addmm,Triton,256,197951,512,64,128,16,2,4,8,2.9507460594177246 addmm,Triton,256,197951,512,64,256,64,2,8,4,2.9673290252685547 addmm,Triton,256,197951,512,64,128,64,2,8,16,3.3906331062316895 addmm,Triton,256,197951,512,64,128,32,2,4,8,3.496859073638916 ``` It seems to perform really well for high values of `K` vs `N` and `M`. Testing this hypothesis with some custom shapes: ``` Parsed 64 unique shapes from benchmark output addmm improvements when best: addmm_128x16384x128: +0.18% addmm_128x262144x256: +38.24% addmm_128x200000x512: +14.76% addmm_256x800000x128: +0.06% addmm_131072x128x256: +0.27% addmm_128x256x131072: +0.25% addmm_2048x200000x64: +12.45% mm improvements when best: mm_128x16384x128: +0.18% mm_128x262144x256: +38.05% mm_128x200000x512: +9.47% mm_256x800000x128: +0.99% mm_512x6400000x256: +3.17% mm_524288x64x64: +0.29% mm_2048x200000x64: +11.19% mm_8192x1000000x256: +34.14% mm_128x4096x100000: +0.40% mm_128x3072x150000: +0.27% ================================================================================ BENCHMARK ANALYSIS RESULTS ================================================================================ Operation: addmm ---------------------------------------- Total shapes analyzed: 33 Average Subgraph placement: 4.39 Median Subgraph placement: 2.0 Subgraph is best choice: 7/33 shapes (21.2%) Average improvement when best: 9.46% Median improvement when best: 0.27% Largest improvement when best: +38.24% Operation: mm ---------------------------------------- Total shapes analyzed: 30 Average Subgraph placement: 7.63 Median Subgraph placement: 2.0 Subgraph is best choice: 10/30 shapes (33.3%) Average improvement when best: 9.81% Median improvement when best: 2.08% Largest improvement when best: +38.05% ``` ## Conclusion Contiguous Subgraph Decompositionseems worthwhile for `mm` and `addmm`, but not `bmm`, and has a very large improvment on low `M`, low `N`, and high `K` shapes. Data gathering scripts: https://gist.github.com/exclamaforte/4a896c064d301b27bf5ca0a4f8fc3866 ## Test Plan: New unit tests. Differential Revision: D80771648 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161241 Approved by: https://github.com/eellison	2025-09-03 17:02:59 +00:00
Guilherme Leobas	eb18d32bda	Add `range_iterator` (#161800 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161800 Approved by: https://github.com/anijain2305 ghstack dependencies: #161799	2025-09-03 16:55:04 +00:00
Guilherme Leobas	889f01eb73	Add CPython test `test_range` (#161799 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161799 Approved by: https://github.com/anijain2305	2025-09-03 16:55:04 +00:00
Xu Han	451ed93156	[inductor] fix split_aot_inductor_output_path on Windows. (#162058 ) fix split_aot_inductor_output_path on Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162058 Approved by: https://github.com/angelayi	2025-09-03 16:53:38 +00:00
nandesuka	9491d289b3	Support generic dynamic shape with padding (#160997 ) Summary: Inductor has the following configurations: config.comprehensive_padding config.padding_alignment_bytes config.padding_stride_threshold In the case of static shape by enabling these three options Inductor will generate code for Flexible layout tensors that tries to pad up all stride dimension to be a multiple of config.padding_alignment_bytes for strides above: config.padding_stride_threshold. In the case where dynamic shapes is enabled no padding is done today. This PR introduces the following configuration which allows the user to specify they wish to generated a padded stride even in the case of dynamic shape operations. This is mainly done so we don't break the previous behaviour of not padding up dynamic shape use cases. The config.padding_stride_threshold does not apply since the values of the strides are dynamic. config.pad_dynamic_shapes In addition to this a new mode "python_slow" has been added to launch grid calculation which achieves the same ceildiv behaviour that is generally applicable to integer division. This is done to prevent test regressions and make wrapper_fxir codegen more generic. Test Plan: CI Rollback Plan: Differential Revision: D80468808 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160997 Approved by: https://github.com/blaine-rister, https://github.com/jansel	2025-09-03 15:58:18 +00:00
Liao, Wei	c157cf6488	port distributed tensor parallel test files for Intel GPU (#161261 ) In this pr, we port test/distributed/parallel 4 test files and test/distributed/debug 1 test file for Intel GPU We could enable Intel GPU with following methods and try the best to keep the original code styles: 1. Use torch.accelerator for general gpu 2. Skip the case if running on xpu which has known issues Pull Request resolved: https://github.com/pytorch/pytorch/pull/161261 Approved by: https://github.com/guangyey, https://github.com/d4l3k	2025-09-03 15:03:32 +00:00
PyTorch MergeBot	bb950284c7	Revert "[inductor][ez] add hook for heuristics to adjust kernel input nodes (#161339 )" This reverts commit 90f50f7e68e120d9574e6e3189e37b4280010ad9. Reverted https://github.com/pytorch/pytorch/pull/161339 on behalf of https://github.com/jeanschmidt due to Breaks internal tests, check D81486248 for more details ([comment](https://github.com/pytorch/pytorch/pull/161339#issuecomment-3249600885))	2025-09-03 14:56:02 +00:00
PyTorch MergeBot	f27985b7e7	Revert "[CUDAGraph] add config to error on skipping cudagraph (#161862 )" This reverts commit 204697f0e695d82894c5010fbec664c4391f90cc. Reverted https://github.com/pytorch/pytorch/pull/161862 on behalf of https://github.com/jeanschmidt due to Breaks internal tests, see D81522732 for more details ([comment](https://github.com/pytorch/pytorch/pull/161862#issuecomment-3249582583))	2025-09-03 14:50:44 +00:00
PyTorch MergeBot	0cd6c56bdf	Revert "test: ensure editable cached wrapper is respected (#160943 )" This reverts commit bbedc71fd3267c639c38b4ec25eaa22f973d9c4d. Reverted https://github.com/pytorch/pytorch/pull/160943 on behalf of https://github.com/jeanschmidt due to See [D81486248](https://www.internalfb.com/diff/D81486248) for details on broken test ([comment](https://github.com/pytorch/pytorch/pull/160943#issuecomment-3249565671))	2025-09-03 14:46:35 +00:00
Nikita Shulga	b40d9432be	[BE] Cleanup stale comments/copy from `gemm` (#162001 ) Followup after https://github.com/pytorch/pytorch/pull/154012 Since the introduction of `gemm_no_downcast_stub` it's no longer necessary to allocate temporary array and then manually implement the `beta` logic in the codebase Pull Request resolved: https://github.com/pytorch/pytorch/pull/162001 Approved by: https://github.com/drisspg ghstack dependencies: #161999	2025-09-03 14:31:09 +00:00
Nikita Shulga	02c83f1334	[BLAS] Avoid downcasts for fp16fp16->fp32 BLAS (#161999 ) Followup after https://github.com/pytorch/pytorch/pull/154012 Fixes CPU part of https://github.com/pytorch/pytorch/issues/160841 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161999 Approved by: https://github.com/drisspg	2025-09-03 14:31:08 +00:00
Nikhil Patel	aed33a8fcb	[Inductor][Tritonparse] Get Inductor kernel params (#161953 ) Summary: Save the config args that Inductor burns into `inductor_metadata` so we can optionally pass them to any Jit Hooks that are set. This allows us to pass them to Tritonparse. Reviewed By: davidberard98, FindHao Differential Revision: D80994791 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161953 Approved by: https://github.com/FindHao	2025-09-03 14:11:27 +00:00
Huamin Li	b16d3f4c8c	[AOTI] Fix a bug from load_constants (#161887 ) Summary: we have ``` std::vector<size_t> constants_internal_offset( num_constants - num_folded_constants); ``` but the for loop does not consider it ``` for (size_t i = 0; i < num_constants; i++) { ... constants_internal_offset[i] ... ``` even in the for loop, it does ``` bool from_folded = this->constant_from_folded(i); if (from_folded) { continue; } ``` but `i` could still be wrong Rollback Plan: Differential Revision: D81425007 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161887 Approved by: https://github.com/angelayi	2025-09-03 07:45:16 +00:00
Edward Z. Yang	4ae57d448c	Make distributed modules importable even when backend not built (#159889 ) This PR is greatly simplified now that it stacked on top of a PR that builds with distributed always. We only need to stub functions that may not be defined due to a backend not being enabled. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159889 Approved by: https://github.com/wconstab ghstack dependencies: #160449	2025-09-03 07:33:55 +00:00
Edward Yang	90b08643c3	Always build USE_DISTRIBUTED. (#160449 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160449 Approved by: https://github.com/wconstab, https://github.com/albanD, https://github.com/dcci	2025-09-03 07:33:55 +00:00
Scott Wolchok	b0a3e58dd7	Add inline fast paths for SymInt operators (#161586 ) If SymInt::maybe_as_int() returns non-empty, then we get an inline fast path. The philosophy here (as with the previous PR) is to preserve performance in the "plain old ints" case. Observed time spent in SymInt functions in computeStorageNBytes to drop (and not cost shift elsewhere in the function) after this change, profiling detach() using code similar to the benchmark from #160580 and Linux perf. Differential Revision: [D81530107](https://our.internmc.facebook.com/intern/diff/D81530107) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161586 Approved by: https://github.com/ezyang ghstack dependencies: #161466	2025-09-03 06:54:47 +00:00
Scott Wolchok	fa1514acf1	Outline SymInt::maybe_as_int_slow_path (#161466 ) Keeps SymInt::maybe_as_int small enough to inline. Differential Revision: [D81530097](https://our.internmc.facebook.com/intern/diff/D81530097) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161466 Approved by: https://github.com/ezyang	2025-09-03 06:54:47 +00:00
FFFrog	827f0d4054	Using get_paths() to get correct installation path for PYTHONPATY (#161947 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161947 Approved by: https://github.com/albanD ghstack dependencies: #161845, #161903	2025-09-03 06:38:03 +00:00
Isalia20	2c03f0acc5	[MPS] enable cat op for sparse (#162007 ) Enable cat op for sparse on MPS Pull Request resolved: https://github.com/pytorch/pytorch/pull/162007 Approved by: https://github.com/malfet	2025-09-03 06:31:35 +00:00
Scott Wolchok	f8ffa9194e	Perf nitpicks on python_arg_parser's is_int_or_symint_list (#161998 ) This function has come up in DTensor perf work, and I had a nitpick on #160256 so here it is. I have neither compiled nor measured this, but am reasonably confident it's better nonetheless. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161998 Approved by: https://github.com/ezyang	2025-09-03 05:38:30 +00:00
fengqing.lu	50fc22dedf	[Intel GPU] Fix XPU SDPA default priority_order UT fail (#161690 ) Fixes #161483 When the whole `test/test_transformers.py` file is run, the case `test_default_priority_order` can pass because other xpu cases would call SDPA so that the priority order is set by `eec876deb6/aten/src/ATen/native/mkldnn/xpu/Attention.cpp (L98-L112)` However, when the case `test_default_priority_order` is run separately, the priority order is unset so that this case would fail. This PR fix this case. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161690 Approved by: https://github.com/guangyey, https://github.com/drisspg	2025-09-03 04:43:27 +00:00
Tianyu Liu	e381d4b020	[DTensor] forbid view ops to redistribute when local split is impossible (#161950 ) This PR is a followup to https://github.com/pytorch/pytorch/pull/149764. In that PR, it only forbids illegal view due to `Flatten`; this PR also forbids illegal view caused by `Split`. This PR also updates the error message to be less about internal implementation details, which users may find confusing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161950 Approved by: https://github.com/ezyang	2025-09-03 04:40:11 +00:00
PyTorch UpdateBot	8875d6e394	[vllm hash update] update the pinned vllm hash (#161929 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161929 Approved by: https://github.com/pytorchbot	2025-09-03 04:26:38 +00:00
Wenyuan Chi	00636e0171	[Reland][Inductor] Prune configs that require more shared memory than the hardware limit. (#161996 ) Summary: This is a re-land of [PR161040](https://github.com/pytorch/pytorch/pull/161040), which had previously caused test failures on AMD GPUs. The tests are now configured to target only NVIDIA GPUs. This diff removes configurations that exceed the hardware shared memory limit, which causes the following compilation error: ``` No valid triton configs. OutOfMemoryError: out of resource: triton_mm Required: 327680 Hardware limit:232448 Reducing block sizes or `num_stages` may help. ``` Test Plan: ``` pytest test/inductor/test_max_autotune.py pytest test/inductor/test_triton_heuristics.py ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161996 Approved by: https://github.com/coconutruben	2025-09-03 04:23:09 +00:00
PyTorch UpdateBot	09d2f1b631	[audio hash update] update the pinned audio hash (#161928 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161928 Approved by: https://github.com/pytorchbot	2025-09-03 04:22:55 +00:00
FFFrog	dac8a4b91c	Using pip3 install instead of python setup.py develop/install (#161903 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161903 Approved by: https://github.com/ezyang ghstack dependencies: #161845	2025-09-03 03:12:18 +00:00
FFFrog	d789451ff6	[OpenReg] Migrate Accelerator Document from source/notes into source/accelerator (#161845 ) As the tile stated. As the document grows, the content will become more and more, so in order to make it easier for users to read and easier for developers to maintain, we have split this file into several separate files and placed them in a dedicated directory called "accelerator". Pull Request resolved: https://github.com/pytorch/pytorch/pull/161845 Approved by: https://github.com/albanD	2025-09-03 03:12:18 +00:00
Eli Uriegas	0447f2d99b	build: Add fallback commands to setup.py (#162009 ) Adds fallback commands for the following: * python setup.py install * python setup.py develop Ideally these should just work and should provide backwards compat. Thought process here is that multiple people rely on these commands and just because setuptools wants to drop support for this I don't think a lot of our downstream users who build from source are expecting these to be gone. This should provide some room for developers to move away from these commands until we have a unified frontend for doing all of these commands that should abstract most of these away. Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/162009 Approved by: https://github.com/clee2000, https://github.com/atalman	2025-09-03 02:56:10 +00:00
William Wen	d5643e8f3a	[dynamo, nested graph breaks] support nested graph breaks that cause skipped frames (#160470 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160470 Approved by: https://github.com/anijain2305 ghstack dependencies: #159329, #159678, #159817, #160138, #159786	2025-09-03 02:47:07 +00:00
Ke Wen	9b81fe281d	[c10d] Lessen density of barrier warning (#162015 ) Warnings are great, but too dense when there are many ranks. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162015 Approved by: https://github.com/d4l3k, https://github.com/H-Huang	2025-09-03 02:20:54 +00:00
Ruben Rodriguez Buchillon	90f50f7e68	[inductor][ez] add hook for heuristics to adjust kernel input nodes (#161339 ) # why - some templates e.g. scale_mm need to unsqueeze/squeeze the nodes for codegen and heuristics - unified place where we can just adjust them for the template # what - inside get_mm_configs, return not the passed in kernel inputs, but allow the template heuristic to adjust them if necessary - the default implementation right now just passes them back this diff just adds the functionality, but does not exercise it other than the default (passthrough) # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520572](https://our.internmc.facebook.com/intern/diff/D81520572) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161339 Approved by: https://github.com/eellison, https://github.com/jansel ghstack dependencies: #161123, #161124, #161125, #161126, #161336, #161338	2025-09-03 01:03:57 +00:00
Ruben Rodriguez Buchillon	877062c9d3	[inductor][choices][ez] pass through layout and input_nodes (#161338 ) # why - params already available in get_mm_configs - simplifies the code - adds a possibility to edit the nodes/layout in a centralized place # what - add layout and input_nodes into extra_kwargs - no other modifications # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D81520575](https://our.internmc.facebook.com/intern/diff/D81520575) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161338 Approved by: https://github.com/jansel, https://github.com/eellison ghstack dependencies: #161123, #161124, #161125, #161126, #161336	2025-09-03 01:03:57 +00:00
Ruben Rodriguez Buchillon	c31dee6fa5	[inductor][ez] ExternChoice with maybe_append_choice (#161336 ) # why - make the API for ExternChoice the same as KernelTemplate - make it possible to use the same retrieval point as templates # what - add a maybe_append_choice to ExternChoice that under the hood invokes self.bind This pr does not actuate the new path, but just exposes it # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py ``` Differential Revision: [D81520578](https://our.internmc.facebook.com/intern/diff/D81520578) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161336 Approved by: https://github.com/jansel ghstack dependencies: #161123, #161124, #161125, #161126	2025-09-03 01:03:57 +00:00
Ruben Rodriguez Buchillon	6cb13dd3cc	[inductor] move scaled_mm template args into heuristics (#161126 ) # why - another step towards get_mm_configs providing all the kwargs needed to add a choice from a template. This in turn will allow us to send all templates through one single call, and handle modifications # what - use the infrastructure for template heuristics to provide extra kwargs that are fixed for a template/op pair to provide the suffix args and epilogue function/fn for scaled_mm # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D80670914](https://our.internmc.facebook.com/intern/diff/D80670914) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161126 Approved by: https://github.com/jansel ghstack dependencies: #161123, #161124, #161125	2025-09-03 01:03:57 +00:00
Ruben Rodriguez Buchillon	cbf01c11ff	[inductor] move addmm/baddbmm template args into heuristics (#161125 ) # why - another step towards get_mm_configs providing all the kwargs needed to add a choice from a template. This in turn will allow us to send all templates through one single call, and handle modifications # what - use the infrastructure for template heuristics to provide extra kwargs that are fixed for a template/op pair to provide the prefix args and epilogue function/fn for addmm/baddbmm - expand kernelinputs to also be able to shuttle around non tensor inputs (scalars) as is needed for alpha and beta # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v -k addmm ``` Differential Revision: [D80670912](https://our.internmc.facebook.com/intern/diff/D80670912) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161125 Approved by: https://github.com/jansel ghstack dependencies: #161123, #161124	2025-09-03 01:03:57 +00:00
Ruben Rodriguez Buchillon	7cdfa520a6	[inductor] move tma workspace in heuristics (#161124 ) # why - another step towards get_mm_configs providing all the kwargs needed to add a choice from a template. This in turn will allow us to send all templates through one single call, and handle modifications # what use the infrastructure for template heuristics to provide extra kwargs that are fixed for a template/op pair to provide the workspace_arg for all the tma templates # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v -k tma ``` Differential Revision: [D80670915](https://our.internmc.facebook.com/intern/diff/D80670915) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161124 Approved by: https://github.com/jansel ghstack dependencies: #161123	2025-09-03 01:03:57 +00:00
Ruben Rodriguez Buchillon	1485ac3264	[inductor] add notion of extra_kwargs for mm_configs (#161123 ) # why - some kwargs are choice independent but rather always the same for a specific op or template - this enables us to track those differently than the choice ones, and thus enables interception of them cleaner - maybe_append_choices can then be simplified to just pass through the kwargs # what - hookup for template heuristics to have per template/op extra kwargs that are always the same, for all choices - hookup for the called to get_mm_configs to provide template/op kwargs to override some of the template/choice kwargs this pr does not use the new machinery, and everything is empty for now. subsequent prs start using it to simplify ops # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D80670916](https://our.internmc.facebook.com/intern/diff/D80670916) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161123 Approved by: https://github.com/jansel	2025-09-03 01:03:57 +00:00
Alex Malyshev	c5b8a10be5	Fix compiler errors in 3.14 stub definitions (#161792 ) The functions here expect to return pointers, but currently aren't returning anything. Make them return NULL. The properties array wants an extra set of braces. One pair for the array, another for the first item in the array. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161792 Approved by: https://github.com/Skylion007	2025-09-03 00:58:41 +00:00
Ke Wen	a02ee4a816	[SymmMem] Use non-blocking version of getmem (#162006 ) As titled, so that the `getmem` calls in the loop are non-blocking, so that we max out the issuance rate. Also had a single `nvshmem_quiet()` at the end to make sure all the getmem calls complete. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162006 Approved by: https://github.com/ngimel	2025-09-02 23:55:22 +00:00
xinan.lin	81b7b16618	Reland "[Fix XPU CI][Inductor UT] Fix test cases broken by community. (#161142 )" (#161949 ) This PR reland #161142 which is reverted to be able to revert other PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161949 Approved by: https://github.com/jansel	2025-09-02 23:43:27 +00:00
PyTorch MergeBot	4cdaf8265d	Revert "Update Kineto submodule (#161572 )" This reverts commit d33840c542b387ab08ba49aa6c45aa9567fd9be7. Reverted https://github.com/pytorch/pytorch/pull/161572 on behalf of https://github.com/seemethere due to This appears as though its causing downstream build failures in inductor workflows and for developers working locally. Going to revert out of an abundance of caution. ([comment](https://github.com/pytorch/pytorch/pull/161572#issuecomment-3247121981))	2025-09-02 23:28:19 +00:00
Kevin Fu	874069fbe4	Log Const Folded Node (#161827 ) Summary: Log folded nodes for easier debugging. Test Plan: sandcastle. Rollback Plan: Reviewed By: henryoier Differential Revision: D81352098 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161827 Approved by: https://github.com/henryoier, https://github.com/yewentao256	2025-09-02 23:23:51 +00:00
Ke Wen	ab643e4dbb	[SymmMem] Increase minimum nthreads to cover sync needs in NVL72 (#161983 ) `sync_remote_blocks` maps threads to peers. Previously min nthreads is warp size, which is too small to cover NVL72. Bumping it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161983 Approved by: https://github.com/ngimel	2025-09-02 23:18:08 +00:00
Ke Wen	5a2da090ed	[SymmMem] Make sure CUDA runtime is initialized before NVSHMEM init (#161232 ) Previously, without calling `torch.empty` before NVSHMEM init, we see error below: ``` src/host/init/init.cu:nvshmemi_check_state_and_init:1117: nvshmem initialization failed, exiting src/host/util/cs.cpp:21: non-zero status: 16: Device or resource busy, exiting... mutex destroy failed ``` Fixing it by calling a `cudaFree(nullptr)` to make sure CUDA runtime is initialized before NVSHMEM init. Removing all `torch.empty(1)` calls from tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161232 Approved by: https://github.com/ngimel ghstack dependencies: #161214	2025-09-02 22:53:28 +00:00
Justin Chu	bd39e47fee	[ONNX] Default to dynamo export (#159646 ) Set dynamo=True and enable fallback. 1. Implemented the compatible behavior where BytesIO objects as `f` is accepted 2. Update tests to explicitly set dynamo=False #151693 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159646 Approved by: https://github.com/titaiwangms	2025-09-02 22:45:55 +00:00
zhxchen17	e4bd0ff4f8	[aot precompile] Handle closure variables. (#161990 ) We previously assume aot precompile should only work on non closures. This is hard to enforce in practice because we will see a lot of cases with decorater (e.g. hugging face models) ``` def check_inputs(fn): def _fn(self, args, kwargs): for arg in args: assert arg.shape[0] > 1 return fn(args, **kwargs) return _fn @check_inputs def foo(x, y): a = x + x b = y + y c = a + b return c ``` It doesn't make sense to not support these cases since they are straightfowrad to do. This PR adds the logic to handle closure and make sure they can be precompiled properly. Differential Revision: [D81509535](https://our.internmc.facebook.com/intern/diff/D81509535/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161990 Approved by: https://github.com/angelayi	2025-09-02 22:26:04 +00:00
PyTorch MergeBot	15c77a8cfd	Revert "Add inductor provenance mapping for cpp extern kernel (#161656 )" This reverts commit 5e5870e858f60ff4bf87d03f3592097e934a9580. Reverted https://github.com/pytorch/pytorch/pull/161656 on behalf of https://github.com/jeffdaily due to causing failures on ROCm MI300, will add label to PR ([comment](https://github.com/pytorch/pytorch/pull/161656#issuecomment-3246965676))	2025-09-02 22:19:19 +00:00
Kurt Mohler	791eff96c8	[MPS] Add `igamma/igammac` ops (#161927 ) Fixes #161725 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161927 Approved by: https://github.com/malfet	2025-09-02 20:52:02 +00:00
Chris Leonard	80dd397f19	Argsort doc stable kwargs (#161986 ) Fixes #129311 Updated torch.argsort documentation to reflect that the 'stable' parameter is a keyword argument and not a normal parameter. @albanD, @soulitzer Pull Request resolved: https://github.com/pytorch/pytorch/pull/161986 Approved by: https://github.com/soulitzer	2025-09-02 20:42:53 +00:00
orangeH25	a75e8cd270	Add api info for torch._C._nn.pyi (#161958 ) Fix part of #148404 APis involved are as followed: - max_pool2d_with_indices - max_pool3d_with_indices - elu - glu - max_unpool2d - max_unpool3d Pull Request resolved: https://github.com/pytorch/pytorch/pull/161958 Approved by: https://github.com/ezyang	2025-09-02 20:39:20 +00:00
PyTorch MergeBot	4e42aa8ffc	Revert "Always build USE_DISTRIBUTED. (#160449 )" This reverts commit b7034e9c924412bfbe8ee25a22d7e95239b5ca65. Reverted https://github.com/pytorch/pytorch/pull/160449 on behalf of https://github.com/jeanschmidt due to Breaking internal builds, can't be landed with forward fix due to internal tooling problems ([comment](https://github.com/pytorch/pytorch/pull/160449#issuecomment-3246689684))	2025-09-02 20:28:42 +00:00
PyTorch MergeBot	420c52ecf3	Revert "Make distributed modules importable even when backend not built (#159889 )" This reverts commit 626cb7df8161dd4ecb4fe43b60f37ce9076f56b1. Reverted https://github.com/pytorch/pytorch/pull/159889 on behalf of https://github.com/jeanschmidt due to Breaking internal builds, can't be landed with forward fix due to internal tooling problems ([comment](https://github.com/pytorch/pytorch/pull/159889#issuecomment-3246677982))	2025-09-02 20:24:01 +00:00
PyTorch MergeBot	82f63c8f6d	Revert "[HOTFIX] Disable DISTRIBUTED_C10D_DIRECT_ACCESS for now (#161946 )" This reverts commit 5561e45758d59c94605873d5db48ed459c004c3b. Reverted https://github.com/pytorch/pytorch/pull/161946 on behalf of https://github.com/jeanschmidt due to Need to be reverted so https://github.com/pytorch/pytorch/pull/159889 can be ([comment](https://github.com/pytorch/pytorch/pull/161946#issuecomment-3246663376))	2025-09-02 20:18:52 +00:00
Xu Han	b4ad38279b	[AOTI] Add Windows-compatible implementation of the mmap-related funcs (#161805 ) Add Windows-compatible implementation of the mmap-related functions. These code was validated on the small developing project: https://github.com/xuhancn/cross_os_mmap?tab=readme-ov-file#cross_os_mmap Pull Request resolved: https://github.com/pytorch/pytorch/pull/161805 Approved by: https://github.com/angelayi	2025-09-02 20:07:41 +00:00
Wei Wang	ef8aabd424	[CD][CUDA13][ARM] aarch64 binary seems to be missing Triton dependency (#161833 ) Requires: filelock, fsspec, jinja2, networkx, setuptools, sympy, typing-extensions Seems to be missing Triton. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161833 Approved by: https://github.com/tinglvv, https://github.com/Skylion007, https://github.com/atalman	2025-09-02 19:31:14 +00:00
Isalia20	dcf385395d	[MPS] Move sparsemps testing from test_mps to test_sparse (#161852 ) Moves Sparse MPS testing from test_mps to test_sparse. Lots of skips now but I expect to remove them iteratively once ops are implemented Pull Request resolved: https://github.com/pytorch/pytorch/pull/161852 Approved by: https://github.com/malfet	2025-09-02 19:04:11 +00:00
Animesh Jain	600c25e9a1	[dynamo] Graph break on torch.cuda.sychronize (#161925 ) Today, AOTDispatcher ignores cuda.synchornize. Even if we wrap it in some HOP, we need it to be a barrier op to prevent any inductor reordering. So graph breaking. Fixes https://github.com/pytorch/pytorch/issues/160751 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161925 Approved by: https://github.com/zou3519, https://github.com/jansel, https://github.com/mlazos	2025-09-02 19:00:21 +00:00
Ke Wen	f981a7fa52	[SymmMem] Add device guard before alloc (#161214 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161214 Approved by: https://github.com/ngimel	2025-09-02 18:53:45 +00:00
sibuachu	b7e207ca9f	Make error message descriptive (#150627 ) (#159423 ) Summary: Adding the number of locals shards to error messages makes it easier to debug. Test Plan: UT Differential Revision: D72396478 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159423 Approved by: https://github.com/Saiteja64	2025-09-02 17:54:39 +00:00
Shangdi Yu	5e5870e858	Add inductor provenance mapping for cpp extern kernel (#161656 ) Summary: Add inductor provenance mapping for cpp extern kernel Test Plan: ``` buck run fbcode//caffe2/test/inductor:provenance_tracing -- -r test_cpu_extern_kernel ``` Differential Revision: D81161751 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161656 Approved by: https://github.com/angelayi	2025-09-02 17:54:04 +00:00
Yu, Guangye	a99d8d39bc	Update torch-xpu-ops commit pin (#161919 ) # Motivation 1. Fallback some linalg functionality such as `linalg_eig`, `linalg_householder_product`, `linalg_solve_triangular` to CPU; 2. Fix codegen dependency bug. # Additional Context This PR aims to fix https://github.com/pytorch/pytorch/issues/161498 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161919 Approved by: https://github.com/EikanWang	2025-09-02 17:09:07 +00:00
PyTorch MergeBot	d6b74568e2	Revert "Add __init__.pyi to torch/linalg (#160750 )" This reverts commit 9a665ca3c472384e9d722bddba79e5a7680f1abd. Reverted https://github.com/pytorch/pytorch/pull/160750 on behalf of https://github.com/jeanschmidt due to Seems that those errors are legitimate, and there is no test plan. I'll be proceeding with a revert ([comment](https://github.com/pytorch/pytorch/pull/160750#issuecomment-3246095383))	2025-09-02 16:53:55 +00:00
Shivam Raikundalia	d33840c542	Update Kineto submodule (#161572 ) Differential Revision: D81087601 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161572 Approved by: https://github.com/cyyever, https://github.com/aaronenyeshi	2025-09-02 16:31:55 +00:00
Justin Chu	f0c391102b	[ONNX] Remove private members from torch.onnx (#161546 ) Remove import of two functions - _run_symbolic_function - _run_symbolic_method to the `torch.onnx` namespace. Signed-off-by: Justin Chu <justinchuby@users.noreply.github.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161546 Approved by: https://github.com/titaiwangms ghstack dependencies: #161323, #161449	2025-09-02 16:31:23 +00:00
Jagadish Krishnamoorthy	a8d6943d36	ROCm: Enable overload tests from test_matmul_cuda (#161540 ) This patch enables hipblaslt backend tests for test_mm_bmm_dtype_overload and test_addmm_baddmm_dtype_overload. Tests were disabled as part of #150812 Rocblas backend tests are not enabled yet, WIP. Test command PYTORCH_TEST_WITH_ROCM=1 pytest test/test_matmul_cuda.py -k 'test_mm_bmm_dtype_overload' -v PYTORCH_TEST_WITH_ROCM=1 pytest test/test_matmul_cuda.py -k 'test_addmm_baddmm_dtype_overload' -v Pull Request resolved: https://github.com/pytorch/pytorch/pull/161540 Approved by: https://github.com/jeffdaily	2025-09-02 16:27:42 +00:00
Justin Chu	d11720efdb	[ONNX] Remove unused logic from internal verification module (#161449 ) Signed-off-by: Justin Chu <justinchuby@users.noreply.github.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161449 Approved by: https://github.com/xadupre, https://github.com/titaiwangms ghstack dependencies: #161323	2025-09-02 16:22:49 +00:00
Edward Yang	9a1c5c0a07	Detect torch function in lists as well (#160256 ) We basically follow the same pattern we do for tensor arguments. The major downside is we now have to traverse the entirety of the int list / etc where previously we didn't have. Benchmark suggests 2% regression for relevant things. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160256 Approved by: https://github.com/albanD	2025-09-02 16:22:42 +00:00
Justin Chu	524b78d4f6	[ONNX] Refactor torchscript based exporter (#161323 ) Refactor torchscript based exporter logic to move them to a single (private) location for better code management. Original public module and method apis are preserved. - Updated module paths in `torch/csrc/autograd/python_function.cpp` accordingly - Removed `check_onnx_broadcast` from `torch/autograd/_functions/utils.py` because it is private&unused @albanD / @soulitzer could you review changes in `torch/csrc/autograd/python_function.cpp` and `torch/autograd/_functions/utils.py`? Thanks! ## BC Breaking - Deprecated members in `torch.onnx.verification` are removed Differential Revision: [D81236421](https://our.internmc.facebook.com/intern/diff/D81236421) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161323 Approved by: https://github.com/titaiwangms, https://github.com/angelayi	2025-09-02 16:10:30 +00:00
Wang, Chuanqi	793fc12aff	[CD] Fix setup-xpu action issue (#161934 ) Fix XPU CD test failure, refer https://github.com/pytorch/pytorch/actions/runs/17370923627/job/49315624191 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161934 Approved by: https://github.com/atalman	2025-09-02 16:03:44 +00:00
Boyuan Feng	204697f0e6	[CUDAGraph] add config to error on skipping cudagraph (#161862 ) Many users want a config to force all cuda ops captured by cudagraph. When not possible, pt2 should error. This PR adds `torch._inductor.triton.cudagraph_or_error` for that (default as False). Also added an environment variable `TORCHINDUCTOR_CUDAGRAPH_OR_ERROR` to control. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161862 Approved by: https://github.com/ezyang	2025-09-02 15:28:22 +00:00
Guilherme Leobas	789d494212	Defer loading hipify until it is needed (#160824 ) Saves a few milliseconds when running a test case: Before: ``` $ PYTORCH_TEST_WITH_DYNAMO=1 python test/dynamo/cpython/3_13/test_float.py GeneralFloatCases.test_float_pow frames [('total', 1), ('ok', 1)] inline_call [] . ---------------------------------------------------------------------- Ran 1 test in 1.497s ``` After: ``` $ PYTORCH_TEST_WITH_DYNAMO=1 python test/dynamo/cpython/3_13/test_float.py GeneralFloatCases.test_float_pow frames [('total', 1), ('ok', 1)] inline_call [] . ---------------------------------------------------------------------- Ran 1 test in 0.909s ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160824 Approved by: https://github.com/zou3519	2025-09-02 15:27:37 +00:00
DrStone71	bc4db2c27f	CUDA 13 -- sm_120 -- Nvidia 5090 -- ptxas warning : Value of threads … (#161380 ) bug fix: i have opened a issue ( https://github.com/pytorch/pytorch/issues/161376 ) and i suggest this bug fix. In this metod compile fine. Fixes #161376 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161380 Approved by: https://github.com/eqy, https://github.com/malfet Co-authored-by: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>	2025-09-02 13:27:57 +00:00
PyTorch MergeBot	e304ea4e69	Revert "[BE] Update xpu driver repo for CD used almalinux 8.10 (#157356 )" This reverts commit c78bbdf4102d2c13bf6aa1abe4352aa7bca401ca. Reverted https://github.com/pytorch/pytorch/pull/157356 on behalf of https://github.com/chuanqi129 due to This PR has performance regression on some workloads ([comment](https://github.com/pytorch/pytorch/pull/157356#issuecomment-3245319046))	2025-09-02 13:20:38 +00:00
Jean Schmidt	1f820de639	[ci] Increase shards for linux-jammy-py3.10-clang18-asan on pull.yml to 7 (#161968 ) [ci] Increase shards for linux-jammy-py3.10-clang18-asan to 7	2025-09-02 14:08:47 +02:00
Rohit Singh Rathaur	fca2601c9d	Improve error message for unsupported padding config (#160866 ) Fixes #160053 The previous error message `Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now` was not clear now we have ``` python3 Python 3.13.5 \| packaged by conda-forge \| (main, Jun 16 2025, 08:27:50) [GCC 13.3.0] on linux Type "help", "copyright", "credits" or "license" for more information. >>> import torch ... import torch.nn.functional as F ... a = torch.empty(2,2,2,2) ... F.pad(a, (1,1), mode="circular") ... Traceback (most recent call last): File "<python-input-0>", line 4, in <module> F.pad(a, (1,1), mode="circular") ~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/rrathaur/Desktop/pytorch/torch/nn/functional.py", line 5294, in pad return torch._C._nn.pad(input, pad, mode, value) ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^ NotImplementedError: Padding size 2 is not supported for 4D input tensor. Supported combinations for non-constant padding: - 2D or 3D input: padding size = 2 (pads last dimension) - 3D or 4D input: padding size = 4 (pads last 2 dimensions) - 4D or 5D input: padding size = 6 (pads last 3 dimensions) >>> ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160866 Approved by: https://github.com/mikaylagawarecki	2025-09-02 07:15:59 +00:00
Yu, Guangye	f8746b878d	Add uuid to XPU device properties (#161392 ) # Motivation Fix https://github.com/intel/torch-xpu-ops/issues/1955 Refer to https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_intel_device_info.md#device-uuid, `ext::intel::info::device::uuid` returns `std::array<unsigned char, 16>` as the UUID. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161392 Approved by: https://github.com/EikanWang, https://github.com/albanD	2025-09-02 06:41:32 +00:00
Tianyu Liu	8703debf66	[DTensor] select strategy with no redistribute when redistribute cost is 0 (#161882 ) Before this PR, the `_select_strategy` always selects the first strategy with minimum redistribute cost. This causes unexpected behavior when - multiple strategies have 0 redistribute costs - the first one with 0 redistribute cost may perform local chunking E.g. in memory efficient SDPA, the default orders of candidate strategies have a `Shard(2)` one before the `Replicate()` one. https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/_ops/_matrix_ops.py#L500-L512 When the input is `Replicate()`, `_select_strategy` will pick the `Shard(2)` strategy and do local chunking first, before local computation. This is clearly unexpected to users. In this PR, we improve `_select_strategy` so that when multiple strategies have 0 redistribute cost, we prioritize the one which keeps input unchanged. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161882 Approved by: https://github.com/ezyang	2025-09-02 05:41:56 +00:00
bobrenjc93	1aeb421c34	Make pattern matcher resilient to ddes (#161843 ) Motivated by the following discord support chat: https://discord.com/channels/1189498204333543425/1409578286186758195 ``` import torch @torch.compile(fullgraph=True, mode='reduce-overhead') def get_mask(W: torch.Tensor, percentage_nonzeros: torch.Tensor): total_elements = W.numel() k = int(total_elements * percentage_nonzeros) top_k_indices = torch.topk(torch.abs(W).flatten(), k)[1] mask = torch.zeros(total_elements, dtype=torch.bool, device=W.device) mask.scatter_(0, top_k_indices, True) mask = mask.view(W.shape) return mask x = torch.randn((128, 64), device='cuda') p = torch.tensor(0.50, device='cuda') get_mask(x, p) ``` Results in ``` InductorError: GuardOnDataDependentSymNode: Could not guard on data-dependent expression Eq(TruncToInt(zuf0), 1) (unhinted: Eq(TruncToInt(zuf0), 1)). (Size-like symbols: none) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161843 Approved by: https://github.com/ezyang	2025-09-02 05:16:13 +00:00
Edward Yang	5561e45758	[HOTFIX] Disable DISTRIBUTED_C10D_DIRECT_ACCESS for now (#161946 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161946 Approved by: https://github.com/msaroufim	2025-09-02 05:01:46 +00:00
soulitzer	8171d6052e	Clear custom autograd Function ctx.to_save earlier (#161171 ) Fixes https://github.com/pytorch/pytorch/issues/161186 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161171 Approved by: https://github.com/albanD	2025-09-02 03:26:31 +00:00
Dev Sashidhar	d5e0f4202b	Fixes broken memory_viz link in CUDA memory docs (#161426 ) Fixes #161375 The "Using the visualizer" section in torch_cuda_memory.md had a link to https://pytorch.org/memory_viz written in inline Markdown link form. Strangely the same syntax worked earlier on the page as the issuer mentioned, but in this spot it's rendered sa a broken link. I wasn't able to pinpoint why the second occurrence was treated differently, but switching it to the Markdown autolink form fixes the problem consistently. I tested this by rebuilding the docs locally with make html and serving the HTML with a local http.server. With the autolink, the link resolves correctly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161426 Approved by: https://github.com/soulitzer	2025-09-02 02:06:54 +00:00
Xuehai Pan	13d66e2a66	[BE][Easy] restore #157584 after #158288 (#158541 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158541 Approved by: https://github.com/ezyang	2025-09-02 02:06:50 +00:00
Edward Yang	bbedc71fd3	test: ensure editable cached wrapper is respected (#160943 ) ## Summary - add a test verifying that editing the local cache wrapper is picked up after Dynamo reset ## Testing - `lintrunner -a` (fails: FLAKE8 failure, TEST_HAS_MAIN failure, CODESPELL failure, PYFMT failure) - `PYTHONPATH=. python test/inductor/test_codecache.py TestPyCodeCache.test_editable_cached_wrapper -v` ------ https://chatgpt.com/codex/tasks/task_e_68a3aa3fcc9883239b17d1f4250d1e89 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160943 Approved by: https://github.com/xmfan	2025-09-02 01:48:30 +00:00
Animesh Jain	e9481b6617	[dynamo] Prevent unnecessary recompile on disabled functions in the compiled frame (#161883 ) Trying out a re-impl of https://github.com/pytorch/pytorch/pull/160934 The above PR led to OOM, most likely because of the cache holding to a nested function (which if not held in the cache would have been garbage collected), which holds on to cuda tensors in its closure. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161883 Approved by: https://github.com/jansel	2025-09-02 01:13:48 +00:00
gaoyufeng	1c1b28d5b6	Fix slice scatter dtype consistency (#160851 ) Fixes #147842 Fix torch.slice_scatter type inconsistency issue. I noticed previous PRs on this have stalled, so I'm opening this new PR. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160851 Approved by: https://github.com/soulitzer	2025-09-02 01:08:26 +00:00
Xu Han	2a5c0785e2	[AOTI] split too long string to smaller pieces when its length larger than 16000, fix msvc c2026. (#161850 ) Split too long string to smaller pieces when its length larger than 16000, fix msvc c2026. reproducer: ```cmd pytest test\inductor\test_aot_inductor.py -v -k test_runtime_checks_large_cpu ``` Error message: <img width="1660" height="174" alt="image" src="https://github.com/user-attachments/assets/56fcd9be-24cb-484b-bfdc-f719ff2650b8" /> For MSVC c2026: https://learn.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026?view=msvc-170 We can split too long string to smaller pieces, it can fix this issue. Local validated: <img width="1122" height="232" alt="image" src="https://github.com/user-attachments/assets/cac54cc9-be51-4a5d-b408-06755a4debd5" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161850 Approved by: https://github.com/jansel	2025-09-02 00:09:01 +00:00
Edward Z. Yang	626cb7df81	Make distributed modules importable even when backend not built (#159889 ) This PR is greatly simplified now that it stacked on top of a PR that builds with distributed always. We only need to stub functions that may not be defined due to a backend not being enabled. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159889 Approved by: https://github.com/wconstab ghstack dependencies: #160449	2025-09-01 23:00:21 +00:00
Edward Yang	b7034e9c92	Always build USE_DISTRIBUTED. (#160449 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160449 Approved by: https://github.com/wconstab, https://github.com/albanD, https://github.com/dcci	2025-09-01 23:00:21 +00:00
PyTorch MergeBot	13b65196db	Revert "Defer loading hipify until it is needed (#160824 )" This reverts commit 403a3a393cda7e60f503f3b04b8805a845dcf45d. Reverted https://github.com/pytorch/pytorch/pull/160824 on behalf of https://github.com/atalman due to Broke slow tests test_utils.py::TestHipifyTrie::test_special_char_export_trie_to_regex [GH job link](https://github.com/pytorch/pytorch/actions/runs/17387051351/job/49355619371) [HUD commit link](`403a3a393c`) ([comment](https://github.com/pytorch/pytorch/pull/160824#issuecomment-3243281628))	2025-09-01 21:34:13 +00:00
Guilherme Leobas	403a3a393c	Defer loading hipify until it is needed (#160824 ) Saves a few milliseconds when running a test case: Before: ``` $ PYTORCH_TEST_WITH_DYNAMO=1 python test/dynamo/cpython/3_13/test_float.py GeneralFloatCases.test_float_pow frames [('total', 1), ('ok', 1)] inline_call [] . ---------------------------------------------------------------------- Ran 1 test in 1.497s ``` After: ``` $ PYTORCH_TEST_WITH_DYNAMO=1 python test/dynamo/cpython/3_13/test_float.py GeneralFloatCases.test_float_pow frames [('total', 1), ('ok', 1)] inline_call [] . ---------------------------------------------------------------------- Ran 1 test in 0.909s ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160824 Approved by: https://github.com/zou3519	2025-09-01 20:57:41 +00:00
Ivan Komarov	cbfb005f7c	Fix type checking for persistent loads in the weights-only unpickler (#161661 ) The error message here implies that we can only call `self.persistent_load(...)` for ints or tuples, but due to the second part of the type check being inverted, weights-only unpickler will throw an exception iff `pid` is an int. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161661 Approved by: https://github.com/Skylion007	2025-09-01 19:57:19 +00:00
Huy Do	d232a95d4a	[BE] Consolidate inductor benchmark Docker images and rename jobs (#161536 ) We have 4 different version of inductor benchmark Docker images used in CI at the moment: 1. `pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks` is used by almost all inductor jobs including nightly benchmark 2. `pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks` runs inductor unit tests with python 3.12 3. `pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks` runs inductor unit tests with python 3.13 4. `pytorch-linux-jammy-py3-gcc11-inductor-benchmarks` runs inductor unit tests on CPU My proposal here is to clean up (2) and (3) and to keep (1) under the same setup from https://ghcr.io/pytorch/torchbench. Simplicity is the key here as inductor workflows are getting more and more complex: 1. Unit tests for Python variant like 3.12 and 3.13 were useful when they were first added to CI. They are much less useful now. [Flambeau](https://hud.pytorch.org/flambeau/s/3876ec7b-43f0-42c6-bfbf-899035e5bb77) shows a 0.97 correlation between them. And we are also moving to 3.14 nowadays. I want to choose 3.12 for (1), but will do this separately. This is also what TorchBench and vLLM are using on CI. 1. We are gradually cleaning up 3.9 on CI https://github.com/pytorch/pytorch/issues/161167 Another BE change here is to rename the jobs various inductor workflows because I think names like `linux-jammy-cuda12_8-py3_10-gcc9-inductor-build` is too long and confusing to look at, better just use human-friendly names like `inductor-build`. Other information is already spelled out in the build environment. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161536 Approved by: https://github.com/zou3519	2025-09-01 19:07:08 +00:00
PyTorch MergeBot	17fa8eec4a	Revert "Fix conv exhaustive autotuning and expand Exhaustive test coverage (#159387 )" This reverts commit 4b4cdcfe3af10df624878985caac4e595fbab54c. Reverted https://github.com/pytorch/pytorch/pull/159387 on behalf of https://github.com/atalman due to need to revert due to merge conflicts, please feel free to merge it back in once conflicts are resolved ([comment](https://github.com/pytorch/pytorch/pull/159387#issuecomment-3242945661))	2025-09-01 17:08:27 +00:00
PyTorch MergeBot	54e275e0d8	Revert "[Fix XPU CI][Inductor UT] Fix test cases broken by community. (#161142 )" This reverts commit c83cbd2f2a2de2e3258f07de77d8740743df6d2d. Reverted https://github.com/pytorch/pytorch/pull/161142 on behalf of https://github.com/jeanschmidt due to This PR needs to be reverted to be able to revert another PR, this is due to merge conflicts, I am sorry for this. Please feel free to rebase and merge at your earliest convenience ([comment](https://github.com/pytorch/pytorch/pull/161142#issuecomment-3242937640))	2025-09-01 17:03:50 +00:00
PyTorch MergeBot	63a9c23fe9	Revert "[CUDA] Reuse blocks with record_stream during CUDA Graph capture in the CUDACachingAllocator (#158352 )" This reverts commit 190c391a28845a14df26abb228d26aa813efb20c. Reverted https://github.com/pytorch/pytorch/pull/158352 on behalf of https://github.com/atalman due to Broke cuda 13.0 nightly builds https://github.com/pytorch/pytorch/actions/runs/17382188549/job/49341981474 ([comment](https://github.com/pytorch/pytorch/pull/158352#issuecomment-3242871629))	2025-09-01 16:27:03 +00:00
Ting Lu	fefee08164	[CD] Add CUDA 13.0 Windows build (#161663 ) Test CUDA 13.0 windows build Pull Request resolved: https://github.com/pytorch/pytorch/pull/161663 Approved by: https://github.com/malfet, https://github.com/atalman	2025-09-01 15:27:17 +00:00
PyTorch MergeBot	21fae99c18	Revert "[cuBLASLt][FP8] `cuBLASLt` appears to support float8 rowwise-scaling on H100 (#161305 )" This reverts commit 55c289d5c104c4959cc125c0fb4fb50c9fc71102. Reverted https://github.com/pytorch/pytorch/pull/161305 on behalf of https://github.com/atalman due to Broke test_matmul_cuda.py::TestFP8MatmulCUDA::test_float8_error_messages_cuda [GH job link](https://github.com/pytorch/pytorch/actions/runs/17309011599/job/49140215634) [HUD commit link](`1190b7f73e`) ([comment](https://github.com/pytorch/pytorch/pull/161305#issuecomment-3242652672))	2025-09-01 14:56:47 +00:00
PyTorch UpdateBot	2ba65472dd	[xla hash update] update the pinned xla hash (#161396 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned xla hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161396 Approved by: https://github.com/pytorchbot	2025-09-01 11:43:03 +00:00
Frank Lin	190c391a28	[CUDA] Reuse blocks with record_stream during CUDA Graph capture in the CUDACachingAllocator (#158352 ) ## Introduction During CUDA Graph capture, the CUDA caching allocator currently defers reclaiming blocks until capture ends. This is because CUDA forbids querying events recorded during capture (the CUDA operation is not executed during the capture stage), so the allocator cannot use its normal event-based logic. However, capture records an DAG (we call it capturing graph) of work. We can use the capturing graph to determine when a block’s old lifetime is fully before future work, and safely reuse it within the same capture. This PR adds an experimental flag `graph_capture_record_stream_reuse: True\|False (default: False)`. When enabled, the allocator inserts lightweight free markers and uses capture ordering to decide if a freed block is safe to reuse during capture. If the proof cannot be established, we fall back to the existing post-capture path. ## Terms * Free marker: A capture-legal no-op (created with `cudaGraphAddEmptyNode`) inserted after the last captured use of the block on each stream that used it. * Terminal: The set of the lastest operations of the stream (or the capturing graph). Any newly captured op on that stream will attach after all nodes in this set. For a stream currently capturing, it is the set of nodes returned in `dependencies_out` by `cudaStreamGetCaptureInfo`. ## When can we reuse a block during capture? ### Strong Rule (Graph-Wide Safety) This rule provides a universal guarantee that a block is safe for reuse by any stream in the graph. > A block is safe to reuse if every free marker is a predecessor of every terminal of all active streams in the graph. Why it's safe: This rule establishes a strict global ordering. Since any new operation on any stream must be appended after that stream's terminals, this condition guarantees that the block's new lifetime begins only after its old lifetime has completely ended everywhere. This prevents lifetime overlaps when the graph is replayed, ensuring correctness. ### Per-stream Rule (A Practical Optimization) The strong rule, while safe, is often unnecessarily restrictive. The `DeviceCachingAllocator` introduces a crucial constraint that allows for a simpler check. In `DeviceCachingAllocator`, `get_free_block` only returns blocks whose `block->stream == p.stream()`. In other words, we never reuse a block on a stream different from the allocation stream. This means we don't need to verify safety across the entire graph. We only need to confirm that the block is safe to reuse from the perspective of its own allocation stream. > Reuse a block for allocations on stream S if every free marker is a predecessor of every node in the terminal set of S. In short, a block is considered reusable on stream S as long as all marker marking it "free" are guaranteed to complete before any new work that might need it on stream S begins. ## Implementation * On `free(block)` during capture * For each stream in `block->stream_uses` and the allocation stream, insert a free marker (empty node) and make it that stream’s tail. * If we cannot place markers for all such streams (for example, a stream is not in capture), defer to the post-capture path. * Otherwise, store the marker handles and keep the block in the capture-private structures. * On `allocate(stream)` during capture (attempt per-stream reclaim) * Query the allocation stream S’s terminal via `cudaStreamGetCaptureInfo`. * For each deferred block, check whether it is allocated on this stream, and each of its free markers is a predecessor of the terminal. * If yes, hand the block to S for immediate reuse within the same capture. * If no, keep it deferred; it will be reconsidered as capture progresses and S’s terminal advances. * On capture end * Any still-deferred blocks follow the existing post-capture reclamation (event insertion/polling). External behavior remains unchanged if we cannot prove safety during capture. ## Examples (2 streams) <img width="641" height="801" alt="pytorch-remove-cudagraph-defer-reclaiming (6)" src="https://github.com/user-attachments/assets/41adc835-d448-483b-99ba-b4341cb7d2a2" /> * Case 0 — Unsafe The two frees are not ordered with respect to each other. For stream 1, the other stream’s free marker does not precede this stream’s terminal, so the per-stream condition fails. Counterexample intuition for the unsafe setups: imagine `f2(x)` runs for a long time. If DeviceCachingAllocator reused block `x` on a stream whose terminal is not ordered after the free markers, the new lifetime could overlap the old one on replay, risking use-after-free or data corruption. The per-stream rule prevents exactly this. * Case 1 — Reusable on stream 1 Stream 1’s terminal is after both frees, so every free marker precedes stream 1’s terminal. The block is reusable for allocations on stream 1. * Case 2 — Not reusable on stream 2, but this cannot occur in `DeviceCachingAllocator` This depicts reusing the block on stream 2 while stream 1’s free is not yet ordered before stream 2’s terminal. Though the block is not safe to reuse on stream 2, DeviceCachingAllocator will not choose that block for stream 2 anyway: `get_free_block` rejects blocks whose `stream != p.stream()`. So this case is unreachable. * Case 3 — Safe (strong rule holds) In this scenario, the terminal nodes of all streams are positioned after the block's free markers, satisfying the strong rule. This guarantees the block is safe for reuse by any stream in the capturing graph. However, since `DeviceCachingAllocator ` only reuses a block on its original allocation stream, verifying this strong condition is unnecessary. We only need to ensure the per-stream rule is met for the specific stream requesting the block. * Case 4 — Freeing after a join See the note below. ## Edge Case: Freeing after a join Our current dependency tracking has a limitation in scenarios where a block is freed after a stream join, see @galv's [comments here](https://github.com/pytorch/pytorch/pull/158352#pullrequestreview-3112565198)). In the case 4, we have a missed opportunity. Because the block's usage is not explicitly marked, we cannot determine that the block's actual last use may have occurred much earlier, long before the join. Then, we must wait for the subsequent join before the block can be reused. ## Thanks Thanks to @galv for his great idea around graph parsing and empty nodes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158352 Approved by: https://github.com/ngimel Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-09-01 09:25:01 +00:00
Raman-RH	20bfb2539d	Skip compilation when FX graph has no calls and returns empty (#160536 ) Fixes #160437 Summary: This PR avoids compiling empty FX graphs generated during graph breaks. If there are no calls in the graph, we can just return the empty list of instructions. More precisely, In compile_and_call_fx_graph, if the FX graph contains no calls (count_calls(self.graph) == 0) and the return value list is empty, we now return an empty instruction list immediately Impact: module: dynamo Pull Request resolved: https://github.com/pytorch/pytorch/pull/160536 Approved by: https://github.com/Lucaskabela	2025-09-01 08:32:22 +00:00
Eli Uriegas	dd2519abe8	ci: Update sphinx, disable google search by default (#161793 ) Includes fixes from https://github.com/pytorch/pytorch_sphinx_theme/pull/207 Signed-off-by: Eli Uriegas <eliuriegas@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161793 Approved by: https://github.com/malfet, https://github.com/albanD	2025-09-01 07:43:39 +00:00
Ke Wen	2f6b4b1ad3	[4/N][SymmMem] Add `get_remote_tensor` + move up `get_buffer` and `get_signal_pad` (#161533 ) Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): `get_remote_tensor `: return a symmetric tensor given a peer rank. The difference between `get_buffer` API and `get_remote_tensor` API: - the former accepts an offset, whereas the latter doesn't - the latter returns a symmetric tensor at `hdl.offset` on `peer`. As a refactorization, this PR also moves the implementation of `get_buffer` and `get_signal_pad` to the `SymmetricMemory` level as their code is common to all backends. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161533 Approved by: https://github.com/ngimel ghstack dependencies: #161470, #161471, #161532	2025-09-01 07:02:06 +00:00
Zheng, Zhaoqiong	6737e2c996	update supported OS for Intel client GPU (#161699 ) update supported OS for Intel client GPU Pull Request resolved: https://github.com/pytorch/pytorch/pull/161699 Approved by: https://github.com/chuanqi129, https://github.com/malfet	2025-09-01 05:45:09 +00:00
PyTorch UpdateBot	67c31dcd36	[vllm hash update] update the pinned vllm hash (#161867 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161867 Approved by: https://github.com/pytorchbot	2025-09-01 04:37:13 +00:00
Yu, Guangye	cb1e31362c	Remove background thread UT on XPU to fix CI (#161844 ) # Motivation Because we revert `torch._C._set_allocator_settings` in https://github.com/pytorch/pytorch/pull/161626, this UT becomes invalid. Fix https://github.com/pytorch/pytorch/issues/161697 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161844 Approved by: https://github.com/gujinghui	2025-09-01 03:45:26 +00:00
Sean McGovern	9a665ca3c4	Add __init__.pyi to torch/linalg (#160750 ) Fixes #149639 In an effort to improve the type checking coverage, added a stub file for the torch/linalg directory. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160750 Approved by: https://github.com/Skylion007	2025-08-31 22:39:05 +00:00
Edward Yang	d9d6dde0f4	Leak Python filenames so that we can give good dispatcher errors. (#160418 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160418 Approved by: https://github.com/zou3519	2025-08-31 22:31:39 +00:00
Scott Wolchok	68738beff7	PythonArgs::toBool: order cheap mutually exclusive checks first (#161455 ) symbools are not identical with Py_True or PyFalse, so we can do those cheap checks first and at least get plain old bools to go fast. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161455 Approved by: https://github.com/Skylion007 ghstack dependencies: #161301, #161292, #161304, #161308, #161315, #161317, #161328, #161329, #161432	2025-08-31 21:35:48 +00:00
Ke Wen	25f4aaed9e	[3/N][SymmMem] Expose offset field from handle (#161532 ) As titled, so that kernels relying on direct pointers can use base address and `hdl.offset` to access remote memory. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161532 Approved by: https://github.com/ngimel ghstack dependencies: #161470, #161471	2025-08-31 18:08:57 +00:00
Ke Wen	61e18b5304	[2/N][SymmMem] Add MemPool allocator and tests (#161471 ) (Porting most of #161008) Hooking SymmetricMemory Allocator to MemPool so that user can create symmetric tensors with regular `torch.zeros`, `torch.arange` etc factories. Also so that our ops can have functional variants that create `out` tensors on symmetric memory. To end users, this PR supports a python UI as follows: ``` allocator = symm_mem.get_mempool_allocator(device) mempool = torch.cuda.MemPool(allocator) with torch.cuda.use_mem_pool(mempool): tensor = torch.arange(numel, dtype=dtype, device=device) ``` Added tests for both use cases above. Differential Revision: [](https://our.internmc.facebook.com/intern/diff/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161471 Approved by: https://github.com/ngimel ghstack dependencies: #161470	2025-08-31 18:08:57 +00:00
Rohit Manav	e92cd94153	removed duplicate imports (#161685 ) Fixes #161684 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161685 Approved by: https://github.com/Skylion007, https://github.com/ezyang	2025-08-31 16:21:49 +00:00
Raman Kumar	0d421ace32	fix spelling of word - when (#160185 ) just found a typo while understanding the codebase while working on another PR This fixes typo in word `when` in files ``` native/cpu/PaddingKernel.cpp native/cpu/batch_norm_kernel.cpp ``` @eqy Pull Request resolved: https://github.com/pytorch/pytorch/pull/160185 Approved by: https://github.com/yewentao256, https://github.com/ezyang	2025-08-31 13:38:23 +00:00
Tan Hoang	91f0bcf43f	[c10d][nvshmem] add nvshmem build rules and dependency for libtorch_cuda (#159562 ) Summary: Add guarded build option for nvshmem-related c10d code with `-c fbcode.caffe2_use_nvshmem` Guarded clause include nvshmem device + host code (static-linked) + these 2 files: - `torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu` - `torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159562 Approved by: https://github.com/Skylion007, https://github.com/kwen2501	2025-08-31 12:56:51 +00:00
Xia, Weiwen	75bc23cfc3	[CPU][Inductor] Improve performance of A16W8 GEMM template (#161148 ) Summary This PR improves the performance of A16W8 GEMM template by - Removing the config with block_n=48 & block_m=16 as it is not very efficient. - Using AMX microkernel when M >= 5 so that we use AMX instead of AVX512 for M=5~31. - Converting int8 values to bf16 with intrinsics instead of `at::vec::convert` as the latter does not have optimized implementation for this case. We saw up to >10% performance gain in various cases of running Llama-3.1-8b-instruct. Test plan Already covered by UT. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161148 Approved by: https://github.com/CaoE, https://github.com/jansel	2025-08-31 09:56:29 +00:00
Natalia Gimelshein	377033757a	Use vectorized stores for all dtypes in cat (#161649 ) resurrecting #151818 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161649 Approved by: https://github.com/Skylion007	2025-08-31 05:42:41 +00:00
PyTorch UpdateBot	f612045ce1	[vllm hash update] update the pinned vllm hash (#161835 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161835 Approved by: https://github.com/pytorchbot	2025-08-31 04:24:04 +00:00
Xu Han	ad7b748686	[AOTI] fix ut, add extension file type for Windows. (#161851 ) fix ut, add extension file type for Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161851 Approved by: https://github.com/ezyang	2025-08-31 01:13:29 +00:00
Isalia20	f3697b033e	[MPS] add bunch of unary funcs for sparse tensors (#161846 ) adds bunch of unary functions for sparse tensors Pull Request resolved: https://github.com/pytorch/pytorch/pull/161846 Approved by: https://github.com/malfet	2025-08-30 21:13:05 +00:00
Lakshay Garg	2d31c3d99d	Pass shared_ptr by value (#161834 ) The way AsyncAllreduceCUDADeviceWork is currently implemented, using it will force a copy of `shared_ptr<gloo::Context>` because `std::move` does nothing for a const ref. This PR changes the param type to shared_ptr<> instead of the const ref. This allows more efficient parameter passing. Here's an example that demonstrates the issue: ```cpp #include <memory> #include <iostream> struct Foo {}; void useFoo_ref(const std::shared_ptr<Foo>& f) { std::shared_ptr<Foo> internal = std::move(f); std::cout << "use_count: " << internal.use_count() << '\n'; } void useFoo_val(std::shared_ptr<Foo> f) { std::shared_ptr<Foo> internal = std::move(f); std::cout << "use_count: " << internal.use_count() << '\n'; } int main() { std::shared_ptr<Foo> f1 = std::make_shared<Foo>(); useFoo_ref(std::move(f1)); // prints "use_count: 2" std::shared_ptr<Foo> f2 = std::make_shared<Foo>(); useFoo_val(std::move(f2)); // prints "use_count: 1" } ``` This also aligns well with [C++ Core Guidelines][1] for handling smart pointers. [1]: https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines?utm_source=chatgpt.com#Rr-summary-smartptrs Pull Request resolved: https://github.com/pytorch/pytorch/pull/161834 Approved by: https://github.com/Skylion007, https://github.com/eqy, https://github.com/kwen2501	2025-08-30 18:00:37 +00:00
PyTorch MergeBot	fb2d5ea697	Revert "[2/N][SymmMem] Add MemPool allocator and tests (#161471 )" This reverts commit b291dc9684d00396239a0c7786b7aac71bf69c05. Reverted https://github.com/pytorch/pytorch/pull/161471 on behalf of https://github.com/atalman due to Multiple internal failures on PR #https://github.com/pytorch/pytorch/pull/161471 will need to land it via co-dev ([comment](https://github.com/pytorch/pytorch/pull/161471#issuecomment-3239283585))	2025-08-30 14:00:29 +00:00
PyTorch MergeBot	2e1345a0f8	Revert "[3/N][SymmMem] Expose offset field from handle (#161532 )" This reverts commit ff9533970ad76ed1905b90df6515aca50354c193. Reverted https://github.com/pytorch/pytorch/pull/161532 on behalf of https://github.com/atalman due to Multiple internal failures on PR #https://github.com/pytorch/pytorch/pull/161471 will need to land it via co-dev ([comment](https://github.com/pytorch/pytorch/pull/161532#issuecomment-3239282308))	2025-08-30 13:57:50 +00:00
PyTorch MergeBot	684ae48c16	Revert "[4/N][SymmMem] Add `get_remote_tensor` + move up `get_buffer` and `get_signal_pad` (#161533 )" This reverts commit 95516ad7e6d92ed131fb6057b29ec52e73190e3c. Reverted https://github.com/pytorch/pytorch/pull/161533 on behalf of https://github.com/atalman due to Multiple internal failures on PR #[161471](https://github.com/pytorch/pytorch/pull/161471) will need to land it via co-dev ([comment](https://github.com/pytorch/pytorch/pull/161533#issuecomment-3239278635))	2025-08-30 13:51:22 +00:00
FFFrog	b93f87d67b	[OpenReg] Integrate Event&Stream from OpenReg Backend into PyTorch (#160100 ) We integrated the openreg backend’s `Stream` and `Event` into PyTorch, all of which are similar to other accelerators like `CUDA`, `XPUs`, etc. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160100 Approved by: https://github.com/albanD ghstack dependencies: #161603, #160099, #161773	2025-08-30 13:21:28 +00:00
FFFrog	6284881b2a	[OpenReg] Add tests of device and memory for OpenReg (#161773 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161773 Approved by: https://github.com/albanD ghstack dependencies: #161603, #160099	2025-08-30 13:21:28 +00:00
FFFrog	aae9cbb6c0	[OpenReg] Add Event&Stream Support for OpenReg Backend (#160099 ) Referring to the signatures and functions of `Stream` and `Event` in CUDA, we use CPU multithreading and conditional variables to implement equivalent capabilities as the underlying foundation of torch_openreg. Changes: - Add stream capabilities for OpenReg - Add event capabilities for OpenReg - Add kernel launch entrypoint for OpenReg - Add testcases about stream and event for OpenReg - Add example for OpenReg Pull Request resolved: https://github.com/pytorch/pytorch/pull/160099 Approved by: https://github.com/albanD ghstack dependencies: #161603	2025-08-30 13:21:21 +00:00
FFFrog	dad2e50ac5	[OpenReg] Rename cpu_fallback_blacklist to cpu_fallback_blocklist (#161603 ) As the title stated. Related Infos: https://github.com/pytorch/pytorch/pull/158644#discussion_r2301460839 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161603 Approved by: https://github.com/albanD	2025-08-30 13:21:15 +00:00
Aleksandar Samardžić	37da7b777b	Fix _scaled_grouped_mm not reported as unsupported on SM100. (#161780 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161780 Approved by: https://github.com/danielvegamyhre, https://github.com/ngimel, https://github.com/Skylion007, https://github.com/eqy	2025-08-30 12:33:51 +00:00
xinan.lin	c83cbd2f2a	[Fix XPU CI][Inductor UT] Fix test cases broken by community. (#161142 ) Fixes #161384, Fixes #161162, Fixes #160946, Fixes #160947, Fixes #160948 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161142 Approved by: https://github.com/jansel	2025-08-30 11:09:07 +00:00
Mwiza Kunda	b994f6e3b3	[inductor] check block options after broadcasting and singleton dims have been removed (#161602 ) This will allow for some more cases to use tensor descriptors e.g. before the following block params would not match because the innermost dimension does not have stride 1 ```python block_params=BlockParameters(shape=[64, 4, 1, 1], block_shape=[((XBLOCK + 3)//4), Min(4, XBLOCK), 1, 1], strides=[0, 1, 0, 0], offsets=[(xoffset//4), ModularIndexing(xoffset, 1, 4), 0, 0]) ``` After broadcasting dimensions and singleton dimensions are removed: ```python block_params=BlockParameters(shape=[4], block_shape=[Min(4, XBLOCK)], strides=[1], offsets=[ModularIndexing(xoffset, 1, 4)]) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161602 Approved by: https://github.com/jansel	2025-08-30 08:10:51 +00:00
yucai-intel	f44ad54bc6	Update torch-xpu-ops commit pin (#161152 ) Update the torch-xpu-ops commit to [8b58040ee32689487f660462f655085f31506dab](`8b58040ee3`), includes: - Add vectorization path on maxpool forward channel last - Add FlightRecorder support for ProcessGroupXCCL - Fix random build failure on codegen - Suppress dllexport warning on Windows - Make torch-xpu-ops build depend on ATen XPU Pull Request resolved: https://github.com/pytorch/pytorch/pull/161152 Approved by: https://github.com/EikanWang Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com>	2025-08-30 07:19:24 +00:00
Scott Wolchok	4d3ab2669b	Stop trying to intern arguments in PyObject_FastGetAttrString (#161432 ) If we want them interned, we should intern at callsites. (The numpy reference has bit rotted; see `b222eb66c7 (diff-6bdb6105198083838f51c57b55b3a49472ed23043bb40018f1ea41138e687163)`) Profiling a simple torchdispatch benchmark with perf before/after seems to show that time spent copying std::strings and interning Python strings is gone, though there is some noise and the improvement is very small. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161432 Approved by: https://github.com/ezyang ghstack dependencies: #161301, #161292, #161304, #161308, #161315, #161317, #161328, #161329	2025-08-30 06:55:43 +00:00
Scott Wolchok	0ee8a4e281	Fix accidental copy in pushPyOutToStack (#161329 ) `auto` forces a copy. Confirmed this did something noticable with perf. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161329 Approved by: https://github.com/zpcore, https://github.com/fduwjj, https://github.com/Skylion007, https://github.com/bdhirsh ghstack dependencies: #161301, #161292, #161304, #161308, #161315, #161317, #161328	2025-08-30 06:55:43 +00:00
Scott Wolchok	eb9526ae35	Avoid double hash lookup in torch._library.simple_registry (#161328 ) Not a huge cost, but free win is free. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161328 Approved by: https://github.com/Skylion007 ghstack dependencies: #161301, #161292, #161304, #161308, #161315, #161317	2025-08-30 06:55:43 +00:00
Scott Wolchok	302d860157	Improve assert perf in _python_dispatch._correct_storage_aliasing (#161317 ) This assertion was expensive because of is_traceable_wrapper_subclass. Finding a cheap check to run first that's likely to let us skip the rest seems to improve things significantly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161317 Approved by: https://github.com/ezyang, https://github.com/XilunWu, https://github.com/bdhirsh ghstack dependencies: #161301, #161292, #161304, #161308, #161315	2025-08-30 06:55:42 +00:00
Scott Wolchok	0c459f2921	Fix pybind enum efficiency issue in return_and_correct_aliasing (#161315 ) Scanning a list of pybind enums with `in` is slow. See NOTE in code for full explanation. This is a significant optimization; will be updating the torchdispatch/return_and_correct_aliasing portion of this stack with benchmark and results soonish. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161315 Approved by: https://github.com/Skylion007, https://github.com/bdhirsh ghstack dependencies: #161301, #161292, #161304, #161308	2025-08-30 06:55:42 +00:00
Scott Wolchok	b96bcb9fdb	Optimize _python_dispatch.return_and_correct_aliasing.get_write_alias (#161308 ) - Empty containers are Falsey - Hoist cheap checks first - Microbenchmarked single-element set access method Benchmark code: ``` import timeit to_test = [ ('list(x)', 'x = set([3])'), ('x[0]', 'x = [3]'), ('list(x)[0]', 'x = set([3])'), ('next(iter(x))', 'x = set([3])'), ] for (stmt, setup) in to_test: res = timeit.timeit(stmt=stmt, setup=setup) print(f"Time for `{stmt}`: {res}") ``` Result with Python 3.13 on Mac (with excess digits manually trimmed; directionally matches result on Linux) ``` Time for `list(x)`: 0.03418 Time for `x[0]`: 0.00852 Time for `list(x)[0]`: 0.03561 Time for `next(iter(x))`: 0.02278 ``` FWIW, I was surprised by this result, so I guess I'm glad I wrote the benchmark! Pull Request resolved: https://github.com/pytorch/pytorch/pull/161308 Approved by: https://github.com/Skylion007, https://github.com/bdhirsh ghstack dependencies: #161301, #161292, #161304	2025-08-30 06:55:42 +00:00
Scott Wolchok	2089ed3d5e	Use `is`, not ==, to check exact type matches in _python_dispatch (#161304 ) `is` checks object identity and is more efficient. Google seems to confirm it is the correct way to do an exact type check. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161304 Approved by: https://github.com/Skylion007, https://github.com/malfet, https://github.com/bdhirsh ghstack dependencies: #161301, #161292	2025-08-30 06:55:42 +00:00
Scott Wolchok	1a64bf2636	Stop accessing func._schema in _python_dispatch.correct_storage_aliasing (#161292 ) func._schema is a pybind, accessing the arguments/returns is expensive, we have no reason to do it anyway, and even though #161301 makes accessing the arguments/returns less expensive, this still seems to improve performance. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161292 Approved by: https://github.com/wconstab, https://github.com/malfet, https://github.com/bdhirsh ghstack dependencies: #161301	2025-08-30 06:55:42 +00:00
Scott Wolchok	5d35b49ba7	Fix forced copying def_property_readonly for FunctionSchema & friends (#161301 ) This took me a bit to figure out and I'm pretty sure I've looked at this code before. Pybind uses `return_value_policy::reference_internal` for `def_property`, which [causes the owning object to be kept alive for the lifespan of the return value](https://pybind11.readthedocs.io/en/stable/advanced/functions.html), allowing the getter to safely avoid copying the property value. However, lambdas act like they return `auto`, not `decltype(auto)`, so our lambdas themselves were forcing copies! Testing: observed std::vector<Argument> copying disappear in Linux perf profile of someOpInfo._schema.arguments/returns (in _python_dispatch.correct_storage_aliasing). Pull Request resolved: https://github.com/pytorch/pytorch/pull/161301 Approved by: https://github.com/Skylion007, https://github.com/malfet, https://github.com/wconstab	2025-08-30 06:55:42 +00:00
CaoE	db622842bc	[Inductor][CPP] Optimize config selecting for micro gemm when number of mxn blocks can not occupy all the threads (#161144 ) If number of mxn blocks can not occupy all the threads, use smaller register block size will get better performance since the computing size per thread is smaller. It may get ~20% performance improvement for the real case `m1_n512_k4096`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161144 Approved by: https://github.com/leslie-fang-intel	2025-08-30 05:53:49 +00:00
Boyuan Feng	77d8e98e1b	[Inductor] update exp codegen for better precision (#161829 ) Prior to this PR, we have: ``` [Default Behavior] uses `tl.math.exp({x})`: eager diff: tensor(2.6935e-06, device='cuda:0', dtype=torch.float64) compile diff: tensor(9.2757e-06, device='cuda:0', dtype=torch.float64) eager_latency:0.0013996509159580942, compile_latency:0.0013981951951980592 TORCHINDUCTOR_USE_FAST_MATH=1 uses `tl.extra.libdevice.exp2(tmp0 * 1.4426950408889634)`: eager diff: tensor(2.2315e-06, device='cuda:0', dtype=torch.float64) compile diff: tensor(3.5329e-06, device='cuda:0', dtype=torch.float64) eager_latency:0.0013982331859319662, compile_latency:0.0013824134564199367 Update inductor to use `tl.extra.libdevice.exp(tmp0)`: eager diff: tensor(2.3421e-06, device='cuda:0', dtype=torch.float64) compile diff: tensor(2.3421e-06, device='cuda:0', dtype=torch.float64) eager_latency:0.0014109122834153282, compile_latency:0.0014062877025520593 ``` Since `tl.extra.libdevice.exp` leads to both better precision and on-par latency, we use it by default now. Note that `tl.extra.libdevice.exp` used to have a perf issue in [January 2025](https://github.com/triton-lang/triton/issues/5735) since it used due to `ex2.approx.f32` instead of `ex2.approx.ftz.f32`. So `tl.extra.libdevice.exp2(tmp0 * 1.4426950408889634)` was used as a workaround. I double checked that the issue is resolved and `tl.extra.libdevice.exp` also uses [ex2.approx.ftz.f32](https://github.com/triton-lang/triton/issues/5735#issuecomment-3238421293) today. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161829 Approved by: https://github.com/jansel	2025-08-30 04:56:51 +00:00
Tianren Gao	2fed4fb464	[FlexAttn] Fix Paged Attention Accuracy via Upper Mask Mod and Prevent Invalid Memory Access (#160861 ) Fixes #159247 Issue 1: Accuracy Problem with Non-Divisible KV Sequences --------------------------------------------------------- ### Background Paged attention in flex decoding produced inaccurate results when KV sequence length is not divisible by block size. For example, when `KV_S = 64` and `block_size = 128`, the output didn't match standard attention accuracy. ### Root Cause The current paged attention does not apply upper mask mod when converting from logical to physical mask mod. Instead, it uses a noop_mask by default which makes all the values unmasked, leading to an accuracy mismatch. Adding a upper mask mod according to the origin actual kv_len (64 in this test case) resolves the issue. ### Solution * Applied proper upper bound masking: Updated all calls to `convert_logical_block_mask` to pass `kv_len` as a tensor with proper shape `[B, KV_S]` to provide information of actual batched KV sequence length. The function now correctly applies upper bound checks using the actual KV sequence lengths for each batch ### Files Modified * `torch/nn/attention/experimental/_paged_attention.py`: Added `kv_len` parameter as a tensor to `get_mask_mod` and applied upper mask to the new mask mod. * `test/inductor/test_flex_attention.py`: Fixed all related `kv_len` parameter call in the tests * `test/inductor/test_flex_decoding.py`: Fixed all related `kv_len` parameter call in the tests Issue 2: Invalid Memory Access (IMA) in Triton Kernels ------------------------------------------------------ ### Background The Triton kernel for flex attention was experiencing invalid memory access errors when running with compute sanitizers, particularly with short KV sequences and small batch sizes. ### Root Cause * Kernel launches CTAs (Cooperative Thread Arrays) proportional to GPU's multi-processor count (108 via `SPLIT_KV`) * With small workloads, many CTAs remain idle but still attempt to access `kv_indices` with invalid `indices_idx` values * This caused out-of-bounds memory access violations ### Solution Implemented boundary checks with early exit: 1. Added `MAX_VALID_KV_IDX` parameter in `torch/_inductor/kernel/flex/flex_decoding.py` * Calculate maximum valid KV index based on actual `kv_indices` tensor size and pass it to Triton template 2. Added early exit logic in `torch/_inductor/kernel/flex/templates/flex_decode.py.jinja` * Boundary checks before accessing `kv_indices` in both normal and full blocks * Idle CTAs with invalid `indices_idx` skip computation entirely This prevents invalid memory access while reducing wasted computation on idle thread blocks. Testing & Validation -------------------- ### Accuracy Tests * Added comprehensive test cases covering KV sequences not divisible by block sizes * Verified output matches standard attention for various sequence length combinations ### Sanitizer Results `========= COMPUTE-SANITIZER Starting standalone test_max_autotune... Running test_max_autotune on device: cuda max_autotune config: True test_max_autotune completed successfully! Test passed! ========= ERROR SUMMARY: 0 errors` Before: More than 13720 invalid memory access errors with sanitizers After: Clean execution with 0 errors Both fixes work together to ensure paged attention produces accurate results while running safely without memory access violations. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160861 Approved by: https://github.com/BoyuanFeng	2025-08-30 04:50:23 +00:00
PyTorch UpdateBot	76f81b56d3	[audio hash update] update the pinned audio hash (#161836 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161836 Approved by: https://github.com/pytorchbot	2025-08-30 04:23:04 +00:00
Howard Huang	82d2d23e85	Add batch option for send/recv_object_list (#160342 ) `send_object_list` and `recv_object_list` use regular `send`/`recv` P2P ops which means that they will create 2-rank NCCL communicators between ranks if the communicators have not been initialized. This adds an option `use_batch` which will call the send/recv with `batch_isend_irecv` which will re-use the communicators already initialized for collectives in the group. --- BatchP2P ops, creates (or use existing) communicator keyed by device index Regular P2P Ops, creates (or use existing) dedicated 2-rank communicators keyed by “rank1:rank2” See: `c8205cb354/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp (L3980-L4008)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160342 Approved by: https://github.com/wconstab	2025-08-30 03:29:09 +00:00
PyTorch MergeBot	e015de1969	Revert "Use vectorized stores for all dtypes (#161649 )" This reverts commit f0a517e333d6204f560d8061a4f70523060c93bf. Reverted https://github.com/pytorch/pytorch/pull/161649 on behalf of https://github.com/ngimel due to buggy ([comment](https://github.com/pytorch/pytorch/pull/161649#issuecomment-3238895967))	2025-08-30 03:13:40 +00:00
Nikita Shulga	0af56fc33e	Cleanup stale submodule directories after checkout (#161748 ) Fixes https://github.com/pytorch/pytorch/issues/161510 Test plan: ``` % cd third_party/kineto % git checkout fe80f9319479265f7a208e615e16a363b993d50c; git submodule update --init --recursive M libkineto/third_party/dynolog M libkineto/third_party/fmt M libkineto/third_party/googletest Previous HEAD position was 5e75018 Fix Local Time on Windows Builds (#1104) HEAD is now at fe80f93 Fix MSVC Error (#1134) Submodule path 'libkineto/third_party/dynolog': checked out 'd2ffe0a4e3acace628db49974246b66fc3e85fb1' Submodule path 'libkineto/third_party/dynolog/third_party/googletest': checked out '52eb8108c5bdec04579160ae17225d66034bd723' Submodule path 'libkineto/third_party/dynolog/third_party/prometheus-cpp': checked out 'b1234816facfdda29845c46696a02998a4af115a' Submodule path 'libkineto/third_party/dynolog/third_party/prometheus-cpp/3rdparty/civetweb': checked out 'd7ba35bbb649209c66e582d5a0244ba988a15159' Submodule path 'libkineto/third_party/dynolog/third_party/prometheus-cpp/3rdparty/googletest': checked out 'e2239ee6043f73722e7aa812a459f54a28552929' Submodule path 'libkineto/third_party/fmt': checked out '40626af88bd7df9a5fb80be7b25ac85b122d6c21' Submodule path 'libkineto/third_party/googletest': checked out '52eb8108c5bdec04579160ae17225d66034bd723' % git checkout 5e75018; git submodule update --init --recursive M libkineto/third_party/dynolog M libkineto/third_party/fmt M libkineto/third_party/googletest Previous HEAD position was fe80f93 Fix MSVC Error (#1134) HEAD is now at 5e75018 Fix Local Time on Windows Builds (#1104) warning: unable to rmdir 'third_party/prometheus-cpp': Directory not empty Submodule path 'libkineto/third_party/dynolog': checked out '7d04a0053a845370ae06ce317a22a48e9edcc74e' Submodule path 'libkineto/third_party/dynolog/third_party/googletest': checked out '58d77fa8070e8cec2dc1ed015d66b454c8d78850' Submodule path 'libkineto/third_party/fmt': checked out '0041a40c1350ba702d475b9c4ad62da77caea164' Submodule path 'libkineto/third_party/googletest': checked out '7aca84427f224eeed3144123d5230d5871e93347' % cd ../.. % git status HEAD detached from 649e397c6de Changes not staged for commit: (use "git add <file>..." to update what will be committed) (use "git restore <file>..." to discard changes in working directory) (commit or discard the untracked or modified content in submodules) modified: third_party/kineto (untracked content) % time git submodule foreach --recursive git clean -ffdx ... git submodule foreach --recursive git clean -ffdx 0.47s user 0.96s system 88% cpu 1.625 total % git status HEAD detached from 649e397c6de ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161748 Approved by: https://github.com/atalman	2025-08-30 01:30:44 +00:00
Irakli Salia	8627a19adf	[MPS] sparse add unary funcs + add for sparse tensors (#160839 ) Adds several unary functions and add. Enables tests for unary functions in test_sparse but not enabling other tests yet, needs more ops before we fully migrate to testing SparseMPS with `test_sparse.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160839 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-08-30 01:09:00 +00:00
eellison	ebfee60101	[WIP] more aggressive persistent reduction (#161055 ) Gives 18% speedup on rms norm (2048, 32768). And we have seen other instances where inductor is not aggressive enough about codegening persistent reductions - e.g. 39% on [this kernel from torch ao](https://github.com/pytorch/pytorch/issues/159769#issuecomment-3188568335). Codegen-ing persistent reductions can be risky if you run out of registers. Here, I'm effectively making persistent reductions an option of looped reductions by setting RBLOCK == rnumel, so that we can still fallback to looped reductions as needed. As criteria: - there needs to be significant memory savings from doing a persistent reduction (by keeping memory in register and avoiding another iteration over input) - we should not be coalescing on x dimension, otherwise large rblock will inhibit coalescing - we should not be especially register or arithmetic intensive (this last part uses mem_ops_per_thread, but could be improved). Still need to do dashboard run, although I'm not sure we get a lot of large rblock in our benchmarks. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161055 Approved by: https://github.com/jansel	2025-08-30 01:08:45 +00:00
PyTorch MergeBot	6db872fa2c	Revert "Cleanup stale submodule directories after checkout (#161748 )" This reverts commit 0e45023cf9cbe1cf18279c1b0d391ea9464e7731. Reverted https://github.com/pytorch/pytorch/pull/161748 on behalf of https://github.com/malfet due to I still see the same failures, and could not understand, from the log whether those checks are running on not ([comment](https://github.com/pytorch/pytorch/pull/161748#issuecomment-3238791895))	2025-08-30 01:04:11 +00:00
Nikita Shulga	7c30a9d7fc	[MPS] Add slow version of `kthvalue` (#161817 ) Which heavily borrows implementation logic from `topk` As this method is non-deterministic, modified the logic for cpu-ops indices comparison with just an equality statement, as by default random numbers picked for input tensor allow for quite a lot of overlaps Pull Request resolved: https://github.com/pytorch/pytorch/pull/161817 Approved by: https://github.com/dcci	2025-08-30 00:44:29 +00:00
Chien-Chin Huang	c1e504ec2f	[SymmMEM] Move AsyncTP tests to a seperate test class (#161820 ) We move AsyncTP tests to a seperate test suite because 1) Async TP ops are not the core symmetric memory APIs, they are more like applications, 2) MultiProcContinuousTest will skip all the following tests if a test fails (we should fix this too). We still want to get the test signals for the core symmetric memory APIs when Async TP ops fail. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161820 Approved by: https://github.com/kwen2501	2025-08-30 00:40:40 +00:00
Parshant Sharma	4ad9fbc83a	Unify TypeAlias definitions in optimizer.py (#161493 ) Fixes #160834 This issue unifies TypeAlias definitions in [optimizer.py](https://github.com/pytorch/pytorch/blob/main/torch/optim/optimizer.py) This ensures the following: - Consistency and Standardization - Enhanced IDE support - Prevents runtime confusion Pull Request resolved: https://github.com/pytorch/pytorch/pull/161493 Approved by: https://github.com/Skylion007	2025-08-30 00:35:02 +00:00
Wang, Chuanqi	0f81e7f640	[CI] Fix XPU ci test permission issue (#161389 ) Due to new test runners, refer https://github.com/pytorch/pytorch/actions/runs/17161094208/job/48694776064#step:2:124 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161389 Approved by: https://github.com/atalman	2025-08-30 00:03:59 +00:00
Isalia20	3daf20f8e1	[MPS] fix empty input in posneg functions (#161824 ) fix empty posneg function for mps: ```python import torch input_tensor = torch.empty(0, device="mps") out_pos = torch.isposinf(input_tensor) ``` Gives: ``` RuntimeError: [srcBuf length] > 0 INTERNAL ASSERT FAILED at "/Users/Irakli_Salia/Desktop/pytorch/aten/src/ATen/native/mps/OperationUtils.mm":551, please report a bug to PyTorch. Placeholder tensor is empty! ``` on main branch Pull Request resolved: https://github.com/pytorch/pytorch/pull/161824 Approved by: https://github.com/malfet	2025-08-29 23:12:04 +00:00
Zhang, Liangang	3e459491b5	Enable XPU path for FlexAttention (#143553 ) [#RFC153024](https://github.com/pytorch/pytorch/issues/153024) Motivation 1. The Attention has been the critical performance bottleneck in the current LLM models, and FlexAttention is a good choice to cover the broad variants in the transformers series models. With FlexAttention, it is easy for us to enable the paged attention and fused SDPA in the transformers repo on XPU device. Besides, it also provide a candidate to process attention in LLM ecosystem libraries ., e.g., vLLM, SGLang on XPU device. 2. FlexAttention is good start point to push the intel triton based GEMM kernel to be matured. FlexAttention provide both flexattention kernel and flexdecoding kernel to cover both compute bound and memory bound GEMM computation, and different shapes should also been supported to serve LLM inference., e.g. head_dim=64, 96, 128, 256. What does this PR do? 1. Enable the device type for Flexattention kernel and UTs to ensure all important UTs pass on XPU device. 2. For E2E model inference, ensure the functionality of LLM models inference with FlexAttention to be ready. Pull Request resolved: https://github.com/pytorch/pytorch/pull/143553 Approved by: https://github.com/EikanWang, https://github.com/drisspg Co-authored-by: Mao Yunfei <yunfei.mao@intel.com> Co-authored-by: Xingyuan Li <xingyuan.li@intel.com> Co-authored-by: majing <jing1.ma@intel.com> Co-authored-by: Xiao, Wang <wang.xiao@intel.com>	2025-08-29 23:10:58 +00:00
Andrey Talman	0e2c8af5a6	[CI/CD] Windows set git config --global core.ignorecase false (#161813 ) Make sure git on windows have core.ignorecase false Pull Request resolved: https://github.com/pytorch/pytorch/pull/161813 Approved by: https://github.com/malfet	2025-08-29 23:04:43 +00:00
Ruben Rodriguez Buchillon	ea27464a79	[inductor][decompose k] disable on everything other than cuda (#161795 ) # why - untested so far # what - add an empty config heuristic for all devices for decompose k - the cuda heuristic, because it is more specific, will still be picked up - add notes explaining how to enable on other devices # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v -k "decompose_k" ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161795 Approved by: https://github.com/PaulZhang12 ghstack dependencies: #161767	2025-08-29 22:41:27 +00:00
Ruben Rodriguez Buchillon	45eccf414f	[inductor][heuristics registry] missing heuristic is not an error anymore, cross device heuristics (#161767 ) # why - not having a heuristic is an error but should not crash, just provide 0 configs - some heuristics are cross device type - cleaner to be explicit about being cross device type than having to enumerate every possible device type # what - on registration, supply device_type=None (explicitly) to say this heuristic is cross device - test to guard the heuristics hierarchies # testing ``` python3 -bb -m pytest test/inductor/test_template_heuristics_registry.py ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161767 Approved by: https://github.com/PaulZhang12	2025-08-29 22:41:27 +00:00
Wang, Chuanqi	037f3bd475	[CI] Migrate XPU build and test to python 3.10 (#161708 ) Follow #161167 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161708 Approved by: https://github.com/malfet	2025-08-29 22:31:39 +00:00
PyTorch MergeBot	6e548c1a87	Revert "[CI] Migrate XPU build and test to python 3.10 (#161708 )" This reverts commit 2a70d98abf8256d3d768eff028fca20198579824. Reverted https://github.com/pytorch/pytorch/pull/161708 on behalf of https://github.com/ZainRizvi due to Sorry but this is causing rocm jobs to fail. See: test/inductor/test_max_autotune.py::TestMaxAutotuneSubproc::test_max_autotune_addmm_search_space_EXHAUSTIVE_dynamic_True [GH job link](https://github.com/pytorch/pytorch/actions/runs/17303310877/job/49125664617) [HUD commit link](`2a70d98abf`) ([comment](https://github.com/pytorch/pytorch/pull/161708#issuecomment-3238359944))	2025-08-29 21:49:15 +00:00
zhxchen17	eb78757708	[inductor] Lift fw_compiler and bw_compiler as toplevel functions. (#161762 ) This is a no-op refactor to compiler_fx which lifts the logic of fw_compiler and bw_compiler to toplevel, so that they can be reused in a different stack (e.g. precompile). Differential Revision: [D81292968](https://our.internmc.facebook.com/intern/diff/D81292968/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161762 Approved by: https://github.com/angelayi, https://github.com/yushangdi	2025-08-29 21:46:55 +00:00
David Berard	05eeb29976	[inductor][triton] support JITCallable._hash_lock (#161768 ) Fixes #161618 Triton # 7974 introduces a threading.RLock() in JITCallable, which is not pickle-able. This PR adds this field to the list of un-pickleable fields that need to be handled specially. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161768 Approved by: https://github.com/xuzhao9	2025-08-29 21:20:02 +00:00
Tristan T	18b4fdde8f	Add MTIA to floor_divide op (#161575 ) Summary: Missed file in op registration resulting in fallback during test Reviewed By: andyanwang, srsuryadev Differential Revision: D81085615 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161575 Approved by: https://github.com/albanD, https://github.com/malfet	2025-08-29 20:39:29 +00:00
PyTorch MergeBot	f6368e934e	Revert "[MPS] sparse add unary funcs + add for sparse tensors (#160839 )" This reverts commit 93c5112f46a978a029644ae599979416ead5c917. Reverted https://github.com/pytorch/pytorch/pull/160839 on behalf of https://github.com/atalman due to test_sparse_csr.py::TestSparseCompressedCPU::test_consistency_SparseCSR_asinh_cpu_complex64 [GH job link](https://github.com/pytorch/pytorch/actions/runs/17329155095/job/49201551217) [HUD commit link](`93c5112f46`) ([comment](https://github.com/pytorch/pytorch/pull/160839#issuecomment-3238093296))	2025-08-29 19:55:39 +00:00
Yidi Wu	bf6aaba0f7	[while_loop] avoid aliasing when body_fn never executes (#160670 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160670 Approved by: https://github.com/zou3519 ghstack dependencies: #160548, #160669	2025-08-29 19:36:37 +00:00
Yidi Wu	456493f7ed	[while_loop][inductor] remove offset check for while_loop (#160669 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160669 Approved by: https://github.com/zou3519 ghstack dependencies: #160548	2025-08-29 19:36:37 +00:00
Huy Do	c74e301455	Bump TorchBench version (#161461 ) To include the latest fixes from TorchBench. I'll setup a nightly commit hash update for this next Pull Request resolved: https://github.com/pytorch/pytorch/pull/161461 Approved by: https://github.com/malfet	2025-08-29 19:21:07 +00:00
Scott Wolchok	67457dbb9d	Fix non-const reference arguments in torch/csrc/jit/python/init.cpp (#161300 ) Shouldn't be any generated code impact, just fixing bad practice. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161300 Approved by: https://github.com/wconstab, https://github.com/malfet ghstack dependencies: #161286	2025-08-29 19:01:32 +00:00
Natalia Gimelshein	e9bbd28f22	make einsum produce contiguous inputs in more cases (#161755 ) Fixes #161729 Written by codex This won't produce contiguous inputs for all einsum applications, because we flatten all right-only and left-only dimensions, so if right and left operand dimensions are interleaved in output, we cannot (with current algo) produce contiguous output, however, for common cases like in the linked issue it works. Let's see what CI says Pull Request resolved: https://github.com/pytorch/pytorch/pull/161755 Approved by: https://github.com/malfet, https://github.com/albanD	2025-08-29 18:50:46 +00:00
PaulZhang12	348d781055	[Inductor] Update Outer Reduction Heuristic (#159093 ) Update outer reduction heuristics for significant speedups. HuggingFace: <img width="572" height="705" alt="Screenshot 2025-08-20 at 12 44 51 AM" src="https://github.com/user-attachments/assets/4872a23b-d136-423a-b2e6-187895bccba1" /> Average ~20% speedup on a kernel by kernel basis TorchBench: <img width="572" height="705" alt="Screenshot 2025-08-20 at 12 45 10 AM" src="https://github.com/user-attachments/assets/b8357b6d-6107-4104-b906-292a17d14d48" /> Average ~40% speedup on a kernel by kernel basis <img width="1705" height="729" alt="Screenshot 2025-08-21 at 5 50 32 PM" src="https://github.com/user-attachments/assets/a9715a2b-9e6c-4b33-ba9f-7870dc561e31" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159093 Approved by: https://github.com/jansel	2025-08-29 18:31:22 +00:00
Ting Lu	303f514d5b	[CI] Add basic CUDA 13.0 periodic test (#161013 ) https://github.com/pytorch/pytorch/issues/159779 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161013 Approved by: https://github.com/atalman Co-authored-by: Andrey Talman <atalman@fb.com> Co-authored-by: Aidyn-A <31858918+Aidyn-A@users.noreply.github.com>	2025-08-29 17:56:33 +00:00
Xu Han	f532f99822	[AOTI] normalize_path_separator zip file path (#161781 ) normalize_path_separator zip file path Pull Request resolved: https://github.com/pytorch/pytorch/pull/161781 Approved by: https://github.com/angelayi	2025-08-29 17:53:41 +00:00
Irakli Salia	93c5112f46	[MPS] sparse add unary funcs + add for sparse tensors (#160839 ) Adds several unary functions and add. Enables tests for unary functions in test_sparse but not enabling other tests yet, needs more ops before we fully migrate to testing SparseMPS with `test_sparse.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160839 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-08-29 16:28:58 +00:00
Mwiza Kunda	0f6a08a029	[inductor] Fix SubgraphInfo round trip (#161779 ) Currently `numels` is not specific to a created subgraph since it is not retrieved by `dataclasses.fields(SubgraphInfo)` due to it not being type annotated, see [ref](https://docs.python.org/3/library/dataclasses.html#module-dataclasses:~:text=The%20%40dataclass%20decorator%20examines%20the%20class%20to%20find%20fields.%20A%20field%20is%20defined%20as%20a%20class%20variable%20that%20has%20a%20type%20annotation.%20With%20two%20exceptions%20described%20below%2C%20nothing%20in%20%40dataclass%20examines%20the%20type%20specified%20in%20the%20variable%20annotation.). So for example the following would happen: ``` self.numels = {"x": sympy.Integer(5)} subgraph_name = "<x>" with self.create_subgraph_body(subgraph_name): self.numels = {"x", sympy.Integer(7)} # this would print that x has size 7, not the original value of 5 print(self.numels) # numels would be None because dataclasses.fields(SubgraphInfo) does not include numels # since it is not type annotated print(self.subgraph_bodies[subgraph_name]) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161779 Approved by: https://github.com/eellison	2025-08-29 16:27:29 +00:00
Zain Rizvi	c8fa907e74	Check commit order (#161560 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161560 Approved by: https://github.com/malfet ghstack dependencies: #161558, #161637	2025-08-29 16:22:58 +00:00
ILCSFNO	b99a112688	Update optional tag for `interpolation` in `torch.quantile()` (#161706 ) Fixes #146156 Refix the issue with the extra needed fix. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161706 Approved by: https://github.com/soulitzer	2025-08-29 16:21:14 +00:00
Chien-Chin Huang	cd6d63f453	[SymmMEM] Fix test_empty_strided_p2p_persistent (#161677 ) test_empty_strided_p2p_persistent allocates persistent symm memory tensors. However, it uses the same alloc_id for different tests, which could cause troubles if these tests are ran under the same process. This PR fixes the issue by using a different alloc_id for different test. https://github.com/pytorch/pytorch/pull/161668 should also fix the issue but we can land this PR for a safer test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161677 Approved by: https://github.com/kwen2501 ghstack dependencies: #161676	2025-08-29 16:11:58 +00:00
Nikita Shulga	0e45023cf9	Cleanup stale submodule directories after checkout (#161748 ) Fixes https://github.com/pytorch/pytorch/issues/161510 Test plan: ``` % cd third_party/kineto % git checkout fe80f9319479265f7a208e615e16a363b993d50c; git submodule update --init --recursive M libkineto/third_party/dynolog M libkineto/third_party/fmt M libkineto/third_party/googletest Previous HEAD position was 5e75018 Fix Local Time on Windows Builds (#1104) HEAD is now at fe80f93 Fix MSVC Error (#1134) Submodule path 'libkineto/third_party/dynolog': checked out 'd2ffe0a4e3acace628db49974246b66fc3e85fb1' Submodule path 'libkineto/third_party/dynolog/third_party/googletest': checked out '52eb8108c5bdec04579160ae17225d66034bd723' Submodule path 'libkineto/third_party/dynolog/third_party/prometheus-cpp': checked out 'b1234816facfdda29845c46696a02998a4af115a' Submodule path 'libkineto/third_party/dynolog/third_party/prometheus-cpp/3rdparty/civetweb': checked out 'd7ba35bbb649209c66e582d5a0244ba988a15159' Submodule path 'libkineto/third_party/dynolog/third_party/prometheus-cpp/3rdparty/googletest': checked out 'e2239ee6043f73722e7aa812a459f54a28552929' Submodule path 'libkineto/third_party/fmt': checked out '40626af88bd7df9a5fb80be7b25ac85b122d6c21' Submodule path 'libkineto/third_party/googletest': checked out '52eb8108c5bdec04579160ae17225d66034bd723' % git checkout 5e75018; git submodule update --init --recursive M libkineto/third_party/dynolog M libkineto/third_party/fmt M libkineto/third_party/googletest Previous HEAD position was fe80f93 Fix MSVC Error (#1134) HEAD is now at 5e75018 Fix Local Time on Windows Builds (#1104) warning: unable to rmdir 'third_party/prometheus-cpp': Directory not empty Submodule path 'libkineto/third_party/dynolog': checked out '7d04a0053a845370ae06ce317a22a48e9edcc74e' Submodule path 'libkineto/third_party/dynolog/third_party/googletest': checked out '58d77fa8070e8cec2dc1ed015d66b454c8d78850' Submodule path 'libkineto/third_party/fmt': checked out '0041a40c1350ba702d475b9c4ad62da77caea164' Submodule path 'libkineto/third_party/googletest': checked out '7aca84427f224eeed3144123d5230d5871e93347' % cd ../.. % git status HEAD detached from 649e397c6de Changes not staged for commit: (use "git add <file>..." to update what will be committed) (use "git restore <file>..." to discard changes in working directory) (commit or discard the untracked or modified content in submodules) modified: third_party/kineto (untracked content) % time git submodule foreach --recursive git clean -ffdx ... git submodule foreach --recursive git clean -ffdx 0.47s user 0.96s system 88% cpu 1.625 total % git status HEAD detached from 649e397c6de ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161748 Approved by: https://github.com/atalman	2025-08-29 14:07:06 +00:00
PyTorch MergeBot	823a329984	Revert "Cleanup stale submodule directories in checkout action (#161748 )" This reverts commit f3c5a82139539c63e6f08966e268c4160e138320. Reverted https://github.com/pytorch/pytorch/pull/161748 on behalf of https://github.com/malfet due to I put the check in the wrong place ([comment](https://github.com/pytorch/pytorch/pull/161748#issuecomment-3237080419))	2025-08-29 13:40:21 +00:00
Ankita George	f0a65cd6d6	Add pg argument to consolidate_safetensors_files_on_every_rank (#161421 ) Summary: Based on feedback on https://github.com/pytorch/torchtitan/pull/1625, adding a pg argument to consolidate_safetensors_files_on_every_rank so that we don't infer the pg and users can supply one if needed. Test Plan: ensure existing tests pass Rollback Plan: Differential Revision: D80954339 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161421 Approved by: https://github.com/fegin	2025-08-29 13:31:11 +00:00
Xilun Wu	627decb0ed	[DTensor] fix DTensorTestCase.destroy_pg() when device_type is "cpu" but CUDA device is available (#161015 ) Summary When `device_id` is not None, barrier() will choose the accelerator of the most pripority, which means if the test specifies to use CPU for testing while CUDA is available on the host, the barrier() will use CUDA. To avoid this and better respect `self.device_type`, we add this branch to enforce barrier() to use CPU when `self.device_type` is CPU and other accelerator is also available. Test `pytest test/distributed/tensor/test_dtensor_testbase.py` Debugging Output ``` # from init_process_group() init pg: backend=gloo, device_id = None default_pg has backend: gloo, device_types: [device(type='cuda'), device(type='cpu')] # from barrier() barrier: device_ids = [10], devices = [], device = None, PG=[device(type='cuda'), device(type='cpu')] ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161015 Approved by: https://github.com/tianyu-l	2025-08-29 12:47:11 +00:00
zeshengzong	448a7e7e31	Fix `SequentialLR` deprecate warning about invoke `step(epoch)` (#149392 ) Fixes #116776 #76113 #113222 #67958 ## Changes - Refactor `LRScheduler.step` method, leave `epoch` check logic in public method `step` - Move update `lr` logic to `_update_lr` method - Make `SequentialLR` use `_update_lr` to avoid unnecessary warning message ## Test Result ```bash pytest test/optim/test_lrscheduler.py -vv ``` ![image](https://github.com/user-attachments/assets/e1c5527e-193e-4328-bf95-023139ea0416) Pull Request resolved: https://github.com/pytorch/pytorch/pull/149392 Approved by: https://github.com/janeyx99	2025-08-29 11:45:11 +00:00
Malay Bag	ed370ae4b0	[unflatten] Fix test by supporting both MappingKey anf GetAttrKey (#161599 ) Summary: As title Test Plan: Run internal tests Rollback Plan: Differential Revision: D81115712 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161599 Approved by: https://github.com/tugsbayasgalan	2025-08-29 10:08:38 +00:00
David Berard	5859edf113	[BE][inductor] replace "and" -> "logical_and" in bucketize_binary_search (#160941 ) Get rid of these warnings: ``` /home/dberard/local/pytorch-env7/pytorch/torch/_inductor/runtime/triton_helpers.py:317: UserWarning: Logical operators 'and' and 'or' are deprecated for non-scalar tensors; please use '&' or '\|' instead ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160941 Approved by: https://github.com/malfet, https://github.com/jingsh	2025-08-29 09:27:13 +00:00
xinan.lin	5b701a6bb2	[AOTI][Intel GPU] Add XPU quantization ops to AOT Inductor. (#156572 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/156572 Approved by: https://github.com/EikanWang, https://github.com/angelayi ghstack dependencies: #157430	2025-08-29 09:19:44 +00:00
xinan.lin	48679ef966	[Refactor][XPU] Refactor XPU quantization op and add header files. (#157430 ) This PR refactors the XPU quantization ops to align their code structure with the CPU implementation for consistency. It also adds necessary header files to enable future integration with AOTI. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157430 Approved by: https://github.com/angelayi	2025-08-29 09:19:44 +00:00
Natalia Gimelshein	0ca3a6085d	use host+device_id to make sure devices are unique in rendezvous request (#161756 ) Per title, for NVL72 systems where devices with the same indices on multiple hosts are within the same nvlink domain Pull Request resolved: https://github.com/pytorch/pytorch/pull/161756 Approved by: https://github.com/kwen2501	2025-08-29 09:09:45 +00:00
Yiming Zhou	a55d2beb50	[export] Support complex constant in serde (#161517 ) Summary: Fixes #160749 For a model like ``` class M(torch.nn.Module): def forward(self, x): s = torch.sin(x) z = 1j * s return z ``` Its graph will be ``` graph(): %x : [num_users=1] = placeholder[target=x] %sin : [num_users=1] = call_function[target=torch.ops.aten.sin.default](args = (%x,), kwargs = {}) %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sin, 1j), kwargs = {}) return (mul,) ``` `1j` will appear as a constant complex argument in the `aten.mul` Test Plan: buck2 run mode/dev-nosan caffe2/test:test_export -- -r test_complex_constant Rollback Plan: Differential Revision: D80672323 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161517 Approved by: https://github.com/angelayi	2025-08-29 08:13:21 +00:00
Chien-Chin Huang	d8a0bdb0d3	[BE][SymmMEM] Change Optional to the shorthand expression for symmetric memory modules (#161676 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161676 Approved by: https://github.com/Skylion007	2025-08-29 07:31:16 +00:00
PyTorch UpdateBot	a7c949089a	[vllm hash update] update the pinned vllm hash (#161752 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161752 Approved by: https://github.com/pytorchbot	2025-08-29 04:54:31 +00:00
PyTorch UpdateBot	a6456bfa85	[audio hash update] update the pinned audio hash (#161753 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161753 Approved by: https://github.com/pytorchbot	2025-08-29 04:52:58 +00:00
Nikita Shulga	f3c5a82139	Cleanup stale submodule directories in checkout action (#161748 ) Fixes https://github.com/pytorch/pytorch/issues/161510 Test plan: ``` % cd third_party/kineto % git checkout fe80f9319479265f7a208e615e16a363b993d50c; git submodule update --init --recursive M libkineto/third_party/dynolog M libkineto/third_party/fmt M libkineto/third_party/googletest Previous HEAD position was 5e75018 Fix Local Time on Windows Builds (#1104) HEAD is now at fe80f93 Fix MSVC Error (#1134) Submodule path 'libkineto/third_party/dynolog': checked out 'd2ffe0a4e3acace628db49974246b66fc3e85fb1' Submodule path 'libkineto/third_party/dynolog/third_party/googletest': checked out '52eb8108c5bdec04579160ae17225d66034bd723' Submodule path 'libkineto/third_party/dynolog/third_party/prometheus-cpp': checked out 'b1234816facfdda29845c46696a02998a4af115a' Submodule path 'libkineto/third_party/dynolog/third_party/prometheus-cpp/3rdparty/civetweb': checked out 'd7ba35bbb649209c66e582d5a0244ba988a15159' Submodule path 'libkineto/third_party/dynolog/third_party/prometheus-cpp/3rdparty/googletest': checked out 'e2239ee6043f73722e7aa812a459f54a28552929' Submodule path 'libkineto/third_party/fmt': checked out '40626af88bd7df9a5fb80be7b25ac85b122d6c21' Submodule path 'libkineto/third_party/googletest': checked out '52eb8108c5bdec04579160ae17225d66034bd723' % git checkout 5e75018; git submodule update --init --recursive M libkineto/third_party/dynolog M libkineto/third_party/fmt M libkineto/third_party/googletest Previous HEAD position was fe80f93 Fix MSVC Error (#1134) HEAD is now at 5e75018 Fix Local Time on Windows Builds (#1104) warning: unable to rmdir 'third_party/prometheus-cpp': Directory not empty Submodule path 'libkineto/third_party/dynolog': checked out '7d04a0053a845370ae06ce317a22a48e9edcc74e' Submodule path 'libkineto/third_party/dynolog/third_party/googletest': checked out '58d77fa8070e8cec2dc1ed015d66b454c8d78850' Submodule path 'libkineto/third_party/fmt': checked out '0041a40c1350ba702d475b9c4ad62da77caea164' Submodule path 'libkineto/third_party/googletest': checked out '7aca84427f224eeed3144123d5230d5871e93347' % cd ../.. % git status HEAD detached from 649e397c6de Changes not staged for commit: (use "git add <file>..." to update what will be committed) (use "git restore <file>..." to discard changes in working directory) (commit or discard the untracked or modified content in submodules) modified: third_party/kineto (untracked content) % time git submodule foreach --recursive git clean -ffdx ... git submodule foreach --recursive git clean -ffdx 0.47s user 0.96s system 88% cpu 1.625 total % git status HEAD detached from 649e397c6de ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161748 Approved by: https://github.com/atalman	2025-08-29 03:21:31 +00:00
Angela Yi	5c306c3ccb	[fx] Add lru_cache to warning (#161721 ) Summary: Added lru_cache to the warning message to avoid flooding logs Test Plan: CI Rollback Plan: Differential Revision: D81245618 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161721 Approved by: https://github.com/pianpwk	2025-08-29 02:25:45 +00:00
Dylan Maloy	c1cb1cb26e	fix tests caused by has_triton (#161737 ) Summary: this will only cause it in the event that we are serializing a triton hop. there are a few tests that do weird mocking stuff that this function doesn't like, so this will prevent it from being called there. Test Plan: att Rollback Plan: Differential Revision: D81261486 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161737 Approved by: https://github.com/angelayi	2025-08-29 02:25:35 +00:00
drisspg	5cb1d71e59	[Flex] Fix float16 default config 128 headdim (#161647 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161647 Approved by: https://github.com/v0i0	2025-08-29 01:48:06 +00:00
Justin Chu	d153af713e	[ez] Improve formatting in error messages for dynamic shapes (#161573 ) Show the repr of `dim` to make the message more clear. Example: before `but got batch instead`, after `but got "batch" instead` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161573 Approved by: https://github.com/angelayi	2025-08-28 23:52:58 +00:00
PyTorch MergeBot	9b67d8e344	Revert "[RELAND] Close some sources of fake tensor leakage (#161589 )" This reverts commit 5790b009751e6ebba35d3e6d05e7c1b135553eee. Reverted https://github.com/pytorch/pytorch/pull/161589 on behalf of https://github.com/atalman due to [GH job link](https://github.com/pytorch/pytorch/actions/runs/17305150611/job/49128381649) [HUD commit link](`5790b00975`) ([comment](https://github.com/pytorch/pytorch/pull/161589#issuecomment-3235224249))	2025-08-28 23:19:36 +00:00
PyTorch MergeBot	47742081c9	Revert "kill allow_complex_guards_as_runtime_asserts (#160198 )" This reverts commit 69d91b94ba5366f4444d8cb8fd3dab4de4f04d3d. Reverted https://github.com/pytorch/pytorch/pull/160198 on behalf of https://github.com/jeffdaily due to let's revert again instead of waiting for forward fix, see earlier comments ([comment](https://github.com/pytorch/pytorch/pull/160198#issuecomment-3235165462))	2025-08-28 22:50:37 +00:00
drisspg	fffa62fa12	Ensure large tensor int32 -> int64 indexing is enabled (#157767 ) Fixes: #https://github.com/pytorch/pytorch/issues/157446 I think that this delta is worth the switch form block-ptrs especially since they are deprecated ## Perf Summary A is nightly B is this diff, so `negative` means this diff improves perf TOP 5 differences <img width="805" height="754" alt="Screenshot 2025-08-24 at 5 49 49 PM" src="https://github.com/user-attachments/assets/aa359cdf-ee9a-427d-be72-1b9aef6f3115" /> <details> <summary><strong>Full perf table (click to expand)</strong></summary> \| attn_type \| dtype \| shape(B,Hq,M,Hkv,N,D) \| TFlops Version A \| TFlops Version B \| \| --- \| --- \| --- \| --- \| --- \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 258.38834144791923 \| 258.6353685004612 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 142.2192450677751 \| 140.12393320464972 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 122.32683823617003 \| 118.51603755647925 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 142.48556906165314 \| 137.24259849208627 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 86.59814488695922 \| 84.59431398586257 \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 288.52679758135764 \| 292.9174195871856 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 172.25541683643277 \| 172.94326459828508 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 164.40864610599826 \| 165.035129576335 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 176.54876886433945 \| 175.08057670028145 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 125.22491679812626 \| 121.06201152859151 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 339.11952481874283 \| 339.0132835601695 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 227.58583240284406 \| 228.21824999409597 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 185.98569659868966 \| 182.32850843255093 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 188.9495725191772 \| 180.31385312481657 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 106.25789530994302 \| 106.55084959448476 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 357.6430536888533 \| 363.30843452247274 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 262.3241154406613 \| 265.73250045488 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 249.30498953911416 \| 249.35928192833785 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 224.74126243851808 \| 223.71776504077988 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 168.26977014013707 \| 165.47991483333809 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 382.8178701785897 \| 384.34752965862685 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 308.1449710013853 \| 311.0653716044644 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 251.96365252505072 \| 243.92283557225903 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 226.69316232745368 \| 215.22769268913356 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 153.34142545296405 \| 151.9312673939401 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 396.0998000753126 \| 398.35036286102473 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 333.5198415274966 \| 344.6354466169716 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 310.5955933379696 \| 305.66347819546 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 260.4012412689896 \| 259.758666997307 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 234.13034252182635 \| 227.61676497283614 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 396.17615538477196 \| 401.1419104525502 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 359.98648311998414 \| 360.8285563463094 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 291.97720707257736 \| 281.41694809965253 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 250.1703628419691 \| 238.556760291579 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 199.50782826294306 \| 191.52327358439223 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 411.0632004785396 \| 413.6362648405517 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 382.9404387613185 \| 397.74886235657607 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 357.0998545146633 \| 350.5115200772392 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 281.8033924428203 \| 281.98601309215843 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 282.56595134222135 \| 277.4565795466672 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 408.89838018149516 \| 405.14531386840076 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 396.07662058160264 \| 393.4598228299578 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 317.8822887267849 \| 304.754931401036 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 265.8801304948243 \| 254.22961974295112 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 227.87390579965614 \| 222.19481980110393 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 427.36821778477025 \| 431.3766620314935 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 410.67994346825 \| 423.4666944003808 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 381.1968748374038 \| 381.77668006420424 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 292.5540046358546 \| 296.5439130720502 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 321.04573768858114 \| 310.7423616656888 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 427.46148866769903 \| 426.162091037068 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 419.75580537687347 \| 421.88640120274334 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 337.3208051798903 \| 327.4912454675092 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 276.5638854539581 \| 262.988360558083 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 250.82791326036886 \| 245.07367032501736 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 435.8055824506086 \| 441.8803729460534 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 432.02638235921006 \| 450.33161016596273 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 402.25525939224883 \| 393.8564689669916 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 297.5337286675904 \| 297.0131881135074 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 343.8697037899545 \| 329.8194073407783 \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 267.58912366821056 \| 256.91606054118375 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 150.81723692609629 \| 146.32172267858743 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 129.51029293209245 \| 122.72144394093334 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 147.627656359087 \| 141.68956350566188 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 87.55100546003591 \| 84.91293287692788 \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 299.5931492743986 \| 305.884253766691 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 179.39026367843837 \| 181.64741311605096 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 173.93547669282367 \| 173.23972950980564 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 185.90234171599252 \| 182.80844545446686 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 128.08176696266082 \| 123.27722685662111 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 340.50674552770664 \| 338.9071088484576 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 225.4438318650432 \| 230.22899884832975 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 194.15123248528312 \| 185.02793973094865 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 200.74289714108176 \| 191.76606719670647 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 107.03564946728423 \| 106.82432377861258 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 371.31799283918406 \| 379.7555394732925 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 275.97762744310455 \| 276.71106853992995 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 261.6648679783462 \| 259.4127232060398 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 237.03108223577615 \| 233.92710216149527 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 172.13926800371152 \| 168.74390922407585 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 381.50199487767276 \| 383.9043681999597 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 307.9748883093411 \| 312.2403515462001 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 251.11319684705438 \| 243.17870127827277 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 236.3253127246763 \| 223.81250201769552 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 154.55693991756874 \| 153.11360584987685 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 407.11400078586615 \| 413.53709886086557 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 348.1705797722622 \| 360.09771155957367 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 321.8593280850388 \| 318.2882327401255 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 270.089032013835 \| 268.767323026064 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 238.07324557907788 \| 228.09842078362692 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 399.8172853171901 \| 401.0954526332136 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 363.4387330438581 \| 364.13111024232677 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 294.1752429133857 \| 283.7235663368415 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 256.8389394007649 \| 246.91771015606483 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 199.3378564292656 \| 192.40439590901758 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 425.5150965556111 \| 430.8190098707553 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 396.00437184073013 \| 411.3873625655787 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 369.92803661607815 \| 361.43244467343663 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 293.4277354412933 \| 295.2529537595746 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 288.0208673072841 \| 281.51896404878863 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 408.3005367220567 \| 408.96116482298913 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 396.90095962766304 \| 396.87385456176486 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 319.0534576137999 \| 302.50950358107764 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 270.3334977708081 \| 258.8506349486557 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 227.46824134365394 \| 222.23759438128766 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 438.24247309479694 \| 437.7975163205371 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 428.34012029699227 \| 433.3215899950434 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 386.52672049728875 \| 388.26216893354984 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 302.71976814728083 \| 302.3574867306459 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 327.39760662780986 \| 308.6348428844912 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 423.31308678262695 \| 426.6306972137279 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 412.6983690923106 \| 419.4961977664297 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 337.41003544742273 \| 324.2155049126126 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 278.7755890910794 \| 265.9194286636502 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 251.55678254755364 \| 244.8843180141462 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 452.5930781172308 \| 457.7117122300742 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 445.05676260348116 \| 463.9304535499636 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 415.78302138389415 \| 406.29229555271456 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 308.0311067300895 \| 304.91354721414314 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 351.43943626809335 \| 329.4476923070317 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 295.1801525813241 \| 291.36521287398904 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 183.23250549178067 \| 182.35421238887605 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 151.56832453117747 \| 151.3422139154794 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 171.02111935180432 \| 160.72516856727913 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 74.05765122783826 \| 74.5885345035243 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 314.3587394591763 \| 319.2938677773619 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 224.57002084153177 \| 225.48868542008177 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 216.00964804143052 \| 215.39576159953486 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 216.1174237618258 \| 214.28437413525663 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 121.08920423648368 \| 119.55813661872644 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 362.2193857281911 \| 360.05005804275936 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 279.8840217430121 \| 279.5437918286659 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 227.76617121021982 \| 222.8655938229316 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 215.43141176970562 \| 207.71852284994702 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 121.35588364218539 \| 121.20636565046884 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 365.1545280898012 \| 373.37585444987326 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 304.360119952975 \| 309.1247297936263 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 287.2603904544586 \| 289.25547903162595 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 257.9852675272418 \| 257.59069234098115 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 188.35158496670232 \| 184.24683960154857 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 389.9744911369211 \| 388.43466897254166 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 345.9228295166513 \| 342.63034895210126 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 279.56334658247437 \| 271.2724375402088 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 245.66477202810066 \| 233.49688207371258 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 170.3270720653187 \| 166.23863845657382 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 400.0041140827554 \| 402.11182445396497 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 363.64641830327434 \| 375.9288663364792 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 341.5776139573363 \| 335.1160003213424 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 281.1811770268521 \| 280.21438270014005 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 247.78716118997716 \| 245.3269825179633 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 403.794126680488 \| 405.2353919019577 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 387.079178426863 \| 385.1461762057035 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 309.7847188173431 \| 298.0443968374749 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 262.4721750159666 \| 250.81679725428586 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 205.70866004479979 \| 202.9620839129557 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 413.380982988662 \| 418.40270594263103 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 398.450064800682 \| 409.6794973994029 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 372.26297458194466 \| 364.44415106552196 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 293.0818569905912 \| 292.85172400643984 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 296.46717085592087 \| 285.76362010612763 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 419.3186786037592 \| 426.08801580934437 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 408.1648467766632 \| 409.4122254207817 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 329.24396020457345 \| 313.5200995121138 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 274.61257504571876 \| 255.7801815432177 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 232.63806001220684 \| 230.03020843492314 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 435.0785891054788 \| 440.39101804225345 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 424.86925312752817 \| 435.18898057396825 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 393.000417896268 \| 395.11543361225256 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 297.7755459218185 \| 300.7208114715287 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 331.71570861760534 \| 318.07127352552885 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 424.58602747137405 \| 425.84897078470715 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 422.66607285025725 \| 423.5524945535485 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 344.8625760048626 \| 331.6793888458635 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 282.0787281511649 \| 263.7895634445868 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 252.7301927385177 \| 245.41844170037427 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 437.0658069164588 \| 442.9101960063628 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 433.13788271434646 \| 452.3873572709863 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 404.0959191546953 \| 396.7077863894884 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 300.45502211883206 \| 301.3439134717943 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 344.11003202413934 \| 330.8897663350314 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 298.4364205341705 \| 291.6793556507056 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 187.6382133139633 \| 191.05409897308772 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 156.55822078636112 \| 154.178925976516 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 173.47765221825162 \| 169.30862508068464 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 74.5885345035243 \| 74.52689061607104 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 323.12233826013045 \| 328.53889207933514 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 236.75872140126316 \| 235.8378325547398 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 227.17836523816675 \| 226.75357076139966 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 224.07209453308036 \| 224.07209453308036 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 122.85572156047981 \| 121.11642183704716 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 361.3123326658092 \| 360.71014086458337 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 281.5287983927017 \| 281.94301754758345 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 232.7456696285686 \| 226.50976826432776 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 221.5612361744038 \| 214.96188822837055 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 121.38311528944315 \| 120.85441868178513 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 380.2579019244734 \| 389.2520157863988 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 316.95230660496924 \| 317.87597790618906 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 301.07968126657323 \| 298.02424098422983 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 267.2240756921594 \| 267.16353549228154 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 189.82761622494257 \| 186.736450261963 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 389.88665375406805 \| 387.9125133037077 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 348.70619958684887 \| 346.6750499749774 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 280.5472989906087 \| 271.22300822012187 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 250.02397620165968 \| 241.22532776331445 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 171.67817496107645 \| 166.95679280483972 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 412.626880230807 \| 417.60238657950777 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 374.8829313933945 \| 389.4448546468815 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 353.20410434172436 \| 345.7072490717473 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 292.51045924209586 \| 291.66621022138287 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 251.6264062063495 \| 248.45110052911542 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 404.0155784550126 \| 401.90546837237514 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 384.4389015599863 \| 386.9684324594344 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 313.3731284132225 \| 298.17074251037894 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 264.19199737284265 \| 252.8982463999916 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 207.03696315185684 \| 202.86697323136772 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 428.2436763312506 \| 433.45005568619536 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 411.8516531869893 \| 428.2753623461049 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 384.9095037182509 \| 372.90888743000744 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 303.2438915629836 \| 302.05095952914337 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 301.8689122735564 \| 285.0363190513223 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 423.13592231504805 \| 420.3991500185611 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 407.44527331585493 \| 408.5064370765247 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 330.50050996167414 \| 316.8763979925965 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 274.6833786307413 \| 259.86098862141324 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 232.24019584158367 \| 226.52040268160232 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 444.4596314237808 \| 455.99558915752266 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 437.4245561244369 \| 455.98275147271966 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 397.3350686877605 \| 397.88875599028063 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 308.53809114394545 \| 307.1359822042007 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 331.32379843423774 \| 316.85293191675646 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 422.4622274366379 \| 425.0407156418684 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 420.9547052783101 \| 430.33779243510276 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 345.50265346504085 \| 332.094855328957 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 280.81715528243365 \| 264.6543640282054 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 252.25635200421783 \| 245.46235499490305 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 452.5524207341139 \| 461.7512032176736 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 445.2316469907137 \| 464.4523799578466 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 416.87264016717023 \| 409.17124592157046 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 309.42579489389846 \| 307.9734464665731 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 350.50782004300623 \| 330.98959545427294 \| </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/157767 Approved by: https://github.com/Skylion007	2025-08-28 22:43:59 +00:00
can-gaa-hou	c0ed87c82d	[Dynamo] Fix weakref.proxy error when `torch.compile` (#161508 ) Fixes #159258 The error occurs when we attempt to create a weak reference from a weak reference proxy. `e9d42b3880/torch/_dynamo/guards.py (L2910-L2915)` In fact, we shouldn't create a weak reference from another reference or proxy, as it would check in CPython. `f60f8225ed/Objects/weakrefobject.c (L410-L418)` However, `__weakrefoffset__` is not equal to 0 when the `guarded_object` is in `weakref.ProxyTypes`, and it will wrongly create a weak reference for the `weakref.ProxyTypes`. I think this could be a bug from CPython, but we can prevent it by adding more weakref type checks (`weakref.ProxyTypes` contains `weakref.ProxyType` and `weakref.CallableProxyType`) here. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161508 Approved by: https://github.com/Lucaskabela, https://github.com/anijain2305, https://github.com/malfet	2025-08-28 22:34:18 +00:00
Aleksei Nikiforov	1069a08dac	Enable more nightly tests on s390x (#160893 ) Enable more nightly tests on s390x Pull Request resolved: https://github.com/pytorch/pytorch/pull/160893 Approved by: https://github.com/malfet	2025-08-28 22:20:55 +00:00
soulitzer	1190b7f73e	Support Triton kernels in SAC region (#161541 ) SAC interaction with triton kernel: - In eager, triton ops are not dispatchable, and so it is always ignored by SAC, i.e., always recomputed. - In compile, although we wrap triton kernels into HOPs, allowing us to intercept them, we still recompute by default rather than save by default, so that compile maintains the invariant of using less memory than eager. - If you want to do something else (e.g. save the output of your triton kernel) you should wrap it in a custom op. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161541 Approved by: https://github.com/drisspg, https://github.com/zou3519, https://github.com/xmfan	2025-08-28 21:15:25 +00:00
PyTorch MergeBot	f46e4bcf43	Revert "Add ciflow/vllm to vLLM commit hash update PR(s) (#161678 )" This reverts commit 0e358050304c6a350dae2bce497bd1867ecc3c9f. Reverted https://github.com/pytorch/pytorch/pull/161678 on behalf of https://github.com/yangw-dev due to we want to keep the vllm pinn updated now, right now we have some failure ([comment](https://github.com/pytorch/pytorch/pull/161678#issuecomment-3234876332))	2025-08-28 20:42:19 +00:00
Ruben Rodriguez Buchillon	496052faf6	[inductor][decompose-k] make part of template heuristics (#161098 ) # why - enable it to go through commont template heuristics point - make easier to use in common extension point e.g. lookup table # what - break template heuristic into base + triton - move k_split generation logic into a templateheuristic for decompose k - register through normal mechanism - to make testing work, add a context manager to temporarily set template heuristics for a template/op to empty (effectively skipping it). This is used for decompose k test to disable triton choices # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D80670918](https://our.internmc.facebook.com/intern/diff/D80670918) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161098 Approved by: https://github.com/jansel ghstack dependencies: #161026, #161097	2025-08-28 20:14:48 +00:00
Ruben Rodriguez Buchillon	f641effe19	[inductor][ez] move template heuristics into dir (#161097 ) # why - simplify the expansion of heuristics beyond just triton (e.g. decomposeK) # what - move template heuristics and registry into its own folder - adjust imports accordingly # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py -v ``` Differential Revision: [D80670917](https://our.internmc.facebook.com/intern/diff/D80670917) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161097 Approved by: https://github.com/PaulZhang12, https://github.com/jansel ghstack dependencies: #161026	2025-08-28 20:14:48 +00:00
Ruben Rodriguez Buchillon	688acf0b83	[inductor][mm] restructure decompose k (#161026 ) # why - make it easier to integrate into lookup table later # what - current version generates templates on the fly and uses them to generate a single choice - lookup table and performance model work best when there is a stable set of templates (with predictable names) and those are then parametrized - this change makes it so that there is a single DecomposeK template with a stable name, and the k split is the only parametrization we do # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py::TestMaxAutotune::test_max_autotune_decompose_k_dynamic_False_bfloat16_sizes1 -v ``` Differential Revision: [D80670913](https://our.internmc.facebook.com/intern/diff/D80670913) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161026 Approved by: https://github.com/PaulZhang12, https://github.com/jansel	2025-08-28 20:14:41 +00:00
Natalia Gimelshein	f0a517e333	Use vectorized stores for all dtypes (#161649 ) resurrecting #151818 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161649 Approved by: https://github.com/Skylion007	2025-08-28 20:06:29 +00:00
Kevin Fu	bacdd985a9	[PT2] Add fastResizeToZero to all static dispatch kernels (#161679 ) Summary: Add fastResizeToZero whenever we are reusing output tensors. Otherwise it keeps throwing warning ``` Warning: An output with one or more elements was resized since it had shape [10], which does not match the required output shape [181]. This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0). (function _resize_output_check) ``` Test Plan: Run local replayer. ``` MODEL_TYPE=ads_mtml_offsite_cvr_oba_optout_dedicated_model MODEL_ENTITY_ID=786096203 SNAPSHOT_ID=11 HARDWARE_TYPE=1 ./sigrid/predictor/scripts/start_gpu_with_gif.sh ${MODEL_ENTITY_ID}_${SNAPSHOT_ID} /data/users/$USER/models/${MODEL_ENTITY_ID}/${SNAPSHOT_ID} 3443 2>&1 \| tee ~/logs/${MODEL_TYPE}/predictor_${MODEL_ENTITY_ID}_${SNAPSHOT_ID} sigrid/predictor/scripts/start_gpu_replayer_localhost_with_gif.sh ${MODEL_ENTITY_ID}_${SNAPSHOT_ID} 1000 ${MODEL_TYPE} /data/users/$USER/requests/filter_requests_ads_mtml_offsite_cvr_oba_optout_dedicated_model_100 localhost /data/users/$USER/models/${MODEL_ENTITY_ID}/${SNAPSHOT_ID} false 3443 false 2>&1 \| tee ~/logs/${MODEL_TYPE}/replayer_${MODEL_ENTITY_ID}_${SNAPSHOT_ID} ``` Before: P1921177565 After: P1921178087 Rollback Plan: Differential Revision: D81177596 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161679 Approved by: https://github.com/henryoier	2025-08-28 19:58:40 +00:00
RajeshvShiyal	1621b5494c	Removed redundant dtype conversion in scaled_dot_product_attention docstring example (#161613 ) Suggested changes done for Fixes #161611. Removed the line attn_bias.to(query.dtype) entirely Fixes #161611 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161613 Approved by: https://github.com/mikaylagawarecki	2025-08-28 19:58:07 +00:00
Avik Chaudhuri	69d91b94ba	kill allow_complex_guards_as_runtime_asserts (#160198 ) Summary: Since `allow_complex_guards_as_runtime_asserts` is now sync'd with `prefer_deferred_runtime_asserts_over_guards`, we can kill the former (especially since it was a export-only concept). Test Plan: updated tests Rollback Plan: Differential Revision: D79903317 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160198 Approved by: https://github.com/ezyang	2025-08-28 19:36:19 +00:00
Dmitry Nikolaev	b76f6d117a	[ROCm] fix numpy version detection and adjust fudge_factors for MI355 (#161429 ) This PR fixes: - Numpy >= 2.1 version detection (instead of python 3.13 version detection) to skip some tests (numpy 2.1 can be installed for older python versions) ``` test_quantization.py::TestDynamicQuantizedOps::test_qlinear test_quantization.py::TestDynamicQuantizedOps::test_qlinear_legacy test_quantization.py::TestQuantizedLinear::test_qlinear test_quantization.py::TestQuantizedLinear::test_qlinear_leaky_relu test_quantization.py::TestQuantizedLinear::test_qlinear_relu test_quantization.py::TestQuantizedLinear::test_qlinear_tanh test_quantization.py::TestQuantizedLinear::test_qlinear_with_input_q_dq_qweight_dq_output_fp32 ``` - A couple of SDPA tests on MI355 by adjusting fudge_factors: ``` test_transformers.py::TestSDPACudaOnlyCUDA::test_mem_efficient_attention_attn_mask_vs_math_ref_grads_batch_size_1_seq_len_q_2048_seq_len_k_8_head_dim_8_is_causal_False_dropout_p_0_0_float32_scale_l1_cuda_float32 test_transformers.py::TestSDPACudaOnlyCUDA::test_mem_efficient_attention_vs_math_ref_grads_batch_size_8_seq_len_q_2048_seq_len_k_8_head_dim_128_is_causal_True_dropout_p_0_0_float32_scale0_cuda_float32 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161429 Approved by: https://github.com/jeffdaily	2025-08-28 19:32:09 +00:00
Karthick Panner Selvam	130e50afff	[Inductor] Add DeviceAssert op to enable device-side assertion in torch.compile (#160677 ) This PR introduces a device_assert op to trigger device-side assertions within torch.compile. This implementation is based on the suggestion in [this comment](https://github.com/pytorch/pytorch/issues/147282#issuecomment-2756056084). Changes Included - Implemented device_assert op and overrides has_side_effect to return True to avoid removal by dead code elimination. - Commented out the assert_async_msg_decomp and functional_assert_async_msg_decomp decompositions to disable the default assert decomposition inside Inductor. - Added lowering for torch.ops.aten._assert_async.msg to convert assert calls into the ops_handler. - Implemented the codegen method for the device_assert op. This supports generating C++ and Triton code. - Added test cases to verify both "should throw" and "should not throw" scenarios. Fixes #147282 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160677 Approved by: https://github.com/mlazos, https://github.com/atalman	2025-08-28 18:57:34 +00:00
Shangdi Yu	30ab87c884	[inductor] don't append None to choices (#161672 ) Summary: don't append None as a choice to choices in autotune Test Plan: See internal Diff Differential Revision: D81188644 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161672 Approved by: https://github.com/angelayi	2025-08-28 18:48:50 +00:00
PyTorch MergeBot	049c08eda8	Revert "[dynamo] [guard] Add caching for inside torch.compile.disable function to avoid unnecessary recompilation. (#160934 )" This reverts commit 8f31aa97a3e1e17bed29b6cedf9884f0c6b145e9. Reverted https://github.com/pytorch/pytorch/pull/160934 on behalf of https://github.com/anijain2305 due to causes memory leak leading to OOMs ([comment](https://github.com/pytorch/pytorch/pull/160934#issuecomment-3234426359))	2025-08-28 17:56:36 +00:00
dolpm	affd071858	[export] serialization support for triton_kernel_wrapper_functional (#161314 ) Summary: att Test Plan: buck2 test mode/opt //caffe2/test:test_export -- test_triton_hop Rollback Plan: Differential Revision: D80827767 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161314 Approved by: https://github.com/angelayi	2025-08-28 17:42:47 +00:00
angelayi	dac062f23b	Add aoti to mps benchmarks (#160741 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160741 Approved by: https://github.com/malfet, https://github.com/huydhn	2025-08-28 17:32:29 +00:00
Wang, Chuanqi	2a70d98abf	[CI] Migrate XPU build and test to python 3.10 (#161708 ) Follow #161167 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161708 Approved by: https://github.com/malfet	2025-08-28 17:27:11 +00:00
eqy	55c289d5c1	[cuBLASLt][FP8] `cuBLASLt` appears to support float8 rowwise-scaling on H100 (#161305 ) Following #157905 I think the macro around ``` TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt"); ``` was never updated and this would cause `float8` tests to fail. Also it appears the `Lt` accepts two inputs with `e4m3` and `e5m2` dtypes simultaneously, so removing that check here as well... CC @lw Pull Request resolved: https://github.com/pytorch/pytorch/pull/161305 Approved by: https://github.com/Skylion007, https://github.com/drisspg, https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-28 17:04:25 +00:00
Nikita Shulga	2042d2174a	[MPS] Migrate round unary op to Metal (#161712 ) And actually use the right function, as [`torch.round`](https://docs.pytorch.org/docs/stable/generated/torch.round.html) doesn't use `std::round`, but rather `std::rint`, which can be easily seen by running something like ```python import torch print(torch.arange(-3., 3., step=.5, device='mps').round()) print(torch.arange(-3., 3., step=.5, device='mps').cpu().round()) ``` Before this change it printed ``` tensor([-3., -3., -2., -2., -1., -1., 0., 1., 1., 2., 2., 3.], device='mps:0') tensor([-3., -2., -2., -2., -1., -0., 0., 0., 1., 2., 2., 2.]) ``` But after this change results match Pull Request resolved: https://github.com/pytorch/pytorch/pull/161712 Approved by: https://github.com/dcci	2025-08-28 16:45:07 +00:00
Will Constable	4fd761fecc	[DTensor] Wrap sharding prop error with contextual exception (#161574 ) Mainly, this helps tell the user more info about the operator that failed to run if it fails during sharding propagation. Previously, only this exception would be raised: ``` RuntimeError: ('Attempted to flatten sharded dimension 1, ', 'but only the leftmost dim of a Flatten can be sharded.') ``` Now you get both the above exception as well as ``` The above exception was the direct cause of the following exception: RuntimeError: Sharding propagation failed for Op(op=aten.view.default, args_schema=Spec((Replicate(), Shard(dim=0), Shard(dim=1), Shard(dim=2)) on (8, 8, 4)), [64, 4] @ mesh: (1, 2, 2, 2)) ``` <stacktrace omitted> <details><summary>detailed error</summary> ``` ====================================================================== ERROR: test_linear (__main__.TestDTensor) ---------------------------------------------------------------------- Traceback (most recent call last): File "/data/users/whc/pytorch/torch/testing/_internal/common_distributed.py", line 668, in wrapper self._join_processes(fn) File "/data/users/whc/pytorch/torch/testing/_internal/common_distributed.py", line 932, in _join_processes self._check_return_codes(fn, elapsed_time) File "/data/users/whc/pytorch/torch/testing/_internal/common_distributed.py", line 972, in _check_return_codes raise RuntimeError(error) RuntimeError: Process 4 exited with error code 10 and exception: Traceback (most recent call last): File "/data/users/whc/pytorch/torch/distributed/tensor/_dispatch.py", line 150, in dispatch self.sharding_propagator.propagate(op_info) File "/data/users/whc/pytorch/torch/distributed/tensor/_sharding_prop.py", line 309, in propagate OutputSharding, self.propagate_op_sharding(op_info.schema) File "/data/users/whc/pytorch/torch/distributed/tensor/_sharding_prop.py", line 45, in __call__ return self.cache(args, kwargs) File "/data/users/whc/pytorch/torch/distributed/tensor/_sharding_prop.py", line 329, in propagate_op_sharding_non_cached op_strategy = self.op_strategy_funcs[op_schema.op](strategy_schema) File "/data/users/whc/pytorch/torch/distributed/tensor/_ops/_view_ops.py", line 673, in reshape_strategy input_tgt_placements, output_placements = propagate_shape_and_sharding( File "/data/users/whc/pytorch/torch/distributed/tensor/_ops/_view_ops.py", line 601, in propagate_shape_and_sharding in_dim = get_in_dim_to_shard(cmd) File "/data/users/whc/pytorch/torch/distributed/tensor/_ops/_view_ops.py", line 537, in get_in_dim_to_shard raise RuntimeError( RuntimeError: ('Attempted to flatten sharded dimension 1, ', 'but only the leftmost dim of a Flatten can be sharded.') The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/data/users/whc/pytorch/torch/testing/_internal/common_distributed.py", line 816, in run_test getattr(self, test_name)() File "/data/users/whc/pytorch/torch/testing/_internal/common_distributed.py", line 670, in wrapper fn() File "/data/users/whc/pytorch/torch/testing/_internal/common_utils.py", line 3224, in wrapper method(args, *kwargs) File "/data/users/whc/pytorch/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 490, in wrapper raise e File "/data/users/whc/pytorch/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 487, in wrapper func(self, args, *kwargs) # type: ignore[misc] File "/data/users/whc/pytorch/test.py", line 60, in test_linear print("results: ", distributed_linear(distributed_input)) File "/data/users/whc/pytorch/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl return self._call_impl(args, *kwargs) File "/data/users/whc/pytorch/torch/nn/modules/module.py", line 1786, in _call_impl return forward_call(args, *kwargs) File "/data/users/whc/pytorch/torch/nn/modules/linear.py", line 134, in forward return F.linear(input, self.weight, self.bias) File "/data/users/whc/pytorch/torch/_compile.py", line 53, in inner return disable_fn(args, *kwargs) File "/data/users/whc/pytorch/torch/_dynamo/eval_frame.py", line 1005, in _fn return fn(args, **kwargs) File "/data/users/whc/pytorch/torch/distributed/tensor/_api.py", line 358, in __torch_dispatch__ return DTensor._op_dispatcher.dispatch( File "/data/users/whc/pytorch/torch/distributed/tensor/_dispatch.py", line 163, in dispatch raise RuntimeError( RuntimeError: Sharding propagation failed for Op(op=aten.view.default, args_schema=Spec((Replicate(), Shard(dim=0), Shard(dim=1), Shard(dim=2)) on (8, 8, 4)), [64, 4] @ mesh: (1, 2, 2, 2)) ``` </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161574 Approved by: https://github.com/zpcore, https://github.com/XilunWu	2025-08-28 15:56:15 +00:00
PyTorch MergeBot	a8270dd124	Revert "kill allow_complex_guards_as_runtime_asserts (#160198 )" This reverts commit 196232bb935cb346f143d5c39e9a73c44121a033. Reverted https://github.com/pytorch/pytorch/pull/160198 on behalf of https://github.com/atalman due to dynamo/test_activation_checkpointing.py::ActivationCheckpointingViaTagsTestsCUDA::test_compile_selective_checkpoint_triton_kernel_cuda [GH job link](https://github.com/pytorch/pytorch/actions/runs/17289619543/job/49074475338) [HUD commit link](`196232bb93`) ([comment](https://github.com/pytorch/pytorch/pull/160198#issuecomment-3234013520))	2025-08-28 15:40:37 +00:00
Jane Xu	63632fc7ee	Add new_zeros dtype variant to the shim and as a stable op (#161597 ) In case we want this before 2.9 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161597 Approved by: https://github.com/mikaylagawarecki	2025-08-28 13:57:24 +00:00
PyTorch MergeBot	05d0f11dbd	Revert "Add test coverage to tf32 in max autotune mm configs (#161545 )" This reverts commit e9d34b2438d65d6d16109e2416f3698de20f85c2. Reverted https://github.com/pytorch/pytorch/pull/161545 on behalf of https://github.com/atalman due to inductor/test_max_autotune.py::TestMaxAutotuneRemoteCache::test_get_mm_configs_float32_precision_ieee [GH job link](https://github.com/pytorch/pytorch/actions/runs/17283985553/job/49058214260) [HUD commit link](`e9d34b2438`) ([comment](https://github.com/pytorch/pytorch/pull/161545#issuecomment-3233569771))	2025-08-28 13:46:47 +00:00
PyTorch MergeBot	ef0483d74c	Revert "Ensure large tensor int32 -> int64 indexing is enabled (#157767 )" This reverts commit b36a20d368733740a8507b3109d193c88930323a. Reverted https://github.com/pytorch/pytorch/pull/157767 on behalf of https://github.com/atalman due to need to revert https://github.com/pytorch/pytorch/pull/157767 internal tests ([comment](https://github.com/pytorch/pytorch/pull/157767#issuecomment-3233558168))	2025-08-28 13:44:41 +00:00
PyTorch MergeBot	5432966253	Revert "Remove test since it ooms on CI (#161644 )" This reverts commit 443452ca2f5beef58019f4e7e7e31c0526aee0fc. Reverted https://github.com/pytorch/pytorch/pull/161644 on behalf of https://github.com/atalman due to need to revert https://github.com/pytorch/pytorch/pull/157767 internal tests ([comment](https://github.com/pytorch/pytorch/pull/161644#issuecomment-3233550883))	2025-08-28 13:41:58 +00:00
PyTorch MergeBot	e9975f501c	Revert "Support Triton kernels in SAC region (#161541 )" This reverts commit 149c68071ca033d5e3427e63e05d9969bd4961e4. Reverted https://github.com/pytorch/pytorch/pull/161541 on behalf of https://github.com/malfet due to Broke some tests in trunk workflow, see https://hud.pytorch.org/hud/pytorch/pytorch/main/1?per_page=50&name_filter=trunk%20%2F%20linux-jammy-cuda12.8 ([comment](https://github.com/pytorch/pytorch/pull/161541#issuecomment-3233457206))	2025-08-28 13:14:53 +00:00
xinan.lin	07f76517e7	[Inductor][WIndows] Fix Windows test case failure. (#161497 ) Fixes windows test case failures: - TritonCodeGenTests.test_inductor_sequence_nr - TritonCodeGenTests.test_indirect_device_assert - CompiledOptimizerTests.test_static_address_finalizer Pull Request resolved: https://github.com/pytorch/pytorch/pull/161497 Approved by: https://github.com/jansel	2025-08-28 12:40:42 +00:00
xinan.lin	3519969e4f	[Intel GPU] Enable tensor memory descriptor in triton template for XPU. (#161600 ) As Intel Triton now supports tensor descriptor, this PR updates the pinned Intel Triton version and introduces support for Triton MM template with tensor descriptor on XPU. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161600 Approved by: https://github.com/EikanWang, https://github.com/jansel	2025-08-28 12:39:58 +00:00
Tugsbayasgalan Manlaibaatar	5790b00975	[RELAND] Close some sources of fake tensor leakage (#161589 ) Reland of https://github.com/pytorch/pytorch/pull/159923 Couple of fixes: 1. When we run into an operation we didn't proxy, we end up emitting fake constants. We detect this and warn using the FQN of the lifted constant. We warn because some internal users complained it was regressing their exportability. 2. Previous attribute mutation detection logic in non-strict didn't account for nested module structure. This fixes silent incorrectness issue of exporting esm and qwen in non-strict 3. We modify yolov3 to fix the previous silent incorrect behaviour 4. We use strict export for levit_128 because it errors in non-strict due to more strict side effect checking When upgrading torchbench pin, opacus_cifar10 seems to not run on eager anymore. I verified this by pushing a temporary PR on master with new pin. So i added it to expect_fail list. Differential Revision: [D81133908](https://our.internmc.facebook.com/intern/diff/D81133908) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161589 Approved by: https://github.com/avikchaudhuri	2025-08-28 09:46:42 +00:00
Eddie Yan	2e77a08b95	[cuDNN][TF32] Account for TF32 in `test_super_resolution_cuda` (#161662 ) cuDNN seems to be dispatching to TF32 kernels on B200 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161662 Approved by: https://github.com/Skylion007	2025-08-28 08:42:34 +00:00
Avik Chaudhuri	196232bb93	kill allow_complex_guards_as_runtime_asserts (#160198 ) Summary: Since `allow_complex_guards_as_runtime_asserts` is now sync'd with `prefer_deferred_runtime_asserts_over_guards`, we can kill the former (especially since it was a export-only concept). Test Plan: updated tests Rollback Plan: Differential Revision: D79903317 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160198 Approved by: https://github.com/ezyang	2025-08-28 07:59:29 +00:00
PyTorch MergeBot	fa76256603	Revert "[dynamic shapes] use prims_common contiguity in create_example_tensors (#160933 )" This reverts commit 33c3794533844236a6e30ba377e0a6802b279fc8. Reverted https://github.com/pytorch/pytorch/pull/160933 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/160933#issuecomment-3232305708))	2025-08-28 07:39:26 +00:00
Gabriel Ferns	d2d4a3c539	Select Algorithm clear feedback savers (#161654 ) Add `clear_feedback_savers` and tests for the feedback functionality. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161654 Approved by: https://github.com/masnesral	2025-08-28 06:56:03 +00:00
Ke Wen	95516ad7e6	[4/N][SymmMem] Add `get_remote_tensor` + move up `get_buffer` and `get_signal_pad` (#161533 ) Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): `get_remote_tensor `: return a symmetric tensor given a peer rank. The difference between `get_buffer` API and `get_remote_tensor` API: - the former accepts an offset, whereas the latter doesn't - the latter returns a symmetric tensor at `hdl.offset` on `peer`. As a refactorization, this PR also moves the implementation of `get_buffer` and `get_signal_pad` to the `SymmetricMemory` level as their code is common to all backends. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161533 Approved by: https://github.com/ngimel ghstack dependencies: #161470, #161471, #161532	2025-08-28 06:47:35 +00:00
Ke Wen	ff9533970a	[3/N][SymmMem] Expose offset field from handle (#161532 ) As titled, so that kernels relying on direct pointers can use base address and `hdl.offset` to access remote memory. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161532 Approved by: https://github.com/ngimel ghstack dependencies: #161470, #161471	2025-08-28 06:39:12 +00:00
Ke Wen	b291dc9684	[2/N][SymmMem] Add MemPool allocator and tests (#161471 ) (Porting most of #161008) Hooking SymmetricMemory Allocator to MemPool so that user can create symmetric tensors with regular `torch.zeros`, `torch.arange` etc factories. Also so that our ops can have functional variants that create `out` tensors on symmetric memory. To end users, this PR supports a python UI as follows: ``` allocator = symm_mem.get_mempool_allocator(device) mempool = torch.cuda.MemPool(allocator) with torch.cuda.use_mem_pool(mempool): tensor = torch.arange(numel, dtype=dtype, device=device) ``` Added tests for both use cases above. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161471 Approved by: https://github.com/ngimel ghstack dependencies: #161470	2025-08-28 06:31:29 +00:00
Oguz Ulgen	0fd63fd88b	Guard config copy for pickle errors (#161659 ) Differential Revision: [D81168335](https://our.internmc.facebook.com/intern/diff/D81168335) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161659 Approved by: https://github.com/zou3519	2025-08-28 06:27:48 +00:00
Ke Wen	eec876deb6	[SymmMem] Isolate set_device tests to avoid hang (#161668 ) `test_symmetric_memory.py` hangs like this: ``` SymmetricMemoryTest::test_empty_strided_p2p_persistent_set_device_False PASSED [5.6364s] SymmetricMemoryTest::test_empty_strided_p2p_persistent_set_device_True ... ``` This set of tests parameterizes whether user sets the device before calling `symm_mem.emtpy`. However, such parametrization does not work well with `MultiProcContinuousTest` because the set device will "contaminate" the next test function. Solution is to move the "set device" tests to a separate test suite using the traditional `MultiProcessTestCase`, which would respawn processes every time. Hang is gone now. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161668 Approved by: https://github.com/fegin	2025-08-28 05:43:49 +00:00
Yang Wang	c83b43d7a8	[1/2]Add summary report for vllm build (#161565 ) Demo Run https://github.com/pytorch/pytorch/actions/runs/17259533323?pr=161565 <img width="1538" height="720" alt="image" src="https://github.com/user-attachments/assets/64f6d7b4-cac6-4c12-863c-b15514bb8810" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161565 Approved by: https://github.com/huydhn	2025-08-28 05:25:55 +00:00
Mikayla Gawarecki	d3d9eb4777	Error when TORCH_STABLE_ONLY is defined in TensorBase.h (#161658 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161658 Approved by: https://github.com/albanD	2025-08-28 04:36:31 +00:00
PyTorch UpdateBot	a65db6dc4c	[vllm hash update] update the pinned vllm hash (#161363 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161363 Approved by: https://github.com/pytorchbot	2025-08-28 04:14:19 +00:00
soulitzer	149c68071c	Support Triton kernels in SAC region (#161541 ) SAC interaction with triton kernel: - In eager, triton ops are not dispatchable, and so it is always ignored by SAC, i.e., always recomputed. - In compile, although we wrap triton kernels into HOPs, allowing us to intercept them, we still recompute by default rather than save by default, so that compile maintains the invariant of using less memory than eager. - If you want to do something else (e.g. save the output of your triton kernel) you should wrap it in a custom op. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161541 Approved by: https://github.com/drisspg, https://github.com/zou3519 ghstack dependencies: #160781	2025-08-28 03:54:46 +00:00
xinan.lin	bae01479c3	[Inductor UT] Re-enable test_torchinductor_opinfo.py on XPU. (#161477 ) The PR #160222 replaced @skipCUDAIf with @requires_cuda_and_triton in test_torchinductor_opinfo.py, which caused the CI jobs for other devices to skip this large test suite. We attempted to revert #160222 but ran into conflicts. I then opened #160936 to revert the changes from #160222, but that resulted in CPU CI job timeouts. I also filed issue #161132 for assistance, but haven’t received a response yet. To minimize the impact, this PR re-enables the test suite on XPU first. I will continue to seek help on re-enabling it for CPU afterwards. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161477 Approved by: https://github.com/jansel	2025-08-28 03:29:21 +00:00
cyy	8939d151d0	Use std::apply for CPU code (#152526 ) The supported compilers are recent enough to enable std::apply in C++17. Pull Request resolved: https://github.com/pytorch/pytorch/pull/152526 Approved by: https://github.com/ezyang	2025-08-28 02:47:54 +00:00
rzou	5edc3d814f	Add option for TorchDispatchMode to ignore torch.compile internals (#161648 ) If TorchDispatchMode.ignore_compile_internals() is True, then we turn off the TorchDispatchMode during the compilation process, instead turning it back on during runtime of the compiled artifact. Test Plan: - new test Pull Request resolved: https://github.com/pytorch/pytorch/pull/161648 Approved by: https://github.com/bdhirsh	2025-08-28 02:41:33 +00:00
rzou	199c3633bf	Fix Inductor Periodic (#161617 ) Models are now passing accuracy. # of graph breaks is larger because these were not actually tested in CI (if the model fails accuracy we do not assert on # of graph breaks). Pull Request resolved: https://github.com/pytorch/pytorch/pull/161617 Approved by: https://github.com/anijain2305	2025-08-28 02:36:08 +00:00
Gabriel Ferns	e9d34b2438	Add test coverage to tf32 in max autotune mm configs (#161545 ) Add a test to make sure that the configs are using the correct setting of tf32 to prevent regression. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161545 Approved by: https://github.com/coconutruben	2025-08-28 02:27:58 +00:00
Simon Fan	be1612201d	[export] Support AC HOP in pre-dispatch (#161479 ) Adds the pre-dispatch handling for the AC hop. This lets the HOP pre-dispatch export without actually pre-dispatch tracing into it,. However, this is not sufficient to support AC in export: - because the HOP body will still be in torch IR, so it will fail export verifiers - the exported module also can't be ran in eager because the AC HOP relies on partitioner to embed RNG state saving/restoring So it must be lowered by AOT Autograd into post-dispatch first before being executed, It suffices for my purposes though. If users had checkpoint API use in their exported model, the behavior goes from silently incorrect to now be validation error. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161479 Approved by: https://github.com/ydwu4 ghstack dependencies: #161353	2025-08-28 01:46:25 +00:00
Simon Fan	15670f9075	[dtensor] support local_map as a decorator (#161353 ) And extract it out as a convenience function for dynamo to wrap Pull Request resolved: https://github.com/pytorch/pytorch/pull/161353 Approved by: https://github.com/zpcore	2025-08-28 01:46:25 +00:00
Huy Do	0e35805030	Add ciflow/vllm to vLLM commit hash update PR(s) (#161678 ) As it should be, otherwise, PR(s) like https://github.com/pytorch/pytorch/pull/161121 were merged without the signals it needed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161678 Approved by: https://github.com/atalman	2025-08-28 01:35:04 +00:00
Shangdi Yu	92c2daebb6	Add inductor provenance tracking artifacts to cache (#161440 ) Summary: - Add inductor provenance tracking artifacts to cache - Update the tlparse version pin to `0.4.0`. The old tlparse version errors out on the new tlparse output. The lowest tlparse version that works is `0.3.42`. tlparse error: ``` thread 'main' panicked at src/parsers.rs:671:71: called `Result::unwrap()` on an `Err` value: Error("EOF while parsing a value", line: 1, column: 0) stack backtrace: 0: 0x55e4ff1c7f00 - <std::sys::backtrace::BacktraceLock::print::DisplayBacktrace as core::fmt::Display>::fmt::h6d42cc84fc840290 1: 0x55e4ff1ee503 - core::fmt::write::h5af61a909e3ec64d 2: 0x55e4ff1c4c33 - std::io::Write::write_fmt::h5a7b54aa6e4a315d 3: 0x55e4ff1c7d52 - std::sys::backtrace::BacktraceLock::print::h555579e7396c26ac 4: 0x55e4ff1c8caf - std::panicking::default_hook::{{closure}}::h9128866118196224 5: 0x55e4ff1c8b1a - std::panicking::default_hook::h52e9e7314e0255f6 6: 0x55e4ff1c9652 - std::panicking::rust_panic_with_hook::h541791bcc774ef34 7: 0x55e4ff1c93fa - std::panicking::begin_panic_handler::{{closure}}::h6479a2f0137c7d19 8: 0x55e4ff1c8419 - std::sys::backtrace::__rust_end_short_backtrace::ha04e7c0fc61ded91 9: 0x55e4ff1c908d - rust_begin_unwind 10: 0x55e4fef7a030 - core::panicking::panic_fmt::h5764ee7030b7a73d 11: 0x55e4fef7a406 - core::result::unwrap_failed::h3ff7104a9ace307a 12: 0x55e4fefb3c56 - <tlparse::parsers::ArtifactParser as tlparse::parsers::StructuredLogParser>::parse::h20bc51a17ffc494a 13: 0x55e4fef9669a - tlparse::run_parser::h20c7729f151eec62 14: 0x55e4fef99a1b - tlparse::parse_path::he4892147f47fbade 15: 0x55e4fef7c760 - tlparse::main::hdc05613b32f4f53b 16: 0x55e4fef89263 - std::sys::backtrace::__rust_begin_short_backtrace::h15f188f3edf42596 17: 0x55e4fef8827d - std::rt::lang_start::{{closure}}::he2c21e32a442538e 18: 0x55e4ff1be0f0 - std::rt::lang_start_internal::h15895544e2012228 19: 0x55e4fef83975 - main 20: 0x7f0b3662a610 - __libc_start_call_main 21: 0x7f0b3662a6c0 - __libc_start_main_alias_2 22: 0x55e4fef7a610 - <unknown> 23: 0x0 - <unknown> ``` Test Plan: ``` buck run mode/dev-nosan fbcode//caffe2/test/inductor:provenance_tracing -- -r test_kernel_information_generation python test/dynamo/test_structured_trace.py -k test_chromium_event ``` Differential Revision: D80976585 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161440 Approved by: https://github.com/oulgen	2025-08-28 01:16:02 +00:00
Paul de Supinski	768a1017c5	Allow parallel start NUMA binding (#161576 ) # Context In #161183, we added NUMA-binding support for `Callable` entrypoints to `elastic_launch`. However, we would raise an exception if the subprocesses would be spawned in parallel via `ThreadPoolExecutor`, which is an option configurable via the `TORCH_MP_PARALLEL_START` environment variable (see diff). The logic here was that `os.sched_setaffinity`, which we used to set CPU affinities, is [per process](https://docs.python.org/3/library/os.html#os.sched_setaffinity), so there could be a race condition during a parallel start: > Restrict the process with PID pid (or the current process if zero) to a set of CPUs. mask is an iterable of integers representing the set of CPUs to which the process should be restricted. But on further reading, the Linux docs say [`sched_setaffinity` is per thread.](https://man7.org/linux/man-pages/man2/sched_setaffinity.2.html) As it turns out, the Python doc is a misnomer. I [verified that `sched_setaffinity` only affects the calling thread, not the entire calling process.](https://gist.github.com/pdesupinski/7e2de3cbe5bb48d489f257b83ccddf07) The upshot is that we actually can safely use the inheritance trick from #161183 even with parallel start, since the setting will be inherited from the calling thread, and `os.sched_setaffinity` only affects the calling thread. # This PR Remove restrictions against parallel start for NUMA binding. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161576 Approved by: https://github.com/d4l3k	2025-08-28 01:15:58 +00:00
Lakshay Garg	0c4a79b7e0	Replace some calls to new with make_{unique,shared} (#160581 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160581 Approved by: https://github.com/malfet	2025-08-28 00:30:45 +00:00
Son Nguyen	9b02435e9f	Improve Scheduler init duration (#161491 ) Early exit merge_loops() if config.loop_ordering_after_fusion is false. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161491 Approved by: https://github.com/jansel	2025-08-28 00:27:51 +00:00
Will Constable	fd60117051	[C10D] add _summarize_ranks util (#160284 ) Prints ranges of ranks succinctly. e.g. For a strided list of ranks, summarizes down to start:stop:step ``` 0:4096:512 ``` Omits step if it's 1 ``` 0:8 ``` Note: endpoints are exclusive. This may not be intuitive to everyone, but in the first above the last rank is 3584, and in the second it is 7. Currently, does not support combinations of striding _and_ range. (e.g. can not generate a representation like "0:2, 4:6, ..., 12:14". Is this needed / useful? If so it could be added. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160284 Approved by: https://github.com/XilunWu	2025-08-28 00:17:53 +00:00
Pian Pawakapan	97a548b640	[PGO] skip allowlist logging for empty graphs (#161530 ) Summary: reduces spurious logging Test Plan: test_pgo Rollback Plan: Differential Revision: D81060182 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161530 Approved by: https://github.com/bobrenjc93, https://github.com/mlazos	2025-08-28 00:12:13 +00:00
PyTorch MergeBot	c55bdb26e1	Revert "[Inductor] Add DeviceAssert op to enable device-side assertion in torch.compile (#160677 )" This reverts commit 378edb047f83dfb84c2d9c032bddebc5e0147b8f. Reverted https://github.com/pytorch/pytorch/pull/160677 on behalf of https://github.com/atalman due to new test is failing internally ([comment](https://github.com/pytorch/pytorch/pull/160677#issuecomment-3230152168))	2025-08-27 23:45:12 +00:00
PyTorch MergeBot	903181bb6f	Revert "[2/N][SymmMem] Add MemPool allocator and tests (#161471 )" This reverts commit 4ed71d5412d58746d23f16689cab61da0e8149ef. Reverted https://github.com/pytorch/pytorch/pull/161471 on behalf of https://github.com/atalman due to failing internal builds ([comment](https://github.com/pytorch/pytorch/pull/161471#issuecomment-3230069186))	2025-08-27 23:18:36 +00:00
David Berard	ba201082b6	[TorchScript] ProfilingExecutor - RemoveProfileNodesAndSpecializeTypes None handling (#161538 ) ProfilingGraphExecutor works like this: 1. do some unrelated JIT optimizations 2. Add profiling nodes to collect JIT information like tensor dtypes and shapes 3. Do some more unrelated JIT optimizations 4. Remove the profiling nodes and extract the tensor info, and then use the JIT tensor info to do optimizations. This PR is intended to fix a bug in Step 4, where the profiling nodes were removed. It was previously assumed that all the things that were profiled were either Tensors or Optional[Tensor]s - otherwise, step 2 would not have introduced a profiling node. However, we saw a case where step 3 would remove replace Optional[Tensor] inputs with `None` inputs (e.g. if a conditional that returned a Tensor or a None could be statically known to only follow the `None` branch). To fix this, we essentially just modify the RemoveProfileNodesAndSpecializeTypes assert so that it accepts Tensors, Optional[Tensor]s, or None (the new part). Note that this issue is probably somewhat uncommon (maybe why we didn't see it for the first 4 years that this code existed). I expect that, typically, any time that step 3 would convert `Optional[Tensor] -> None`, step 1 would have already done that. So it's difficult to reproduce in an end-to-end TorchScript workload. Differential Revision: [D81068172](https://our.internmc.facebook.com/intern/diff/D81068172) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161538 Approved by: https://github.com/nmacchioni	2025-08-27 23:12:15 +00:00
PyTorch MergeBot	8fc2467fe5	Revert "[3/N][SymmMem] Expose offset field from handle (#161532 )" This reverts commit 68d395d61e9d4601ab1e2bca56eb28253572c662. Reverted https://github.com/pytorch/pytorch/pull/161532 on behalf of https://github.com/atalman due to need to revert https://github.com/pytorch/pytorch/pull/161471 internal failure ([comment](https://github.com/pytorch/pytorch/pull/161532#issuecomment-3230016806))	2025-08-27 23:06:55 +00:00
drisspg	30edac5da6	Updates to CuTe DSL template renderer (#161117 ) # Summary This adds a few more render functions available to template writers, specifically get_output and modification. The reasons why are more clear in the next PR in this stack. <img width="1645" height="364" alt="Screenshot 2025-08-21 at 1 48 50 PM" src="https://github.com/user-attachments/assets/2d508fda-4273-43ef-9edf-086e592e9249" /> Majority of the new cod is around the OpOverrides for CuTe DSL. It is alot to test and most of the actual testing I have been doing is via score_mods to the flash_attention at the next layer of this stack. A bunch of score mods that me and Claude came up with , that exercise the actual ops. ``` Py def causal_mask(score, b, h, q_idx, kv_idx): """Causal attention mask.""" return torch.where(q_idx >= kv_idx, score, float("-inf")) def relative_bias(score, b, h, token_q, token_kv): """Relative position bias.""" return score + torch.abs(token_q - token_kv) def relative_bias_v2(score, b, h, token_q, token_kv): """Relative position bias with factor of 2.""" return score + 2 * torch.abs(token_q - token_kv) def times_two(score, b, h, q_idx, kv_idx): """Simple score modification that doubles the score.""" return score * 2 def alibi_bias(score, b, h, q_idx, kv_idx): """ALiBi (Attention with Linear Biases) - used in some modern models.""" # Different slopes for different heads slope = 2 ** (-8 * (h + 1) / 8) # Simplified version return score - slope * torch.abs(q_idx - kv_idx) def sliding_window(score, b, h, q_idx, kv_idx, window_size=256): """Sliding window attention - only attend to nearby tokens.""" return torch.where( torch.abs(q_idx - kv_idx) <= window_size, score, float("-inf") ) def block_diagonal(score, b, h, q_idx, kv_idx, block_size=64): """Block diagonal attention pattern.""" q_block = q_idx // block_size kv_block = kv_idx // block_size return torch.where(q_block == kv_block, score, float("-inf")) def additive_bias(score, b, h, q_idx, kv_idx): """Test simple addition with position-based bias.""" return score + (q_idx + kv_idx) * 0.01 def multiplicative_decay(score, b, h, q_idx, kv_idx): """Test multiplication with distance-based decay.""" distance = torch.abs(q_idx - kv_idx) return score * torch.exp(-0.1 * distance) def sine_wave_bias(score, b, h, q_idx, kv_idx): """Test trigonometric functions.""" return score + 0.1 * torch.sin(2 * math.pi * (q_idx - kv_idx) / 64) def log_distance_penalty(score, b, h, q_idx, kv_idx): """Test logarithmic operations.""" distance = torch.abs(q_idx - kv_idx).float() return score - torch.log(1 + distance) def alternating_mask(score, b, h, q_idx, kv_idx): """Test with alternating pattern - good for branch prediction.""" return torch.where((q_idx + kv_idx) % 2 == 0, score, float("-inf")) def head_specific_pattern(score, b, h, q_idx, kv_idx): """Different behavior per attention head.""" even_head = h % 2 == 0 causal = q_idx >= kv_idx return torch.where(even_head & causal, score, float("-inf")) def sparse_strided(score, b, h, q_idx, kv_idx, stride=4): """Sparse attention with strided pattern.""" return torch.where( (kv_idx % stride == 0) \| (q_idx == kv_idx), score, float("-inf") ) def causal_with_global(score, b, h, q_idx, kv_idx): """Causal mask but first few tokens are globally attended.""" is_causal = q_idx >= kv_idx is_global = kv_idx < 4 return torch.where(is_causal \| is_global, score, float("-inf")) def dilated_attention(score, b, h, q_idx, kv_idx, dilation_rate=2): """Dilated attention pattern - exponentially increasing gaps.""" distance = torch.abs(q_idx - kv_idx) is_attended = (distance == 0) \| ((distance > 0) & ((distance & (distance - 1)) == 0)) return torch.where(is_attended, score, float("-inf")) ``` Example outputs: ``` [Test Suite] Config: batch=4, heads=32, seq_q=8192, seq_kv=8192, dim=128 [Test 1: none] [No score_mod, flash='enabled'] Found flash_attncute: True [No score_mod, flash='disabled'] Found flash_attncute: False ✓ Outputs match between flash enabled/disabled ✓ Output matches eager SDPA (rtol=0.001, atol=0.001) [Test 2: causal] [With score_mod, flash='enabled'] Found flash_attncute: True [With score_mod, flash='disabled'] Found flash_attncute: False ✗ Outputs differ between flash modes: Tensor-likes are not close! Mismatched elements: 17879 / 134217728 (0.0%) Greatest absolute difference: 0.0078125 at index (0, 15, 15, 60) (up to 0.001 allowed) Greatest relative difference: 2.5 at index (3, 22, 153, 126) (up to 0.001 allowed) [Test 3: rel_bias] [With score_mod, flash='enabled'] Found flash_attncute: True [With score_mod, flash='disabled'] Found flash_attncute: False ✗ Outputs differ between flash modes: Tensor-likes are not close! Mismatched elements: 12836 / 134217728 (0.0%) Greatest absolute difference: 0.015625 at index (0, 3, 2775, 84) (up to 0.001 allowed) Greatest relative difference: 11.8125 at index (3, 28, 4095, 76) (up to 0.001 allowed) [Test 4: rel_bias_v2] ``` This is bfloat16 and there are no major differences. The list of pointwise ops here isn't exhaustive but it is fairly covering Pull Request resolved: https://github.com/pytorch/pytorch/pull/161117 Approved by: https://github.com/mlazos	2025-08-27 23:01:31 +00:00
Avik Chaudhuri	12c0cf3fab	switch prefer_deferred_runtime_asserts_over_guards in export (#160111 ) Summary: In preparation for checking shape guards in export, this PR effectively switches `prefer_deferred_runtime_asserts_over_guards` to `False`, matching Dynamo. Actually that's a lie: we switch it to `allow_complex_guards_as_runtime_asserts`, which is `False` by default but can be controlled via an internally API to be `True`. This makes the two flags synchronized, so we should be able to kill `allow_complex_guards_as_runtime_asserts` at this point. Test Plan: updated tests Rollback Plan: Differential Revision: D79734206 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160111 Approved by: https://github.com/tugsbayasgalan	2025-08-27 22:51:10 +00:00
Zain Rizvi	6b051d7de3	[BE] Refactor trymerge for readability (#161637 ) Two changes: - Extract getting the last_commit's sha into it's own function - Rename merge_changes to merge_changes_locally to better explain it's functionality Pull Request resolved: https://github.com/pytorch/pytorch/pull/161637 Approved by: https://github.com/seemethere, https://github.com/malfet ghstack dependencies: #161558	2025-08-27 22:44:00 +00:00
rebeccajae	ee0ec21191	Ensure that tensors are contiguous before using no-graph MPS impl (#161641 ) Fixes #161640 Check if tensors are contiguous before using the no-graph implementation. Using the script in the issue above with this change I get expected results. ``` MPS contiguous result sample: tensor([ 1.3600, -2.9516, 1.3207, -3.5132, 1.7061], device='mps:0') MPS non-contig result sample: tensor([ 1.3600, -2.9516, 1.3207, -3.5132, 1.7061], device='mps:0') CPU non-contig result sample: tensor([ 1.3600, -2.9516, 1.3207, -3.5132, 1.7061]) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161641 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-08-27 22:31:57 +00:00
Xinran / Allan Rui	7da02bf8af	Skip const folding with symbolic expression (#161437 ) Summary: When performing constant folding, we must skip over operators that have symbolic `fill_value`. Test Plan: CI Rollback Plan: Reviewed By: kalpit-meta-1 Differential Revision: D80965936 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161437 Approved by: https://github.com/StellarrZ	2025-08-27 22:09:58 +00:00
William Wen	1041805c1e	[dynamo, nested graph breaks] prevent excessive recompilations (#159786 ) Nested continuation function code objects are now unique w.r.t. stack trace below (and including) the current code object. Without this change, e.g. in the added test, `f3` would be recompiled on the second graph break. Followup: we can skip guards on continuation functions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159786 Approved by: https://github.com/anijain2305 ghstack dependencies: #159329, #159678, #159817, #160138	2025-08-27 21:53:37 +00:00
William Wen	6562646dab	[dynamo, nested graph breaks] clean up comments and codegen (#160138 ) Fix comments to reflect that we no longer codegen cells to be sent to resume function as inputs - they are instead codegen'd after the unsupported instruction in order to build resume functions that are closures. Also simplify some codegen. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160138 Approved by: https://github.com/anijain2305 ghstack dependencies: #159329, #159678, #159817	2025-08-27 21:53:37 +00:00
William Wen	d0a242e547	[dynamo, nested graph breaks] support nested closures (#159817 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159817 Approved by: https://github.com/anijain2305 ghstack dependencies: #159329, #159678	2025-08-27 21:53:37 +00:00
William Wen	3f8090809f	[dynamo, nested graph breaks] support nested graph breaks x context managers (#159678 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159678 Approved by: https://github.com/anijain2305 ghstack dependencies: #159329	2025-08-27 21:53:37 +00:00
William Wen	10d93325b1	[dynamo, nested graph breaks] support very simple nested graph breaks (#159329 ) e.g. this graph breaks once now: ```python import torch torch._dynamo.config.nested_graph_breaks = True def inner(x): x = x + 1 torch._dynamo.graph_break() return x + 2 @torch.compile(backend="eager") def outer(x): return inner(x) print(outer(torch.ones(3))) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159329 Approved by: https://github.com/anijain2305	2025-08-27 21:53:37 +00:00
Animesh Jain	68fa882dad	[dynamo] Correctly track mutation class source for MutableMappingVariable (#161568 ) Fixes https://github.com/pytorch/pytorch/issues/161505 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161568 Approved by: https://github.com/Lucaskabela, https://github.com/malfet	2025-08-27 21:47:17 +00:00
Yu, Guangye	b9c6aa1e17	Revert "Refactor CUDAAllocatorConfig to reuse AcceleratorAllocatorConfig (#150312 )" (#161628 ) This reverts commit ae1a706444d6c0a6019ffc936c8b36574335a5d5. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161628 Approved by: https://github.com/atalman ghstack dependencies: #161625, #161626, #161627	2025-08-27 21:37:14 +00:00
Yu, Guangye	b7b9fb9962	Revert "Deprecate overleap functions in CUDAAllocatorConfig, use AcceleratorAllocatorConfig instead (#156165 )" (#161627 ) This reverts commit c1145852a5eac96f5551b5d1805109ce4dc5e1fa. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161627 Approved by: https://github.com/atalman ghstack dependencies: #161625, #161626	2025-08-27 21:37:14 +00:00
Yu, Guangye	c03d8d4082	Revert "Generalize torch._C._set_allocator_settings to be generic (#156175 )" (#161626 ) This reverts commit 908c5cc4c0f22d141776bde47c296b5186691855. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161626 Approved by: https://github.com/atalman ghstack dependencies: #161625	2025-08-27 21:37:14 +00:00
clr	40f46b09c7	async_compile: Fix the wait method to actually wait (#161561 ) This method never triggered. It's used in 2 tests and they pass, so no serious concern. Note that I did introduce and fix a latent bug, which is if we called shutdown_compile_workers, jobs would crash with this change due to ready_future being finished if we called wait. However we only call wait in tests so that bug is fine. The other behaviour, is that if you called shutdown, I believe we may potentially block on your first triton compile after that, until the pool was ready. This should correctly switch to direct mode, until the pool is ready on later warmups. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161561 Approved by: https://github.com/masnesral ghstack dependencies: #161452	2025-08-27 21:35:31 +00:00
clr	0d6597138c	inductor: Log the specific triton kernel that fails (#161452 ) Added a optional name argument to SubprocPool.submit. We record this in a dictionary, and when raising exceptions, add the name. We manage the lifecycle the same as the pending futures. Added a specific testcase to make sure this logs correctly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161452 Approved by: https://github.com/masnesral	2025-08-27 21:35:31 +00:00
Yu, Guangye	06ddaf1e0a	Revert "Back out "Deprecate overleap functions in CUDAAllocatorConfig, use AcceleratorAllocatorConfig instead (#156165 )" (#160999 )" (#161625 ) This reverts commit a818fa77e3a72271f144514ef349c5a666313205. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161625 Approved by: https://github.com/atalman	2025-08-27 21:34:12 +00:00
Blaine Burton Rister	26d0ff1cba	[AOTI-FX] Enhance launch grid FloorDiv replacement using sympy.together. (#161582 ) # Feature 2d launch grids with dynamic shapes can contain sympy expressions like `floor(x / 128 + y / 128)`. This breaks the dynamic shapes tracer which only supports `FloorDiv`, and not `floor`. To handle this case, call `sympy.together` prior to pattern matching to convert this to `floor((x + y) / 128)`. Then, we can recognize the pattern and map it to `FloorDiv(x + y, 128)`. # Test plan Added a custom Triton test exposing this. The test calls a 2d autotuned kernel with dynamic shapes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161582 Approved by: https://github.com/nandesuka	2025-08-27 21:31:28 +00:00
zhxchen17	c36d18d7e8	[rfc] aot precompile with custom backend api (#161383 ) Adding a new feature to torch.compile(fullgraph=True) which "aot_compile" a function with given example inputs. On user side it should look like: ``` def foo(x, y): return x + y compiled_fn = torch.compile(fullgraph=True).aot_compile(((torch.randn(3, 4), torch.randn(3, 4)), {})) ``` This is different from the traditional `torch.compile` workflow where compiled object will be a drop-in replacement for the original eager model: ``` tensor input -> torch.compile() -> tensor output (and populates the cache entry) ``` `aot_compile` will instead return a compiled function as result, and it's purely functional and doesn't populate the compile cache entry in dynamo: ``` tensor input -> aot_compile() -> compiled function ``` The aot compiled function will be savable and loadable on disk as well: ``` torch.compile(fullgraph=True).aot_compile(...).save_compiled_function('my/path') compiled_fn = torch.compiler.load_compiled_function("my/path") ``` Right now we treat compiler backend as a blackbox and it needs to implement the following interface to make compile artifacts serialzable: ``` class SerializableCallable: def save_compile_artifacts(): .... def load_compile_artifacts(): .... ``` We haven't implemented this for inductor yet, but this shouldn't be an issue since we gate this feature through `torch._dynamo.config.aot_compile` (which defaults to False), and this will be left as follow up PR to the current PR. Differential Revision: [D80914270](https://our.internmc.facebook.com/intern/diff/D80914270/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161383 Approved by: https://github.com/tugsbayasgalan	2025-08-27 21:26:25 +00:00
PyTorch MergeBot	014b98dd09	Revert "Add inductor backend to device interface; make minifier_tests more device agnostic (#151314 )" This reverts commit 77bc959fe122bfd131e339ca36cab445a1860806. Reverted https://github.com/pytorch/pytorch/pull/151314 on behalf of https://github.com/atalman due to sorry change is faling internally ([comment](https://github.com/pytorch/pytorch/pull/151314#issuecomment-3229774015))	2025-08-27 21:21:19 +00:00
PyTorch MergeBot	38ed57d446	Revert "Updates to CuTe DSL template renderer (#161117 )" This reverts commit 1750cc80374a9dd22fc26701c0602ae11a62baf0. Reverted https://github.com/pytorch/pytorch/pull/161117 on behalf of https://github.com/atalman due to will need to revert to unblock revert of https://github.com/pytorch/pytorch/pull/151314 ([comment](https://github.com/pytorch/pytorch/pull/161117#issuecomment-3229754295))	2025-08-27 21:17:25 +00:00
Benjamin Glass	007935a802	[cpp_wrapper] Swap to new PyBind11 simple GIL header (#161063 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161063 Approved by: https://github.com/Skylion007 ghstack dependencies: #160754	2025-08-27 21:15:01 +00:00
Benjamin Glass	cbc53b7696	Update pybind11 submodule to 3.0.1 (#160754 ) Upgrade to PyBind11 v3. This allows us to strip out our own (possibly broken?) handling of the C++ ABI when building extensions, in favor of the more-complete PyBind11 internal handling. Fixes a few test failures due to https://github.com/pybind/pybind11/issues/5774, which effectively makes the `__qualname__` attribute of functions platform-dependent. Test plan: CI Pull Request resolved: https://github.com/pytorch/pytorch/pull/160754 Approved by: https://github.com/Skylion007	2025-08-27 21:15:01 +00:00
Zain Rizvi	624bc36163	Ensure the comment id is always passed in to trymerge (#161558 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161558 Approved by: https://github.com/seemethere, https://github.com/malfet	2025-08-27 19:53:28 +00:00
Wang, Chuanqi	06c7516994	[BE] Upgrade XPU support package to 2025.2 (#158733 ) Including below changes, - Add XPU support package 2025.2 build and test in CI for both Linux and Windows - Keep XPU support package 2025.1 build in CI to ensure no break issue until PyTorch 2.9 release - Upgrade XPU support package from 2025.1 to 2025.2 in CD for both Linux and Windows - Rename Linux CI job name & image name to n & n-1 - Update XPU runtime pypi packages dependencies of CD wheels - Remove deprecated support package version docker image build Pull Request resolved: https://github.com/pytorch/pytorch/pull/158733 Approved by: https://github.com/EikanWang, https://github.com/atalman	2025-08-27 19:33:38 +00:00
William Wen	2efcf9d081	[dynamo] Fix graph break registry loading in fbcode (#161550 ) Summary: Add `torch/_dynamo/graph_break_registry.json` as an internal dependency. Minor related fixes. Test Plan: Test on OSS. Rollback Plan: Differential Revision: D81078973 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161550 Approved by: https://github.com/Lucaskabela, https://github.com/anijain2305	2025-08-27 19:25:15 +00:00
drisspg	443452ca2f	Remove test since it ooms on CI (#161644 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161644 Approved by: https://github.com/BoyuanFeng	2025-08-27 19:11:29 +00:00
Roman Bobniev	47ecd2042f	[ONNX] Fix index_put_ usage (#161263 ) Summary: It's hard to understand how it's working in most of our models, but in general it looks like `aten::copy_` is replaced incorrectly. There are two schemas for `aten::copy_`: 1. `aten::copy_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)` 2. `aten::copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)` According to the logic in the comments we don't need one of the parameters for `aten::index_put_`. It seems logic has been inferred from ordinary `aten::copy` where there could be a third parameter which is `non_blocking` flag. Depending on the execution environment the sliced copying can be replaced either by first schema or by second schema with explicitly setting default parameter to `False`. If first schema is selected it will lead to the crash (which is easily to catch in our prod env). In case of the second schema selection, there is no crash, but the third parameter is treated as `accumulate` parameter of the `index_put_` function which doesn't make sense. So, in any case usage of the third parameter must be removed from the `aten::copy_` replacement. For more details and check this post: https://fb.workplace.com/groups/1405155842844877/permalink/25337687649165028/ Test Plan: The test fails in production envirounment only. In the test env `non_blocking` flag is mapped as `False` to the `acumulate` flag, which doesn't cause test to fail, but has no sense in terms of flags mapping. The export works without errors, before the fix it was failing with accessing by index out of bounds vector, like this: ``` 1095 _C._jit_onnx_log("Torch IR graph at exception: ", graph) File ~/.bento/kernels/bento_kernel_gaia_ml/1578/bento_kernel_gaia_ml_binary-inplace#link-tree/torch/onnx/utils.py:636, in _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop, fixed_batch_size, params_dict, dynamic_axes, input_names, module) 629 _C._jit_pass_lower_all_tuples(graph) 630 # in _jit_pass_onnx, symbolic functions are called for each node for conversion. 631 # However, there are nodes that cannot be converted without additional context. 632 # For example, the number of outputs from split (and whether it is static or dynamic) is unknown 633 # until the point where it is unpacked by listUnpack node. 634 # This pass does a preprocess, and prepares the nodes such that enough context can be received 635 # by the symbolic function. --> 636 _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module) 637 _C._jit_pass_onnx_preprocess(graph) 639 # onnx does not support tuples, so try to remove them RuntimeError: vector::_M_range_check: __n (which is 2) >= this->size() (which is 2) ``` The test script: ``` import torch as th import tempfile class CopyTest(th.nn.Module): def forward( self, input_th: th.Tensor ): to_fill = th.ones((3, 3)) to_fill[:, 0] = input_th[:, 0] return to_fill m = CopyTest() test_tensor = th.zeros((3, 3)) with tempfile.NamedTemporaryFile() as f: th.onnx.export( m, (test_tensor,), f, export_params=True, opset_version=17, do_constant_folding=True, input_names=["input"], output_names=["features"], dynamo=False, ) ``` The exported model test: ``` import torch import onnx import onnxruntime model_name = '/home/ironsided/test_model.onnx' onnx_model = onnx.load(model_name) onnx.checker.check_model(onnx_model) example_inputs = (torch.zeros(3, 3),) onnx_inputs = [tensor.numpy(force=True) for tensor in example_inputs] print(f"Input length: {len(onnx_inputs)}") print(f"Sample input: {onnx_inputs}") ort_session = onnxruntime.InferenceSession( model_name, providers=["CPUExecutionProvider"] ) onnxruntime_input = {input_arg.name: input_value for input_arg, input_value in zip(ort_session.get_inputs(), onnx_inputs)} # ONNX Runtime returns a list of outputs onnxruntime_outputs = ort_session.run(None, onnxruntime_input)[0] print(onnxruntime_outputs) ``` The produced result is correct: ``` Input length: 1 Sample input: [array([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]], dtype=float32)] [[0. 1. 1.] [0. 1. 1.] [0. 1. 1.]] ``` Rollback Plan: Differential Revision: D80797028 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161263 Approved by: https://github.com/justinchuby, https://github.com/jermenkoo	2025-08-27 18:53:13 +00:00
drisspg	1750cc8037	Updates to CuTe DSL template renderer (#161117 ) # Summary This adds a few more render functions available to template writers, specifically get_output and modification. The reasons why are more clear in the next PR in this stack. <img width="1645" height="364" alt="Screenshot 2025-08-21 at 1 48 50 PM" src="https://github.com/user-attachments/assets/2d508fda-4273-43ef-9edf-086e592e9249" /> Majority of the new cod is around the OpOverrides for CuTe DSL. It is alot to test and most of the actual testing I have been doing is via score_mods to the flash_attention at the next layer of this stack. A bunch of score mods that me and Claude came up with , that exercise the actual ops. ``` Py def causal_mask(score, b, h, q_idx, kv_idx): """Causal attention mask.""" return torch.where(q_idx >= kv_idx, score, float("-inf")) def relative_bias(score, b, h, token_q, token_kv): """Relative position bias.""" return score + torch.abs(token_q - token_kv) def relative_bias_v2(score, b, h, token_q, token_kv): """Relative position bias with factor of 2.""" return score + 2 * torch.abs(token_q - token_kv) def times_two(score, b, h, q_idx, kv_idx): """Simple score modification that doubles the score.""" return score * 2 def alibi_bias(score, b, h, q_idx, kv_idx): """ALiBi (Attention with Linear Biases) - used in some modern models.""" # Different slopes for different heads slope = 2 ** (-8 * (h + 1) / 8) # Simplified version return score - slope * torch.abs(q_idx - kv_idx) def sliding_window(score, b, h, q_idx, kv_idx, window_size=256): """Sliding window attention - only attend to nearby tokens.""" return torch.where( torch.abs(q_idx - kv_idx) <= window_size, score, float("-inf") ) def block_diagonal(score, b, h, q_idx, kv_idx, block_size=64): """Block diagonal attention pattern.""" q_block = q_idx // block_size kv_block = kv_idx // block_size return torch.where(q_block == kv_block, score, float("-inf")) def additive_bias(score, b, h, q_idx, kv_idx): """Test simple addition with position-based bias.""" return score + (q_idx + kv_idx) * 0.01 def multiplicative_decay(score, b, h, q_idx, kv_idx): """Test multiplication with distance-based decay.""" distance = torch.abs(q_idx - kv_idx) return score * torch.exp(-0.1 * distance) def sine_wave_bias(score, b, h, q_idx, kv_idx): """Test trigonometric functions.""" return score + 0.1 * torch.sin(2 * math.pi * (q_idx - kv_idx) / 64) def log_distance_penalty(score, b, h, q_idx, kv_idx): """Test logarithmic operations.""" distance = torch.abs(q_idx - kv_idx).float() return score - torch.log(1 + distance) def alternating_mask(score, b, h, q_idx, kv_idx): """Test with alternating pattern - good for branch prediction.""" return torch.where((q_idx + kv_idx) % 2 == 0, score, float("-inf")) def head_specific_pattern(score, b, h, q_idx, kv_idx): """Different behavior per attention head.""" even_head = h % 2 == 0 causal = q_idx >= kv_idx return torch.where(even_head & causal, score, float("-inf")) def sparse_strided(score, b, h, q_idx, kv_idx, stride=4): """Sparse attention with strided pattern.""" return torch.where( (kv_idx % stride == 0) \| (q_idx == kv_idx), score, float("-inf") ) def causal_with_global(score, b, h, q_idx, kv_idx): """Causal mask but first few tokens are globally attended.""" is_causal = q_idx >= kv_idx is_global = kv_idx < 4 return torch.where(is_causal \| is_global, score, float("-inf")) def dilated_attention(score, b, h, q_idx, kv_idx, dilation_rate=2): """Dilated attention pattern - exponentially increasing gaps.""" distance = torch.abs(q_idx - kv_idx) is_attended = (distance == 0) \| ((distance > 0) & ((distance & (distance - 1)) == 0)) return torch.where(is_attended, score, float("-inf")) ``` Example outputs: ``` [Test Suite] Config: batch=4, heads=32, seq_q=8192, seq_kv=8192, dim=128 [Test 1: none] [No score_mod, flash='enabled'] Found flash_attncute: True [No score_mod, flash='disabled'] Found flash_attncute: False ✓ Outputs match between flash enabled/disabled ✓ Output matches eager SDPA (rtol=0.001, atol=0.001) [Test 2: causal] [With score_mod, flash='enabled'] Found flash_attncute: True [With score_mod, flash='disabled'] Found flash_attncute: False ✗ Outputs differ between flash modes: Tensor-likes are not close! Mismatched elements: 17879 / 134217728 (0.0%) Greatest absolute difference: 0.0078125 at index (0, 15, 15, 60) (up to 0.001 allowed) Greatest relative difference: 2.5 at index (3, 22, 153, 126) (up to 0.001 allowed) [Test 3: rel_bias] [With score_mod, flash='enabled'] Found flash_attncute: True [With score_mod, flash='disabled'] Found flash_attncute: False ✗ Outputs differ between flash modes: Tensor-likes are not close! Mismatched elements: 12836 / 134217728 (0.0%) Greatest absolute difference: 0.015625 at index (0, 3, 2775, 84) (up to 0.001 allowed) Greatest relative difference: 11.8125 at index (3, 28, 4095, 76) (up to 0.001 allowed) [Test 4: rel_bias_v2] ``` This is bfloat16 and there are no major differences. The list of pointwise ops here isn't exhaustive but it is fairly covering Pull Request resolved: https://github.com/pytorch/pytorch/pull/161117 Approved by: https://github.com/mlazos	2025-08-27 18:39:09 +00:00
Sandeep Narendranath Karjala	ec585ceab4	[inductor] structured-log graph execution order + test (#160448 ) Summary: - Emit a structured trace per compiled graph execution to reconstruct execution order in TLParse. - Adds debug.log_graph_execution(name) called from `CompiledFxGraph.__call__`, producing an artifact named inductor_graph_execution with payload {"graph": "graph_<id>"}. Testing: - Add inline test to verify structure and output Pull Request resolved: https://github.com/pytorch/pytorch/pull/160448 Approved by: https://github.com/xmfan	2025-08-27 18:12:46 +00:00
Yidi Wu	16ce6a4aad	[hop] move insert_deferred_runtime_asserts under subtracer (#161416 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161416 Approved by: https://github.com/pianpwk ghstack dependencies: #160548	2025-08-27 17:43:02 +00:00
Yang Wang	3345a7ff8a	[VLLM][FLASHINFER UPDATE] (#161537 ) VLLM build x torch fails due to flashinfer build fail, detected that vllm team recently changed the point to flashinfer Pull Request resolved: https://github.com/pytorch/pytorch/pull/161537 Approved by: https://github.com/huydhn	2025-08-27 17:41:26 +00:00
Huy Do	55e6ea105c	Fix running the benchmark jobs twice (#161619 ) I made a mistake in https://github.com/pytorch/pytorch/pull/160935 removing this condition check. This ran the benchmark job twice for schedule jobs, i.e. https://github.com/pytorch/pytorch/actions/runs/17266546494. This was missed during testing because `pull_request` and `workflow_dispatch` were working ok. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161619 Approved by: https://github.com/anijain2305	2025-08-27 17:18:10 +00:00
lakshayg	a3fa1b8c2a	Set USE_NVSHMEM only if USE_DISTRIBUTED is set (#161451 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161451 Approved by: https://github.com/eqy	2025-08-27 17:11:19 +00:00
Chris Leonard	620d52e882	Fix sort doc error (#161539 ) Fixes #129298. Updated torch.sort documentation so that the 'stable' parameter is a Keyword Argument. This is how it's implemented in PyTorch. @malfet Pull Request resolved: https://github.com/pytorch/pytorch/pull/161539 Approved by: https://github.com/soulitzer	2025-08-27 17:01:53 +00:00
PyTorch MergeBot	69c7b16e6f	Revert "Back out "Refactor CUDAAllocatorConfig to reuse AcceleratorAllocatorConfig (#150312 )" (#161002 )" This reverts commit a03cc53e6f6e2fe67316cb8c74c25f5b953f445b. Reverted https://github.com/pytorch/pytorch/pull/161002 on behalf of https://github.com/guangyey due to This PR breaks CI TestCudaMallocAsync::test_allocator_settings ([comment](https://github.com/pytorch/pytorch/pull/161002#issuecomment-3228980897))	2025-08-27 16:52:22 +00:00
Guilherme Leobas	379ebdaf5e	[OrderedDict] Implement `OrderedDict.popitem(last=...)` (#155153 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/155153 Approved by: https://github.com/anijain2305 ghstack dependencies: #160156, #155072, #155152	2025-08-27 15:46:40 +00:00
Guilherme Leobas	7c8f049d54	[OrderedDict] Implement `OrderedDict.move_to_end(key, last=False)` (#155152 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/155152 Approved by: https://github.com/anijain2305 ghstack dependencies: #160156, #155072	2025-08-27 15:46:40 +00:00
Guilherme Leobas	e3718c4855	[dict] Implement dict.__ior__ and fix return type in dict.__or__ (#155072 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/155072 Approved by: https://github.com/anijain2305 ghstack dependencies: #160156	2025-08-27 15:46:40 +00:00
Guilherme Leobas	2d44969bbd	Wrap class definitions in `set_fullgraph(False)` in `test_dict`/`test_ordered_dict` (#160156 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160156 Approved by: https://github.com/zou3519	2025-08-27 15:46:40 +00:00
Irem Yuksel	a2af6a9d6b	Run WoArm64 CI every 4 hours (#161504 ) Since WoArm64 isn’t part of CI yet, this PR schedules the workflow to increase visibility and insights. It will execute every 4 hours and still support manual runs via the `ciflow/win-arm64` tag. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161504 Approved by: https://github.com/seemethere, https://github.com/atalman	2025-08-27 15:46:34 +00:00
PyTorch MergeBot	28af843ee0	Revert "Fix index_add for int64 input + zerodim index (#161511 )" This reverts commit d51486616cb3fe54bc298669a88059be56c1fb22. Reverted https://github.com/pytorch/pytorch/pull/161511 on behalf of https://github.com/clee2000 due to broke test_indexing.py::TestIndexingCPU::test_index_add_zerodim_index_floating_alpha_cpu [GH job link](https://github.com/pytorch/pytorch/actions/runs/17257089116/job/48971728595) [HUD commit link](`d51486616c`) on dynamo? ([comment](https://github.com/pytorch/pytorch/pull/161511#issuecomment-3228705842))	2025-08-27 15:38:11 +00:00
Karthick Panner Selvam	378edb047f	[Inductor] Add DeviceAssert op to enable device-side assertion in torch.compile (#160677 ) This PR introduces a device_assert op to trigger device-side assertions within torch.compile. This implementation is based on the suggestion in [this comment](https://github.com/pytorch/pytorch/issues/147282#issuecomment-2756056084). Changes Included - Implemented device_assert op and overrides has_side_effect to return True to avoid removal by dead code elimination. - Commented out the assert_async_msg_decomp and functional_assert_async_msg_decomp decompositions to disable the default assert decomposition inside Inductor. - Added lowering for torch.ops.aten._assert_async.msg to convert assert calls into the ops_handler. - Implemented the codegen method for the device_assert op. This supports generating C++ and Triton code. - Added test cases to verify both "should throw" and "should not throw" scenarios. Fixes #147282 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160677 Approved by: https://github.com/mlazos	2025-08-27 14:49:20 +00:00
FFFrog	d2db6c86b0	[OpenReg] Add Develop Notes for Integrating New Backend into PyTorch (#158644 ) To facilitate the integration of the new backend, we plan to publish a new development note that details all the key components,hoping to speed up the development of other accelerators. This PR is the beginning of this note, and involve the part of registration of operators and we will gradually improve it and keep in sync with OpenReg's code. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158644 Approved by: https://github.com/albanD	2025-08-27 14:47:25 +00:00
Animesh Jain	a3c1cbdbc6	[dynamo][higher order ops] Refactor for out spec (#161354 ) Preparing for the next PR to add more info in the output spec. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161354 Approved by: https://github.com/zou3519	2025-08-27 14:41:18 +00:00
Ting Lu	9632f4ea9f	[CD] [aarch64] Add CUDA 13.0 sbsa nightly build (#161257 ) https://github.com/pytorch/pytorch/issues/159779 CUDA SBSA build for CUDA 13.0 1. Supported archs: sm_80 to sm_120. Including support for Thor (sm_110), SPARK (sm_121), GB300 (sm_103). "This release adds support of SM110 GPUs for arm64-sbsa on Linux." from 13.0 release notes https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html 2. Use -compress-mode=size for binary size reduction, 13.0 wheel is 2.18 GB, when compared with 12.9 3.28 GB, that is 1.1 GB of savings and ~33.5% smaller. 3. Refactored the libs_to_copy list with common libs, and version_specific_libs. TODO: add the other CUDA archs in the existing support matrix of x86 to SBSA build as well Pull Request resolved: https://github.com/pytorch/pytorch/pull/161257 Approved by: https://github.com/nWEIdia, https://github.com/atalman	2025-08-27 14:38:07 +00:00
Animesh Jain	3d406429b0	[dynamo][vllm] Support typing.get_type_hints (#161362 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161362 Approved by: https://github.com/Skylion007, https://github.com/StrongerXi, https://github.com/jansel	2025-08-27 09:55:31 +00:00
Shangdi Yu	9a12bab0d3	Add debug handle to inductor provenance tracking (#161110 ) Summary: Use debug handle on kernel names to distinguish different calls to the same kernel. Previous kernel name: kernel_name New kernel name: kernel_name:debug_handle We add the debug handle to the tlparse artifacts: `inductor_provenance_tracking_node_mappings` and `inductor_provenance_tracking_kernel_stack_traces`. We also add debug handles in the comments of the generated code so we can map to them in the provenance tracking highlighter tool: https://github.com/pytorch/tlparse/pull/134 Example output code is below. If a kernel doesn't have a debug handle, the `[Provenance debug handles]` comment line will not be written. ``` # Topologically Sorted Source Nodes: [y, z], Original ATen: [aten.addmm, aten.gelu] # [Provenance debug handles] triton_poi_fused_addmm_gelu_2:3 stream0 = get_raw_stream(0) triton_poi_fused_addmm_gelu_2.run(buf4, primals_5, 300, stream=stream0) ``` The debug handles will also be used by downstream profilers such as zoomer. Test Plan: ``` buck run mode/opt fbcode//caffe2/test/inductor:provenance_tracing ``` Rollback Plan: Differential Revision: D78994959 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161110 Approved by: https://github.com/angelayi	2025-08-27 04:56:11 +00:00
Manuel Candales	d51486616c	Fix index_add for int64 input + zerodim index (#161511 ) Fixes #161446 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161511 Approved by: https://github.com/malfet	2025-08-27 04:11:10 +00:00
Animesh Jain	07a4e9fea8	[benchmarks] Skip mobilenetv3_large_100 in CI for accuracy (#161570 ) To keep the CI green - https://github.com/pytorch/pytorch/issues/161419 Its unclear if this is a real failure. And debugging it is non trivial. Skipping for now to keep the CI greenst Pull Request resolved: https://github.com/pytorch/pytorch/pull/161570 Approved by: https://github.com/BoyuanFeng, https://github.com/zou3519	2025-08-27 03:44:04 +00:00
Michael Lazos	be55d7ac9e	Revert "[Dynamo] Allow inlining into AO quantization modules (#152934 )" (#161567 ) This reverts commit 20e2ca3e29ce9eb33eef17db077696222c175764. Fixes https://github.com/pytorch/pytorch/issues/157434 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161567 Approved by: https://github.com/Lucaskabela	2025-08-27 03:33:04 +00:00
William Wen	8b78ba07b1	[dynamo, nested graph breaks] add nested graph break tests (#144516 ) Note: nested graph break tests (and wrapped tests) are xfailed/skipped for now - we will iteratively enable the tests as more of the nested graph break implementation is complete. Differential Revision: [D81084809](https://our.internmc.facebook.com/intern/diff/D81084809) Pull Request resolved: https://github.com/pytorch/pytorch/pull/144516 Approved by: https://github.com/anijain2305	2025-08-27 03:00:56 +00:00
drisspg	b36a20d368	Ensure large tensor int32 -> int64 indexing is enabled (#157767 ) Fixes: #https://github.com/pytorch/pytorch/issues/157446 I think that this delta is worth the switch form block-ptrs especially since they are deprecated ## Perf Summary A is nightly B is this diff, so `negative` means this diff improves perf TOP 5 differences <img width="805" height="754" alt="Screenshot 2025-08-24 at 5 49 49 PM" src="https://github.com/user-attachments/assets/aa359cdf-ee9a-427d-be72-1b9aef6f3115" /> <details> <summary><strong>Full perf table (click to expand)</strong></summary> \| attn_type \| dtype \| shape(B,Hq,M,Hkv,N,D) \| TFlops Version A \| TFlops Version B \| \| --- \| --- \| --- \| --- \| --- \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 258.38834144791923 \| 258.6353685004612 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 142.2192450677751 \| 140.12393320464972 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 122.32683823617003 \| 118.51603755647925 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 142.48556906165314 \| 137.24259849208627 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 86.59814488695922 \| 84.59431398586257 \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 288.52679758135764 \| 292.9174195871856 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 172.25541683643277 \| 172.94326459828508 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 164.40864610599826 \| 165.035129576335 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 176.54876886433945 \| 175.08057670028145 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 125.22491679812626 \| 121.06201152859151 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 339.11952481874283 \| 339.0132835601695 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 227.58583240284406 \| 228.21824999409597 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 185.98569659868966 \| 182.32850843255093 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 188.9495725191772 \| 180.31385312481657 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 106.25789530994302 \| 106.55084959448476 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 357.6430536888533 \| 363.30843452247274 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 262.3241154406613 \| 265.73250045488 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 249.30498953911416 \| 249.35928192833785 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 224.74126243851808 \| 223.71776504077988 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 168.26977014013707 \| 165.47991483333809 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 382.8178701785897 \| 384.34752965862685 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 308.1449710013853 \| 311.0653716044644 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 251.96365252505072 \| 243.92283557225903 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 226.69316232745368 \| 215.22769268913356 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 153.34142545296405 \| 151.9312673939401 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 396.0998000753126 \| 398.35036286102473 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 333.5198415274966 \| 344.6354466169716 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 310.5955933379696 \| 305.66347819546 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 260.4012412689896 \| 259.758666997307 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 234.13034252182635 \| 227.61676497283614 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 396.17615538477196 \| 401.1419104525502 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 359.98648311998414 \| 360.8285563463094 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 291.97720707257736 \| 281.41694809965253 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 250.1703628419691 \| 238.556760291579 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 199.50782826294306 \| 191.52327358439223 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 411.0632004785396 \| 413.6362648405517 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 382.9404387613185 \| 397.74886235657607 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 357.0998545146633 \| 350.5115200772392 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 281.8033924428203 \| 281.98601309215843 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 282.56595134222135 \| 277.4565795466672 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 408.89838018149516 \| 405.14531386840076 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 396.07662058160264 \| 393.4598228299578 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 317.8822887267849 \| 304.754931401036 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 265.8801304948243 \| 254.22961974295112 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 227.87390579965614 \| 222.19481980110393 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 427.36821778477025 \| 431.3766620314935 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 410.67994346825 \| 423.4666944003808 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 381.1968748374038 \| 381.77668006420424 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 292.5540046358546 \| 296.5439130720502 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 321.04573768858114 \| 310.7423616656888 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 427.46148866769903 \| 426.162091037068 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 419.75580537687347 \| 421.88640120274334 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 337.3208051798903 \| 327.4912454675092 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 276.5638854539581 \| 262.988360558083 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 250.82791326036886 \| 245.07367032501736 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 435.8055824506086 \| 441.8803729460534 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 432.02638235921006 \| 450.33161016596273 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 402.25525939224883 \| 393.8564689669916 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 297.5337286675904 \| 297.0131881135074 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 343.8697037899545 \| 329.8194073407783 \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 267.58912366821056 \| 256.91606054118375 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 150.81723692609629 \| 146.32172267858743 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 129.51029293209245 \| 122.72144394093334 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 147.627656359087 \| 141.68956350566188 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 87.55100546003591 \| 84.91293287692788 \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 299.5931492743986 \| 305.884253766691 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 179.39026367843837 \| 181.64741311605096 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 173.93547669282367 \| 173.23972950980564 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 185.90234171599252 \| 182.80844545446686 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 128.08176696266082 \| 123.27722685662111 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 340.50674552770664 \| 338.9071088484576 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 225.4438318650432 \| 230.22899884832975 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 194.15123248528312 \| 185.02793973094865 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 200.74289714108176 \| 191.76606719670647 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 107.03564946728423 \| 106.82432377861258 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 371.31799283918406 \| 379.7555394732925 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 275.97762744310455 \| 276.71106853992995 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 261.6648679783462 \| 259.4127232060398 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 237.03108223577615 \| 233.92710216149527 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 172.13926800371152 \| 168.74390922407585 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 381.50199487767276 \| 383.9043681999597 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 307.9748883093411 \| 312.2403515462001 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 251.11319684705438 \| 243.17870127827277 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 236.3253127246763 \| 223.81250201769552 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 154.55693991756874 \| 153.11360584987685 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 407.11400078586615 \| 413.53709886086557 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 348.1705797722622 \| 360.09771155957367 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 321.8593280850388 \| 318.2882327401255 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 270.089032013835 \| 268.767323026064 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 238.07324557907788 \| 228.09842078362692 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 399.8172853171901 \| 401.0954526332136 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 363.4387330438581 \| 364.13111024232677 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 294.1752429133857 \| 283.7235663368415 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 256.8389394007649 \| 246.91771015606483 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 199.3378564292656 \| 192.40439590901758 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 425.5150965556111 \| 430.8190098707553 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 396.00437184073013 \| 411.3873625655787 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 369.92803661607815 \| 361.43244467343663 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 293.4277354412933 \| 295.2529537595746 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 288.0208673072841 \| 281.51896404878863 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 408.3005367220567 \| 408.96116482298913 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 396.90095962766304 \| 396.87385456176486 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 319.0534576137999 \| 302.50950358107764 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 270.3334977708081 \| 258.8506349486557 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 227.46824134365394 \| 222.23759438128766 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 438.24247309479694 \| 437.7975163205371 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 428.34012029699227 \| 433.3215899950434 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 386.52672049728875 \| 388.26216893354984 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 302.71976814728083 \| 302.3574867306459 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 327.39760662780986 \| 308.6348428844912 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 423.31308678262695 \| 426.6306972137279 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 412.6983690923106 \| 419.4961977664297 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 337.41003544742273 \| 324.2155049126126 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 278.7755890910794 \| 265.9194286636502 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 251.55678254755364 \| 244.8843180141462 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 452.5930781172308 \| 457.7117122300742 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 445.05676260348116 \| 463.9304535499636 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 415.78302138389415 \| 406.29229555271456 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 308.0311067300895 \| 304.91354721414314 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 351.43943626809335 \| 329.4476923070317 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 295.1801525813241 \| 291.36521287398904 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 183.23250549178067 \| 182.35421238887605 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 151.56832453117747 \| 151.3422139154794 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 171.02111935180432 \| 160.72516856727913 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 74.05765122783826 \| 74.5885345035243 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 314.3587394591763 \| 319.2938677773619 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 224.57002084153177 \| 225.48868542008177 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 216.00964804143052 \| 215.39576159953486 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 216.1174237618258 \| 214.28437413525663 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 121.08920423648368 \| 119.55813661872644 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 362.2193857281911 \| 360.05005804275936 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 279.8840217430121 \| 279.5437918286659 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 227.76617121021982 \| 222.8655938229316 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 215.43141176970562 \| 207.71852284994702 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 121.35588364218539 \| 121.20636565046884 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 365.1545280898012 \| 373.37585444987326 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 304.360119952975 \| 309.1247297936263 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 287.2603904544586 \| 289.25547903162595 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 257.9852675272418 \| 257.59069234098115 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 188.35158496670232 \| 184.24683960154857 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 389.9744911369211 \| 388.43466897254166 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 345.9228295166513 \| 342.63034895210126 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 279.56334658247437 \| 271.2724375402088 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 245.66477202810066 \| 233.49688207371258 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 170.3270720653187 \| 166.23863845657382 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 400.0041140827554 \| 402.11182445396497 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 363.64641830327434 \| 375.9288663364792 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 341.5776139573363 \| 335.1160003213424 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 281.1811770268521 \| 280.21438270014005 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 247.78716118997716 \| 245.3269825179633 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 403.794126680488 \| 405.2353919019577 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 387.079178426863 \| 385.1461762057035 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 309.7847188173431 \| 298.0443968374749 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 262.4721750159666 \| 250.81679725428586 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 205.70866004479979 \| 202.9620839129557 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 413.380982988662 \| 418.40270594263103 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 398.450064800682 \| 409.6794973994029 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 372.26297458194466 \| 364.44415106552196 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 293.0818569905912 \| 292.85172400643984 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 296.46717085592087 \| 285.76362010612763 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 419.3186786037592 \| 426.08801580934437 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 408.1648467766632 \| 409.4122254207817 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 329.24396020457345 \| 313.5200995121138 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 274.61257504571876 \| 255.7801815432177 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 232.63806001220684 \| 230.03020843492314 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 435.0785891054788 \| 440.39101804225345 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 424.86925312752817 \| 435.18898057396825 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 393.000417896268 \| 395.11543361225256 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 297.7755459218185 \| 300.7208114715287 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 331.71570861760534 \| 318.07127352552885 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 424.58602747137405 \| 425.84897078470715 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 422.66607285025725 \| 423.5524945535485 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 344.8625760048626 \| 331.6793888458635 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 282.0787281511649 \| 263.7895634445868 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 252.7301927385177 \| 245.41844170037427 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 437.0658069164588 \| 442.9101960063628 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 433.13788271434646 \| 452.3873572709863 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 404.0959191546953 \| 396.7077863894884 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 300.45502211883206 \| 301.3439134717943 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 344.11003202413934 \| 330.8897663350314 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 298.4364205341705 \| 291.6793556507056 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 187.6382133139633 \| 191.05409897308772 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 156.55822078636112 \| 154.178925976516 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 173.47765221825162 \| 169.30862508068464 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 74.5885345035243 \| 74.52689061607104 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 323.12233826013045 \| 328.53889207933514 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 236.75872140126316 \| 235.8378325547398 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 227.17836523816675 \| 226.75357076139966 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 224.07209453308036 \| 224.07209453308036 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 122.85572156047981 \| 121.11642183704716 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 361.3123326658092 \| 360.71014086458337 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 281.5287983927017 \| 281.94301754758345 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 232.7456696285686 \| 226.50976826432776 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 221.5612361744038 \| 214.96188822837055 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 121.38311528944315 \| 120.85441868178513 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 380.2579019244734 \| 389.2520157863988 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 316.95230660496924 \| 317.87597790618906 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 301.07968126657323 \| 298.02424098422983 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 267.2240756921594 \| 267.16353549228154 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 189.82761622494257 \| 186.736450261963 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 389.88665375406805 \| 387.9125133037077 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 348.70619958684887 \| 346.6750499749774 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 280.5472989906087 \| 271.22300822012187 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 250.02397620165968 \| 241.22532776331445 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 171.67817496107645 \| 166.95679280483972 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 412.626880230807 \| 417.60238657950777 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 374.8829313933945 \| 389.4448546468815 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 353.20410434172436 \| 345.7072490717473 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 292.51045924209586 \| 291.66621022138287 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 251.6264062063495 \| 248.45110052911542 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 404.0155784550126 \| 401.90546837237514 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 384.4389015599863 \| 386.9684324594344 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 313.3731284132225 \| 298.17074251037894 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 264.19199737284265 \| 252.8982463999916 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 207.03696315185684 \| 202.86697323136772 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 428.2436763312506 \| 433.45005568619536 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 411.8516531869893 \| 428.2753623461049 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 384.9095037182509 \| 372.90888743000744 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 303.2438915629836 \| 302.05095952914337 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 301.8689122735564 \| 285.0363190513223 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 423.13592231504805 \| 420.3991500185611 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 407.44527331585493 \| 408.5064370765247 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 330.50050996167414 \| 316.8763979925965 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 274.6833786307413 \| 259.86098862141324 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 232.24019584158367 \| 226.52040268160232 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 444.4596314237808 \| 455.99558915752266 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 437.4245561244369 \| 455.98275147271966 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 397.3350686877605 \| 397.88875599028063 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 308.53809114394545 \| 307.1359822042007 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 331.32379843423774 \| 316.85293191675646 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 422.4622274366379 \| 425.0407156418684 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 420.9547052783101 \| 430.33779243510276 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 345.50265346504085 \| 332.094855328957 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 280.81715528243365 \| 264.6543640282054 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 252.25635200421783 \| 245.46235499490305 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 452.5524207341139 \| 461.7512032176736 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 445.2316469907137 \| 464.4523799578466 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 416.87264016717023 \| 409.17124592157046 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 309.42579489389846 \| 307.9734464665731 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 350.50782004300623 \| 330.98959545427294 \| </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/157767 Approved by: https://github.com/Skylion007	2025-08-27 02:45:20 +00:00
PyTorch MergeBot	de58505890	Revert "[Inductor] Add DeviceAssert op to enable device-side assertion in torch.compile (#160677 )" This reverts commit cddcaa19035d6414a351be7c7b16c47d5a0c3466. Reverted https://github.com/pytorch/pytorch/pull/160677 on behalf of https://github.com/karthickai due to This is breaking tests on Rocm ([comment](https://github.com/pytorch/pytorch/pull/160677#issuecomment-3226541063))	2025-08-27 02:36:42 +00:00
atalman	6913529ff8	Move non inductor workflows to Python 3.9 -> 3.10 (#161182 ) Related to: https://github.com/pytorch/pytorch/issues/161167 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161182 Approved by: https://github.com/malfet, https://github.com/huydhn, https://github.com/seemethere	2025-08-27 02:32:24 +00:00
Gabriel Ferns	4b4cdcfe3a	Fix conv exhaustive autotuning and expand Exhaustive test coverage (#159387 ) - Fix Conv exhaustive. - Fix AMD config pruning. - Expand exhaustive test suite. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159387 Approved by: https://github.com/coconutruben	2025-08-27 01:54:50 +00:00
Ke Wen	68d395d61e	[3/N][SymmMem] Expose offset field from handle (#161532 ) As titled, so that kernels relying on direct pointers can use base address and `hdl.offset` to access remote memory. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161532 Approved by: https://github.com/ngimel ghstack dependencies: #161470, #161471	2025-08-27 00:49:06 +00:00
Ke Wen	4ed71d5412	[2/N][SymmMem] Add MemPool allocator and tests (#161471 ) (Porting most of #161008) Hooking SymmetricMemory Allocator to MemPool so that user can create symmetric tensors with regular `torch.zeros`, `torch.arange` etc factories. Also so that our ops can have functional variants that create `out` tensors on symmetric memory. To end users, this PR supports a python UI as follows: ``` allocator = symm_mem.get_mempool_allocator(device) mempool = torch.cuda.MemPool(allocator) with torch.cuda.use_mem_pool(mempool): tensor = torch.arange(numel, dtype=dtype, device=device) ``` Added tests for both use cases above. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161471 Approved by: https://github.com/ngimel ghstack dependencies: #161470	2025-08-27 00:49:06 +00:00
Ke Wen	8dd5aa9689	[1/N][SymmMem] Add offset to handle, cache on base address (#161470 ) For the kernels that need peer pointers directly, the rendezvous handle should allow user to get the offset of tensor wrt to base allocation address. Thus the need to add an `offset` field to SymmMem handle. But we don't want to cache all the handles just bc they have different offsets, hence the search and cache logic below: (i) At rendezvous, the search key is still `x.storage().data_ptr()`, like now, but it should do search in 2 parts - one is just dictionary lookup, like today, if that failed, it needs to search `allocations_` to see if the storage ptr falls in one of the segments. This is possible as we have all segments recorded during alloc. (ii) If this segment hasn't been rendezvoused, we rendezvous it, cache it in the `symm_mem_` map with its base address as key. (iii) We still need to return a handle for the current tensor, with a corresponding offset. This handle will be a shallow copy of the base handle, with the offset adjusted. Some impl details: (i.1) If we find a matching allocation, we can immediately use the allocation base address to do a re-search in `symm_mem_`. (iii.1) To make the handle copy shallow, we move the common information -- base ptrs, base signal pad, etc -- to a structure referenced by both handles. The structure is called `NVSHMEMPeerAllocInfo`. A copy of handle just adds one more `intrusive_ptr` to it. The handle copy constructor accepts an `offset` argument. Test: Existing tests should not fail. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161470 Approved by: https://github.com/ngimel	2025-08-27 00:49:06 +00:00
Angela Yi	8ff9485815	[export] Update unflattening dynamo.disable (#161306 ) Summary: Doing inline disabling causes recompiles with the reason "Cache line invalidated because L['___stack0'] got deallocated" Test Plan: CI Rollback Plan: Differential Revision: D80816956 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161306 Approved by: https://github.com/pianpwk	2025-08-27 00:27:16 +00:00
William Wen	b074cbaedd	[dynamo] allow resume functions to have name in both freevars and varnames (#161544 ) fixes https://github.com/pytorch/pytorch/issues/161542 Differential Revision: [D81073109](https://our.internmc.facebook.com/intern/diff/D81073109) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161544 Approved by: https://github.com/StrongerXi, https://github.com/anijain2305	2025-08-27 00:25:16 +00:00
Scott Wolchok	80bf883d21	Replace manual cache in _python_dispatch.get_alias_info with functools.cache (#161286 ) In addition to being more code, the manual cache was doing an extra dictionary lookup on each cache hit. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161286 Approved by: https://github.com/wconstab	2025-08-27 00:17:51 +00:00
Blaine Burton Rister	9de9d25f8d	[Inductor-FX] Support custom triton kernels (#161474 ) # Feature Add support for custom Triton kernels to the FX backend. This turned out not to require any new features, except for a minor change to handle `tl.constexpr` arguments which are not part of the autotuning config. # Caveat This may not cover every possible case. For example, we might need more features for autotuning custom Triton code. This PR entirely skips the [custom codegen ](https://github.com/pytorch/pytorch/blob/main/torch/_higher_order_ops/triton_kernel_wrap.py#L1034-L1039) for user-defined grid functions, but there may be edge cases requiring this logic. However, this PR seems to do a reasonable job as many of the grids end up being written into Inductor/Triton metadata and don't require special codegen. As a follow up, I'm planning to test this against all of AOTI's custom Triton kernel tests. # Test plan Added a CI test using a custom Triton kernel. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161474 Approved by: https://github.com/angelayi	2025-08-27 00:15:19 +00:00
Malay Bag	dbc903a94a	[APS IR] Minfor fix - use GetAttrKey in get_keystr to match with flat args path in unflatten (#161453 ) Summary: While passing path info to [_check_input_constraints_for_graph](https://www.internalfb.com/code/fbsource/[6b5b2dc35902a26ce265e3c0ae5189a3faba1d38]/fbcode/caffe2/torch/export/unflatten.py?lines=594), GetAttrKey is used to specify path str. To match with that get_keystr should also use GetAttrKey. Test Plan: Existing tests ``` buck run mode/opt caffe2/test:test_export -- -r unflatten ``` ``` Ran 413 tests in 204.533s OK (skipped=1, expected failures=13) ``` Rollback Plan: Differential Revision: D80984083 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161453 Approved by: https://github.com/tugsbayasgalan	2025-08-27 00:05:20 +00:00
PyTorch MergeBot	1b34e04485	Revert "Update pybind11 submodule to 3.0.1 (#160754 )" This reverts commit 660b0b8128181d11165176ea3f979fa899f24db1. Reverted https://github.com/pytorch/pytorch/pull/160754 on behalf of https://github.com/atalman due to please see https://github.com/pytorch/pytorch/pull/160754#issuecomment-3226051449 ([comment](https://github.com/pytorch/pytorch/pull/160754#issuecomment-3226078102))	2025-08-26 23:35:22 +00:00
PyTorch MergeBot	1ce423274d	Revert "[cpp_wrapper] Swap to new PyBind11 simple GIL header (#161063 )" This reverts commit 74c4c758afa8c28162f00a456c185552e1159fd3. Reverted https://github.com/pytorch/pytorch/pull/161063 on behalf of https://github.com/atalman due to sorry broke vllm tests please see https://github.com/pytorch/pytorch/pull/160754#issuecomment-3226051449 ([comment](https://github.com/pytorch/pytorch/pull/161063#issuecomment-3226065212))	2025-08-26 23:31:23 +00:00
PyTorch MergeBot	4e630f0629	Revert "[Inductor] Update Outer Reduction Heuristic (#159093 )" This reverts commit ca9fe0107e165a4a4147325ff6d34235ebde447f. Reverted https://github.com/pytorch/pytorch/pull/159093 on behalf of https://github.com/PaulZhang12 due to Addressing internal implications then relanding ([comment](https://github.com/pytorch/pytorch/pull/159093#issuecomment-3225942525))	2025-08-26 22:37:56 +00:00
Karthick Panner Selvam	cddcaa1903	[Inductor] Add DeviceAssert op to enable device-side assertion in torch.compile (#160677 ) This PR introduces a device_assert op to trigger device-side assertions within torch.compile. This implementation is based on the suggestion in [this comment](https://github.com/pytorch/pytorch/issues/147282#issuecomment-2756056084). Changes Included - Implemented device_assert op and overrides has_side_effect to return True to avoid removal by dead code elimination. - Commented out the assert_async_msg_decomp and functional_assert_async_msg_decomp decompositions to disable the default assert decomposition inside Inductor. - Added lowering for torch.ops.aten._assert_async.msg to convert assert calls into the ops_handler. - Implemented the codegen method for the device_assert op. This supports generating C++ and Triton code. - Added test cases to verify both "should throw" and "should not throw" scenarios. Fixes #147282 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160677 Approved by: https://github.com/mlazos	2025-08-26 22:33:23 +00:00
soulitzer	1e4dfeeb06	Add early_stop kwarg to torch.utils.checkpoint (#160781 ) We already have a context manager "set_checkpoint_early_stop". This PR adds a kwarg that toggles the same setting. It is also useful to have a kwarg version of the setting in addition to the context manager because is annoying to apply a context manager when the AC is being applied via CheckpointWrapper. Similar to the "debug" kwarg and the corresponding "set_checkpoint_debug_enabled" context manager, the context manager defaults to None and overrides the local setting when non-None. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160781 Approved by: https://github.com/tianyu-l	2025-08-26 22:32:35 +00:00
angelayi	4d078cfc4e	[fx] Add is_fx_symbolic_tracing flag (#161385 ) Fixes https://github.com/pytorch/pytorch/issues/135276 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161385 Approved by: https://github.com/pianpwk	2025-08-26 22:26:27 +00:00
Ti-Tai Wang	da838f65af	[ONNX] Drop draft_export in exporter API (#161454 ) If onnx exporter fallbacks to draft_export with big models, this is taking forever for users, and possibly spam the printout, which keeps users from their stack trace with strict=False. We could consider make another API for draft_export as debugging tool, or combine it with report=True when "model is small"? Pull Request resolved: https://github.com/pytorch/pytorch/pull/161454 Approved by: https://github.com/justinchuby	2025-08-26 22:13:43 +00:00
gaoyufeng	cde54fe4e9	fix-unpin-memory-tensor-param (#160992 ) Fixes #160983 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160992 Approved by: https://github.com/ngimel	2025-08-26 21:55:25 +00:00
soulitzer	e06d1d6610	[BE] Improve torch.inference_mode docs and error message (#161164 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161164 Approved by: https://github.com/sfc-gh-sbekman, https://github.com/janeyx99	2025-08-26 20:58:56 +00:00
Hashem Hashemi	b2db293abc	[ROCm] No-fence global reduce (#161180 ) This change removes need for fences in global_reduce by converting the stores to reduce_buffer[] into atomics+return. This is crucial for perf in architectures with split caches (e.g. MI300), where fences are inherently costly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161180 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-26 20:43:59 +00:00
PyTorch MergeBot	6686974ddd	Revert "[dynamo, nested graph breaks] add nested graph break tests (#144516 )" This reverts commit 9a756c2d710a0680bac93ab0b42db519ec2dc6cf. Reverted https://github.com/pytorch/pytorch/pull/144516 on behalf of https://github.com/atalman due to failing internal tests ([comment](https://github.com/pytorch/pytorch/pull/144516#issuecomment-3225659358))	2025-08-26 20:40:17 +00:00
eqy	3d82256a86	[FP8][cuBLAS][SM100] cuBLAS doesn't support rowwise-scaling on `sm110` or `sm120` either (#161236 ) See also #160693 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161236 Approved by: https://github.com/Skylion007	2025-08-26 20:40:11 +00:00
PyTorch MergeBot	a4fb65701b	Revert "[dynamo, nested graph breaks] support very simple nested graph breaks (#159329 )" This reverts commit 8dab6d4c414bf997297804008c3da893e69cd51f. Reverted https://github.com/pytorch/pytorch/pull/159329 on behalf of https://github.com/atalman due to failing internally ([comment](https://github.com/pytorch/pytorch/pull/159329#issuecomment-3225617445))	2025-08-26 20:24:10 +00:00
PyTorch MergeBot	6afd766401	Revert "[dynamo, nested graph breaks] support nested graph breaks x context managers (#159678 )" This reverts commit 02fa5bf6d80fa4baa6bb6dd2fa6a16d88852da91. Reverted https://github.com/pytorch/pytorch/pull/159678 on behalf of https://github.com/atalman due to failing internal tests ([comment](https://github.com/pytorch/pytorch/pull/159678#issuecomment-3225597425))	2025-08-26 20:16:36 +00:00
PyTorch MergeBot	a7aa480e55	Revert "[dynamo, nested graph breaks] support nested closures (#159817 )" This reverts commit ef0ef6f93f7ef6d16d71a6997b72185504acd4b6. Reverted https://github.com/pytorch/pytorch/pull/159817 on behalf of https://github.com/atalman due to failing internal tests ([comment](https://github.com/pytorch/pytorch/pull/159817#issuecomment-3225586996))	2025-08-26 20:13:33 +00:00
PyTorch MergeBot	9f6e1b8730	Revert "[ROCm] SDPA fix mem fault when dropout is enabled (#154864 )" This reverts commit 3caddd4daa5b1a167663c07219e065e86247ad76. Reverted https://github.com/pytorch/pytorch/pull/154864 on behalf of https://github.com/atalman due to reverted internally ([comment](https://github.com/pytorch/pytorch/pull/154864#issuecomment-3225554119))	2025-08-26 20:03:59 +00:00
PyTorch MergeBot	caf98fde0d	Revert "[dynamo, nested graph breaks] clean up comments and codegen (#160138 )" This reverts commit ac6316caaa74513cbcf3c7f9269bc23cd74749db. Reverted https://github.com/pytorch/pytorch/pull/160138 on behalf of https://github.com/atalman due to failing internal tests ([comment](https://github.com/pytorch/pytorch/pull/160138#issuecomment-3225546707))	2025-08-26 20:01:26 +00:00
PyTorch MergeBot	46576f5a16	Revert "[dynamo, nested graph breaks] prevent excessive recompilations (#159786 )" This reverts commit 67d31f6b281d3b15b205756fc7ebc450cdde1dab. Reverted https://github.com/pytorch/pytorch/pull/159786 on behalf of https://github.com/atalman due to failing internal tests ([comment](https://github.com/pytorch/pytorch/pull/159786#issuecomment-3225535752))	2025-08-26 19:54:22 +00:00
Charlie West-Taylor	77bc959fe1	Add inductor backend to device interface; make minifier_tests more device agnostic (#151314 ) Tried to decouple the always cpu <=> c++, cuda <=> triton assumption. Tried to keep it relatively simple by just guarding things more specifically, at the moment. Pull Request resolved: https://github.com/pytorch/pytorch/pull/151314 Approved by: https://github.com/eellison	2025-08-26 19:40:37 +00:00
Jeff Daily	262640fd22	[ROCm][CI] restore test_flex_attention tests (#161519 ) Reverts #161450 and targets specific subtests to skip on MI200. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161519 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-26 19:31:30 +00:00
Zhengxu Chen	74124d1b46	[reland] [dynamo] Refactor convert_frame.compile_frame to be self contained function. [5/n] (#161514 ) Summary: convert_frame.compile_frame used to take a callback transform function which will capture the frame object it has, but the frame information is not passed directly into compile_frame function. This PR changes the signature of compile_frame so that frame information is directly passed in the function without taking a callback. This makes it easier to build fullgraph capture API on top of compile_frame. Test Plan: CI Rollback Plan: Differential Revision: D81041296 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161514 Approved by: https://github.com/tugsbayasgalan	2025-08-26 19:16:05 +00:00
Joshua Su	a03cc53e6f	Back out "Refactor CUDAAllocatorConfig to reuse AcceleratorAllocatorConfig (#150312 )" (#161002 ) Summary: reverting this diff since it caused S551328. Please see D80217492 for dertails. Test Plan: NA Rollback Plan: Differential Revision: D80553588 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161002 Approved by: https://github.com/jingsh, https://github.com/izaitsevfb	2025-08-26 19:04:13 +00:00
Yidi Wu	00efeabc29	[hop] make materialize_as_graph disable pre-existing dispatch modes (#161220 ) For materializing_as_subgraph, we just want to trace a graph. The handling of different modes should register their own logic. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161220 Approved by: https://github.com/Lucaskabela	2025-08-26 18:52:38 +00:00
Arsh Zahed	d4703fb91c	[dtensor] Add propagate_tensor_meta function that skips cache if _are_we_tracing (#161334 ) Fixes an issue where the log softmax handler checked the tensor metadata cache without checking for tracing or symints. Probably best to merge this after #160798, but not strictly blocking. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161334 Approved by: https://github.com/xmfan	2025-08-26 18:46:58 +00:00
Tom Ritchford	cd87f30295	DOC: Clarify documentation for torch.matmul and fix a typo (#161424 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161424 Approved by: https://github.com/AlannaBurke	2025-08-26 18:30:57 +00:00
Lucas Kabela	f0e0a6897e	type misc init and tools for dynamo (#161293 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/161293 Approved by: https://github.com/anijain2305	2025-08-26 17:38:49 +00:00
vishalgoyal316	d2bd55d8de	Typo correction in variable name inital_grad of Class TestFullyShardG… (#161501 ) Typo correction in variable name inital_grad of Class TestFullyShardGradientScaler implementation. Fixes #161480 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161501 Approved by: https://github.com/soulitzer	2025-08-26 17:16:42 +00:00
Yidi Wu	6598f00c18	[dynamo] auto lift unbacked symbol in tensor's storage_offset (#161199 ) ```python import torch torch._dynamo.config.capture_scalar_outputs = True class M(torch.nn.Module): def forward(self, idx, x): u0 = idx.item() x0 = x.select(0, u0) def fn(): return x0.sin() return torch.cond(x0.sum() > 0, fn, fn) m = M() out = torch.compile(m, fullgraph=True)(torch.tensor(0, dtype=torch.int64, device="cuda"), torch.randn(3, 3, device="cuda")) print(out) ``` Before the PR, we didn't track the storage_offset symbol of a tensor. After https://github.com/pytorch/pytorch/pull/157605, we create an unbacked_symint for stroage_offset for the result of select. So when we try to lift the free basic symbols of x0 during speculating fn, we found a free symbol that's not bound to a proxy. This PR tracks the symbols of storage_offset and associated it with a proxy using torch.ops.aten.storage_offest. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161199 Approved by: https://github.com/zou3519 ghstack dependencies: #161198	2025-08-26 17:06:54 +00:00
Yidi Wu	ba6ce66698	[dynamo] lift backed symint output of item() (#161198 ) Before the change in this PR, we have an error for the following code ```python import torch torch._dynamo.config.capture_scalar_outputs = True class M(torch.nn.Module): def forward(self, idx, x): u0 = idx.item() x0 = x.select(0, u0) def fn(): return x0.sin() return torch.cond(x0.sum() > 0, fn, fn) m = M() out = torch.compile(m, fullgraph=True)(torch.tensor(0, dtype=torch.int64), torch.randn(3, 3)) ``` The error is caused when speculate fn, and tries to lift symbol of x0.storage_offset() but found the symbols doesn't have a source associated with it. What really happens is that, when input tensor is a scalar tensor of int type and resides on CPU, we have a short cut that creates a norm symint when .item() is called see https://github.com/pytorch/pytorch/pull/126245. However, previously, we only track the unbacked symint output of an operation because we believe all the backed symint must have a source associated with it and has already bee lifted as input at the top-level. Now this invariant no longer holds, so we end up an error saying the symbol doesn't have source (because only input and symbols derided from inputs have source and result of .item() doesn't have a source). In this PR, we start to also track the normal symint with the proxy that created it (i.e. in this case the proxy .item()). Pull Request resolved: https://github.com/pytorch/pytorch/pull/161198 Approved by: https://github.com/zou3519	2025-08-26 17:06:54 +00:00
PaulZhang12	ca9fe0107e	[Inductor] Update Outer Reduction Heuristic (#159093 ) Update outer reduction heuristics for significant speedups. HuggingFace: <img width="572" height="705" alt="Screenshot 2025-08-20 at 12 44 51 AM" src="https://github.com/user-attachments/assets/4872a23b-d136-423a-b2e6-187895bccba1" /> Average ~20% speedup on a kernel by kernel basis TorchBench: <img width="572" height="705" alt="Screenshot 2025-08-20 at 12 45 10 AM" src="https://github.com/user-attachments/assets/b8357b6d-6107-4104-b906-292a17d14d48" /> Average ~40% speedup on a kernel by kernel basis <img width="1705" height="729" alt="Screenshot 2025-08-21 at 5 50 32 PM" src="https://github.com/user-attachments/assets/a9715a2b-9e6c-4b33-ba9f-7870dc561e31" /> Differential Revision: [D80835998](https://our.internmc.facebook.com/intern/diff/D80835998) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159093 Approved by: https://github.com/jansel	2025-08-26 16:12:07 +00:00
AmdSampsa	f9df4ec2af	SDPA skip logic for ROCm (#160522 ) Skips some test for flex and eff attention if they are not supported by the hardware Pull Request resolved: https://github.com/pytorch/pytorch/pull/160522 Approved by: https://github.com/drisspg, https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-26 15:51:07 +00:00
Catherine Lee	a72803f1e3	[ez][CI] GIve the linux check job a name that isn't linux-job (#161413 ) Reason: The default name is linux-job, which gets put in the linux category on HUD, but this isn't really a linux related job. Renaming it like this will make it go into the "other" category on HUD Other options: Change the grouping code in test-infra Pull Request resolved: https://github.com/pytorch/pytorch/pull/161413 Approved by: https://github.com/huydhn, https://github.com/seemethere	2025-08-26 15:18:35 +00:00
Jeff Daily	10e67f5ec3	forward fix #161102 (#161465 ) PR #161102 caused tf32 to be the default precision for flex attention. This PR forward-fixes the broken logic and restores ROCm MI200 CI flex attention test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161465 Approved by: https://github.com/jeffdaily, https://github.com/eqy Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-26 15:11:54 +00:00
PyTorch MergeBot	818ba434c7	Revert "Ensure large tensor int32 -> int64 indexing is enabled (#157767 )" This reverts commit fc69c2bc67672c3b2d0c62c1821895f09288f1c0. Reverted https://github.com/pytorch/pytorch/pull/157767 on behalf of https://github.com/atalman due to internal failure, sorry will revert ([comment](https://github.com/pytorch/pytorch/pull/157767#issuecomment-3224341111))	2025-08-26 14:12:06 +00:00
Ting Lu	ae8d319fd4	Update NVSHMEM to 3.3.24 and fix download link (#161321 ) https://github.com/pytorch/pytorch/issues/159779 Update NVSHMEM 3.3.24 for [PyTorch CUDA13 Binary Cannot Be Built with SM_75 with NVSHMEM](https://github.com/pytorch/pytorch/issues/160980) Enabled back sm_75 for NVSHMEM Fixed the NVSHMEM download link for the issue with 3.3.20 download in issue - [[CD] nvshem-3.3.9 wheels for aarch64 is not manylinux2_28 compliant](https://github.com/pytorch/pytorch/issues/160425) Todo: Should also enable back build ARM with NVSHMEM since it is compatible with manylinux2_28 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161321 Approved by: https://github.com/Skylion007, https://github.com/atalman	2025-08-26 13:26:18 +00:00
PyTorch MergeBot	e795450a35	Revert "[dynamo] Refactor convert_frame.compile_frame to be self contained function. [5/n] (#160900 )" This reverts commit 447d34b5f80fb7350f79decd855cb599cab39083. Reverted https://github.com/pytorch/pytorch/pull/160900 on behalf of https://github.com/atalman due to reverting since can't land existing diff internally, will need to reland it ([comment](https://github.com/pytorch/pytorch/pull/160900#issuecomment-3224029031))	2025-08-26 12:45:59 +00:00
David Berard	8c506e6310	[easy][test] Add repeat_interleave opinfo that exercises binary search fusion (#161445 ) This adds a configuration that would have caught the need for https://github.com/pytorch/pytorch/pull/159961 when https://github.com/pytorch/pytorch/pull/158462 was landed. Notably: * the test has output_size kwarg specified * the input is 1D plus a size-1 dimension (otherwise, if there are non-size-1 dimensions, then the fusion won't occur) Differential Revision: [D80981715](https://our.internmc.facebook.com/intern/diff/D80981715) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161445 Approved by: https://github.com/eellison, https://github.com/v0i0	2025-08-26 12:32:24 +00:00
PyTorch MergeBot	4a1aca11c2	Revert "[inductor] structured-log graph execution order + test (#160448 )" This reverts commit 995397d47a0e27394ee1010f158e181eb304100a. Reverted https://github.com/pytorch/pytorch/pull/160448 on behalf of https://github.com/atalman due to internal failure please see associated diff ([comment](https://github.com/pytorch/pytorch/pull/160448#issuecomment-3223939035))	2025-08-26 12:20:37 +00:00
Chuanhao Zhuge	e9d42b3880	[small][muon] Use addmm for Newton–Schulz orthogonalization (#161379 ) A performance optimization. Using `torch.addmm`, which fuses `matrix multiply + scale + add` into one op. Benchmark In a QWEN-like 0.5B model training we observed average `optimizer.step()` latency speedup: matmul ~44.5 ms -> addmm ~27.4 ms: a 1.62× speedup. matmul <img width="1403" height="600" alt="Screenshot 2025-08-24 at 3 15 37 PM" src="https://github.com/user-attachments/assets/a77a68d4-da3c-473a-97f0-e6ef0a3b46d9" /> addmm <img width="1426" height="602" alt="Screenshot 2025-08-24 at 3 13 42 PM" src="https://github.com/user-attachments/assets/e493af36-44d3-4026-9f7c-fd0f9cdbc7e5" /> Testing End-to-end training: We used a training script that pre-trains a QWEN-like model on `openwebtext-100k` dataset. We trained for one epoch and the resulting loss curves show consistency between normal matmul and addmm. <img width="1035" height="434" alt="Screenshot 2025-08-24 at 2 56 21 PM" src="https://github.com/user-attachments/assets/b96b13e3-0a01-4908-853c-d917b41f3d75" /> Unit test: ```python # dummy model and data model0 = Linear(10, 10, bias=False) model1 = copy.deepcopy(model0) inputs = torch.randn(8, 10) targets = torch.randn(8, 10) loss = MSELoss() lr = 1e-3 wd = 0.1 momentum = 0.95 opt_ref_muon = Muon( params=model0.parameters(), lr=lr, weight_decay=wd, momentum=momentum, nesterov=nesterov, adjust_lr_fn="original", ) opt_exp_muon = Muon( params=model1.parameters(), lr=lr, weight_decay=wd, momentum=momentum, nesterov=nesterov, adjust_lr_fn="original", use_addmm=True, ) out_ref = model0(inputs) loss_ref = loss(out_ref, targets) opt_ref_muon.zero_grad() loss_ref.backward() opt_ref_muon.step() out_exp = model1(inputs) loss_exp = loss(out_exp, targets) opt_exp_muon.zero_grad() loss_exp.backward() opt_exp_muon.step() for p_ref, p_exp in zip(model0.parameters(), model1.parameters()): torch.testing.assert_close(p_ref, p_exp) ``` shows numeric difference, but this is expected on bf16 precision: ``` Mismatched elements: 96 / 100 (96.0%) Greatest absolute difference: 8.985400199890137e-05 at index (1, 9) (up to 1e-06 allowed) Greatest relative difference: 0.007370449136942625 at index (0, 6) (up to 1e-05 allowed) ``` ~~Introduced a flag that allows users to opt in, as there are numerical differences relative to the original implementation.~~ Update: since `addmm` fuses the math ops, there are fewer intermediate roundings and is therefore more numerically accurate compared to the original form. Based on this, we opt to make `addmm` the default and only option. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161379 Approved by: https://github.com/janeyx99	2025-08-26 09:17:28 +00:00
Tsung-Hsien Lee	8cfc119491	[pytorch] Simplify codes using `std::all_of()` for `_check_tensors_share_device_and_dtype()` (#161411 ) Summary: These two nested loops of checks could be simplified with `std::all_of()` to make it more compact. Test Plan: OSS CI & tests Rollback Plan: Differential Revision: D80946082 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161411 Approved by: https://github.com/Skylion007, https://github.com/cyyever	2025-08-26 08:56:24 +00:00
Tsung-Hsien Lee	e7e270a33a	[pytorch] Merge two nested if statement checks into one (#161387 ) Summary: This reduces the code indentation level by one. Test Plan: OSS CI & tests Rollback Plan: Differential Revision: D80915357 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161387 Approved by: https://github.com/janeyx99	2025-08-26 08:45:36 +00:00
Nikhil Patel	6aef9f3a69	[Inductor][Tritonparse] Call `jit_post_compile_hook` within Inductor Triton Kernel compile path (#161443 ) Summary: Since Inductor skips JIT compilation for Triton kernels, we need to manually invoke `knobs.runtime.jit_post_compile_hook` if one exists. Here, we do this to enable Tritonparse to extract launch metadata from Inductor launched kernels. We can control whether or not Inductor will run the hook with a new `TORCHINDUCTOR_RUN_JIT_POST_COMPILE_HOOK=1 ` config variable. Reviewed By: davidberard98 Differential Revision: D80624932 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161443 Approved by: https://github.com/FindHao	2025-08-26 06:24:42 +00:00
Xilun Wu	7376111d59	[BE] fix compute_global_tensor_shape test (#161441 ) Fixes #161154 Test `pytest test/distributed/tensor/test_utils.py -s -k test_compute_global_tensor_shape_1D` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161441 Approved by: https://github.com/kwen2501	2025-08-26 03:22:29 +00:00
PyTorch MergeBot	92ab184824	Revert "[Inductor] Prune configs that require more shared memory than the hardware limit (#161040 )" This reverts commit b2e06e0194c3fa8f7578a1b48751cc027394fb67. Reverted https://github.com/pytorch/pytorch/pull/161040 on behalf of https://github.com/jeffdaily due to still failing on rocm, see https://hud.pytorch.org/failure?name=rocm%20%2F%20linux-jammy-rocm-py3.10%20%2F%20test%20(default%2C%203%2C%206%2C%20linux.rocm.gpu.2)&jobName=undefined&failureCaptures=inductor%2Ftest_triton_heuristics.py%3A%3ATestTritonHeuristics%3A%3Atest_prune_configs_over_shared_memory_limit_do_pruning_True ([comment](https://github.com/pytorch/pytorch/pull/161040#issuecomment-3222430129))	2025-08-26 03:15:32 +00:00
Zesheng Zong	8c442e4fd3	Fix LBFGS warning convert a tensor with requires_grad=True to a scalar (#160389 ) Fixes #160197 ## Test Result ```python In [1]: import warnings ...: warnings.simplefilter('error') ...: import torch ...: print(torch.__version__) ...: a, b = torch.rand((2, 32, 32)) ...: a.requires_grad_() ...: optimizer = torch.optim.LBFGS([a]) ...: loss_fn = lambda x, y: (x-y).pow(2).mean() ...: ...: def closure(): ...: optimizer.zero_grad() ...: loss = loss_fn(a, b) ...: loss.backward() ...: return loss ...: ...: for i in range(100): ...: optimizer.step(closure) ...: print(i, loss_fn(a, b)) ...: 2.9.0a0+gitf33f3f8 0 tensor(5.8066e-11, grad_fn=<MeanBackward0>) 1 tensor(5.8066e-11, grad_fn=<MeanBackward0>) 2 tensor(5.8066e-11, grad_fn=<MeanBackward0>) 3 tensor(5.8066e-11, grad_fn=<MeanBackward0>) 4 tensor(5.8066e-11, grad_fn=<MeanBackward0>) 5 tensor(5.8066e-11, grad_fn=<MeanBackward0>) 6 tensor(5.8066e-11, grad_fn=<MeanBackward0>) 7 tensor(5.8066e-11, grad_fn=<MeanBackward0>) 8 tensor(5.8066e-11, grad_fn=<MeanBackward0>) 9 tensor(5.8066e-11, grad_fn=<MeanBackward0>) 10 tensor(5.8066e-11, grad_fn=<MeanBackward0>) ... ``` ```bash pytest test/test_optim.py -vv ... test/test_optim.py::TestOptimRenewedCUDA::test_tensor_lr_num_dim_2_NAdam_cuda_float32 PASSED [2.7192s] [ 99%] test/test_optim.py::TestOptimRenewedCUDA::test_tensor_lr_num_dim_2_RAdam_cuda_float32 PASSED [2.5370s] [ 99%] test/test_optim.py::TestOptimRenewedCUDA::test_tensor_lr_num_dim_2_RMSprop_cuda_float32 PASSED [2.0190s] [ 99%] test/test_optim.py::TestOptimRenewedCUDA::test_tensor_lr_num_dim_2_Rprop_cuda_float32 PASSED [1.8554s] [ 99%] test/test_optim.py::TestOptimRenewedCUDA::test_tensor_lr_num_dim_2_SGD_cuda_float32 PASSED [2.0433s] [ 99%] test/test_optim.py::TestOptimRenewedCUDA::test_tensor_lr_num_dim_2_SparseAdam_cuda_float32 PASSED [1.1788s] [100%] ================== 1471 passed, 242 skipped in 2440.52s (0:40:40) ============ ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160389 Approved by: https://github.com/janeyx99 Co-authored-by: albanD <desmaison.alban@gmail.com>	2025-08-26 03:07:47 +00:00
angelayi	e34b6a0103	Add meta for add.Scalar (#161332 ) Fixes https://github.com/pytorch/pytorch/issues/161076 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161332 Approved by: https://github.com/Skylion007	2025-08-26 02:26:51 +00:00
RajeshvShiyal	f795e92802	space added between type and checking for typechecking (#161352 ) space added between type and checking for "typechecking" Fixes #161282 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161352 Approved by: https://github.com/malfet	2025-08-26 02:07:33 +00:00
Huy Do	becd6cd744	Increase timeout value when pushing to ghcr.io (#161444 ) Seeing this timing out a lots in trunk now https://github.com/pytorch/pytorch/actions/runs/17165552358/job/48705069047. The benchmark image is the largest one we have on CI, so it's probably over the 30 minutes limit. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161444 Approved by: https://github.com/atalman	2025-08-26 01:51:16 +00:00
FFFrog	ec21cafd85	[OpenReg] Refactor and Optimize the OpenReg for Preparation of Docs (#159640 ) As the title stated. Changes: - Fixed a bug where abs_stub could not be triggered - Refactor registration to prepare for documentation - Add meta, fallback for openreg Pull Request resolved: https://github.com/pytorch/pytorch/pull/159640 Approved by: https://github.com/albanD	2025-08-26 01:44:21 +00:00
PyTorch MergeBot	908b0ccb1f	Revert "Increase timeout value when pushing to ghcr.io (#161444 )" This reverts commit b9e9e92817fd7d1a778f074105603efb07e05004. Reverted https://github.com/pytorch/pytorch/pull/161444 on behalf of https://github.com/huydhn due to Reland this to generate a different has value for the benchmark Docker image ([comment](https://github.com/pytorch/pytorch/pull/161444#issuecomment-3222257119))	2025-08-26 01:41:59 +00:00
amdfaa	85adf80cf1	Disable inductor/test_flex_attention.py (#161450 ) Currently inductor/test_flex_attention.py is causing rocm pytorch mi250 shard 1 to go over the timeout limit. This PR is for disabling that test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161450 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-26 01:28:51 +00:00
Benjamin Glass	74c4c758af	[cpp_wrapper] Swap to new PyBind11 simple GIL header (#161063 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161063 Approved by: https://github.com/Skylion007 ghstack dependencies: #160754	2025-08-26 01:21:18 +00:00
Benjamin Glass	660b0b8128	Update pybind11 submodule to 3.0.1 (#160754 ) Upgrade to PyBind11 v3. This allows us to strip out our own (possibly broken?) handling of the C++ ABI when building extensions, in favor of the more-complete PyBind11 internal handling. Fixes a few test failures due to https://github.com/pybind/pybind11/issues/5774, which effectively makes the `__qualname__` attribute of functions platform-dependent. Test plan: CI Pull Request resolved: https://github.com/pytorch/pytorch/pull/160754 Approved by: https://github.com/Skylion007	2025-08-26 01:21:18 +00:00
Yiming Zhou	089ad1d88b	[1/n][export] Refactor PT2 Archive weight saving and loading (#160394 ) Summary: We split the refactoring in two parts for forward compatibility concerns First, we land the deserialization (loading part) Then, we land the serialization (saving part) Save weights and constants as individual files in PT2 archive. Each weight/constant will be saved as raw bytes, unless it is a custom object (TorchBind object) or a non-fake tensor subclass, for these two special cases we still save them using pickle. The metadata of saved tensors along with the file name will be saved as `PayloadMeta`. The mapping from FQN to `PayloadMeta` will be saved as `PayloadConfig` under `WEIGHTS_CONFIG_FORMAT` and `CONTANTS_CONFIG_FORMAT` This changes the serialization in python side when calling `torch.export.save()`. For deserialization in python `torch.export.load()`, we make it BC-safe by allowing loading legacy format weights/constants. For deserialization in C++ `torch/nativert/ModelRunner.cpp`, we make this a BC breaking change as currently the OSS ModelRunner API is not being used. The file structure ``` ├── archive_format ├── archive_version ├── byteorder ├── .data │ ├── serialization_id │ └── version ├── data │ ├── sample_inputs │ │ └── model.pt │ ├── constants │ │ ├── tensor_0 │ │ ├── tensor_1 │ │ └── model_constants_config.json │ └── weights │ ├── weight_0 │ ├── weight_1 │ ├── weight_2 │ ├── weight_3 │ └── model_weights_config.json └── models └── model.json ``` Test Plan: CI Rollback Plan: Differential Revision: D80035490 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160394 Approved by: https://github.com/SherlockNoMad	2025-08-26 01:15:42 +00:00
William Wen	67d31f6b28	[dynamo, nested graph breaks] prevent excessive recompilations (#159786 ) Nested continuation function code objects are now unique w.r.t. stack trace below (and including) the current code object. Without this change, e.g. in the added test, `f3` would be recompiled on the second graph break. Followup: we can skip guards on continuation functions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159786 Approved by: https://github.com/anijain2305 ghstack dependencies: #157971, #159281, #144516, #159329, #159678, #159817, #160138	2025-08-26 00:58:38 +00:00
William Wen	ac6316caaa	[dynamo, nested graph breaks] clean up comments and codegen (#160138 ) Fix comments to reflect that we no longer codegen cells to be sent to resume function as inputs - they are instead codegen'd after the unsupported instruction in order to build resume functions that are closures. Also simplify some codegen. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160138 Approved by: https://github.com/anijain2305 ghstack dependencies: #157971, #159281, #144516, #159329, #159678, #159817	2025-08-26 00:58:38 +00:00
William Wen	ef0ef6f93f	[dynamo, nested graph breaks] support nested closures (#159817 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159817 Approved by: https://github.com/anijain2305 ghstack dependencies: #157971, #159281, #144516, #159329, #159678	2025-08-26 00:58:28 +00:00
William Wen	02fa5bf6d8	[dynamo, nested graph breaks] support nested graph breaks x context managers (#159678 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159678 Approved by: https://github.com/anijain2305 ghstack dependencies: #157971, #159281, #144516, #159329	2025-08-26 00:58:18 +00:00
William Wen	8dab6d4c41	[dynamo, nested graph breaks] support very simple nested graph breaks (#159329 ) e.g. this graph breaks once now: ```python import torch torch._dynamo.config.nested_graph_breaks = True def inner(x): x = x + 1 torch._dynamo.graph_break() return x + 2 @torch.compile(backend="eager") def outer(x): return inner(x) print(outer(torch.ones(3))) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159329 Approved by: https://github.com/anijain2305 ghstack dependencies: #157971, #159281, #144516	2025-08-26 00:58:07 +00:00
William Wen	9a756c2d71	[dynamo, nested graph breaks] add nested graph break tests (#144516 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/144516 Approved by: https://github.com/anijain2305 ghstack dependencies: #157971, #159281	2025-08-26 00:57:58 +00:00
William Wen	504a6445a4	[dynamo, nested graph breaks] use CALL_FUNCTION_EX when calling resume function (#159281 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159281 Approved by: https://github.com/anijain2305 ghstack dependencies: #157971	2025-08-26 00:57:48 +00:00
William Wen	2df9b437e3	[dynamo, nested graph breaks] implement new resume frame stack/locals/cell layout convention (#157971 ) The comments/conventions are not exactly correct here, as the implementation at this PR is partial. They will be fixed in #160138. No tests added, since there shouldn't be any overall semantic changes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157971 Approved by: https://github.com/anijain2305	2025-08-26 00:57:39 +00:00
rzou	4e19c1906a	Get Inductor periodic CI green (#161297 ) I'll file hi-pri issues for the things that need looking into. Test Plan: - wait for CI Pull Request resolved: https://github.com/pytorch/pytorch/pull/161297 Approved by: https://github.com/angelayi	2025-08-26 00:49:49 +00:00
Nikhil Patel	332fa5b388	[Inductor][Triton] Fix SCALING_ROWWISE misclassification for scalar scales (#160450 ) Summary: In `tuned_scaled_mm()`, we unsqeeuze any scalar scale from [] -> [1, 1]. Later, when we are determining how to set the `SCALING_ROWWISE` kernel attribute, we check whether the scale has 2 dimensions. However, since we previously unsqueezed any scalar scales, this will always evaluate to True. Test Plan: Run the following tests in test/inductor/test_fp8.py: test_tensorwise_scaling_tma_template test_rowwise_scaling_tma_template Rollback Plan: Differential Revision: D80108117 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160450 Approved by: https://github.com/eellison	2025-08-26 00:24:55 +00:00
Huy Do	b9e9e92817	Increase timeout value when pushing to ghcr.io (#161444 ) Seeing this timing out a lots in trunk now https://github.com/pytorch/pytorch/actions/runs/17165552358/job/48705069047. The benchmark image is the largest one we have on CI, so it's probably over the 30 minutes limit. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161444 Approved by: https://github.com/atalman	2025-08-25 23:52:59 +00:00
Tsung-Hsien Lee	e6aa7287f8	[pytorch] Leverage `unordered_map.try_emplace()` to simplify code (#161388 ) Summary: Because [`unordered_map.try_emplace()`](https://en.cppreference.com/w/cpp/container/unordered_map/try_emplace.html) does not invoke value's constructor if key is already existed, this matches with the previous the behavior on checking the key's existence first, and then instantiate the value. Test Plan: OSS CI & tests Rollback Plan: Differential Revision: D80916349 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161388 Approved by: https://github.com/janeyx99	2025-08-25 23:33:59 +00:00
atalman	94b9569c4a	Forward fix periodic vision build (#161408 ) Trying to forward fix: https://github.com/pytorch/pytorch/issues/161358 use SM 80 architecture by default Pull Request resolved: https://github.com/pytorch/pytorch/pull/161408 Approved by: https://github.com/zou3519, https://github.com/huydhn Co-authored-by: Huy Do <huydhn@gmail.com>	2025-08-25 23:28:22 +00:00
morrison-turnansky	2cf7ac2fb7	Issue 160495 inductor complex float (#160736 ) Avoiding calling tensor.view(tensor.real.dtype) when tensor.ndim =0 fixes the issue. Called a reshape. Fixes #160495 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160736 Approved by: https://github.com/ngimel	2025-08-25 23:23:13 +00:00
zhxchen17	447d34b5f8	[dynamo] Refactor convert_frame.compile_frame to be self contained function. [5/n] (#160900 ) convert_frame.compile_frame used to take a callback transform function which will capture the frame object it has, but the frame information is not passed directly into compile_frame function. This PR changes the signature of compile_frame so that frame information is directly passed in the function without taking a callback. This makes it easier to build fullgraph capture API on top of compile_frame. @exported-using-ghexport Differential Revision: [D80469801](https://our.internmc.facebook.com/intern/diff/D80469801/) Differential Revision: [D80469801](https://our.internmc.facebook.com/intern/diff/D80469801) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160900 Approved by: https://github.com/tugsbayasgalan, https://github.com/anijain2305	2025-08-25 23:16:21 +00:00
Wenyuan Chi	b2e06e0194	[Inductor] Prune configs that require more shared memory than the hardware limit (#161040 ) Summary: This diff removes configs that require more shared memory than the hardware limit, which causes the following compilation error: ``` No valid triton configs. OutOfMemoryError: out of resource: triton_mm Required: 327680 Hardware limit:232448 Reducing block sizes or `num_stages` may help. ``` Test Plan: ``` buck2 test mode/dev-nosan fbcode//caffe2/test/inductor:max_autotune -- test_max_autotune_prune_choices -v 1,stderr ``` Rollback Plan: Differential Revision: D80594562 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161040 Approved by: https://github.com/eellison	2025-08-25 23:09:09 +00:00
drisspg	fc69c2bc67	Ensure large tensor int32 -> int64 indexing is enabled (#157767 ) Fixes: #https://github.com/pytorch/pytorch/issues/157446 I think that this delta is worth the switch form block-ptrs especially since they are deprecated ## Perf Summary A is nightly B is this diff, so `negative` means this diff improves perf TOP 5 differences <img width="805" height="754" alt="Screenshot 2025-08-24 at 5 49 49 PM" src="https://github.com/user-attachments/assets/aa359cdf-ee9a-427d-be72-1b9aef6f3115" /> <details> <summary><strong>Full perf table (click to expand)</strong></summary> \| attn_type \| dtype \| shape(B,Hq,M,Hkv,N,D) \| TFlops Version A \| TFlops Version B \| \| --- \| --- \| --- \| --- \| --- \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 258.38834144791923 \| 258.6353685004612 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 142.2192450677751 \| 140.12393320464972 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 122.32683823617003 \| 118.51603755647925 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 142.48556906165314 \| 137.24259849208627 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 64) \| 86.59814488695922 \| 84.59431398586257 \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 288.52679758135764 \| 292.9174195871856 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 172.25541683643277 \| 172.94326459828508 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 164.40864610599826 \| 165.035129576335 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 176.54876886433945 \| 175.08057670028145 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 16, 1024, 128) \| 125.22491679812626 \| 121.06201152859151 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 339.11952481874283 \| 339.0132835601695 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 227.58583240284406 \| 228.21824999409597 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 185.98569659868966 \| 182.32850843255093 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 188.9495725191772 \| 180.31385312481657 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 64) \| 106.25789530994302 \| 106.55084959448476 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 357.6430536888533 \| 363.30843452247274 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 262.3241154406613 \| 265.73250045488 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 249.30498953911416 \| 249.35928192833785 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 224.74126243851808 \| 223.71776504077988 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 16, 2048, 128) \| 168.26977014013707 \| 165.47991483333809 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 382.8178701785897 \| 384.34752965862685 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 308.1449710013853 \| 311.0653716044644 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 251.96365252505072 \| 243.92283557225903 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 226.69316232745368 \| 215.22769268913356 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 64) \| 153.34142545296405 \| 151.9312673939401 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 396.0998000753126 \| 398.35036286102473 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 333.5198415274966 \| 344.6354466169716 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 310.5955933379696 \| 305.66347819546 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 260.4012412689896 \| 259.758666997307 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 16, 4096, 128) \| 234.13034252182635 \| 227.61676497283614 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 396.17615538477196 \| 401.1419104525502 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 359.98648311998414 \| 360.8285563463094 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 291.97720707257736 \| 281.41694809965253 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 250.1703628419691 \| 238.556760291579 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 64) \| 199.50782826294306 \| 191.52327358439223 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 411.0632004785396 \| 413.6362648405517 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 382.9404387613185 \| 397.74886235657607 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 357.0998545146633 \| 350.5115200772392 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 281.8033924428203 \| 281.98601309215843 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 16, 8192, 128) \| 282.56595134222135 \| 277.4565795466672 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 408.89838018149516 \| 405.14531386840076 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 396.07662058160264 \| 393.4598228299578 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 317.8822887267849 \| 304.754931401036 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 265.8801304948243 \| 254.22961974295112 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 64) \| 227.87390579965614 \| 222.19481980110393 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 427.36821778477025 \| 431.3766620314935 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 410.67994346825 \| 423.4666944003808 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 381.1968748374038 \| 381.77668006420424 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 292.5540046358546 \| 296.5439130720502 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 16, 16384, 128) \| 321.04573768858114 \| 310.7423616656888 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 427.46148866769903 \| 426.162091037068 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 419.75580537687347 \| 421.88640120274334 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 337.3208051798903 \| 327.4912454675092 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 276.5638854539581 \| 262.988360558083 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 64) \| 250.82791326036886 \| 245.07367032501736 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 435.8055824506086 \| 441.8803729460534 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 432.02638235921006 \| 450.33161016596273 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 402.25525939224883 \| 393.8564689669916 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 297.5337286675904 \| 297.0131881135074 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 16, 32768, 128) \| 343.8697037899545 \| 329.8194073407783 \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 267.58912366821056 \| 256.91606054118375 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 150.81723692609629 \| 146.32172267858743 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 129.51029293209245 \| 122.72144394093334 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 147.627656359087 \| 141.68956350566188 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 64) \| 87.55100546003591 \| 84.91293287692788 \| \| noop \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 299.5931492743986 \| 305.884253766691 \| \| causal \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 179.39026367843837 \| 181.64741311605096 \| \| alibi \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 173.93547669282367 \| 173.23972950980564 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 185.90234171599252 \| 182.80844545446686 \| \| document_mask \| torch.bfloat16 \| (2, 16, 1024, 4, 1024, 128) \| 128.08176696266082 \| 123.27722685662111 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 340.50674552770664 \| 338.9071088484576 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 225.4438318650432 \| 230.22899884832975 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 194.15123248528312 \| 185.02793973094865 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 200.74289714108176 \| 191.76606719670647 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 64) \| 107.03564946728423 \| 106.82432377861258 \| \| noop \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 371.31799283918406 \| 379.7555394732925 \| \| causal \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 275.97762744310455 \| 276.71106853992995 \| \| alibi \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 261.6648679783462 \| 259.4127232060398 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 237.03108223577615 \| 233.92710216149527 \| \| document_mask \| torch.bfloat16 \| (2, 16, 2048, 4, 2048, 128) \| 172.13926800371152 \| 168.74390922407585 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 381.50199487767276 \| 383.9043681999597 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 307.9748883093411 \| 312.2403515462001 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 251.11319684705438 \| 243.17870127827277 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 236.3253127246763 \| 223.81250201769552 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 64) \| 154.55693991756874 \| 153.11360584987685 \| \| noop \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 407.11400078586615 \| 413.53709886086557 \| \| causal \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 348.1705797722622 \| 360.09771155957367 \| \| alibi \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 321.8593280850388 \| 318.2882327401255 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 270.089032013835 \| 268.767323026064 \| \| document_mask \| torch.bfloat16 \| (2, 16, 4096, 4, 4096, 128) \| 238.07324557907788 \| 228.09842078362692 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 399.8172853171901 \| 401.0954526332136 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 363.4387330438581 \| 364.13111024232677 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 294.1752429133857 \| 283.7235663368415 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 256.8389394007649 \| 246.91771015606483 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 64) \| 199.3378564292656 \| 192.40439590901758 \| \| noop \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 425.5150965556111 \| 430.8190098707553 \| \| causal \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 396.00437184073013 \| 411.3873625655787 \| \| alibi \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 369.92803661607815 \| 361.43244467343663 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 293.4277354412933 \| 295.2529537595746 \| \| document_mask \| torch.bfloat16 \| (2, 16, 8192, 4, 8192, 128) \| 288.0208673072841 \| 281.51896404878863 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 408.3005367220567 \| 408.96116482298913 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 396.90095962766304 \| 396.87385456176486 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 319.0534576137999 \| 302.50950358107764 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 270.3334977708081 \| 258.8506349486557 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 64) \| 227.46824134365394 \| 222.23759438128766 \| \| noop \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 438.24247309479694 \| 437.7975163205371 \| \| causal \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 428.34012029699227 \| 433.3215899950434 \| \| alibi \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 386.52672049728875 \| 388.26216893354984 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 302.71976814728083 \| 302.3574867306459 \| \| document_mask \| torch.bfloat16 \| (2, 16, 16384, 4, 16384, 128) \| 327.39760662780986 \| 308.6348428844912 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 423.31308678262695 \| 426.6306972137279 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 412.6983690923106 \| 419.4961977664297 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 337.41003544742273 \| 324.2155049126126 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 278.7755890910794 \| 265.9194286636502 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 64) \| 251.55678254755364 \| 244.8843180141462 \| \| noop \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 452.5930781172308 \| 457.7117122300742 \| \| causal \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 445.05676260348116 \| 463.9304535499636 \| \| alibi \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 415.78302138389415 \| 406.29229555271456 \| \| sliding_window \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 308.0311067300895 \| 304.91354721414314 \| \| document_mask \| torch.bfloat16 \| (2, 16, 32768, 4, 32768, 128) \| 351.43943626809335 \| 329.4476923070317 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 295.1801525813241 \| 291.36521287398904 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 183.23250549178067 \| 182.35421238887605 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 151.56832453117747 \| 151.3422139154794 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 171.02111935180432 \| 160.72516856727913 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 64) \| 74.05765122783826 \| 74.5885345035243 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 314.3587394591763 \| 319.2938677773619 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 224.57002084153177 \| 225.48868542008177 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 216.00964804143052 \| 215.39576159953486 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 216.1174237618258 \| 214.28437413525663 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 16, 1024, 128) \| 121.08920423648368 \| 119.55813661872644 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 362.2193857281911 \| 360.05005804275936 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 279.8840217430121 \| 279.5437918286659 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 227.76617121021982 \| 222.8655938229316 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 215.43141176970562 \| 207.71852284994702 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 64) \| 121.35588364218539 \| 121.20636565046884 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 365.1545280898012 \| 373.37585444987326 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 304.360119952975 \| 309.1247297936263 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 287.2603904544586 \| 289.25547903162595 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 257.9852675272418 \| 257.59069234098115 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 16, 2048, 128) \| 188.35158496670232 \| 184.24683960154857 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 389.9744911369211 \| 388.43466897254166 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 345.9228295166513 \| 342.63034895210126 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 279.56334658247437 \| 271.2724375402088 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 245.66477202810066 \| 233.49688207371258 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 64) \| 170.3270720653187 \| 166.23863845657382 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 400.0041140827554 \| 402.11182445396497 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 363.64641830327434 \| 375.9288663364792 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 341.5776139573363 \| 335.1160003213424 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 281.1811770268521 \| 280.21438270014005 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 16, 4096, 128) \| 247.78716118997716 \| 245.3269825179633 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 403.794126680488 \| 405.2353919019577 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 387.079178426863 \| 385.1461762057035 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 309.7847188173431 \| 298.0443968374749 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 262.4721750159666 \| 250.81679725428586 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 64) \| 205.70866004479979 \| 202.9620839129557 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 413.380982988662 \| 418.40270594263103 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 398.450064800682 \| 409.6794973994029 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 372.26297458194466 \| 364.44415106552196 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 293.0818569905912 \| 292.85172400643984 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 16, 8192, 128) \| 296.46717085592087 \| 285.76362010612763 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 419.3186786037592 \| 426.08801580934437 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 408.1648467766632 \| 409.4122254207817 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 329.24396020457345 \| 313.5200995121138 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 274.61257504571876 \| 255.7801815432177 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 64) \| 232.63806001220684 \| 230.03020843492314 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 435.0785891054788 \| 440.39101804225345 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 424.86925312752817 \| 435.18898057396825 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 393.000417896268 \| 395.11543361225256 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 297.7755459218185 \| 300.7208114715287 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 16, 16384, 128) \| 331.71570861760534 \| 318.07127352552885 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 424.58602747137405 \| 425.84897078470715 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 422.66607285025725 \| 423.5524945535485 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 344.8625760048626 \| 331.6793888458635 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 282.0787281511649 \| 263.7895634445868 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 64) \| 252.7301927385177 \| 245.41844170037427 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 437.0658069164588 \| 442.9101960063628 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 433.13788271434646 \| 452.3873572709863 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 404.0959191546953 \| 396.7077863894884 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 300.45502211883206 \| 301.3439134717943 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 16, 32768, 128) \| 344.11003202413934 \| 330.8897663350314 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 298.4364205341705 \| 291.6793556507056 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 187.6382133139633 \| 191.05409897308772 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 156.55822078636112 \| 154.178925976516 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 173.47765221825162 \| 169.30862508068464 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 64) \| 74.5885345035243 \| 74.52689061607104 \| \| noop \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 323.12233826013045 \| 328.53889207933514 \| \| causal \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 236.75872140126316 \| 235.8378325547398 \| \| alibi \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 227.17836523816675 \| 226.75357076139966 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 224.07209453308036 \| 224.07209453308036 \| \| document_mask \| torch.bfloat16 \| (4, 16, 1024, 4, 1024, 128) \| 122.85572156047981 \| 121.11642183704716 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 361.3123326658092 \| 360.71014086458337 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 281.5287983927017 \| 281.94301754758345 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 232.7456696285686 \| 226.50976826432776 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 221.5612361744038 \| 214.96188822837055 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 64) \| 121.38311528944315 \| 120.85441868178513 \| \| noop \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 380.2579019244734 \| 389.2520157863988 \| \| causal \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 316.95230660496924 \| 317.87597790618906 \| \| alibi \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 301.07968126657323 \| 298.02424098422983 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 267.2240756921594 \| 267.16353549228154 \| \| document_mask \| torch.bfloat16 \| (4, 16, 2048, 4, 2048, 128) \| 189.82761622494257 \| 186.736450261963 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 389.88665375406805 \| 387.9125133037077 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 348.70619958684887 \| 346.6750499749774 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 280.5472989906087 \| 271.22300822012187 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 250.02397620165968 \| 241.22532776331445 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 64) \| 171.67817496107645 \| 166.95679280483972 \| \| noop \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 412.626880230807 \| 417.60238657950777 \| \| causal \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 374.8829313933945 \| 389.4448546468815 \| \| alibi \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 353.20410434172436 \| 345.7072490717473 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 292.51045924209586 \| 291.66621022138287 \| \| document_mask \| torch.bfloat16 \| (4, 16, 4096, 4, 4096, 128) \| 251.6264062063495 \| 248.45110052911542 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 404.0155784550126 \| 401.90546837237514 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 384.4389015599863 \| 386.9684324594344 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 313.3731284132225 \| 298.17074251037894 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 264.19199737284265 \| 252.8982463999916 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 64) \| 207.03696315185684 \| 202.86697323136772 \| \| noop \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 428.2436763312506 \| 433.45005568619536 \| \| causal \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 411.8516531869893 \| 428.2753623461049 \| \| alibi \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 384.9095037182509 \| 372.90888743000744 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 303.2438915629836 \| 302.05095952914337 \| \| document_mask \| torch.bfloat16 \| (4, 16, 8192, 4, 8192, 128) \| 301.8689122735564 \| 285.0363190513223 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 423.13592231504805 \| 420.3991500185611 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 407.44527331585493 \| 408.5064370765247 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 330.50050996167414 \| 316.8763979925965 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 274.6833786307413 \| 259.86098862141324 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 64) \| 232.24019584158367 \| 226.52040268160232 \| \| noop \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 444.4596314237808 \| 455.99558915752266 \| \| causal \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 437.4245561244369 \| 455.98275147271966 \| \| alibi \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 397.3350686877605 \| 397.88875599028063 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 308.53809114394545 \| 307.1359822042007 \| \| document_mask \| torch.bfloat16 \| (4, 16, 16384, 4, 16384, 128) \| 331.32379843423774 \| 316.85293191675646 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 422.4622274366379 \| 425.0407156418684 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 420.9547052783101 \| 430.33779243510276 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 345.50265346504085 \| 332.094855328957 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 280.81715528243365 \| 264.6543640282054 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 64) \| 252.25635200421783 \| 245.46235499490305 \| \| noop \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 452.5524207341139 \| 461.7512032176736 \| \| causal \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 445.2316469907137 \| 464.4523799578466 \| \| alibi \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 416.87264016717023 \| 409.17124592157046 \| \| sliding_window \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 309.42579489389846 \| 307.9734464665731 \| \| document_mask \| torch.bfloat16 \| (4, 16, 32768, 4, 32768, 128) \| 350.50782004300623 \| 330.98959545427294 \| </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/157767 Approved by: https://github.com/Skylion007	2025-08-25 22:51:00 +00:00
Michael Lazos	adecb0c9e8	[Cutlass-EVT] Fix buffer size issues (#161335 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161335 Approved by: https://github.com/henrylhtsang ghstack dependencies: #161398	2025-08-25 22:08:30 +00:00
Michael Lazos	d57c79e609	[Cutlass] Fix regression from f7ad69f (#161398 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161398 Approved by: https://github.com/henrylhtsang	2025-08-25 22:08:30 +00:00
atalman	1a566c4909	Remove Python 3.9 nightly builds (#161427 ) Please see https://github.com/pytorch/pytorch/issues/161167 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161427 Approved by: https://github.com/huydhn	2025-08-25 22:05:40 +00:00
Michael Lazos	37a34022b5	[Pattern Matcher] improve error msg (#161423 ) Updates pattern matcher error message Pull Request resolved: https://github.com/pytorch/pytorch/pull/161423 Approved by: https://github.com/mengluy0125, https://github.com/masnesral	2025-08-25 21:48:54 +00:00
Huy Do	763053dc53	Always run OIDC auth on B200 to be able to upload artifacts to S3 (#161436 ) Reported by @drisspg , in its current form, the OIDC auth step wasn't run when the previous test step failed. We need this to always run to be able to upload artifacts to S3. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161436 Approved by: https://github.com/nWEIdia, https://github.com/drisspg	2025-08-25 21:05:20 +00:00
Daniel Galvez	cf94cadbee	[CUDAGraph] Add getter for cuda graph exec (#161294 ) This is far simpler than #155164 since we never destroy the cudaGraphExec_t. The request comes from TRT-LLM specifically. The motivation is that some power users would like to mutate specific kernel parameters via APIs like `cudaGraphExec*SetParams` after a cuda graph has been instantiated. For example, a common request has been to be able to change the sequence length of attention kernels, after having captured a graph for the largest possible sequence length. It turns out that the host overhead you eliminate via cuda graphs in LLM inference ends up causing an increase in computation time when you size your kernels to the maximum possible sequence length (which I believe is done in both TRT-LLM and vLLM). Attention is the most problematic kernel because its computation time is quadratic in the sequence length, rather than linear. This can work if your attention kernel can work for arbitrary shapes (this is not the case for all attention implementations! Many of them specialize with templates), and you have a persistent kernel that allocates only as many blocks as you have SM's (so you don't have to figure out how many blocks to allocate for a specific sequence length). Using a conditional SWITCH node is a better generic approach to this problem, but that requires more infrastructure work. Note that this requires knowledge of the exact location of the value in your kernel's parameter buffer to mutate. It won't work with arbitrary stream capture code whose kernels you don't know before hand. So I expect this code path to be rarely used. Testing: ``` pytest -s -k raw_graph_exec test/test_cuda.py ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161294 Approved by: https://github.com/ngimel, https://github.com/BoyuanFeng, https://github.com/eellison, https://github.com/eqy	2025-08-25 20:57:37 +00:00
Sandeep Narendranath Karjala	995397d47a	[inductor] structured-log graph execution order + test (#160448 ) Summary: - Emit a structured trace per compiled graph execution to reconstruct execution order in TLParse. - Adds debug.log_graph_execution(name) called from `CompiledFxGraph.__call__`, producing an artifact named inductor_graph_execution with payload {"graph": "graph_<id>"}. Testing: - Add inline test to verify structure and output Pull Request resolved: https://github.com/pytorch/pytorch/pull/160448 Approved by: https://github.com/xmfan	2025-08-25 20:12:18 +00:00
Chen	ffa1ce7650	Fix the parity of original and exported module parameters (#160600 ) ## Problem Fixing parameter mismatch issue during torch.export with strict mode (see "How to reproduce the issue" section below): When there are two attribute mapping to the same tensor, the strict mode will 1. Have a standard param buffer table to standardize the name (bug happens [here](`f861dc1826/torch/export/_trace.py (L356)`)! when 2 parameter have same id(param), the latter name will overwrite the previous name) 2. [Update](`f861dc1826/torch/export/_trace.py (L1481)`) exported signature with updated standard FQN (problematic) 3. When getting exported_program.module(), it will call [_unlift_exported_program_lifted_states](`f861dc1826/torch/export/exported_program.py (L1297)`) to recover attribute from exported signature where the parameter name is defined and standardized Then the named_parameter of this module will have overwritten name instead of original name ## How to reproduce the issue? reproduce issue shared by @taotaohuang001 torch version: 2.8.0 ```python import torch from torch import nn # ---- Toy model with embedding weight sharing (aliasing) ---- class Toy(nn.Module): def __init__(self): super().__init__() self.embedding_layers = nn.ModuleDict() tbl = nn.Embedding(100, 8) self.embedding_layers["ActorId"] = tbl # Alias: reuse the SAME module instance for another feature self.embedding_layers["RootActorId"] = self.embedding_layers["ActorId"] self.proj = nn.Linear(16, 1) def forward(self, feats: dict[str, torch.Tensor]): e1 = self.embedding_layers["ActorId"](feats["ActorId"]) e2 = self.embedding_layers["RootActorId"](feats["RootActorId"]) return self.proj(torch.cat([e1, e2], dim=-1)) torch.manual_seed(0) m = Toy().eval() # Show pre-export parameter names (canonicalized; shared weight appears once) print("PRE-EXPORT named_parameters:") print([name for name, _ in m.named_parameters()]) # Sanity: the two feature names point to the same weight object w1 = m.embedding_layers["ActorId"].weight w2 = m.embedding_layers["RootActorId"].weight print("PRE-EXPORT alias -> same object:", w1 is w2, "\| same storage:", w1.data_ptr() == w2.data_ptr()) # Example inputs (dict structure will be captured by export) ex_in = { "ActorId": torch.randint(0, 100, (4,)), "RootActorId": torch.randint(0, 100, (4,)), } # ---- Export (in memory) and materialize the runnable module ---- ep = torch.export.export(m, (ex_in,), strict=True) gm = ep.module() # GraphModule with new (canonical) parameter names print("\nPOST-EXPORT named_parameters (GraphModule):") post_names = [name for name, _ in gm.named_parameters()] print(post_names) # Prove alias persists after export: run fwd/bwd and check a single grad tensor exists out = gm(ex_in).sum() out.backward() # Find the embedding weight in the exported module by shape (100, 8) emb_names = [name for name, p in gm.named_parameters() if p.shape == torch.Size([100, 8])] print("\nEmbedding param (post-export) canonical name:", emb_names[0] if emb_names else "<not found>") # Show that only one grad exists for the shared table for name, p in gm.named_parameters(): if p.grad is not None and p.shape == torch.Size([100, 8]): print("Grad present on shared embedding weight:", name, "\| grad shape:", tuple(p.grad.shape)) break ``` And you will see parameters are different before and after export ``` PRE-EXPORT named_parameters: ['embedding_layers.ActorId.weight', 'proj.weight', 'proj.bias'] PRE-EXPORT alias -> same object: True \| same storage: True POST-EXPORT named_parameters (GraphModule): ['embedding_layers.RootActorId.weight', 'proj.weight', 'proj.bias'] Embedding param (post-export) canonical name: embedding_layers.RootActorId.weight Grad present on shared embedding weight: embedding_layers.RootActorId.weight \| grad shape: (100, 8) ``` ## Solution Fixing this issue by making sure latter named parameter will not overwrite the `param_buffer_table` when original model's named parameter already maps to certain parameter. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160600 Approved by: https://github.com/angelayi	2025-08-25 19:40:06 +00:00
PyTorch MergeBot	3e210f90c2	Revert "[dynamo] Refactor convert_frame.compile_frame to be self contained function. [5/n] (#160900 )" This reverts commit 1113e7de30da95973c1eac7921601f9a0e94f2db. Reverted https://github.com/pytorch/pytorch/pull/160900 on behalf of https://github.com/atalman due to executorch failure ([comment](https://github.com/pytorch/pytorch/pull/160900#issuecomment-3221372096))	2025-08-25 18:56:18 +00:00
Scott Wolchok	660b5656a4	Inline is_read_only_alias_match in _correct_storage_aliasing (#161285 ) Drives down the overhead of return_and_correct_storage_aliasing slightly. Hopefully you'll agree it doesn't compromise readability. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161285 Approved by: https://github.com/wconstab ghstack dependencies: #161231, #161234, #161235, #161240, #161284	2025-08-25 18:35:21 +00:00
Scott Wolchok	0e0bb4f1fd	Remove unnecessary len() call in _correct_storage_aliasing.is_read_only_alias_match (#161284 ) Containers are truthy iff they're non-empty. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161284 Approved by: https://github.com/Skylion007, https://github.com/wconstab ghstack dependencies: #161231, #161234, #161235, #161240	2025-08-25 18:35:21 +00:00
Scott Wolchok	b048f0e189	Improve efficiency of _python_dispatch.return_and_correct_aliasing (#161240 ) get_write_alias() call count reduction explained briefly in code comment. We don't need to check write_aliases against None in the final outs_to_return calculation because we just did that check. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161240 Approved by: https://github.com/wconstab ghstack dependencies: #161231, #161234, #161235	2025-08-25 18:35:21 +00:00
Scott Wolchok	c35538d3c5	Minor cleanup of DeviceMesh.__eq__ (#161235 ) `self is other` means the same thing as `id(self) == id(other)`, but it's one operator instead of 3. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161235 Approved by: https://github.com/wconstab, https://github.com/zpcore, https://github.com/fduwjj ghstack dependencies: #161231, #161234	2025-08-25 18:35:21 +00:00
Scott Wolchok	cfafd98c53	Use comparison key in OpSchema to avoid duplicate work between `__hash__` and `__eq__` (#161234 ) The performance cost of `dict` lookups keyed by `OpSchema` is a significant minority of DTensor overhead. With this change we shave a net ~1% off the total running time of the benchmark from #160580, as measured by using cProfile and comparing cumulative time spent in propagate + OpSchema's `__post_init__`. (`__post_init__` grew from 2.5% to 6.4% (+3.9%) and propagate shrank from 12.5% to 7.8% (-4.7%)). Pull Request resolved: https://github.com/pytorch/pytorch/pull/161234 Approved by: https://github.com/wconstab ghstack dependencies: #161231	2025-08-25 18:35:21 +00:00
Scott Wolchok	5d6434b132	Fix OpSchema equality check (#161231 ) `__eq__` didn't compare lists of DTensorSpec, but `__hash__` did (and it looks like attention was paid to hash, so I made comparison follow suit). Pull Request resolved: https://github.com/pytorch/pytorch/pull/161231 Approved by: https://github.com/wconstab, https://github.com/XilunWu, https://github.com/zpcore	2025-08-25 18:35:21 +00:00
xinan.lin	2f0de0ff93	[Inductor] Update Intel Triton for PyTorch 2.9. (#161050 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161050 Approved by: https://github.com/anmyachev, https://github.com/EikanWang, https://github.com/jansel	2025-08-25 17:18:19 +00:00
angelayi	c081481bbe	[aoti-fx] Output OpOverload fallbacks (#161195 ) Updates the inductor-wrapper-fxir code to use the kernel.op_overload when generating extern kernel calls. This way we can keep the IR consistent with using ATen ops. TODO: we're also inserting torch.empty_strided calls -- need to turn this into aten too Pull Request resolved: https://github.com/pytorch/pytorch/pull/161195 Approved by: https://github.com/blaine-rister	2025-08-25 17:03:05 +00:00
PyTorch MergeBot	df571ae7ad	Revert "Fix conv exhaustive autotuning and expand Exhaustive test coverage (#159387 )" This reverts commit 3ea6cc8c2d443d6104159d50e8328c144f6caa39. Reverted https://github.com/pytorch/pytorch/pull/159387 on behalf of https://github.com/jeffdaily due to breaks ROCm, AttributeError: 'torch._C._CudaDeviceProperties' object has no attribute 'shared_memory_per_block_optin' ([comment](https://github.com/pytorch/pytorch/pull/159387#issuecomment-3220989480))	2025-08-25 16:50:03 +00:00
Animesh Jain	9e1c954134	[dynamo] Pass requires_grad to nn.Parameter construction (#161364 ) Fixes https://github.com/pytorch/pytorch/issues/161191 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161364 Approved by: https://github.com/Skylion007, https://github.com/StrongerXi	2025-08-25 16:49:28 +00:00
Tom Ritchford	83283ce7f5	docstring_linter: Fix #151692 and other issues (#156596 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/156596 Approved by: https://github.com/eellison	2025-08-25 16:04:14 +00:00
Hashem Hashemi	ab8d60f4c8	[ROCm] Unroll loads in global_reduce (#161181 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161181 Approved by: https://github.com/jeffdaily, https://github.com/pruthvistony Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-25 15:45:49 +00:00
Xuehai Pan	af3265d20f	[BE][CI] fix `pkg=<pin>` to `pkg==<pin>` in pip requirement specs (#160811 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160811 Approved by: https://github.com/seemethere	2025-08-25 15:31:21 +00:00
Eddie Yan	f391afe9bf	[cuDNN][convolution] remove redundant conv3d 64bit test (#161177 ) turns out it's the same as ``` @onlyCUDA @largeTensorTest("40GB") @largeTensorTest("24GB", "cpu") @tf32_on_and_off(0.005) def test_conv3d_64bit_indexing(self, device): x = torch.rand(1, 32, 512, 512, 256) m = torch.nn.Conv3d(32, 1, kernel_size=1, padding=0, stride=1, bias=False) yref = m(x) y = m.to(device=device)(x.to(device=device)) self.assertEqual(yref, y) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161177 Approved by: https://github.com/Skylion007	2025-08-25 15:01:05 +00:00
zhxchen17	1113e7de30	[dynamo] Refactor convert_frame.compile_frame to be self contained function. [5/n] (#160900 ) convert_frame.compile_frame used to take a callback transform function which will capture the frame object it has, but the frame information is not passed directly into compile_frame function. This PR changes the signature of compile_frame so that frame information is directly passed in the function without taking a callback. This makes it easier to build fullgraph capture API on top of compile_frame. @exported-using-ghexport Differential Revision: [D80469801](https://our.internmc.facebook.com/intern/diff/D80469801/) Differential Revision: [D80469801](https://our.internmc.facebook.com/intern/diff/D80469801) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160900 Approved by: https://github.com/tugsbayasgalan, https://github.com/anijain2305	2025-08-25 14:53:54 +00:00
PyTorch MergeBot	40c0e700a4	Revert "[AMD] Fix AMD User Defined Kernel Autotune (#160671 )" This reverts commit 431846a6323c6f1d02da49e311ac694324f386f4. Reverted https://github.com/pytorch/pytorch/pull/160671 on behalf of https://github.com/atalman due to new test is failing: inductor/test_aot_inductor.py::AOTInductorTestABICompatibleGpu::test_rocm_triton_autotuning_cuda [GH job link](https://github.com/pytorch/pytorch/actions/runs/17172795679/job/48725235301) [HUD commit link](`431846a632`) ([comment](https://github.com/pytorch/pytorch/pull/160671#issuecomment-3220442141))	2025-08-25 14:07:48 +00:00
zeshengzong	510825e5fe	Optimize `dynamo` typing (#147499 ) Optimize dynamo methods type annotation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/147499 Approved by: https://github.com/anijain2305	2025-08-25 13:20:45 +00:00
PyTorch MergeBot	ab7787fb82	Revert "[inductor] Windows inductor use intel-openmp. (#160258 )" This reverts commit 41673110cd7c5960824cc74a6fcaeda1a8bc7a23. Reverted https://github.com/pytorch/pytorch/pull/160258 on behalf of https://github.com/malfet due to Reverting to fix https://github.com/pytorch/pytorch/issues/160898 and https://github.com/pytorch/pytorch/issues/160962 ([comment](https://github.com/pytorch/pytorch/pull/160258#issuecomment-3220158145))	2025-08-25 12:57:47 +00:00
PyTorch MergeBot	1eccfb157a	Revert "[BE] Remove intel-openmp dependency in setup.py (#160976 )" This reverts commit e4839470470168648dee5997f57347bb8541ea2b. Reverted https://github.com/pytorch/pytorch/pull/160976 on behalf of https://github.com/malfet due to This PR is doing something strange ([comment](https://github.com/pytorch/pytorch/pull/160976#issuecomment-3220120462))	2025-08-25 12:46:12 +00:00
Raman Kumar	4651aaac47	Fix typo: 'complext' (#160335 ) minor fix for a typo: `complext` to `complex` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160335 Approved by: https://github.com/Skylion007	2025-08-25 10:37:59 +00:00
Liang Wang	037c43d3b2	[tgif] fix getattr_recursive with ModuleList (#161204 ) Summary: This change updates `getattr_recursive` to handle qualnames with ModuleList that contain digit indices, for example, `op_instances.1.value_model.feature_weights` Test Plan: TBA Rollback Plan: Reviewed By: jiayisuse Differential Revision: D80503985 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161204 Approved by: https://github.com/jiayisuse	2025-08-25 10:08:47 +00:00
Dmitry Rogozhkin	eb5549a431	xpu: fix cpp_extension compatibility with oneapi dpc++ 2025.2 compiler (#161012 ) Intel oneapi DPC++ compiler has changed (fixed) parsing of `-fsycl-host-compiler-options` option in the respect of treating arguments with escaped quotes. This commit adds an if branches depending on compiler versions. Fixes: https://github.com/intel/torch-xpu-ops/issues/1938 CC: @chuanqi129 @EikanWang @guangyey Pull Request resolved: https://github.com/pytorch/pytorch/pull/161012 Approved by: https://github.com/guangyey, https://github.com/EikanWang, https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-08-25 09:29:53 +00:00
FFFrog	56ebed627a	[OpenReg] Add OSX/Windows Support for OpenReg (#159441 ) As the title stated. Changes: - Abstract platform-specific APIs - Add OSX/Windows support - Set default symbol visibility to "hidden" Co-authored-by: @can-gaa-hou Original PR:https://github.com/pytorch/pytorch/pull/159029 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159441 Approved by: https://github.com/albanD Co-authored-by: jiahaochen666 <jiahaochen535@gmail.com>	2025-08-25 08:03:27 +00:00
Liao, Wei	80df27a612	port distributed pipeline test files for Intel GPU (#159033 ) In this PR we will port all distributed pipeline test files. We could enable Intel GPU with following methods and try the best to keep the original code styles: 1. instantiate_device_type_tests() 2. use "torch.accelerator.current_accelerator()" to determine the accelerator backend 3. use "requires_accelerator_dist_backend()" to replace requires_nccl() 4. use "get_default_backend_for_device()" to get backend 5. enabled XPU for some test path Pull Request resolved: https://github.com/pytorch/pytorch/pull/159033 Approved by: https://github.com/guangyey, https://github.com/kwen2501	2025-08-25 05:24:27 +00:00
Will Constable	e3d68dfae2	[DTensor] Make default RNG semantics match user-passed generator (#160482 ) Previously, DTensor kept its own copy of the generator state after the first time a random operator was called on a DTensor. This copy would evolve independently from the generator outside of DTensor. After adding support for users to pass a specific generator into random operators (e.g. `uniform_(..., generator=)`), it was determined (in discussion on #159991) to change the semantics so that any random operations performed on DTensor would evolve the state of the publicly visible generators (either the default one or user-passed one). The upsides are (1) it is now possible to call torch.manual_seed() at any point in the program and have a consistent effect on DTensor, (2) DTensor ops have an observable effect on the generator. The downside is that users are now responsible for seeding their generator before using DTensor, ensuring all ranks use the same seed. Fixes #159991 confirmed docs rendered OK <img width="897" height="414" alt="image" src="https://github.com/user-attachments/assets/c082f0f0-5447-47aa-834f-65342eb237cd" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160482 Approved by: https://github.com/wanchaol	2025-08-25 04:21:19 +00:00
Natalia Gimelshein	726dce3c94	[nccl symm mem] don't use arg for mempool, correctly use symmetric registration in hooks (#161238 ) Per title Pull Request resolved: https://github.com/pytorch/pytorch/pull/161238 Approved by: https://github.com/kwen2501, https://github.com/syed-ahmed	2025-08-25 03:09:32 +00:00
Chuanhao Zhuge	74280d0913	[muon] Introduce Muon optimizer to PyTorch (#160213 ) A single-device version of Muon. Algorithm refers Keller Jordan's [Muon blogpost](https://kellerjordan.github.io/posts/muon/), and optionally incorporates [Moonshot's](https://github.com/MoonshotAI/Moonlight/blob/master/Moonlight.pdf) learning rate adjustment strategy. This implementation maintains a minimalist API and is consistent with other optimizer conventions. PyTorch team prefers to handle parameter filtering at a higher level, with the Muon optimizer performing only the msign computation for orthogonalization on all parameters it receives. Users are responsible for grouping parameters for different optimizers as needed. An example usage is shown below, and a more detailed example will be added to the [PyTorch examples](https://github.com/pytorch/examples) directory. Usage ```python model = MyModelForCausalLM # filter out your params manually muon_params = [...] adamw_params = [...] muon = Muon( params = muon_params lr=lr, wd=wd, ) adamw = AdamW( params = adamw_params lr=lr, wd=wd, ) # in training loop loss = model(input) loss.backward() muon.step() adamw.step() muon.zero_grad() adamw.zero_grad() ``` ~~Additional usage~~ ~~Users are also able to pass in self-defined `msign` function for orthogonalization, and learning rate adjustment function. Interface defined below:~~ ```python ~~AdjustLrFn: TypeAlias = Callable[[float, torch.Size], float]~~ ~~MsignFn: TypeAlias = Callable[[Tensor, BaseMsignFnConfig], Tensor]~~ ``` As discussed with team and in comment, we prefer to make the interface simpler and cleaner, thus we removed the callback interface, and canonicalize the original NS algorithm for Muon. The only configs available to users are `ns_steps`, `coefficients`, and `eps`, configurable through kwargs. By default, we use 5-step Newton-Schulz, with coefficients proposed by [Keller](https://kellerjordan.github.io/posts/muon/). We use LR adjustment proposed by [Moonshot](https://github.com/MoonshotAI/Moonlight/blob/master/Moonlight.pdf), which grafts learning rate from AdamW. Testing ~~1. Unit tests: the newly introduced Muon is covered in `test/test_optim.py`. We updated the test cases to pass named parameters to the optimizer under test. Additionally, we introduced a new test case to verify that when the user provides an empty FQN list, Muon correctly falls back to AdamW behavior.~~ As discussed, in order not to complicate the codebase, we prefer not to include reference implementation into PyTorch. We also updated the interface so we don't need to test the FQN based filtering. Muon is covered by the existing `test_optim.py` unit test. 2. End-to-end test: we added a training script that pre-trains a QWEN-like model on `openwebtext-100k` dataset. We trained for one epoch and the resulting loss curve is compared against the Moonshot implementation to confirm behavioral consistency. <img width="1102" height="472" alt="Screenshot 2025-07-29 at 1 04 12 AM" src="https://github.com/user-attachments/assets/ceab0733-497d-4070-8032-02ae7995c64c" /> Numerics We evaluate our implementation with existing implementation to confirm numerical consistency. As discussed, our implementation closely follows the algorithm described in [Keller's post](https://kellerjordan.github.io/posts/muon/), while incorporating the learning rate adjustment from [Moonlight](https://github.com/MoonshotAI/Moonlight/blob/master/Moonlight.pdf). This captures a key insight that allows users to reuse hyper-parameters tuned for `adamW`, making Muon a drop-in swap. As expected, the numerics difference mainly comes from `adjust_lr`, a max of ~5% relative diff in an example unit test setup below. ```python # dummy model and data model0 = Linear(10, 10, bias=False) model1 = copy.deepcopy(model0) inputs = torch.randn(8, 10) targets = torch.randn(8, 10) loss = MSELoss() lr = 1e-3 wd = 0.1 momentum = 0.95 opt_ref_muon = KellySingleDeviceMuon( params=model0.parameters(), lr=lr, weight_decay=wd, momentum=momentum, ) opt_exp_muon = Muon( params=model1.parameters(), lr=lr, weight_decay=wd, momentum=momentum, ) out_ref = model0(inputs) loss_ref = loss(out_ref, targets) opt_ref_muon.zero_grad() loss_ref.backward() opt_ref_muon.step() out_exp = model1(inputs) loss_exp = loss(out_exp, targets) opt_exp_muon.zero_grad() loss_exp.backward() opt_exp_muon.step() for p_ref, p_exp in zip(model0.parameters(), model1.parameters()): torch.testing.assert_close(p_ref, p_exp) ``` As explained above, including this `adjust_lr` is preferable. This is validated by an e2e training runs on training a qwen-2-like 0.5b model, where the curves show that training with `adjust_lr` converges more effectively than without. <img width="1179" height="464" alt="Screenshot 2025-08-18 at 10 12 33 AM" src="https://github.com/user-attachments/assets/e797d3da-c2f0-4187-b99e-5d48b7437c3c" /> Performance Training for one epoch of openwebtext-100k on eight H100 GPUs with DDP: - adamw_ddp finishes in 13.12 min - pytorch_muon_ddp finishes in 13.45 min Muon runs ~20s slower compared to AdamW. Assuming no other changes, Muon is 2.5% slower than AdamW. AdamW: Optimizer.step() takes ~13.5 ms, step time ~930 ms <img width="726" height="590" alt="Screenshot 2025-07-29 at 1 56 14 AM" src="https://github.com/user-attachments/assets/ebcd7e1c-d129-4b20-9396-39f568edf03d" /> Muon: Optimizer.step() takes ~54 ms, step time ~960 ms <img width="751" height="597" alt="Screenshot 2025-07-29 at 2 02 20 AM" src="https://github.com/user-attachments/assets/72f5b904-ebd5-4502-a6ff-d3e9e5a6da81" /> Note We restrict the implementation to accept only 2D parameters. An alternative approach is to allow parameters with more than two dimensions and apply orthogonalization over the last two dimensions. We opt not to go with this approach as it can be error-prone. For example, with a kernel shaped `[in_channel, height, width, out_channel]`, applying orthogonalization to the last two dimensions is not meaningful. Since Muon is designed to operate orthogonalization on 2D matrices, preserving this assumption keeps the implementation clean and sound. Next Steps 1. Add `MuP` 2. Open-source optimized triton kernel for symmetric matmul. A preliminary benchmark found 1.23x - 1.48x speedup on small - large (n = 256 -> 16384) matrices. 3. Open-source unsharded Muon co-designed with FSDP2. **** Pull Request resolved: https://github.com/pytorch/pytorch/pull/160213 Approved by: https://github.com/janeyx99	2025-08-24 08:03:04 +00:00
Ting Lu	1de4540449	Use -compress-mode=size for CUDA 13 build for binary size reduction (#161316 ) https://github.com/pytorch/pytorch/issues/159779 CUDA 13 added the support for --compress-mode flag for nvcc across all drivers of CUDA 13.X toolkits, enabling the possibility to use --compress-mode=size for significant size reduction (~71% less for CUDA Math APIs for example). https://developer.nvidia.com/blog/whats-new-and-important-in-cuda-toolkit-13-0/ Why we have to add for CUDA 13 only, quote from @ptrblck : Any usage of --compress-mode=size/balance will drop the support of older CUDA drivers and will bump the min. driver requirement to CUDA 12.4. https://github.com/pytorch/pytorch/pull/157791#issuecomment-3058027353 Default for CUDA 13 will be --compress-mode=balance which gives smaller binaries than LZ4 speed mode used in previous CUDA versions. Related - https://github.com/pytorch/pytorch/pull/157791 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161316 Approved by: https://github.com/nWEIdia, https://github.com/Skylion007	2025-08-24 03:28:29 +00:00
Aidyn-A	3e5b021f21	[ATen][CPU][Sparse] Use Third-Party Eigen for sparse add and addmm (#155357 ) This pull request adds the following ops for sparse matrices using Eigen library: ```python add(a_csr, b_csr) add(a_csc, b_csc) addmm(c_csr, a_csr, b_csr) addmm(c_csr, a_csr, b_csc) addmm(c_csr, a_csc, b_csc) addmm(c_csr, a_csc, b_csr) addmm(c_csc, a_csr, b_csr) addmm(c_csc, a_csr, b_csc) addmm(c_csc, a_csc, b_csc) addmm(c_csc, a_csc, b_csr) ``` Currently, the operations for sparse matrices on CPU are available through MKL only. The non-existence of MKL on `aarch64` causes the unavailability of these ops on any machines with ARM based CPUs, including Apple Silicon, AWS Graviton and NVIDIA Grace. This PR addresses this issue by using Eigen as a backend for the above ops. This is a re-factored version of my previous PR #101814. The main difference with the old one, this does not enable Eigen by default. Pull Request resolved: https://github.com/pytorch/pytorch/pull/155357 Approved by: https://github.com/pearu, https://github.com/eqy Co-authored-by: Eli Uriegas <eliuriegas@meta.com>	2025-08-23 19:03:55 +00:00
Nikita Shulga	4acdbb8311	[MPS] Fix index_copy for strided indices (#161333 ) By passing strides to strided variant of the tensor Fixes https://github.com/pytorch/pytorch/issues/160993 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161333 Approved by: https://github.com/huydhn, https://github.com/wdvr ghstack dependencies: #161206, #161267	2025-08-23 14:38:57 +00:00
PyTorch MergeBot	f912c93344	Revert "Move non inductor workflows to Python 3.9 -> 3.10 (#161182 )" This reverts commit e20f6d798606f3245686e950c43635bbe526232d. Reverted https://github.com/pytorch/pytorch/pull/161182 on behalf of https://github.com/zou3519 due to broke dynamo_wrapped tests, those are a bit finicky to fix (there is probably more than one failure!) ([comment](https://github.com/pytorch/pytorch/pull/161182#issuecomment-3216953097))	2025-08-23 13:00:42 +00:00
Paul de Supinski	33346b5814	Support NUMA Binding for Callable Entrypoints, Take 2 (#161183 ) # Context In #160163, we added support for NUMA binding for `Callable` entrypoints to `elastic_launch`. This requires special consideration, because they go through a different path to spawn subprocesses compared to `str` entrypoints, a path which does not provide a straightforward way to utilize `numactl` CLI. See #160006 for a full description of the challenges. Although #160163 worked in initial local experiments, we ran into some linker errors in other environments when we tried to call `numactl`. This appeared to be due to interactions with how the `LD_PRELOAD` environment variable was being set. # This PR On further thought, the most straightforward, foolproof solution here is to use [the trick that @d4l3k suggested.](https://github.com/pytorch/pytorch/issues/160006#issuecomment-3162018836) Specifically, for each local rank `i`: 1. The parent process sets its own CPU affinity to what local rank `i`'s should be. 2. Then, the parent spawns the subprocess for local rank `i`. 3. Finally, the parent resets its own CPU affinity to what it was originally. There were other solutions that would work just for `Callable` entrypoints, but I believe this is the simplest one that can work for both `str` and `Callable`, and it's pretty simple. This required a bit of refactoring: 1. Turn all the `_get_.*_numactl_options` into functions which return a set of logical CPUs to bind to, rather than options like `--cpunodebind=0`. 2. Instead of wrapping commands with `numactl`, use `os.sched_setaffinity` to bind to the CPUs from (1.). 3. Put this all inside a context manager which encapsulates applying and restoring the bindings in the parent process. 4. Use the context manager for both `str` and `Callable` paths # Test Plan ## Automated `$ pytest test/test_numa_binding.py` ## Manual See [doc.](https://docs.google.com/document/d/1vxD-OKYBTT27jbBwtW9iz9g0tNM0u-i0tiTJg_ieQA8/edit?tab=t.0) Meta only, but TLDR tried out every combination of `str`, `Callable`, binding disabled, and binding enabled on the same model and saw 2x SM utilization for binding enabled. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161183 Approved by: https://github.com/d4l3k	2025-08-23 07:23:22 +00:00
Chong Gu	431846a632	[AMD] Fix AMD User Defined Kernel Autotune (#160671 ) Summary: AMD specific kwargs need to be removed from the guard, otherwise a keyerror will be raised when executing the kernel. Test Plan: ``` buck2 run mode/opt-amd-gpu -m rocm641 -c fbcode.split-dwarf=true -c fbcode.use_link_groups=true -c fbcode.enable_gpu_sections=true //hpc/new/models/feed/benchmark:feed_lower_benchmark -- --load=manifold://ads_storage_fblearner/tree/user/facebook/fblearner/predictor/894698382/0/gpu_lowering/new_input8 --skip-eager --skip-flop-estimation --sync-mode=0 --lower-backend=AOT_INDUCTOR ``` can succeed after this change. Rollback Plan: Differential Revision: D80285441 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160671 Approved by: https://github.com/muchulee8	2025-08-23 07:23:09 +00:00
Malay Bag	cd31be28ec	Reland D80238201: [Torch.Export] Add flat arg paths in error message (#160919 ) Summary: [The diff was reverted due to CLA error, in the process of retrieving account] Previous error message ``` RuntimeError: Expected input at args.<unknown location>.shape[0] to be equal to 4096, but got 7680. If you meant for this dimension to be dynamic, please re-export and specify dynamic_shapes (e.g. with Dim.DYNAMIC) ``` New error message ``` RuntimeError: Expected input at args.[0].supervision_input.weight.shape[0] to be equal to 4096, but got 7680. If you meant for this dimension to be dynamic, please re-export and specify dynamic_shapes (e.g. with Dim.DYNAMIC) ``` Test Plan: ``` buck test mode/opt apf/rec/ir/tests:ir_export_deserialize_test ``` https://www.internalfb.com/intern/testinfra/testrun/4785074906254375 ``` buck run mode/opt caffe2/test:test_export -- -r unflatten ``` ``` Ran 413 tests in 208.414s OK (skipped=1, expected failures=13) ``` Rollback Plan: Differential Revision: D80487367 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160919 Approved by: https://github.com/angelayi	2025-08-23 07:20:58 +00:00
PyTorch MergeBot	710514a2a5	Revert "Enable output padding when only outermost dim is dynamic (#159404 )" This reverts commit f15ada5c6fad97a7dcbfa4673f067b6942dda640. Reverted https://github.com/pytorch/pytorch/pull/159404 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/159404#issuecomment-3216517032))	2025-08-23 07:17:30 +00:00
Xu Han	22df59efc0	[inductor] add MSVC language pack check. (#161298 ) Check MSVC's language pack: https://github.com/pytorch/pytorch/issues/157673#issuecomment-3051682766 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161298 Approved by: https://github.com/angelayi	2025-08-23 07:06:48 +00:00
Angel Li	3a4140bf8e	[FlexAttention] fixing learnable bias assertion error in inductor (#161170 ) Users encountered unexpected behaviour when using FlexAttention with learnable biases, including assertion errors (#157677) We traced the root cause to the registration of subgraph buffers—this caused inconsistencies in the naming and ultimately incorrect retrieval later on. This problem only arose if the model was compiled as a whole (ie using @torch.compile) since only then would there be naming conflicts. In this PR, we register the buffers with the base graph to solve this issue. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161170 Approved by: https://github.com/drisspg	2025-08-23 06:24:22 +00:00
Yang Wang	6443ea337d	enable more tests (#161192 ) Enable more vllm test against pytorch main, add schedule to run the test every 12 hours. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161192 Approved by: https://github.com/huydhn	2025-08-23 06:01:22 +00:00
Justin Chu	36ac916929	[ONNX] Fix lower opset version support in dynamo=True (#161056 ) After we switched to constructing the registry with the specified opset version in dynamo=True, support for opset<18 was broken because there would be no torchlib ops registered for these opsets. I updated the registry creation logic to always use opset 18 if the requested opset is lower, and use the version converter (as designed) to target those opsets. This requires onnxscript>=0.4 (https://github.com/pytorch/pytorch/pull/161312) Fixes https://github.com/onnx/onnx/issues/7235 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161056 Approved by: https://github.com/titaiwangms	2025-08-23 05:04:36 +00:00
PyTorch UpdateBot	7131bfab89	[vllm hash update] update the pinned vllm hash (#161227 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161227 Approved by: https://github.com/pytorchbot	2025-08-23 04:25:16 +00:00
PyTorch UpdateBot	ac8d9418ae	[audio hash update] update the pinned audio hash (#161331 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161331 Approved by: https://github.com/pytorchbot	2025-08-23 04:21:03 +00:00
Justin Chu	38a492d40d	[ONNX] Remove unused _onnx_supported_ops (#161322 ) Signed-off-by: Justin Chu <justinchuby@users.noreply.github.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161322 Approved by: https://github.com/titaiwangms	2025-08-23 02:42:25 +00:00
Kurt Mohler	394728bab2	[MPS] Update `avg_pool3d` kernel to use `opmath_t` (#161071 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161071 Approved by: https://github.com/Skylion007, https://github.com/malfet ghstack dependencies: #161011	2025-08-23 02:36:22 +00:00
Kurt Mohler	121afd6a8f	[MPS] Update `avg_pool2d` to use Metal kernel when `ceil_mode=True` (#161011 ) Fixes #160743 The MPS impl of `avg_pool2d` seems to only give incorrect results when `ceil_mode=True`. I wrote a performance measurement script (`0ee6e58643/avg_pool_mps/perf_2d.py`) which tests a bunch of different cases and also marks the cases where MPS and CPU results do not match. I found that if I update `avg_pool2d` to use the new Metal kernel in all cases, that fixes all the mismatches, but it also decreases performance for some of the `ceil_mode=False` cases. So I opted to only run the new Metal kernel when `ceil_mode=True`, which does not significantly decrease performance in any of the cases tested. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161011 Approved by: https://github.com/malfet	2025-08-23 02:36:22 +00:00
Blaine Burton Rister	d228a776e9	[Inductor-FX] Support Tensorbox outputs (#161245 ) # Problem The FX converter previously supported graph outputs which were `StorageBox`, but not `TensorBox`. The latter seems to show up in certain cases when the output is a slice/view of the input. # Fix This PR generalizes the code to handle `MutableBox` instead of `StorageBox` specifically. # Test Added a CI test exposing the issue. The test case was found by intentionally breaking `TensorBox(ReinterpretView` support in https://github.com/pytorch/pytorch/pull/161258. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161245 Approved by: https://github.com/angelayi	2025-08-23 02:04:13 +00:00
can-gaa-hou	cee72119b2	[Test] Adding a testcase for constant_pad_nd (#161259 ) Fixes #161066 This PR adds a simple testcase for constant_pad_nd on MPS as mentioned in https://github.com/pytorch/pytorch/pull/161149#issuecomment-3211701274 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161259 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-08-23 01:00:50 +00:00
PyTorch MergeBot	47d267364c	Revert "[SymmMem] Support rendezvous on slice of a tensor (#160825 )" This reverts commit 9d9cc9897ac44a1a8df38211b03d8342a8af48c3. Reverted https://github.com/pytorch/pytorch/pull/160825 on behalf of https://github.com/kwen2501 due to Change of course; use storage_ptr as key ([comment](https://github.com/pytorch/pytorch/pull/160825#issuecomment-3215951048))	2025-08-22 23:41:55 +00:00
Justin Chu	0d9da384ef	Bump onnxscript to 0.4.0 in CI (#161312 ) Use onnxscript apis for torch 2.9. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161312 Approved by: https://github.com/titaiwangms, https://github.com/malfet	2025-08-22 23:23:08 +00:00
Aaron Pollack	f521e82a4e	Update pyrefly config for better codenav (#161200 ) This fixes behavior in codenav by switching from `replace_imports_with_any` to `ignore-missing-imports` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161200 Approved by: https://github.com/aorenste, https://github.com/albanD	2025-08-22 23:05:07 +00:00
Ivan Zaitsev	bcfe1b2d71	Add initial bc-linter configuration (#161319 ) Preparation for https://github.com/pytorch/test-infra/pull/7016 Currently merging this PR is a noop change for PyTorch repo (bc-linter is not looking at the config yet). Pull Request resolved: https://github.com/pytorch/pytorch/pull/161319 Approved by: https://github.com/seemethere, https://github.com/ZainRizvi	2025-08-22 22:54:25 +00:00
Justin Chu	419a2dbf5f	[ONNX] Remove enable_fake_mode and exporter_legacy (#161222 ) Remove enable_fake_mode and exporter_legacy entirely. Even though this is bc breaking, `enable_fake_mode` is no longer compatible with the latest version of transformers, and so it is no longer useful. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161222 Approved by: https://github.com/titaiwangms	2025-08-22 22:15:27 +00:00
Shivam Raikundalia	3373b074f5	[Profiler] Add GC Events to Python Stack Tracer (#161209 ) Summary: Adds Python Garbage Collection to Kineto Traces and Profiler FunctionEvents. Create custom cpp callback in profiler_python.cpp. Then define a python function with cpp and register that callback for all python garbage collection. We don't worry about thread safety in this case because we are only doing init/teardown for main thread while holding GIL. Currently we are hiding this behind experimental config because python tracing tends to be unstable especially when adding any new feature. If this is found to not add too much overhead we can set this to on by default. NOTE: To enable this you need both with_stack=True and the experimental config on! Test Plan: Ran trace with GC induced and saw it on trace Also added a test Rollback Plan: Differential Revision: D80491146 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161209 Approved by: https://github.com/ngimel	2025-08-22 22:11:25 +00:00
Nikita Shulga	c8bb0e4720	[MPS] Fix `index_copy` for scalars (#161267 ) By `squeezing the input` when copying into scalar tensor from a 1d one And enable `test_index_copy_scalars_mps` Fixes https://github.com/pytorch/pytorch/issues/160737 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161267 Approved by: https://github.com/manuelcandales, https://github.com/Skylion007, https://github.com/dcci ghstack dependencies: #161206	2025-08-22 21:45:34 +00:00
Rob Timpe	4c36c8a994	[dynamo] Support method calls on complex ConstantVariables (#161122 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161122 Approved by: https://github.com/mlazos, https://github.com/guilhermeleobas	2025-08-22 21:40:03 +00:00
Yiming Zhou	9d882fd9ff	[benchmark] Add torchscript jit.trace to benchmark option (#161223 ) For comparing NativeRT and TorchScript. We add `torchscript-jit-trace` as an option in the benchmark. With this option, we can run trace a model and run inference with the traced module using TorchScript interpreter ``` python ./benchmarks/dynamo/huggingface.py --performance --inference --torchscript-jit-trace python ./benchmarks/dynamo/timm_models.py --performance --inference --torchscript-jit-trace python ./benchmarks/dynamo/torchbench.py --performance --inference --torchscript-jit-trace ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161223 Approved by: https://github.com/huydhn	2025-08-22 21:38:28 +00:00
Eddie Yan	2835cc5e91	[cuDNN] head dim > 128 works on H100 again in cuDNN SDPA? (#161210 ) reference: https://github.com/pytorch/torchtitan/pull/1610 9.10 only for now, we would want to hold off on upgrading to either cuDNN frontend 1.14+/cuDNN 9.11+ due to some head-dim > 128 handling issues Pull Request resolved: https://github.com/pytorch/pytorch/pull/161210 Approved by: https://github.com/Skylion007	2025-08-22 21:21:53 +00:00
PyTorch MergeBot	3f1a97a99c	Revert "[dynamic shapes] unbacked-safe slicing (#157944 )" This reverts commit 44549c7146bd6c4166f97e856037babe1b7f4f49. Reverted https://github.com/pytorch/pytorch/pull/157944 on behalf of https://github.com/pianpwk due to this PR & internal diff landed out of sync, just reverted internal with D80720654, will revert this & reland as codev ([comment](https://github.com/pytorch/pytorch/pull/157944#issuecomment-3215610135))	2025-08-22 20:48:46 +00:00
PyTorch MergeBot	981ac533c6	Revert "Close some sources of fake tensor leakages (#159923 )" This reverts commit 5afa4187dfe1e99278f8e372ec09102d5b937572. Reverted https://github.com/pytorch/pytorch/pull/159923 on behalf of https://github.com/zou3519 due to broke aoti test in inductor periodic ([comment](https://github.com/pytorch/pytorch/pull/159923#issuecomment-3215580688))	2025-08-22 20:42:50 +00:00
Gabriel Ferns	3ea6cc8c2d	Fix conv exhaustive autotuning and expand Exhaustive test coverage (#159387 ) Conv exhuastive currently throws an error, and I think it's worth adding tests to the other ops too in order to prevent regression in exhaustive. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159387 Approved by: https://github.com/coconutruben	2025-08-22 20:06:09 +00:00
PyTorch MergeBot	2c0650a00a	Revert "[BE][inductor] tl.dot(..., allow_tf32=...) -> tl.dot(..., input_precision=...) (#160711 )" This reverts commit 8dbe7f99bd707ee28ae12ecb9cab54e1785bf13e. Reverted https://github.com/pytorch/pytorch/pull/160711 on behalf of https://github.com/davidberard98 due to internal failure - T235384144 - I'll revert while I investigate. ([comment](https://github.com/pytorch/pytorch/pull/160711#issuecomment-3215343200))	2025-08-22 19:10:35 +00:00
PyTorch MergeBot	eba1ad09e4	Revert "[SymmMem] Support rendezvous on view of a tensor (#160925 )" This reverts commit 9d7cecdd6c44c5421d341bcc359be4097ea9a2f5. Reverted https://github.com/pytorch/pytorch/pull/160925 on behalf of https://github.com/kwen2501 due to Change of course: use storage ptr as symm mem keys as in the old days and force no_split in MemPool ([comment](https://github.com/pytorch/pytorch/pull/160925#issuecomment-3215315717))	2025-08-22 18:59:25 +00:00
Wang, Chuanqi	a43480d19c	[CD] Enable triton xpu Windows build for Python 3.14 (#161255 ) Follow #159869 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161255 Approved by: https://github.com/atalman	2025-08-22 18:39:31 +00:00
Xu Han	17b0263e86	[inductor] fix march=native pass to Windows CC. (#161264 ) fix march=native pass to Windows CC. <img width="593" height="218" alt="image" src="https://github.com/user-attachments/assets/1caedffa-d9be-43d9-9ce2-590c055980cd" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161264 Approved by: https://github.com/angelayi	2025-08-22 18:38:51 +00:00
Xu Han	97200c9711	[inductor] Add get page_size support for Windows. (#161273 ) `resource` can't work on Windows, as it is a Unix specific package as seen in https://docs.python.org/2/library/resource.html Use Windows system API to get page_size. Local tested: <img width="467" height="433" alt="image" src="https://github.com/user-attachments/assets/47a39060-3aea-46c3-bd8e-35a39413c51f" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161273 Approved by: https://github.com/angelayi	2025-08-22 18:36:14 +00:00
PyTorch MergeBot	1d458e2947	Revert "[Inductor] Update Outer Reduction Heuristic (#159093 )" This reverts commit f085f299584b06a2a7d8855eda2a411313e782ad. Reverted https://github.com/pytorch/pytorch/pull/159093 on behalf of https://github.com/seemethere due to this fails internal tests, see D80630416 for more info ([comment](https://github.com/pytorch/pytorch/pull/159093#issuecomment-3215263317))	2025-08-22 18:35:36 +00:00
Yidi Wu	266784ec6a	remove old while_loop_schema_gen test (#161202 ) Fixes https://github.com/pytorch/pytorch/issues/141202. This test is flaky for mysterious reasons and we have created a new way of creating schemas for hops. So delete the test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161202 Approved by: https://github.com/zou3519	2025-08-22 18:22:29 +00:00
Jeff Daily	25df65afd8	[ROCm] revamp HIPCachingAllocatorMasqueradingAsCUDA (#161221 ) HIPAllocatorMasqueradingAsCUDA and HIPCachingAllocatorMasqueradingAsCUDA are now proper complete wrappers of HIPAllocator and HIPCachingAllocator, respectively. HIPAllocatorMasqueradingAsCUDA now subclasses HIPAllocator instead of Allocator. This fixes usability of hipify replacing c10::cuda::CUDACachingAllocator::get() where callers expect a CUDAAllocator to be returned but instead were getting a very thin Allocator shim instead. This also fixes using cudagraph trees with torch compile. The hip:0 device was not being replaced by the cuda:0 device in all methods. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161221 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-22 18:13:12 +00:00
atalman	e20f6d7986	Move non inductor workflows to Python 3.9 -> 3.10 (#161182 ) Related to: https://github.com/pytorch/pytorch/issues/161167 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161182 Approved by: https://github.com/malfet, https://github.com/huydhn	2025-08-22 16:48:43 +00:00
Nikita Shulga	c2390087c3	[MPS] Fix index_select for scalar_types (#161206 ) By copy-n-pasting logic from `index_select_out_cpu` (and `_cuda`), where essentially the resizing is done inside the op, which also fixes faulty logic for scalars Pull Request resolved: https://github.com/pytorch/pytorch/pull/161206 Approved by: https://github.com/manuelcandales	2025-08-22 16:45:35 +00:00
zeshengzong	f09458c2e1	Enable `test/test_numpy_interop.py` config in mypy (#158556 ) ## Test Result ```bash lintrunner --take MYPY test/test_numpy_interop.py Warning: Could not find a lintrunner config at: '.lintrunner.private.toml'. Continuing without using configuration file. ok No lint issues. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158556 Approved by: https://github.com/soulitzer	2025-08-22 16:18:58 +00:00
Jithun Nair	7fcdd8d6af	Use ROCm MI325 runners for trunk.yml (#161184 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161184 Approved by: https://github.com/jeffdaily	2025-08-22 16:18:55 +00:00
PyTorch MergeBot	c7a77470c5	Revert "[DTensor] Make default RNG semantics match user-passed generator (#160482 )" This reverts commit d1faf2ef0476eb60b42c057baee9af0f48ae849a. Reverted https://github.com/pytorch/pytorch/pull/160482 on behalf of https://github.com/jeffdaily due to failing cuda and rocm jobs ([comment](https://github.com/pytorch/pytorch/pull/160482#issuecomment-3214694297))	2025-08-22 15:04:28 +00:00
Rex Zhang	ce467df5d1	rm platform args xplat/langtech/mobile/BUCK (#161018 ) Differential Revision: D80460691 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161018 Approved by: https://github.com/drisspg	2025-08-22 14:47:36 +00:00
IvanKobzarev	db44de4c0d	[inductor] Estimate peak memory allocfree and applying to reordering collectives (#160113 ) 1. Applying @eellison idea from https://github.com/pytorch/pytorch/pull/146562#discussion_r2059363672 for estimate_peak_memory: ``` """ Alternative version of estimate_peak_memory, that respects the fact, that every SchedulerNode has multiple phases: 1. alloc ( outputs ) 2. run_kernel 3. dealloc last_use buffers estimate_peak_memory collapses memory into one value: size_alloc - size_free While peak memory happens after alloc. Duplicating the code to not migrate all callsites at once, In future usages of estimate_peak_memory will migrate to this version. """ ``` - Applying this in `reorder_communication_preserving_peak_memory` pass. 2. Buffers during reordering can change deallocation point, if candidate and group to swap both are users of the f_input_buf and group contains last_use_snode. - Addressing this tracking the last_use_snode for each buffer and recomputing current memory respecting the change in size_free (group_node after reordering is not the last user of the buffer and its size_free -= buffer_size, while candidate becomes the last user and candidate.size_free += buffer_size). 4. Adding env var `PYTORCH_REORDER_COLLECTIVES_LIMIT` for ablation to limit number of collectives to reorder. What is after this PR: Iterative recomputation of memory estimations matches full memory estimations. Active memory is not regressing a lot, but reserved memory is significantly regressed. Investigation and fix of "reserved" memory will be in following PRs. BASELINE (bucketing AG and RS): active: 32Gb reserved: 34Gb ``` [rank0]:[titan] 2025-08-11 11:28:36,798 - root - INFO - step: 1 loss: 12.2722 grad_norm: 4.2192 active_memory: 24.66GiB(25.96%) reserved_memory: 25.38GiB(26.72%) tps: 99 tflops: 5.71 mfu: 0.58% [rank0]:[titan] 2025-08-11 11:28:38,640 - root - INFO - step: 2 loss: 13.1738 grad_norm: 50.5566 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 4,448 tflops: 257.63 mfu: 26.05% [rank0]:[titan] 2025-08-11 11:28:40,029 - root - INFO - step: 3 loss: 15.6866 grad_norm: 80.0862 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 5,900 tflops: 341.72 mfu: 34.55% [rank0]:[titan] 2025-08-11 11:28:41,423 - root - INFO - step: 4 loss: 13.4853 grad_norm: 7.8538 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 5,881 tflops: 340.57 mfu: 34.44% [rank0]:[titan] 2025-08-11 11:28:42,820 - root - INFO - step: 5 loss: 16.1191 grad_norm: 53.2481 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 5,867 tflops: 339.77 mfu: 34.35% ``` REORDER: active: 32Gb reserved: 36Gb ``` [rank0]:[titan] 2025-08-11 11:34:32,772 - root - INFO - step: 1 loss: 12.2490 grad_norm: 4.1944 active_memory: 24.66GiB(25.96%) reserved_memory: 26.81GiB(28.22%) tps: 85 tflops: 4.90 mfu: 0.50% [rank0]:[titan] 2025-08-11 11:34:35,329 - root - INFO - step: 2 loss: 13.1427 grad_norm: 39.5942 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 3,205 tflops: 185.61 mfu: 18.77% [rank0]:[titan] 2025-08-11 11:34:36,770 - root - INFO - step: 3 loss: 14.6084 grad_norm: 51.0743 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 5,688 tflops: 329.44 mfu: 33.31% [rank0]:[titan] 2025-08-11 11:34:38,197 - root - INFO - step: 4 loss: 13.6181 grad_norm: 8.1122 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 5,744 tflops: 332.68 mfu: 33.64% [rank0]:[titan] 2025-08-11 11:34:39,821 - root - INFO - step: 5 loss: 15.8913 grad_norm: 59.8510 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 5,046 tflops: 292.22 mfu: 29.55% ``` REORDER + SINK_WAITS_ITERATIVE: active: 35Gb reserved: 41Gb ``` [rank0]:[titan] 2025-08-11 11:31:36,119 - root - INFO - step: 1 loss: 12.2646 grad_norm: 4.1282 active_memory: 27.60GiB(29.05%) reserved_memory: 32.49GiB(34.20%) tps: 173 tflops: 10.00 mfu: 1.01% [rank0]:[titan] 2025-08-11 11:31:37,452 - root - INFO - step: 2 loss: 13.2353 grad_norm: 42.4234 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,152 tflops: 356.26 mfu: 36.02% [rank0]:[titan] 2025-08-11 11:31:38,780 - root - INFO - step: 3 loss: 13.8205 grad_norm: 24.0156 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,169 tflops: 357.29 mfu: 36.13% [rank0]:[titan] 2025-08-11 11:31:40,106 - root - INFO - step: 4 loss: 13.1033 grad_norm: 9.1167 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,183 tflops: 358.10 mfu: 36.21% [rank0]:[titan] 2025-08-11 11:31:41,443 - root - INFO - step: 5 loss: 16.3530 grad_norm: 51.8118 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,130 tflops: 355.03 mfu: 35.90% ``` Differential Revision: [D80718143](https://our.internmc.facebook.com/intern/diff/D80718143) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160113 Approved by: https://github.com/wconstab, https://github.com/eellison Co-authored-by: eellison <elias.ellison@gmail.com>	2025-08-22 14:19:57 +00:00
PyTorch MergeBot	639b8cc51d	Revert "cd: Add no-cache for test binaries (#149218 )" This reverts commit 523bffd38856dc9fca36bddded64f74822a6e1a2. Reverted https://github.com/pytorch/pytorch/pull/149218 on behalf of https://github.com/atalman due to Lets not use no-cache flags on test binaries ([comment](https://github.com/pytorch/pytorch/pull/149218#issuecomment-3214338844))	2025-08-22 13:14:23 +00:00
Ting Lu	49ff884b1e	Add CUDA 13.0 x86 builds (#160956 ) https://github.com/pytorch/pytorch/issues/159779 CUDA 13.0.0 NVSHMEM 3.3.20 CUDNN 9.12.0.46 Adding x86 linux builds for CUDA 13. Adding libtorch docker. Package naming changed for CUDA 13 (removed postfix -cu13 for some packages). Preparation checklist: 1. Update index https://download.pytorch.org/whl/nightly/cu130 with pypi packages 2. Update packaging name based on https://pypi.org/project/cuda-toolkit/ metadata Pull Request resolved: https://github.com/pytorch/pytorch/pull/160956 Approved by: https://github.com/atalman Co-authored-by: atalman <atalman@fb.com>	2025-08-22 11:31:09 +00:00
Ting Lu	a68f63e331	Add Windows CUDA 13 build and magma script (#161073 ) Add magma build 13.0 for Windows Add cuda_install.bat 13.0 for Windows build https://github.com/pytorch/pytorch/issues/159779 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161073 Approved by: https://github.com/atalman Co-authored-by: Andrey Talman <atalman@fb.com>	2025-08-22 11:24:25 +00:00
Tom Ritchford	774b4befa1	[BE] [dynamo] Simplify two methods in ConstDictVariable (#159361 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159361 Approved by: https://github.com/anijain2305	2025-08-22 11:11:30 +00:00
FFFrog	2beffb3311	Refactoring TensorImpl by using constexpr and std::is_same_v (#161043 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161043 Approved by: https://github.com/Skylion007	2025-08-22 10:49:49 +00:00
frost-intel	9b4adc4db7	[fr] [xpu] Add FlightRecorder support for ProcessGroupXCCL (#158568 ) Adds support for FlightRecorder in ProcessGroupXCCL. See https://github.com/intel/torch-xpu-ops/pull/1867 for XCCL implementation and more details. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158568 Approved by: https://github.com/guangyey, https://github.com/fduwjj	2025-08-22 09:03:35 +00:00
Arsh Zahed	9e491f753e	[dynamo] Remove extra if statement in builder _wrap (#161215 ) Removes a redundant if statement. Does not impact logic so no test changes needed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161215 Approved by: https://github.com/StrongerXi	2025-08-22 08:56:06 +00:00
Yu, Guangye	373e25c2eb	Disable background threads for XPU host allocator (#161242 ) # Motivation https://github.com/pytorch/pytorch/pull/160505 enables background threads for XPU host allocator. However, it will hang on Windows during program exit. Now disable it until we narrow down the issue. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161242 Approved by: https://github.com/EikanWang	2025-08-22 08:40:13 +00:00
IvanKobzarev	595987d28d	[bucketing] allow convert_element_type after fsdp reduce_scatter (#161159 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161159 Approved by: https://github.com/eellison	2025-08-22 06:41:50 +00:00
Xu Han	c4670e40c9	[inductor] remove Windows unsupported build options. (#161197 ) Changes: 1. Math related build option is not supported by msvc, skip them on Windows. 2. Move all math related build option to `_get_ffast_math_flags` function. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161197 Approved by: https://github.com/jansel	2025-08-22 06:23:43 +00:00
Xu Han	9b3ebd25ac	[inductor] Enable max compatible to msvc for oneAPI headers. (#161196 ) Enable max compatible to msvc for oneAPI headers. The key context is `The /permissive- option is compatible with almost all of the header files from the latest Windows Kits` from https://learn.microsoft.com/en-us/cpp/build/reference/permissive-standards-conformance?view=msvc-170 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161196 Approved by: https://github.com/jansel	2025-08-22 06:23:26 +00:00
zeshengzong	f8bd85827d	Optimzie `zero_grad` description (#161239 ) Optimize [zero_grad doc](https://docs.pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html) format and description. ## Test Result ### Before <img width="996" height="534" alt="image" src="https://github.com/user-attachments/assets/e1db973c-57e8-4525-90e7-0500cde2263d" /> ### After <img width="890" height="496" alt="image" src="https://github.com/user-attachments/assets/5579c4fb-a857-4030-9303-34770083d1a5" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161239 Approved by: https://github.com/janeyx99	2025-08-22 06:18:25 +00:00
Huy Do	bc7eaa0d8a	[BE] Remove the default TORCH_CUDA_ARCH_LIST in CI Docker image (#161137 ) This doesn't make sense to have this default to Maxwell, which is too old. All other places in CI/CD needs to overwrite this value. IMO, it makes more sense to not set this at all and let CI/CD jobs set it for their own use cases instead. This is partly responsible for the build failure in https://github.com/pytorch/pytorch/issues/160988 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161137 Approved by: https://github.com/msaroufim	2025-08-22 06:03:11 +00:00
Yang Wang	0dea191ff7	[VLLM TEST]setup test workflow (#160583 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160583 Approved by: https://github.com/huydhn, https://github.com/atalman	2025-08-22 05:38:39 +00:00
Simon Fan	8aad3a60ce	[dynamo] propagate tensor metadata on Tensor.__setitem__(tensor) (#161036 ) Fixes silent incorrectness for autograd function tracing, where we rely on FakeTensor metadata (requires_grad) to determine whether to HOP or not: `5ee464db5c/torch/_dynamo/variables/misc.py (L671)` Stared at this with @anijain2305 yesterday, `Tensor.__setitem__` can update tensor metadata, and we can just run the fake prop and extract the output metadata from the updated FakeTensor. FIXES https://github.com/pytorch/pytorch/issues/160901 It should also be the root cause behind the issue in https://github.com/pytorch/torchtitan/pull/1604 @bdhirsh @ruisizhang123 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161036 Approved by: https://github.com/anijain2305 ghstack dependencies: #160805	2025-08-22 04:43:22 +00:00
PyTorch UpdateBot	c7fb031706	[audio hash update] update the pinned audio hash (#161226 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161226 Approved by: https://github.com/pytorchbot	2025-08-22 04:22:08 +00:00
Yiming Zhou	c60dea5261	[export] Allow tempfile._TemporaryFileWrapper in package_pt2 (#161203 ) Summary: We use tempfile.NamedTemporaryFile to create a temporary pt2 file in `test_nativert.py` However, it is not recognized as an allowed file format and a warning will be thrown. Test Plan: CI Rollback Plan: Differential Revision: D80740916 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161203 Approved by: https://github.com/angelayi	2025-08-22 04:10:35 +00:00
Phoslight	bf8431ba06	[inductor][cpu] Fix double-offset issue in `GEMM_TEMPLATE` (#159233 ) Fixes #158076 Basically, the gemm template generates code like ``` cpp_CppMicroGemmRef_micro_gemm<static_cast<bool>(false), static_cast<bool>(false)>( &(X[static_cast<int64_t>(k_start + 196LLm_start + 38416LLks_b_index)]), &(W[static_cast<int64_t>(200704000LL + n_start + 80LLk_start + 15680LLks_b_index)]), &(local_acc_buf[static_cast<int64_t>(Nrnci + ((-1LL)Nrnc))]), static_cast<int64_t>(m_end + ((-1LL)m_start)), static_cast<int64_t>(Nr), static_cast<int64_t>(k_end + ((-1LL)k_start)), static_cast<int64_t>(196LL), static_cast<int64_t>(80LL), static_cast<int64_t>(Nc_blocksNr) ); ``` However, when the input tensor W has a storage offset, this results in a double offset issue. That is, the resulting pointer is `2 * 200704000LL` away from `W.storage().data_ptr()`, which causes an out-of-bounds access. The storage offset of `W` is introduced by [this patch](https://github.com/pytorch/pytorch/pull/136421/files), but I think it's a reasonable fix. So `cpp_gemm_template.py` should handle input matrices with storage offsets properly. I think a good way to fix this issue is to create a new matrix that has no storage offset. When `should_block_weights` is true, `block_weight()` creates a clean new matrix, so that branch is not affected by this issue. BTW I've also examined the FX IRs generated by `torch.compile()`, as well as the generated python module, and they are correct. The newly-added test in `test_cpu_select_algorithm.py` can reproduce the issue. With this patch, the crash is fixed. It also resolves the crash reported in #158076. I ran CPU tests in `test_cpu_select_algorithm.py`, but many of them are skipped due to MKL and AMX. I'd be appreciated if someone can help verify the test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159233 Approved by: https://github.com/leslie-fang-intel, https://github.com/swolchok	2025-08-22 03:47:28 +00:00
Jovian Anthony Jaison	2fdd4f918c	Log exception_stack_trace to dynamo_compile (#161096 ) Note: Adding unit test for this is tricky as having errors in the specific unit test would cause test_utils.py to crash all together. Tested as follows: 1. Added x = 1/0 after guarded_code = compile_inner(code, one_graph, hooks, transform) in convert_frame.py 2. Printed exception_stack_trace and got: ['Traceback (most recent call last):\n File "/data/users/jovian/pytorch/torch/_dynamo/convert_frame.py", line 1207, in _compile\n x = 1/0\n ~^~\nZeroDivisionError: division by zero\n'] Pull Request resolved: https://github.com/pytorch/pytorch/pull/161096 Approved by: https://github.com/c00w	2025-08-22 03:29:15 +00:00
Scott Todd	31a41daff4	[ROCm][Windows] Include native_transformers srcs to fix link errors. (#160373 ) Following up on https://github.com/pytorch/pytorch/pull/152951#discussion_r2267714825, this removes a few lines added in that pull request, fixing link errors like ``` [7019/7028] Linking CXX shared library bin\torch_hip.dll FAILED: [code=4294967295] bin/torch_hip.dll lib/torch_hip.lib C:\Windows\system32\cmd.exe /C "cd . && D:\projects\TheRock\external-builds\pytorch\3.12.venv\Lib\site-packages\cmake\data\bin\cmake.exe -E vs_link_dll --msvc-ver=1942 --intdir=caffe2\CMakeFiles\torch_hip.dir --rc=C:\PROGRA~2\WI3CF2~1\10\bin\100261~1.0\x64\rc.exe --mt=C:\PROGRA~2\MICROS~2\2022\BUILDT~1\VC\Tools\Llvm\x64\bin\llvm-mt.exe --manifests -- D:\projects\TheRock\external-builds\pytorch\3.12.venv\Lib\site-packages\_rocm_sdk_devel\lib\llvm\bin\lld-link.exe /nologo @CMakeFiles\torch_hip.rsp /out:bin\torch_hip.dll /implib:lib\torch_hip.lib /pdb:bin\torch_hip.pdb /dll /version:0.0 /machine:x64 /ignore:4049 /ignore:4217 /ignore:4099 /INCREMENTAL:NO && cd ." LINK: command "D:\projects\TheRock\external-builds\pytorch\3.12.venv\Lib\site-packages\_rocm_sdk_devel\lib\llvm\bin\lld-link.exe /nologo @CMakeFiles\torch_hip.rsp /out:bin\torch_hip.dll /implib:lib\torch_hip.lib /pdb:bin\torch_hip.pdb /dll /version:0.0 /machine:x64 /ignore:4049 /ignore:4217 /ignore:4099 /INCREMENTAL:NO /MANIFEST:EMBED,ID=2" failed (exit code 1) with the following output: lld-link: error: undefined symbol: __declspec(dllimport) class std::tuple<class at::Tensor, class at::Tensor, class at::Tensor> __cdecl at::native::transform_bias_rescale_qkv_cuda(class at::Tensor const &, class at::Tensor const &, __int64) >>> referenced by caffe2\CMakeFiles\torch_hip.dir\__\aten\src\ATen\RegisterCUDA_0.cpp.obj:(class std::tuple<class at::Tensor, class at::Tensor, class at::Tensor> __cdecl at::`anonymous namespace'::`anonymous namespace'::wrapper_CUDA___transform_bias_rescale_qkv(class 0xE9BF7323::Tensor const &, class 0xE9BF7323::Tensor const &, __int64)) >>> referenced by caffe2\CMakeFiles\torch_hip.dir\__\aten\src\ATen\RegisterNestedTensorCUDA_0.cpp.obj:(class std::tuple<class at::Tensor, class at::Tensor, class at::Tensor> __cdecl at::`anonymous namespace'::`anonymous namespace'::wrapper_NestedTensorCUDA___transform_bias_rescale_qkv(class 0xEFEB5304::Tensor const &, class 0xEFEB5304::Tensor const &, __int64)) ``` The `native_transformers_hip_hip` and `native_transformers_hip_cpp` sources are okay to define (and are required) even if accelerated versions of these operations are not available. I've tested downstream builds of torch with ROCm on native Windows via https://github.com/ROCm/TheRock both with and without aotriton and these changes were needed for the build to succeed in both cases. I have _not_ tested Linux, WSL, or with the HIP SDK. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160373 Approved by: https://github.com/alugorey, https://github.com/jeffdaily	2025-08-22 01:43:25 +00:00
Jane Xu	cc791d5857	Quick fix to headers in stable/tensor_inl.h (#161168 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161168 Approved by: https://github.com/mikaylagawarecki, https://github.com/Skylion007	2025-08-22 01:27:44 +00:00
Yiming Zhou	be2e6b3158	[export] Remove unused Model, tensor_paths, constant_paths (#161185 ) Summary: Removed `Model`, it's not being used anywhere so it's safe. Removed `tensor_paths` and `constant_paths` fields in `ExportedProgram` - BC: when the current deserializer load a previously serialized EP (that comes with empty `tensor_paths` and `constant_paths`), it will just ignore those two fields - FC: when the old deserializer load a newly serialized EP (that doesn't come with `tensor_paths` and `constant_paths`, it will also ignore those two fields in `_dict_to_dataclass()` Differential Revision: D80725094 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161185 Approved by: https://github.com/SherlockNoMad	2025-08-22 01:07:01 +00:00
eellison	a85711d565	Avoid making node a successor/predecessor of itself (#161205 ) This fixes an assertion we were running into in the memory planning about not having an acyclic graph. The repro is very long so hard to make local test of, but fixes repro I am looking at. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161205 Approved by: https://github.com/IvanKobzarev, https://github.com/bdhirsh	2025-08-22 00:30:29 +00:00
dolpm	ff4f5dd8ed	[nativert] oss layout planner tests (#160942 ) Summary: att - changed one of the tests to get rid of torcharrow dep. Test Plan: ``` buck2 test //caffe2/test/cpp/nativert:layout_planner_tests Tests finished: Pass 15. Fail 0. Fatal 0. Skip 0. Build failure 0 ``` Rollback Plan: Reviewed By: SherlockNoMad Differential Revision: D80108549 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160942 Approved by: https://github.com/georgiaphillips, https://github.com/henryoier	2025-08-22 00:26:25 +00:00
Ankita George	46429be723	[DCP][HF] Add option to parallelize reads in HF Storage Reader (#160205 ) Parallelize reading of data behind thread_count argument to HFStorageReader Test plan: ensure existing tests pass and run a job successfully with these changes Differential Revision: [D79478188](https://our.internmc.facebook.com/intern/diff/D79478188/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160205 Approved by: https://github.com/meetv18	2025-08-21 23:58:02 +00:00
dependabot[bot]	f5bf5147ad	Bump uv from 0.8.4 to 0.8.6 in /.ci/lumen_cli (#161212 ) Bumps [uv](https://github.com/astral-sh/uv) from 0.8.4 to 0.8.6. - [Release notes](https://github.com/astral-sh/uv/releases) - [Changelog](https://github.com/astral-sh/uv/blob/main/CHANGELOG.md) - [Commits](https://github.com/astral-sh/uv/compare/0.8.4...0.8.6) --- updated-dependencies: - dependency-name: uv dependency-version: 0.8.6 dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-08-21 15:54:34 -07:00
PyTorch MergeBot	fc0683b1e7	Revert "[ATen][CPU][Sparse] Use Third-Party Eigen for sparse add and addmm (#155357 )" This reverts commit ce048de608180fa88335e5821070472539968b54. Reverted https://github.com/pytorch/pytorch/pull/155357 on behalf of https://github.com/seemethere due to This is causing buck builds to fail since we didn't add the definition of AT_USE_EIGEN_SPARSE in the buckbuild.bzl file, will follow-up and re-land this. ([comment](https://github.com/pytorch/pytorch/pull/155357#issuecomment-3212270510))	2025-08-21 22:38:40 +00:00
Nikita Shulga	cb57953215	[BE] Enable `test_index_put_accumulate_duplicate_indices` on MPS (#161201 ) By changing dtype to float if device is MPS Note: for some reason test runs much longer on MPS than on CPU ``` % python ../test/test_indexing.py -v -k test_index_put_accumulate_duplicate_indices_mps test_index_put_accumulate_duplicate_indices_mps (__main__.TestIndexingMPS.test_index_put_accumulate_duplicate_indices_mps) ... ok ---------------------------------------------------------------------- Ran 1 test in 9.139s OK ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161201 Approved by: https://github.com/dcci	2025-08-21 22:05:42 +00:00
PaulZhang12	f085f29958	[Inductor] Update Outer Reduction Heuristic (#159093 ) Update outer reduction heuristics for significant speedups. HuggingFace: <img width="572" height="705" alt="Screenshot 2025-08-20 at 12 44 51 AM" src="https://github.com/user-attachments/assets/4872a23b-d136-423a-b2e6-187895bccba1" /> Average ~20% speedup on a kernel by kernel basis TorchBench: <img width="572" height="705" alt="Screenshot 2025-08-20 at 12 45 10 AM" src="https://github.com/user-attachments/assets/b8357b6d-6107-4104-b906-292a17d14d48" /> Average ~40% speedup on a kernel by kernel basis <img width="1705" height="729" alt="Screenshot 2025-08-21 at 5 50 32 PM" src="https://github.com/user-attachments/assets/a9715a2b-9e6c-4b33-ba9f-7870dc561e31" /> Differential Revision: [D80630416](https://our.internmc.facebook.com/intern/diff/D80630416) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159093 Approved by: https://github.com/jansel	2025-08-21 22:02:49 +00:00
Will Constable	d1faf2ef04	[DTensor] Make default RNG semantics match user-passed generator (#160482 ) Previously, DTensor kept its own copy of the generator state after the first time a random operator was called on a DTensor. This copy would evolve independently from the generator outside of DTensor. After adding support for users to pass a specific generator into random operators (e.g. `uniform_(..., generator=)`), it was determined (in discussion on #159991) to change the semantics so that any random operations performed on DTensor would evolve the state of the publicly visible generators (either the default one or user-passed one). The upsides are (1) it is now possible to call torch.manual_seed() at any point in the program and have a consistent effect on DTensor, (2) DTensor ops have an observable effect on the generator. The downside is that users are now responsible for seeding their generator before using DTensor, ensuring all ranks use the same seed. Fixes #159991 confirmed docs rendered OK <img width="897" height="414" alt="image" src="https://github.com/user-attachments/assets/c082f0f0-5447-47aa-834f-65342eb237cd" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160482 Approved by: https://github.com/wanchaol	2025-08-21 22:02:16 +00:00
Yang Wang	cc2b65a91a	[VLLM]setup test cli logics (#160361 ) setup vllm test logics. 1. install wheels generated from previous build stage 2. generate and install vllm test pkg list on run time based on the torch wheels in the instance 3. run test based on the pre-defined test plan notice the test-plan format is temporary for some basic vllm testing Pull Request resolved: https://github.com/pytorch/pytorch/pull/160361 Approved by: https://github.com/atalman, https://github.com/huydhn	2025-08-21 21:59:41 +00:00
Gabriel Ferns	67fc16c744	Add profiler analysis flag to combine multiple profiles into one (#161145 ) Combine multiple profiles into one: ``` python profile_analysis.py --combine <file1> <file2> ... <out> ``` This only works well if they have different pids, like from different programs in a distributed run. <img width="1521" height="465" alt="combining_multiple_profiles" src="https://github.com/user-attachments/assets/aba7112b-e9a9-4075-b82b-a4e4408384da" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/161145 Approved by: https://github.com/xmfan	2025-08-21 21:36:58 +00:00
Ankita George	fb241d0a44	[dcp][hf] Fix multi-rank consolidation for no files to process case (#160660 ) Summary: In the consolidate_safetensors_files_on_every_rank method, where we use multiple ranks to combine sharded safetensors files, if there are more ranks in the world size, than there are safetensors file to consolidate, then some ranks don't have to do any work. When I had tested, this case wasn't caught, and there was an extra barrier call, causing issues for the ranks that had no work to do. They should wait at the end, as do the ranks with work. Test Plan: tested this case on a job e2e added a unit test Rollback Plan: Differential Revision: D80273616 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160660 Approved by: https://github.com/sibuachu	2025-08-21 21:18:03 +00:00
Jagadish Krishnamoorthy	d2b8c0d431	forward fix of #152198 (#161166 ) torch._inductor.virtualized.OpsValue objects instance does not have shape attribute. This breaks the fp8 test on ROCm. Add the OpsValue class in todo list. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161166 Approved by: https://github.com/jeffdaily	2025-08-21 21:09:48 +00:00
can-gaa-hou	e25ee0290e	Fix constant_pad_nd_mps bug when pad is empty (#161149 ) Fixes #161066 There is a size check here, which causes the error. `8ce81bcee1/aten/src/ATen/native/mps/operations/Pad.mm (L39-L40)` If the argument `pad` is empty, it will return the cloned tensor on CPU. `8ce81bcee1/aten/src/ATen/native/PadNd.cpp (L43-L64)` Therefore, this PR fixes the empty padding argument error by checking the size first and returning a cloned tensor immediately if the padding size is 0. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161149 Approved by: https://github.com/malfet	2025-08-21 20:45:26 +00:00
Animesh Jain	5805c4210b	[invoke_subgraph][inductor] Thread graphsafe rng input states for hops (#160713 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160713 Approved by: https://github.com/eellison	2025-08-21 20:41:29 +00:00
Xu Han	db38c44ad6	[inductor] add libraries_dirs for level_zero (#161146 ) Changes: 1. change set `include_dirs` to append value. 2. add append `libraries_dirs` for level_zero. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161146 Approved by: https://github.com/angelayi	2025-08-21 19:55:12 +00:00
Xu Han	1e3fe78a10	[inductor] disable min/max macro on Windows. (#161133 ) Disable min/max macro on Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161133 Approved by: https://github.com/angelayi	2025-08-21 19:52:56 +00:00
Tsung-Hsien Lee	a445b41e4f	[pytorch] Simplify PyTorch `foreach_*` API restrictions check (#161039 ) Summary: C++'s polymorphism and reusing components help us reduce the amount of bolierplate codes here. Test Plan: CI & tests Rollback Plan: Differential Revision: D80594353 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161039 Approved by: https://github.com/janeyx99	2025-08-21 19:50:02 +00:00
Tsung-Hsien Lee	801851086d	[pytorch] Invoke `vector.reserve()` consistently for non-inplace foreach operations (#161128 ) Summary: The `reserve()` method is used to pre-allocate memory for the result vector before adding elements to it. This is an optimization that makes sense for several reasons: 1. Performance improvement: By pre-allocating memory for the exact number of elements needed, it avoids multiple reallocations and memory copies that would occur as the vector grows dynamically. 2. Memory efficiency: It ensures that the vector allocates exactly the amount of memory needed, no more and no less, which is efficient when we know the final size in advance. 3. Reduced overhead: Each reallocation typically involves: - Allocating a new, larger block of memory - Copying all existing elements to the new location - Destroying the old elements - Deallocating the old memory block - Consistent performance: Without reservation, vector growth typically follows a geometric progression (like 1, 2, 4, 8, 16...), which can lead to unpredictable performance spikes when reallocation occurs. Test Plan: OSS CI & tests Rollback Plan: Differential Revision: D80674453 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161128 Approved by: https://github.com/Skylion007	2025-08-21 19:43:11 +00:00
dolpm	958f9ca88e	[nativert] oss static kernel tests (#161087 ) Summary: att - should be no-op Test Plan: buck2 test //caffe2/test/cpp/nativert:static_kernel_ops_tests Tests finished: Pass 24. Fail 0. Fatal 0. Skip 0. Build failure 0 Rollback Plan: Differential Revision: D80216488 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161087 Approved by: https://github.com/georgiaphillips, https://github.com/henryoier	2025-08-21 19:42:21 +00:00
James Wu	9668210302	Allow bypasses for Precompile when guards, etc. cannot be serialized (#160902 ) This adds a new function `bypass_package` and `CompilePackage.bypass_current_entry()`. This allows us to safely bypass if there are models with unserializable or incompatible parts. When we encounter something incompatible, we'll raise a bypass and ignore that particular code in DynamoCodeEntry. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160902 Approved by: https://github.com/zhxchen17	2025-08-21 18:20:42 +00:00
Huy Do	3f5a8e2003	Fix torchaudio build when TORCH_CUDA_ARCH_LIST is not set (#161084 ) Fixes https://github.com/pytorch/pytorch/issues/160988. The root cause can be found in the same issue. This fix ensures that when reuse old wheel is on and `torchaudio` wheel is not there, the inductor test job can still rebuild the wheel it needs Pull Request resolved: https://github.com/pytorch/pytorch/pull/161084 Approved by: https://github.com/malfet, https://github.com/zou3519	2025-08-21 17:38:32 +00:00
Angela Yi	3dacaf0e1e	[aoti-fx] Add meta["val"] metadata (#161019 ) Summary: Added a `_set_node_metadata_hook` which automatically adds node.meta["val"] to every new node that gets created under this context. Test Plan: ` buck2 test //mtia/host_runtime/afg/tests:test_dynamic_shapes_advanced_ops` https://www.internalfb.com/buck2/866439a2-2ba6-42d1-8e43-508d60456e2e `buck2 test //mtia/host_runtime/afg/tests:test_dynamic_shapes_basic_ops` https://www.internalfb.com/intern/testinfra/testrun/11540474149662857 Rollback Plan: Differential Revision: D80579336 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161019 Approved by: https://github.com/blaine-rister	2025-08-21 16:45:41 +00:00
PyTorch MergeBot	a6401cb5aa	Revert "flip the list-as-tuple behavior for short lists (#160794 )" This reverts commit febfc3ec03004116dfd6d504e6853ff02a1dd6e0. Reverted https://github.com/pytorch/pytorch/pull/160794 on behalf of https://github.com/seemethere due to This if failing internal tests, see D80671241 ([comment](https://github.com/pytorch/pytorch/pull/160794#issuecomment-3211314867))	2025-08-21 16:33:30 +00:00
PyTorch MergeBot	7006fd0c88	Revert "[inductor] Estimate peak memory allocfree and applying to reordering collectives (#160113 )" This reverts commit 517d38d3406abbba35d0694bff259a698cad3ec9. Reverted https://github.com/pytorch/pytorch/pull/160113 on behalf of https://github.com/IvanKobzarev due to Segment tree starts failing on trunk even ciflows/trunk passed on PR ([comment](https://github.com/pytorch/pytorch/pull/160113#issuecomment-3211286092))	2025-08-21 16:22:44 +00:00
IvanKobzarev	517d38d340	[inductor] Estimate peak memory allocfree and applying to reordering collectives (#160113 ) 1. Applying @eellison idea from https://github.com/pytorch/pytorch/pull/146562#discussion_r2059363672 for estimate_peak_memory: ``` """ Alternative version of estimate_peak_memory, that respects the fact, that every SchedulerNode has multiple phases: 1. alloc ( outputs ) 2. run_kernel 3. dealloc last_use buffers estimate_peak_memory collapses memory into one value: size_alloc - size_free While peak memory happens after alloc. Duplicating the code to not migrate all callsites at once, In future usages of estimate_peak_memory will migrate to this version. """ ``` - Applying this in `reorder_communication_preserving_peak_memory` pass. 2. Buffers during reordering can change deallocation point, if candidate and group to swap both are users of the f_input_buf and group contains last_use_snode. - Addressing this tracking the last_use_snode for each buffer and recomputing current memory respecting the change in size_free (group_node after reordering is not the last user of the buffer and its size_free -= buffer_size, while candidate becomes the last user and candidate.size_free += buffer_size). 4. Adding env var `PYTORCH_REORDER_COLLECTIVES_LIMIT` for ablation to limit number of collectives to reorder. What is after this PR: Iterative recomputation of memory estimations matches full memory estimations. Active memory is not regressing a lot, but reserved memory is significantly regressed. Investigation and fix of "reserved" memory will be in following PRs. BASELINE (bucketing AG and RS): active: 32Gb reserved: 34Gb ``` [rank0]:[titan] 2025-08-11 11:28:36,798 - root - INFO - step: 1 loss: 12.2722 grad_norm: 4.2192 active_memory: 24.66GiB(25.96%) reserved_memory: 25.38GiB(26.72%) tps: 99 tflops: 5.71 mfu: 0.58% [rank0]:[titan] 2025-08-11 11:28:38,640 - root - INFO - step: 2 loss: 13.1738 grad_norm: 50.5566 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 4,448 tflops: 257.63 mfu: 26.05% [rank0]:[titan] 2025-08-11 11:28:40,029 - root - INFO - step: 3 loss: 15.6866 grad_norm: 80.0862 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 5,900 tflops: 341.72 mfu: 34.55% [rank0]:[titan] 2025-08-11 11:28:41,423 - root - INFO - step: 4 loss: 13.4853 grad_norm: 7.8538 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 5,881 tflops: 340.57 mfu: 34.44% [rank0]:[titan] 2025-08-11 11:28:42,820 - root - INFO - step: 5 loss: 16.1191 grad_norm: 53.2481 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 5,867 tflops: 339.77 mfu: 34.35% ``` REORDER: active: 32Gb reserved: 36Gb ``` [rank0]:[titan] 2025-08-11 11:34:32,772 - root - INFO - step: 1 loss: 12.2490 grad_norm: 4.1944 active_memory: 24.66GiB(25.96%) reserved_memory: 26.81GiB(28.22%) tps: 85 tflops: 4.90 mfu: 0.50% [rank0]:[titan] 2025-08-11 11:34:35,329 - root - INFO - step: 2 loss: 13.1427 grad_norm: 39.5942 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 3,205 tflops: 185.61 mfu: 18.77% [rank0]:[titan] 2025-08-11 11:34:36,770 - root - INFO - step: 3 loss: 14.6084 grad_norm: 51.0743 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 5,688 tflops: 329.44 mfu: 33.31% [rank0]:[titan] 2025-08-11 11:34:38,197 - root - INFO - step: 4 loss: 13.6181 grad_norm: 8.1122 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 5,744 tflops: 332.68 mfu: 33.64% [rank0]:[titan] 2025-08-11 11:34:39,821 - root - INFO - step: 5 loss: 15.8913 grad_norm: 59.8510 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 5,046 tflops: 292.22 mfu: 29.55% ``` REORDER + SINK_WAITS_ITERATIVE: active: 35Gb reserved: 41Gb ``` [rank0]:[titan] 2025-08-11 11:31:36,119 - root - INFO - step: 1 loss: 12.2646 grad_norm: 4.1282 active_memory: 27.60GiB(29.05%) reserved_memory: 32.49GiB(34.20%) tps: 173 tflops: 10.00 mfu: 1.01% [rank0]:[titan] 2025-08-11 11:31:37,452 - root - INFO - step: 2 loss: 13.2353 grad_norm: 42.4234 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,152 tflops: 356.26 mfu: 36.02% [rank0]:[titan] 2025-08-11 11:31:38,780 - root - INFO - step: 3 loss: 13.8205 grad_norm: 24.0156 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,169 tflops: 357.29 mfu: 36.13% [rank0]:[titan] 2025-08-11 11:31:40,106 - root - INFO - step: 4 loss: 13.1033 grad_norm: 9.1167 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,183 tflops: 358.10 mfu: 36.21% [rank0]:[titan] 2025-08-11 11:31:41,443 - root - INFO - step: 5 loss: 16.3530 grad_norm: 51.8118 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,130 tflops: 355.03 mfu: 35.90% ``` Differential Revision: [D79886535](https://our.internmc.facebook.com/intern/diff/D79886535) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160113 Approved by: https://github.com/wconstab, https://github.com/eellison Co-authored-by: eellison <elias.ellison@gmail.com>	2025-08-21 15:45:06 +00:00
Andy Lugo	3caddd4daa	[ROCm] SDPA fix mem fault when dropout is enabled (#154864 ) Fixes issue that exhibited a device side memory access fault due to incorrect tensor life management Pull Request resolved: https://github.com/pytorch/pytorch/pull/154864 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-21 14:23:13 +00:00
Kaichao You	18271148d3	[dist] expose unsafe_get_ptr for dist.ProcessGroupNCCL.NCCLConfig (#161136 ) expose the pointer so that we can create the `ncclConfig_t` object from pytorch and use it elsewhere. this is useful to control the nccl communicator parameters for multiple nccl communicators. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161136 Approved by: https://github.com/kwen2501	2025-08-21 10:47:03 +00:00
Xia, Weiwen	a941d7ffe5	[Quant][CPU] Avoid NaN in fp8 output of qlinear and qconv (#160957 ) Summary When output dtype is fp8, oneDNN does not ensure intermediate results in the range of [-448, 448] before converting to fp8. So, we may get NaN in the output, which is a disaster for inference. This PR fixes this issue by clamping the intermediate results by oneDNN's post-op clip. Test plan ``` pytest -sv test/quantization/core/test_quantized_op.py -k "q and fp8" ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160957 Approved by: https://github.com/Valentine233, https://github.com/CaoE	2025-08-21 08:36:21 +00:00
PyTorch MergeBot	acb00d3ccf	Revert "Fix torchaudio build when TORCH_CUDA_ARCH_LIST is not set (#161084 )" This reverts commit cfdaaaaa26d7f34427ba941569eca46f02f79f3e. Reverted https://github.com/pytorch/pytorch/pull/161084 on behalf of https://github.com/huydhn due to My mistake in not checking for nvidia-smi availability ([comment](https://github.com/pytorch/pytorch/pull/161084#issuecomment-3209498435))	2025-08-21 08:17:04 +00:00
PyTorch MergeBot	bd5857a1d6	Revert "[inductor] Estimate peak memory allocfree and applying to reordering collectives (#160113 )" This reverts commit 9d18bf01b1661d227f6af41ac07a1e9ef20a9e1a. Reverted https://github.com/pytorch/pytorch/pull/160113 on behalf of https://github.com/huydhn due to Sorry for reverting your change, but lots of failures showing up after this lands ([comment](https://github.com/pytorch/pytorch/pull/160113#issuecomment-3209487237))	2025-08-21 08:13:33 +00:00
CaoE	23b033452f	[Inductor][CPP] Fix layout for local buf in outer loop fusion (#160857 ) Fixes #159154 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160857 Approved by: https://github.com/leslie-fang-intel, https://github.com/jansel	2025-08-21 06:00:04 +00:00
Dylan Maloy	2f50ae7d20	[nativert] make runtime const folding aware of run_const_graph (#160760 ) Summary: it's possible that we have foldable nodes that use things that will be folded by run_const_graph Test Plan: CI Rollback Plan: Differential Revision: D80355542 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160760 Approved by: https://github.com/SherlockNoMad	2025-08-21 05:22:03 +00:00
IvanKobzarev	9d18bf01b1	[inductor] Estimate peak memory allocfree and applying to reordering collectives (#160113 ) 1. Applying @eellison idea from https://github.com/pytorch/pytorch/pull/146562#discussion_r2059363672 for estimate_peak_memory: ``` """ Alternative version of estimate_peak_memory, that respects the fact, that every SchedulerNode has multiple phases: 1. alloc ( outputs ) 2. run_kernel 3. dealloc last_use buffers estimate_peak_memory collapses memory into one value: size_alloc - size_free While peak memory happens after alloc. Duplicating the code to not migrate all callsites at once, In future usages of estimate_peak_memory will migrate to this version. """ ``` - Applying this in `reorder_communication_preserving_peak_memory` pass. 2. Buffers during reordering can change deallocation point, if candidate and group to swap both are users of the f_input_buf and group contains last_use_snode. - Addressing this tracking the last_use_snode for each buffer and recomputing current memory respecting the change in size_free (group_node after reordering is not the last user of the buffer and its size_free -= buffer_size, while candidate becomes the last user and candidate.size_free += buffer_size). 4. Adding env var `PYTORCH_REORDER_COLLECTIVES_LIMIT` for ablation to limit number of collectives to reorder. What is after this PR: Iterative recomputation of memory estimations matches full memory estimations. Active memory is not regressing a lot, but reserved memory is significantly regressed. Investigation and fix of "reserved" memory will be in following PRs. BASELINE (bucketing AG and RS): active: 32Gb reserved: 34Gb ``` [rank0]:[titan] 2025-08-11 11:28:36,798 - root - INFO - step: 1 loss: 12.2722 grad_norm: 4.2192 active_memory: 24.66GiB(25.96%) reserved_memory: 25.38GiB(26.72%) tps: 99 tflops: 5.71 mfu: 0.58% [rank0]:[titan] 2025-08-11 11:28:38,640 - root - INFO - step: 2 loss: 13.1738 grad_norm: 50.5566 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 4,448 tflops: 257.63 mfu: 26.05% [rank0]:[titan] 2025-08-11 11:28:40,029 - root - INFO - step: 3 loss: 15.6866 grad_norm: 80.0862 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 5,900 tflops: 341.72 mfu: 34.55% [rank0]:[titan] 2025-08-11 11:28:41,423 - root - INFO - step: 4 loss: 13.4853 grad_norm: 7.8538 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 5,881 tflops: 340.57 mfu: 34.44% [rank0]:[titan] 2025-08-11 11:28:42,820 - root - INFO - step: 5 loss: 16.1191 grad_norm: 53.2481 active_memory: 32.14GiB(33.83%) reserved_memory: 34.21GiB(36.01%) tps: 5,867 tflops: 339.77 mfu: 34.35% ``` REORDER: active: 32Gb reserved: 36Gb ``` [rank0]:[titan] 2025-08-11 11:34:32,772 - root - INFO - step: 1 loss: 12.2490 grad_norm: 4.1944 active_memory: 24.66GiB(25.96%) reserved_memory: 26.81GiB(28.22%) tps: 85 tflops: 4.90 mfu: 0.50% [rank0]:[titan] 2025-08-11 11:34:35,329 - root - INFO - step: 2 loss: 13.1427 grad_norm: 39.5942 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 3,205 tflops: 185.61 mfu: 18.77% [rank0]:[titan] 2025-08-11 11:34:36,770 - root - INFO - step: 3 loss: 14.6084 grad_norm: 51.0743 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 5,688 tflops: 329.44 mfu: 33.31% [rank0]:[titan] 2025-08-11 11:34:38,197 - root - INFO - step: 4 loss: 13.6181 grad_norm: 8.1122 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 5,744 tflops: 332.68 mfu: 33.64% [rank0]:[titan] 2025-08-11 11:34:39,821 - root - INFO - step: 5 loss: 15.8913 grad_norm: 59.8510 active_memory: 32.14GiB(33.83%) reserved_memory: 36.40GiB(38.31%) tps: 5,046 tflops: 292.22 mfu: 29.55% ``` REORDER + SINK_WAITS_ITERATIVE: active: 35Gb reserved: 41Gb ``` [rank0]:[titan] 2025-08-11 11:31:36,119 - root - INFO - step: 1 loss: 12.2646 grad_norm: 4.1282 active_memory: 27.60GiB(29.05%) reserved_memory: 32.49GiB(34.20%) tps: 173 tflops: 10.00 mfu: 1.01% [rank0]:[titan] 2025-08-11 11:31:37,452 - root - INFO - step: 2 loss: 13.2353 grad_norm: 42.4234 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,152 tflops: 356.26 mfu: 36.02% [rank0]:[titan] 2025-08-11 11:31:38,780 - root - INFO - step: 3 loss: 13.8205 grad_norm: 24.0156 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,169 tflops: 357.29 mfu: 36.13% [rank0]:[titan] 2025-08-11 11:31:40,106 - root - INFO - step: 4 loss: 13.1033 grad_norm: 9.1167 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,183 tflops: 358.10 mfu: 36.21% [rank0]:[titan] 2025-08-11 11:31:41,443 - root - INFO - step: 5 loss: 16.3530 grad_norm: 51.8118 active_memory: 35.08GiB(36.92%) reserved_memory: 41.62GiB(43.80%) tps: 6,130 tflops: 355.03 mfu: 35.90% ``` Differential Revision: [D79886535](https://our.internmc.facebook.com/intern/diff/D79886535) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160113 Approved by: https://github.com/wconstab, https://github.com/eellison Co-authored-by: eellison <elias.ellison@gmail.com>	2025-08-21 05:19:38 +00:00
dolpm	67b98da1b2	[nativert] oss static kernel test utils (#161086 ) Summary: att - should be a no-op Test Plan: ci Rollback Plan: Differential Revision: D80214768 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161086 Approved by: https://github.com/georgiaphillips	2025-08-21 04:49:06 +00:00
PyTorch UpdateBot	b0420d2438	[vllm hash update] update the pinned vllm hash (#161121 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161121 Approved by: https://github.com/pytorchbot	2025-08-21 04:21:09 +00:00
PyTorch UpdateBot	6096d277c5	[audio hash update] update the pinned audio hash (#161021 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161021 Approved by: https://github.com/pytorchbot	2025-08-21 04:20:56 +00:00
Huy Do	cfdaaaaa26	Fix torchaudio build when TORCH_CUDA_ARCH_LIST is not set (#161084 ) Fixes https://github.com/pytorch/pytorch/issues/160988. The root cause can be found in the same issue. This fix ensures that when reuse old wheel is on and `torchaudio` wheel is not there, the inductor test job can still rebuild the wheel it needs Pull Request resolved: https://github.com/pytorch/pytorch/pull/161084 Approved by: https://github.com/malfet, https://github.com/zou3519	2025-08-21 03:47:15 +00:00
Eddie Yan	117f11adb4	[FlexAttention][TF32] Handle uninitialized `torch.backends.cuda.matmul.fp32_precision` (#161102 ) For https://github.com/pytorch/pytorch/issues/161022 The warning says the old API will be deprecated in 2.9+ anyway, leaving it up to the author of #125888 to decide on initialization behavior then Pull Request resolved: https://github.com/pytorch/pytorch/pull/161102 Approved by: https://github.com/ngimel, https://github.com/drisspg, https://github.com/BoyuanFeng	2025-08-21 03:36:52 +00:00
Rohit Manav	a154c2093c	remove redundant installation (#160634 ) Fixes #160302 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160634 Approved by: https://github.com/sekyondaMeta, https://github.com/malfet	2025-08-21 03:31:12 +00:00
Xia, Weiwen	39862acb2e	[CPU][Inductor] improve performance of A16W4 GEMM template (#159127 ) Summary This PR improves performance of A16W4 GEMM template by removing boundary check of prefetch in the kernel code. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159127 Approved by: https://github.com/CaoE	2025-08-21 03:16:26 +00:00
bobrenjc93	9a41570199	[rfc] add hint_override kwarg to mark_dynamic (#161007 ) The motivation for this change can be seen through the following example: ``` import torch GPU_TYPE = "cuda" @torch.compile def no_override(x): return x.sum(dim=0) @torch.compile def override(x): return x.sum(dim=0) x_small = torch.randn(4096, 512, device=GPU_TYPE) no_override(x_small) torch._dynamo.decorators.mark_dynamic(x_small, 0, hint_override=4096 * 1000) override(x_small) ``` Previously, when reductions were split, codegen relied only on the first observed shape. With a small input, this resulted in a small split size: ``` def triton_red_fused_sum_0(in_ptr0, out_ptr0, ks0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): xnumel = 16384 rnumel = r0_numel ``` With the new scheme, inductor honors hint_override during codegen, producing larger and more appropriate split sizes: ``` def triton_red_fused_sum_0(in_ptr0, out_ptr0, ks0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): xnumel = 1024000 rnumel = r0_numel ``` This addresses a broader problem with dynamism: performance and numerics previously depended on whichever shape was seen first. For example: ``` f(s0) -> f(s2) f(s1) -> f(s2) ``` could generate different kernels. With the new approach, an explicit override pins the chosen configuration: ``` f(s0, hint_override=s0) -> f(s2) f(s1, hint_override=s0) -> f(s2) ``` ensuring consistent kernel generation regardless of input order. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161007 Approved by: https://github.com/jansel	2025-08-21 02:22:52 +00:00
PyTorch MergeBot	f9875166a9	Revert "[FSDP][Collectives] skipping reduce_scatter when world size is 1 (#160136 )" This reverts commit 3d126e17e0c2630031e7a359d6a6fd1dbe52c4f7. Reverted https://github.com/pytorch/pytorch/pull/160136 on behalf of https://github.com/jithunnair-amd due to Sorry, but looks like this broke ROCm distributed CI ([comment](https://github.com/pytorch/pytorch/pull/160136#issuecomment-3208632921))	2025-08-21 01:34:19 +00:00
PyTorch MergeBot	6b5be1f4a0	Revert "[FSDP][Replicate] replicate tests for param registration and input device movements (#160147 )" This reverts commit a3a82e3da85a53afc4bbf3d75bd3d3dcc2e06645. Reverted https://github.com/pytorch/pytorch/pull/160147 on behalf of https://github.com/jithunnair-amd due to Sorry, but looks like this broke ROCm distributed CI ([comment](https://github.com/pytorch/pytorch/pull/160136#issuecomment-3208632921))	2025-08-21 01:34:19 +00:00
Huamin Li	0924304e72	[AOTI] Add a new config cpp.use_constexpr_for_int_array (#160927 ) Summary: Default True so same as before, but make it configurable Differential Revision: D80185094 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160927 Approved by: https://github.com/henryoier	2025-08-21 01:16:27 +00:00
Natalia Gimelshein	d875d3ca1e	don't try to set lazy module loading env var (#161103 ) This is not needed on drivers >=525, and in DriverAPI::get() we are initializing the context anyway, so setting environment variable after that is beside the point As a result of calling DriverAPI::get on systems that don't have gpus available (e.g. due to CUDA_VISIBLE_DEVICES="") people were getting confusing errors. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161103 Approved by: https://github.com/eqy, https://github.com/malfet	2025-08-21 01:06:51 +00:00
Yuxuan Chen	a825557ed5	Workaround ATen SFINAE under libc++ (#161101 ) The existing logic here to workaround dealing with SFINAE under Microsoft platforms also applies to libc++ platforms. It appears that nvcc reports ambiguity in overload resolution for `pow_`. This seems like a nvcc limitation. ``` fbcode/caffe2/aten/src/ATen/native/cuda/Pow.cuh(42): error: more than one instance of overloaded function "pow" matches the argument list: function template "std::__2::enable_if<<expression>, std::__2::__promote<_A1, _A2, void>>::type::type pow(_A1, _A2) noexcept" (declared at line 848 of fbcode/third-party-buck/platform010-libcxx/build/libcxx/include/c++/v1/math.h) function template "std::__2::enable_if<<expression>, std::__2::__promote<_Tp, _Up, void>>::type pow(_Tp, _Up) noexcept" (declared at line 11308 of fbcode/third-party-buck/platform010/build/cuda/12.4/bin/..//include/crt/math_functions.h) argument types are: (double, float) return ::pow(base, exp); ^ ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161101 Approved by: https://github.com/malfet	2025-08-21 00:55:58 +00:00
Nikita Shulga	3e3e83418d	[BE] Move indexing tests to test_indexing (#160994 ) Which enables them on MPS device - xfail all `test_index_reduce` on MPS, as op is not implemented - xfail all `test_index_copy` on MPS due to the silent correctness problems, see https://github.com/pytorch/pytorch/issues/160993 - Fixed hard crash in `index_fill` and replaced `skipIfMPS` with `expectedFailueMPS` - Created issue for the lack of deterministic algorithms for MPS backend Pull Request resolved: https://github.com/pytorch/pytorch/pull/160994 Approved by: https://github.com/manuelcandales ghstack dependencies: #160850, #160889, #160926	2025-08-21 00:42:55 +00:00
Jazlyn Li	667245dc60	TritonKernel.inductor_meta_common() -> self.inductor_meta_common() (#160895 ) Summary: use `self.inductor_meta_common()` to call the static method, since the custom subclasses may overwrite the method to be an instance method Test Plan: ``` caffe2/test/inductor:select_algorithm -- test_finalized_subclass_hooks ``` Rollback Plan: Differential Revision: D80375351 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160895 Approved by: https://github.com/eellison, https://github.com/blaine-rister	2025-08-21 00:22:51 +00:00
Grant	54c2b66592	Replace _device_t with torch.types.Device in torch/cpu/__init__.py (#161031 ) Fixes #152952 Replace `_device_t` with `torch.types.Device` in `torch/cpu/__init__.py`. Did basic smoke test by running tests that `import torch.cpu` including `test/distributed/test_c10d_functional_native.py` and `test/test_decomp.py`. Based this PR off of #152935 which is referenced in the main issue. (also, this is my first contribution but I followed the contributing guide closely) Pull Request resolved: https://github.com/pytorch/pytorch/pull/161031 Approved by: https://github.com/janeyx99	2025-08-21 00:22:43 +00:00
Xu Han	be87f22dfb	[inductor] Enable updated __cplusplus macro (#161064 ) Intel oneAPI has some header depends on `__cplusplus` macro. This PR is enable updated __cplusplus macro for msvc. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161064 Approved by: https://github.com/angelayi	2025-08-21 00:17:08 +00:00
Xu Han	2a7a7ad711	[inductor] add level zero for xpu (#161061 ) Add level zero for Inductor xpu on Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161061 Approved by: https://github.com/angelayi	2025-08-21 00:14:15 +00:00
Teja Rao	7e6ce41555	[dcp_poc] add async checkpointing tests (#161034 ) Summary: add tests for async checkpointer for the experimental checkpointer Test Plan: tests Rollback Plan: Differential Revision: D80590461 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161034 Approved by: https://github.com/pradeepfn	2025-08-21 00:08:53 +00:00
Ben Niu	4ed3184dee	Conditionally enable ACL for bmm_out_or_baddbmm_ (#161065 ) Summary: Similar to #ifdef checks added in addmm_impl_cpu_ to conditionally enable ACL, we add the same checks in bmm_out_or_baddbmm_. This essentially disables ACL for bmm_out_or_baddbmm_ and enables ArmPL, which seems to be performing better. Test Plan: AR SL Rollback Plan: Reviewed By: Nicoshev Differential Revision: D80494623 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161065 Approved by: https://github.com/q10	2025-08-20 23:32:25 +00:00
Pian Pawakapan	44549c7146	[dynamic shapes] unbacked-safe slicing (#157944 ) Generates new unbacked symbols for slice output size & storage offset, when appropriate semantics are unclear. Teaches inductor to codegen the slice with flexible semantics. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157944 Approved by: https://github.com/laithsakka	2025-08-20 22:52:56 +00:00
Natalia Gimelshein	febfc3ec03	flip the list-as-tuple behavior for short lists (#160794 ) Per title, previously we started throwing noisy warnings, but given how popular this pattern was in our test suite decided to leave it as warning, not as silent behavior change for one release. Now `treatSequenceAsTuple` would return `true` in the only case where the sequence was indeed a tuple, so no need for a special function anymore. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160794 Approved by: https://github.com/albanD	2025-08-20 22:40:42 +00:00
Tugsbayasgalan (Tugsuu) Manlaibaatar	5afa4187df	Close some sources of fake tensor leakages (#159923 ) Differential Revision: D79694055 Couple of fixes: 1. When we run into an operation we didn't proxy, we end up emitting fake constants. We detect this and error using the FQN of the lifted constant 2. Previous attribute mutation detection logic in non-strict didn't account for nested module structure. This fixes silent incorrectness issue of exporting esm and qwen in non-strict 3. We modify yolov3 to fix the previous silent incorrect behaviour When upgrading torchbench pin, opacus_cifar10 seems to not run on eager anymore. I verified this by pushing a temporary PR on master with new pin. So i added it to expect_fail list. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159923 Approved by: https://github.com/avikchaudhuri	2025-08-20 22:24:23 +00:00
Mikayla Gawarecki	30384abcb1	Decrease number of bytes used by uninitialized tokens_ in KernelFunction (#160764 ) std::unique_ptr to decrease bytes from 24 to 8 Since std::unique_ptr is not copyable this required defining the copy / copy assignment constructors. Which made me realize we shouldn't be copying `tokens_` in those. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160764 Approved by: https://github.com/albanD	2025-08-20 21:33:27 +00:00
Ethan Wee	16e811e0b5	[CI] remove tb-nightly (#160996 ) Removing tb-nightly because we found issues when importing tensorboard as having both tb-nightly and tensorboard causes issues when pip would report 2.18.0 (pinned tensorboard) but importing in a python shell would report 2.13.XXX. This mismatch causes issues when running tests in a numpy2.X environment. e.g. ``` /var/lib/jenkins/pytorch# PYTORCH_TEST_WITH_ROCM=1 python test/test_monitor.py TestMonitorTensorboard.test_event_handler /opt/venv/lib/python3.12/site-packages/redis/connection.py:77: UserWarning: redis-py works best with hiredis. Please consider installing warnings.warn(msg) /opt/venv/lib/python3.12/site-packages/google/protobuf/internal/well_known_types.py:91: DeprecationWarning: datetime.datetime.utcfromtimestamp() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.fromtimestamp(timestamp, datetime.UTC). _EPOCH_DATETIME_NAIVE = datetime.datetime.utcfromtimestamp(0) E ====================================================================== ERROR: test_event_handler (__main__.TestMonitorTensorboard.test_event_handler) ---------------------------------------------------------------------- Traceback (most recent call last): File "/var/lib/jenkins/pytorch/test/test_monitor.py", line 116, in setUp from tensorboard.backend.event_processing import ( File "/opt/venv/lib/python3.12/site-packages/tensorboard/backend/event_processing/plugin_event_multiplexer.py", line 25, in <module> from tensorboard.backend.event_processing import ( File "/opt/venv/lib/python3.12/site-packages/tensorboard/backend/event_processing/plugin_event_accumulator.py", line 25, in <module> from tensorboard.backend.event_processing import event_file_loader File "/opt/venv/lib/python3.12/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> from tensorboard import dataclass_compat File "/opt/venv/lib/python3.12/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> from tensorboard.plugins.hparams import metadata as hparams_metadata File "/opt/venv/lib/python3.12/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> NULL_TENSOR = tensor_util.make_tensor_proto( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/venv/lib/python3.12/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto numpy_dtype = dtypes.as_dtype(nparray.dtype) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/venv/lib/python3.12/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype if type_value.type == np.string_ or type_value.type == np.unicode_: ^^^^^^^^^^ File "/opt/venv/lib/python3.12/site-packages/numpy/__init__.py", line 400, in __getattr__ raise AttributeError( AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead. ---------------------------------------------------------------------- Ran 1 test in 0.355s FAILED (errors=1) ``` After removing tb-nightly and ensuring that tensorboard 2.18.0 is the only tensoboard in the env: ``` root@rocm-framework-47:/var/lib/jenkins/pytorch# PYTORCH_TEST_WITH_ROCM=1 python test/test_monitor.py TestMonitorTensorboard.test_event_handler . ---------------------------------------------------------------------- Ran 1 test in 0.409s OK ``` ``` >>> import tensorboard >>> print(tensorboard.__version__) 2.13.0a20230426 ``` ```:/# pip show tensorboard Name: tensorboard Version: 2.18.0 Summary: TensorBoard lets you watch Tensors Flow Home-page: https://github.com/tensorflow/tensorboard Author: Google Inc. Author-email: packages@tensorflow.org License: Apache 2.0 Location: /opt/venv/lib/python3.12/site-packages Requires: absl-py, grpcio, markdown, numpy, packaging, protobuf, setuptools, six, tensorboard-data-server, werkzeug Required-by: ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160996 Approved by: https://github.com/huydhn	2025-08-20 21:25:58 +00:00
Tsung-Hsien Lee	19c70c2f3d	[pytorch] Faster and safer lambda expression capture in `has_integral_tensor()` (#161042 ) Summary: Because `includeBool` is already a small value type (i.e., `bool`, 1 byte) that's passed by value to the function. Capturing by reference (4 or 8 bytes depending on the system) is unnecessary and could potentially lead to dangling reference issues if the lambda outlives the original variable. Capturing by value is more efficient for small types and safer. Test Plan: OSS CI & tests Rollback Plan: Differential Revision: D80595698 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161042 Approved by: https://github.com/Skylion007	2025-08-20 20:59:41 +00:00
Will Feng	8047cde0f3	Try to fix Inductor CI periodic tests (#160932 ) - hf_Reformer: this one starts failing due to increased graph breaks due to transformers pin bump (#159291). We can likely just bump the expected graph break count. - dla102: this one starts timing out on 8/13 Wed between commit 6e8865f and ee1b041. But based on the PT2 dashboard, this model actually doesn't have compile time or runtime regression. Will try to bump up the timeout and see if it can work. - hf_BigBird: this one has its accuracy status improved since today. Will update hf_BigBird accuracy status. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160932 Approved by: https://github.com/zou3519, https://github.com/huydhn, https://github.com/malfet	2025-08-20 20:36:46 +00:00
Dmitry Nikolaev	24e7f3c21c	[ROCm] fix large tensor sort on MI350 (#161054 ) Currently std::min -> ::min did not work as expected on ROCm when input values >= 2147483648 Replace `std::min` to ternary statement Also `std::min` can be replaced by explicit typing `std::min<int64_t>` fixes on ROCm: test_sort_and_select.py::TestSortAndSelectCUDA::test_sort_large_cuda_float16 error: RuntimeError: Cannot sort dimension of length 8192 Similar PR to fix large tensors on ROCm https://github.com/pytorch/pytorch/pull/130994 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161054 Approved by: https://github.com/jeffdaily	2025-08-20 19:58:01 +00:00
Nikita Shulga	e1a64b75ff	[CD] Delete full builds (#161075 ) As they are no longer needed for Colab, see https://github.com/googlecolab/colabtools/issues/5508#issuecomment-3200871941 and [<img width="896" height="128" alt="image" src="https://github.com/user-attachments/assets/a287393c-bde7-4e10-99bf-2e0d66346efe" /> ](https://colab.research.google.com/drive/1YJ5Y0xsApXSewM1cQwWQ_AS3A77vytgq) Fixes https://github.com/pytorch/pytorch/issues/160972 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161075 Approved by: https://github.com/atalman	2025-08-20 19:40:15 +00:00
eellison	b708966201	Fix bucketing introducing cycles (#160967 ) We were just looking at direct arguments, but not transitive dependencies. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160967 Approved by: https://github.com/IvanKobzarev	2025-08-20 19:38:46 +00:00
Tugsbayasgalan (Tugsuu) Manlaibaatar	dbef606631	Add support for tracing vmap in pre-dispatch export (#154650 ) Summary: ONNX team and recent transformer upgrade ran into this error and we also ran into during our export benchmarking. This diff makes it possible to trace through vmap implementation in pre-dispatch IR. Note that we don't support serializing functorch ops in pre-dispatch IR and in the future, we should desugar them to post-grad ops. The implementation strategy is: 1. We add python wrappers around vmap APIs so that we attach custom torch function handler that is only on during non-strict export. The reason is we don't want to add this to default torch_function handler because it will break BC. 2. Some dynamo changes to make sure it picks up new python wrapper APIs. The reason is when we do strict export, we need to re-materialize these APIs in pre-dispatch IR from torch IR. We can avoid this by special casing in dynamo for export to proxy different API calls but i feel that is too much chaos because you need to be able to proxy 2 different variants of same vmap API. Test Plan: CI Differential Revision: D75623875 Pull Request resolved: https://github.com/pytorch/pytorch/pull/154650 Approved by: https://github.com/ezyang, https://github.com/zou3519	2025-08-20 19:31:07 +00:00
Ruben Rodriguez Buchillon	c5cb255625	[inductor][mm] fix tma issue (#161025 ) # why - head is broken # what - the template for experimental API is broken - the test assumes not experimental API # testing ``` python3 -bb -m pytest test/inductor/test_max_autotune.py::TestMaxAutotune::test_max_autotune_regular_mm_persistent_tma_strided_a_transposed_True_b_transposed_False_dynamic_True -v ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161025 Approved by: https://github.com/PaulZhang12	2025-08-20 18:52:38 +00:00
redwrasse	957b170d8e	Fix SVD forward-mode AD multiplication priority (#161027 ) Multiplication order priority for the SVD JVP appears to have been the opposite of the optimal one. Results from a crude CPU benchmark on my laptop for random matrices of various ratios: ``` Performance Results Table \| Test Case \| Matrix Size \| Aspect Ratio \| Before JVP (ms) \| After JVP (ms) \| Change (ms) \| % Change \| Status \| \|----------------------------------\|-------------\|--------------\|-----------------\|----------------\|-------------\|----------\|---------------------\| \| Tall matrix (10:1 ratio) \| 1000×100 \| 10:1 tall \| 3.13 \| 3.24 \| +0.11 \| -3.5% \| ❌ Regression \| \| Tall matrix (10:1 ratio, larger) \| 2000×200 \| 10:1 tall \| 15.72 \| 14.66 \| -1.06 \| +6.7% \| ✅ Improvement \| \| Tall matrix (10:1 ratio, large) \| 5000×500 \| 10:1 tall \| 105.97 \| 101.84 \| -4.13 \| +3.9% \| ✅ Improvement \| \| Wide matrix (1:10 ratio) \| 100×1000 \| 1:10 wide \| 5.90 \| 4.64 \| -1.26 \| +21.4% \| ✅ Major Improvement \| \| Wide matrix (1:10 ratio, larger) \| 200×2000 \| 1:10 wide \| 18.29 \| 17.78 \| -0.51 \| +2.8% \| ✅ Improvement \| \| Wide matrix (1:10 ratio, large) \| 500×5000 \| 1:10 wide \| 137.40 \| 128.70 \| -8.70 \| +6.3% \| ✅ Improvement \| \| Square matrix (baseline) \| 1000×1000 \| 1:1 square \| 116.16 \| 106.09 \| -10.07 \| +8.7% \| ✅ Improvement \| \| Square matrix (larger baseline) \| 2000×2000 \| 1:1 square \| 714.30 \| 673.23 \| -41.07 \| +5.7% \| ✅ Improvement \| ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/161027 Approved by: https://github.com/soulitzer	2025-08-20 18:47:11 +00:00
Jovian Anthony Jaison	c02e26bf31	Fix filename showing up as ints in dynamo_compile stack_trace column. (#160916 ) Test plan: $ python -m test_utils Note: Another way is adding the actual file_name to from_traceback, but since it's referenced in multiple places and may have associated tests this seems safer. Lmk if changes are needed @c00w Pull Request resolved: https://github.com/pytorch/pytorch/pull/160916 Approved by: https://github.com/c00w, https://github.com/masnesral	2025-08-20 18:38:38 +00:00
eqy	c74e5f6061	[CUDA] Bump tolerances for `test_baddmm` (#159915 ) Only one mismatch out of the entire result tensor. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159915 Approved by: https://github.com/nWEIdia, https://github.com/drisspg	2025-08-20 18:05:51 +00:00
dolpm	1471b20cb3	add static dispatch kernel registration to open source (#160439 ) Summary: static dispatch registry should be moved to open source. the rest can maintain internally for now, since delegates will all go through ET hop. Test Plan: spot checked existing tests and didn't see any missing registrations Differential Revision: D80099377 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160439 Approved by: https://github.com/SherlockNoMad, https://github.com/zhxchen17	2025-08-20 17:58:00 +00:00
Kevin Yin	b2632e7982	Fix error message for fsdp_pre_all_gather (#160817 ) See: `20e40492b0/test/distributed/_composable/fsdp/test_fully_shard_extensions.py (L97-L104)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160817 Approved by: https://github.com/weifengpy, https://github.com/H-Huang	2025-08-20 17:43:57 +00:00
zhxchen17	5255e65c01	[dynamo] Refactor convert_frame to remove usage of nonlocal tracer output return. [4/n] (#160899 ) Today convert_frame is implemented like the following: ``` def _compile(): tracer_output = None def transform(): nonlocal tracer_output ... def _compile_inner(): transform(...) compile_inner(...) ``` The code is using unconventional nonlocal variable as the return value. This is not ideal for 2 reasons: 1. Reasoning about the code, especially together with error handling code becomes harder. 2. more importantly, this makes it harder to extract out common code pieces into a shared library because everything must depend on a central global state. In this diff we remove the usage of nonlocal return and just use the conventional function return to output the compilation data. Differential Revision: [D80461258](https://our.internmc.facebook.com/intern/diff/D80461258/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160899 Approved by: https://github.com/tugsbayasgalan ghstack dependencies: #160814, #160815, #160855	2025-08-20 17:37:26 +00:00
zhxchen17	9e050b6339	[dynamo] Refactor convert_frame._compile_inner to return compiled bytecode + output graph. [3/n] (#160855 ) We are refactoring dynamo code for convert frame so that we can have modularized pieces sharable between different compiler frontends (e.g. torch.compile, precompile and torch.export). This PR adds a new helper function compile_frame() which takes a bytecode and a transform function and return compiled bytecode + output graph as DynamoOutput type. Differential Revision: [D80430802](https://our.internmc.facebook.com/intern/diff/D80430802/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160855 Approved by: https://github.com/tugsbayasgalan ghstack dependencies: #160814, #160815	2025-08-20 17:37:26 +00:00
eellison	b3e215b864	Trigger h100 on test_max_autotune, mm, grouped_mm changes (#160678 ) Following @henrylhtsang 's pr here: https://github.com/pytorch/pytorch/pull/160656 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160678 Approved by: https://github.com/henrylhtsang, https://github.com/ngimel	2025-08-20 16:56:30 +00:00
Wang, Chuanqi	e483947047	[BE] Remove intel-openmp dependency in setup.py (#160976 ) Fixes #160962 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160976 Approved by: https://github.com/xuhancn, https://github.com/atalman	2025-08-20 16:33:16 +00:00
Angel Li	8e17709055	FlexDecode not guarding on GQA groups correctly (#160904 ) Addressing #151359 Updates flex_decode dispatch to use flex attention rather than flex decode if number of groups is not a power of 2 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160904 Approved by: https://github.com/drisspg	2025-08-20 16:32:16 +00:00
Isuru Fernando	e631557518	Fix meta function for aten.complex (#160894 ) Closes https://github.com/pytorch/pytorch/issues/160882 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160894 Approved by: https://github.com/mlazos	2025-08-20 16:30:04 +00:00
Charlie West-Taylor	7f201baf41	Allow exposing more functions during initial template expansion (#159554 ) Also adds a `_register_hook` utility, and documents & type annotates `PartialRender`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159554 Approved by: https://github.com/laithsakka, https://github.com/kundaMwiza	2025-08-20 16:08:55 +00:00
Aidyn-A	ce048de608	[ATen][CPU][Sparse] Use Third-Party Eigen for sparse add and addmm (#155357 ) This pull request adds the following ops for sparse matrices using Eigen library: ```python add(a_csr, b_csr) add(a_csc, b_csc) addmm(c_csr, a_csr, b_csr) addmm(c_csr, a_csr, b_csc) addmm(c_csr, a_csc, b_csc) addmm(c_csr, a_csc, b_csr) addmm(c_csc, a_csr, b_csr) addmm(c_csc, a_csr, b_csc) addmm(c_csc, a_csc, b_csc) addmm(c_csc, a_csc, b_csr) ``` Currently, the operations for sparse matrices on CPU are available through MKL only. The non-existence of MKL on `aarch64` causes the unavailability of these ops on any machines with ARM based CPUs, including Apple Silicon, AWS Graviton and NVIDIA Grace. This PR addresses this issue by using Eigen as a backend for the above ops. This is a re-factored version of my previous PR #101814. The main difference with the old one, this does not enable Eigen by default. Pull Request resolved: https://github.com/pytorch/pytorch/pull/155357 Approved by: https://github.com/pearu, https://github.com/eqy	2025-08-20 15:44:54 +00:00
PyTorch MergeBot	90ea9ccefe	Revert "[rfc] add hint_override kwarg to mark_dynamic (#161007 )" This reverts commit 0533ff2ccba7e77622ac3c6758f1032bdc10feff. Reverted https://github.com/pytorch/pytorch/pull/161007 on behalf of https://github.com/jeffdaily due to failing on both cuda and rocm ([comment](https://github.com/pytorch/pytorch/pull/161007#issuecomment-3206893756))	2025-08-20 15:31:33 +00:00
PyTorch MergeBot	6ea4be1e2e	Revert "[dynamic shapes] unbacked-safe slicing (#157944 )" This reverts commit 2f0cba934de7094a66c6ce68f5e937254f23142a. Reverted https://github.com/pytorch/pytorch/pull/157944 on behalf of https://github.com/seemethere due to This is blocking internal sync due to merge conflicts ([comment](https://github.com/pytorch/pytorch/pull/157944#issuecomment-3206833193))	2025-08-20 15:16:45 +00:00
Joshua Su	a818fa77e3	Back out "Deprecate overleap functions in CUDAAllocatorConfig, use AcceleratorAllocatorConfig instead (#156165 )" (#160999 ) Summary: reverting this diff since it caused S551328. Please see D80217492 for dertails. Test Plan: NA Rollback Plan: Differential Revision: D80553314 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160999 Approved by: https://github.com/izaitsevfb, https://github.com/jingsh	2025-08-20 15:04:36 +00:00
Mwiza Kunda	5ee464db5c	[inductor] Fix descriptor broadcasting for singleton dimensions (#160310 ) This fixes the case when an input / output contains both zero strides and singleton dimensions. In this case the broadcasting dimensions generated for the descriptor need to ignore dimensions that have zero strides with size 1, otherwise the determination of which dimensions to broadcast will fail. As an example, consider the following store instruction: ``` name=buf1 index=x2 + 192y0 + 64y1 valule=TritonCSEVariable('tmp7') params = BlockParameters( shape=[3, 4, 1, 1, 64], block_shape=[((YBLOCK + 3)//4), Min(4, YBLOCK), 1, 1, XBLOCK], strides=[64, 192, 0, 0, 1], offsets=[(yoffset//4), ModularIndexing(yoffset, 1, 4), 0, 0, xoffset] ) broadcasting_dims=[False, False, True, True, False] broadcast_shape=[((YBLOCK + 3)//4), Min(4, YBLOCK), XBLOCK] ``` Because `len(self.broadcasting_dims) != self.broadcast_shape)`, dim3 is incorrectly marked as a broadcast dimension when the pre-broadcast shape is computed in `codegen_broadcast_and_reshape`. ``` 9 pre_broadcast_shape = [ 280 sympy.S.One if is_broadcasting else dim 281 for dim, is_broadcasting in zip( 282 -> self.broadcast_shape, self.broadcasting_dims 283 ) 284 ] ``` The pre_broadcast_shape is now wrong: `[((YBLOCK + 3)//4), Min(4, YBLOCK), 1]` Triton throws the following error: `reshape() cannot change total number of elements in tensor` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160310 Approved by: https://github.com/blaine-rister	2025-08-20 09:48:58 +00:00
bobrenjc93	0533ff2ccb	[rfc] add hint_override kwarg to mark_dynamic (#161007 ) The motivation for this change can be seen through the following example: ``` import torch GPU_TYPE = "cuda" @torch.compile def no_override(x): return x.sum(dim=0) @torch.compile def override(x): return x.sum(dim=0) x_small = torch.randn(4096, 512, device=GPU_TYPE) no_override(x_small) torch._dynamo.decorators.mark_dynamic(x_small, 0, hint_override=4096 * 1000) override(x_small) ``` Previously, when reductions were split, codegen relied only on the first observed shape. With a small input, this resulted in a small split size: ``` def triton_per_fused_sum_1(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr): xnumel = 512 r0_numel = 32 ``` With the new scheme, inductor honors hint_override during codegen, producing larger and more appropriate split sizes: ``` def triton_red_fused_sum_0(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr): xnumel = 16384 r0_numel = 128 ``` This addresses a broader problem with dynamism: performance and numerics previously depended on whichever shape was seen first. For example: ``` f(s0) -> f(s2) f(s1) -> f(s2) ``` could generate different kernels. With the new approach, an explicit override pins the chosen configuration: ``` f(s0, hint_override=s0) -> f(s2) f(s1, hint_override=s0) -> f(s2) ``` ensuring consistent kernel generation regardless of input order. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161007 Approved by: https://github.com/jansel	2025-08-20 07:51:09 +00:00
Nick Riasanovsky	a9fabeb012	[BE] Fix old TMA API in persistent matmul template (#161030 ) Summary: Fixes a bug introduced by https://github.com/pytorch/pytorch/pull/159407 Test Plan: NA Rollback Plan: Differential Revision: D80588320 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161030 Approved by: https://github.com/adamomainz, https://github.com/NikhilAPatel, https://github.com/nmacchioni, https://github.com/aakhundov	2025-08-20 05:53:57 +00:00
FFFrog	0f801a510f	Using std::vector or c10::SmallVector instead of CArray (#160959 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160959 Approved by: https://github.com/Skylion007	2025-08-20 05:32:29 +00:00
dolpm	576a0e64ed	[nativert] ensure that moveable outputs are set in other executionframe ctor (#161005 ) Summary: so we use this constructor in HigherOrderKernel. problems arise in the loop condition, where it's possible for an output from the prev. iteration to be an input to the next. so the Output(N) of a kernel may be the Input(M) to a kernel in the next iteration. Thus, if the output value is reset (via. fastresizetozero) or overwritten by a prev. kernel before it is to be used, we have major major issues. we need to enforce that outputs are moved, not copied, to ensure this doesn't happen. Test Plan: buck2 test //caffe2/test:test_export --local-only -- test_while_loop_tensor_constant_idx_cpp_runtime_nonstrict Rollback Plan: Differential Revision: D80565374 Pull Request resolved: https://github.com/pytorch/pytorch/pull/161005 Approved by: https://github.com/SherlockNoMad	2025-08-20 05:05:32 +00:00
Menglu Yu	a3fe1ced40	[Optimus][decompose_mm] Fix BooleanAtom corner case (#160987 ) Summary: We observe a case where the BooleanAtom does not support regular sum op for bool exp, thus we fix it by using bool() Rollback Plan: Differential Revision: D80550876 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160987 Approved by: https://github.com/Yuzhen11, https://github.com/mlazos	2025-08-20 04:36:12 +00:00
PyTorch UpdateBot	7e4bfa74ea	[vllm hash update] update the pinned vllm hash (#161020 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161020 Approved by: https://github.com/pytorchbot	2025-08-20 04:15:50 +00:00
Teja Rao	d8fcb2a4ac	[dcp_poc] Fix parameter order in distributed checkpoint API to use path-first for consistency (#160986 ) Summary: This commit standardizes the parameter order across PyTorch's experimental distributed checkpoint (DCP) API, changing all checkpoint operations from (state_dict, path) to (path, state_dict) for consistency with standard file I/O patterns. Test Plan: sandcastle tests Rollback Plan: Differential Revision: D80549014 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160986 Approved by: https://github.com/pradeepfn	2025-08-20 04:09:18 +00:00
Sandeep Narendranath Karjala	2b62ef7420	Add kernel information JSON generation for AOTI packages (#160540 ) Summary: Build on D80031559. Generate kernel_information.json in AOTI compiled artifacts by combining stack traces and node mappings from provenance tracking. This implementation delivers exactly what Zoomer team requested: 1. Core Function: `create_kernel_information_json()` in debug.py combines 3 data sources: - `_inductor_kernel_stack_trace` → `stack_traces` field - `_inductor_triton_kernel_to_post_grad_node_info` → `post_grad_nodes` field - `_inductor_post_to_pre_grad_nodes["postToPre"]` → `pre_grad_nodes` field 2. AOTI Integration: codecache.py writes `kernel_information.json` to pt2 packages when both AOTI packaging and provenance tracking are enabled. 3. Test Coverage: TestKernelInformationAOTI class validates: - JSON file creation in AOTI packages using zipfile - Exact format compliance - Proper disabling without provenance tracking Output Format (exact specification): ```json { "triton_kernel_name_1": { "stack_traces": [str, str, ...], "post_grad_nodes": [str, str, ...], "pre_grad_nodes": [str, str, ...] } } ``` Test Plan: ``` buck test fbcode//caffe2/test/inductor:provenance_tracing -- TestKernelInformationAOTI ``` Manual validation: ```python import torch model = torch.nn.Linear(10, 1) with torch._inductor.config.patch("aot_inductor.package", True): with torch._inductor.config.patch("trace.basic_provenance_tracking", True): # AOTI compilation should generate kernel_information.json compiled = torch.export.export(model, (torch.randn(1, 10),)) ``` --- Rollback Plan: Differential Revision: D80139160 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160540 Approved by: https://github.com/yushangdi	2025-08-20 02:33:45 +00:00
Lucas Kabela	54cc63b467	[BE][Dynamo] Type coverage for symbolic_convert (#160922 ) As part of better engineering, we add type coverage to `dynamo/symbolic_convert.py`, which is the main work engine of dynamo for emulating python bytecode. Running ``` mypy torch/_dynamo/symbolic_convert.py --linecount-report /tmp/coverage_log ``` \| -------- \| Lines Annotated \| Lines Total \| % lines covered \| Funcs Annotated \| Funcs Total \| % funcs covered \| \| -------- \| ------- \| -------- \| ------- \| ------- \| ------- \| ------- \| \| Main \| 764 \| 4286 \| 17.83% \| 43 \| 241 \| 17.84% \| \| This PR \| 4322 \| 4322 \| 100.00% \| 241 \| 241 \| 100.00% \| \| Delta \| +3558 \| +36 \| +82.17% \| +198 \| 0 \| +82.16% \| Pull Request resolved: https://github.com/pytorch/pytorch/pull/160922 Approved by: https://github.com/StrongerXi	2025-08-20 01:24:31 +00:00
zhxchen17	599f639ddb	[dynamo] Refactor transform() so that instruction translator can be used as a tracing function. [2/n] (#160815 ) We are refactoring dynamo code for convert frame so that we can have modularized pieces sharable between different compiler frontends (e.g. torch.compile, precompile and torch.export). This PR follows the last one which separate out the part to run instruction translator on a given frame and return a DynamoTracerOutput. The end result is a free function that runs instruction translator indepedently. A follow up diff will wrap the low level function. Differential Revision: [D80388694](https://our.internmc.facebook.com/intern/diff/D80388694/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160815 Approved by: https://github.com/anijain2305 ghstack dependencies: #160814	2025-08-20 01:16:35 +00:00
Simon Fan	72e4786d16	[dynamo][dist] trace DeviceMesh's get_local_rank and get_rank as constants (#160805 ) Used in https://github.com/pytorch/torchtitan/pull/1555 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160805 Approved by: https://github.com/StrongerXi, https://github.com/mlazos	2025-08-20 01:12:24 +00:00
CaoE	371909cfd1	[Inductor][CPP] Add float16 support for CppMicroGemmAMX (#147368 ) Add float16 support for CppMicroGemmAMX for float16 gemm template. Float16 CppMicroGemmAMX needs a higher version of compiler, e.g., GCC 13. Pull Request resolved: https://github.com/pytorch/pytorch/pull/147368 Approved by: https://github.com/jgong5, https://github.com/leslie-fang-intel, https://github.com/jansel	2025-08-20 01:04:05 +00:00
Mikayla Gawarecki	78a8e6a671	Add new_empty (with dtype argument only) to torch::stable (#159508 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159508 Approved by: https://github.com/janeyx99 ghstack dependencies: #160557	2025-08-20 00:50:42 +00:00
Jagadish Krishnamoorthy	543896fcf3	test_matmul_cuda: Refine MX test skipping (#161009 ) Replace return unittest.skip with raise unittest.SkipTest to ensure that the test suite correctly reports skipped tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/161009 Approved by: https://github.com/jeffdaily	2025-08-20 00:47:45 +00:00
Anshul Sinha	a3a82e3da8	[FSDP][Replicate] replicate tests for param registration and input device movements (#160147 ) Summary: In order to ensure that replicate acts as intended (a specialized version of hsdp) we need to make sure that it can pass the same tests that fully_shard can for training. To this end, I have added three test cases, one to test input device movement and the other two to test parameter registration during the forward and backward pass of a model. Test Cases 1. pytest test/distributed/_composable/test_replicate_training.py -k test_root_move_forward_input_to_device 2. pytest test/distributed/_composable/test_replicate_training.py -k TestReplicateRegisteredParams Pull Request resolved: https://github.com/pytorch/pytorch/pull/160147 Approved by: https://github.com/weifengpy ghstack dependencies: #160135, #160136	2025-08-20 00:47:00 +00:00
Ke Wen	9d7cecdd6c	[SymmMem] Support rendezvous on view of a tensor (#160925 ) `tensor.view` share the same `data_ptr()` as the original tensor, thus cannot serve as key to rendezvous' map (we want a 1:1 match between handle and tensor, thus need a unique key). @ezyang suggests using the raw `TensorImpl` of a tensor, for which `tensor.view` would have a different value than the original tensor. But the raw `TensorImpl` can be stumbled on again when a previous tensor gets deallocated and a new one allocated. For that reason, we'd also need to use a `weak_instrusive_ptr` to distinguish the two tensors, i.e. for the deallocated tensor, `weak_instrusive_ptr::expired()` would return true. Added `test_rendezvous_view` and `test_rendezvous_same`. Note: the view support has been added to NVSHMEM backend and NCCL backend. For CUDA backend, I have yet to investigate. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160925 Approved by: https://github.com/ngimel ghstack dependencies: #160825	2025-08-19 23:49:25 +00:00
Natalia Gimelshein	0d19541284	fabric detection - fix build on an old toolkit (#160984 ) Fixes #160960 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160984 Approved by: https://github.com/eqy	2025-08-19 23:43:36 +00:00
eqy	e836323a23	[FP8][cuBLAS][SM100] cuBLAS doesn't support rowwise-scaling on `sm100` (#160693 ) See also: https://docs.nvidia.com/cuda/cublas/#id93 Only tensor-wide scales and 1D scales with tiled layout are supported. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160693 Approved by: https://github.com/nWEIdia, https://github.com/Skylion007	2025-08-19 23:22:51 +00:00
Colin Peppler	512fc768e9	Add tlparse artifact for joint graph passes (for inference & non-freezing only) (#160589 ) Summary: Joint graph passes run several FX passes which can modify the graph before it hits Inductor. There's three usages of joint graph passes: - for inference & not freezing (we add structured loggings only for this) - for inference & freezing - for fw/bw split Rollback Plan: Reviewed By: yushangdi Differential Revision: D80130321 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160589 Approved by: https://github.com/yushangdi	2025-08-19 23:18:40 +00:00
Xilun Wu	a7b5955ea8	[ContextParallel] add Document Masking test (#160700 ) Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom): * __->__ #160700 Summary add test case to CP + FlexAttention for Document Masking Test `pytest test/distributed/tensor/test_attention.py -s -k test_ring_flex_attention_document_mask` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160700 Approved by: https://github.com/fegin	2025-08-19 23:03:18 +00:00
PyTorch MergeBot	e83825f91c	Revert "handling special case for pow(3) for GPU (#157537 )" This reverts commit 05e8fac4f374c4dbf0cd0e85e925e9112cf234a2. Reverted https://github.com/pytorch/pytorch/pull/157537 on behalf of https://github.com/malfet due to This is really really bad from performance point of view, wonder if any benchmarks will detect that ([comment](https://github.com/pytorch/pytorch/pull/157537#issuecomment-3202661810))	2025-08-19 22:57:45 +00:00
Pian Pawakapan	33c3794533	[dynamic shapes] use prims_common contiguity in create_example_tensors (#160933 ) Summary: forward fix T234739699 Test Plan: T234739699 Rollback Plan: Differential Revision: D80503451 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160933 Approved by: https://github.com/henrylhtsang	2025-08-19 22:43:13 +00:00
Jane Xu	8f766d6839	Add ScalarType -> shim conversion, add stable::Tensor.scalar_type (#160557 ) TL;DR: Moving to ScalarType in user extensions and removing deprecated dtypes. This change _modifies_ the from/to behavior between ScalarType and StableValue! Whereas before, user extensions could only in abstract pass around obfuscated dtypes appearing as int32_ts, now, users can confidently use torch::headeronly::ScalarType in their extensions for major scalar types. This PR enables ABI stability by adding a translation layer through the shim, so that even if the ScalarType enum values change in the future, user extensions need not fear. Then we add a Tensor scalar_type API which reuses the from/to logic to return to the user a nice ScalarType (vs an abstracted int32_t). I then changed the test to test the scalar_type API. This code change required some refactoring because of circular dependencies. ## BC Breaking note This commit is (narrowly) BC-breaking for unpopular dtypes: `quint`s, `qint`s, `Bits`, `dummy_uint`s, `dummy_int*`s, `Float8_e8m0fnu`, and `Float4_e2m1fn_x2` in the narrow use case where an extension retrieves a Tensor dtype of the above and passes it into `aoti_torch_call_dispatcher`. As of now, I believe there are 0 users of this use case, so the benefits of this change significantly justify BC-breaking this API. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160557 Approved by: https://github.com/mikaylagawarecki, https://github.com/malfet	2025-08-19 22:13:47 +00:00
Raman Kumar	05e8fac4f3	handling special case for pow(3) for GPU (#157537 ) follows #152373 Special case for pow(3): Similar to the [CPU kernel](`d27d36136c/aten/src/ATen/native/cpu/PowKernel.cpp (L64)`), added corresponding GPU code for numerical stability. issue #150951 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157537 Approved by: https://github.com/soulitzer	2025-08-19 21:57:08 +00:00
Zhengxu Chen	f90ccad165	[export] Relax FC requirement of serde.deserialize by allowing unknown fields. (#160918 ) Summary: Previously we will pass all serialized data to dataclass ctors. Now we just loop over all the existing fields in dataclass and fetch only the field we need to run ctor. This should help with the case when we deserializing a buffer with new field. Test Plan: CI Rollback Plan: Differential Revision: D80487716 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160918 Approved by: https://github.com/angelayi	2025-08-19 21:54:46 +00:00
Rob Timpe	35e4d97e04	[dynamo] Support builtin complex with constant args (#160799 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160799 Approved by: https://github.com/guilhermeleobas, https://github.com/mlazos	2025-08-19 20:38:54 +00:00
Jazlyn Li	66166cf1e7	preserve node meta to fix inductor generated kernel name for pattern matched graphs (#160542 ) Summary: When using inductor pattern matcher to replace graphs, the graph generated by replacement function can be missing `original_aten` metadata for the replaced nodes. This further results in inductor failing to generate a sensible kernel name, eg. `tri_poi_fused_0` , missing the aten op name. This diff attempts to fix that by allowing tracing the graph in replacement function with `preserve_node_meta`. Included this as an option to turn on in `pattern_matcher.fwd_only` function. Can confirm that with the fix, MTIA's pattern matcher replaced original graph with a node that has original_aten meta, and inductor generated kernel name has op name. Test Plan: added kernel_name check to afg_inductor_test silu test Rollback Plan: Differential Revision: D80183670 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160542 Approved by: https://github.com/eellison, https://github.com/bdhirsh	2025-08-19 20:32:17 +00:00
PyTorch MergeBot	eba20d2d74	Revert "[WIP] Merge Test (#160998 )" This reverts commit ef761c43538abae5bccc0c4b6ebaf42ff676db7a. Reverted https://github.com/pytorch/pytorch/pull/160998 on behalf of https://github.com/ZainRizvi due to Undoing test merge ([comment](https://github.com/pytorch/pytorch/pull/160998#issuecomment-3202125839))	2025-08-19 20:30:39 +00:00
John Stawinski	ef761c4353	[WIP] Merge Test (#160998 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160998 Approved by: https://github.com/ZainRizvi	2025-08-19 20:26:07 +00:00
Will Constable	1ea918caf9	[C10D] Make MultiProcContinuousTest less spammy (#160821 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160821 Approved by: https://github.com/fduwjj ghstack dependencies: #160892	2025-08-19 20:17:19 +00:00
Will Constable	779fc29c04	[C10D] Fix spelling of MultiProcContinuousTest (#160892 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160892 Approved by: https://github.com/fduwjj	2025-08-19 20:17:19 +00:00
Aaron Gokaslan	ed8bcccf31	[BE][Ez]: Update ruff to 0.12.9 (#160896 ) Updates ruff. Fixes false positives and other miscellaneous ruff linting and formatting fixes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160896 Approved by: https://github.com/zou3519	2025-08-19 19:56:24 +00:00
Ke Wen	9d9cc9897a	[SymmMem] Support rendezvous on slice of a tensor (#160825 ) When we search for a NVSHMEM allocation backing a tensor, don't limit it to an exact match between `tensor.data_ptr()` and `allocation.base_ptr`. Instead, test whether the former is within an allocation range, i.e. [base_ptr, base_ptr + size). This PR also squashed in original base PR #160795: Since (i) `handle = rendezvous(tensor)`, and (ii) we pass `handle->buffer_ptrs` to kernels, `handle` should carry the `data_ptr()` of tensor instead of the base address of a memory allocation (previous case). Pull Request resolved: https://github.com/pytorch/pytorch/pull/160825 Approved by: https://github.com/Skylion007, https://github.com/ngimel	2025-08-19 19:08:45 +00:00
Markus Hoehnerbach	65d21dae18	[inductor] dont reuse buffers if it affects peak (#145883 ) (#159530 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159530 Approved by: https://github.com/eellison	2025-08-19 19:02:56 +00:00
atalman	62db8ec391	windows python 3.14 nightly builds (#159869 ) Related to https://github.com/pytorch/pytorch/issues/156856 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159869 Approved by: https://github.com/malfet, https://github.com/williamwen42	2025-08-19 18:36:16 +00:00
Mengtian Xu	5dad5b4f57	[AIDIR] Revise the insight content (#160649 ) Summary: Make it more descriptive and understable to user. Rollback Plan: Differential Revision: D80218659 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160649 Approved by: https://github.com/jingsh	2025-08-19 18:04:49 +00:00
Huy Do	fab5dac734	Tweak dependabot to run inductor jobs (#160935 ) After https://github.com/pytorch/pytorch/pull/160635, I can see dependabot creating the PR to bump `transformers` version at https://github.com/pytorch/pytorch/pull/160807. This a good start, but there are several tweaks we need: 1. Run inductor tests on the PR including one round of perf benchmark, which is always needed. So, we need `ciflow/inductor` label and a `pull_request` trigger for the benchmark 2. Per @anijain2305 feedback, we don't need to update patch version. So, I add a rule to ignore it. Again, we would need to test this out after this lands. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160935 Approved by: https://github.com/anijain2305	2025-08-19 17:56:07 +00:00
Nikita Shulga	a44a0d3671	[MPS] Fix index_add for complex + int64 (#160926 ) By re-using deterministic algorithm from `bbc7c03e93/aten/src/ATen/native/cuda/Indexing.cu (L1106-L1113)` Fixes https://github.com/pytorch/pytorch/issues/160845 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160926 Approved by: https://github.com/manuelcandales ghstack dependencies: #160850, #160889	2025-08-19 17:43:06 +00:00
Pian Pawakapan	2f0cba934d	[dynamic shapes] unbacked-safe slicing (#157944 ) Generates new unbacked symbols for slice output size & storage offset, when appropriate semantics are unclear. Teaches inductor to codegen the slice with flexible semantics. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157944 Approved by: https://github.com/laithsakka	2025-08-19 17:32:47 +00:00
Sam Anklesaria	0a5ab612dd	Port amax to stable ABI (#160214 ) To enable porting torchaudio to the stable ABI, we need the `amax` operation to be accessible. This PR ports the op and provides tests that it behaves correctly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160214 Approved by: https://github.com/mikaylagawarecki	2025-08-19 17:24:53 +00:00
Jeff Daily	1fbe230b0d	forward fix #160747 (#160981 ) broke rocm inductor tests Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/160981 Approved by: https://github.com/jeffdaily, https://github.com/Skylion007 Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-19 17:16:41 +00:00
PyTorch MergeBot	eddaaa6c2a	Revert "Recheck Autotune cache on Precompile serialization to prune compilation results (#158656 )" This reverts commit 664005662ad8c9aa1942015397048aa9ca14fd6d. Reverted https://github.com/pytorch/pytorch/pull/158656 on behalf of https://github.com/seemethere due to failing internal tests, see D80486843 ([comment](https://github.com/pytorch/pytorch/pull/158656#issuecomment-3201491561))	2025-08-19 16:53:20 +00:00
Richard Barnes	fecc5f6001	[codemod] Fix unused-local-typedef issue in caffe2/aten/src/ATen/native/cuda/CUDALoops.cuh +2 (#160944 ) Summary: LLVM has a warning `-Wunused-local-typedef` which we are enabling to remove unused code. This has the side-effect of making it easier to do refactors should as removing unnecessary includes. For questions/comments, contact r-barnes. - If you approve of this diff, please use the "Accept & Ship" button :-) Test Plan: Sandcastle Rollback Plan: Differential Revision: D80511128 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160944 Approved by: https://github.com/cyyever, https://github.com/Skylion007	2025-08-19 16:49:29 +00:00
Isuru Fernando	f305019377	[inductor] propagate shapes in CSEVariable (#152198 ) Fixes #149905 Pull Request resolved: https://github.com/pytorch/pytorch/pull/152198 Approved by: https://github.com/eellison	2025-08-19 16:46:38 +00:00
Tialo	50cfe76231	Update checkpoint warning to target PyTorch 2.9 (#160725 ) Follow-up to #160534. Fixes the docstrings and the warning in checkpoint_sequential, which presumably should have same deprecation notice Pull Request resolved: https://github.com/pytorch/pytorch/pull/160725 Approved by: https://github.com/soulitzer	2025-08-19 15:08:50 +00:00
James Wu	9225c61994	Move save guard error throwing to separate phase (#160662 ) This diff makes it so that the portion saving guards that can throw is completely separated from GuardBuilder, and instead in `serialize_guards`. This lets me add a try catch around it for caching precompile later. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160662 Approved by: https://github.com/zhxchen17	2025-08-19 14:46:43 +00:00
PyTorch MergeBot	e3ebf364e6	Revert "Use numpy 1.26.2 for Python 3.9 and 3.10 (#160836 )" This reverts commit 5d9653d90ee003173dd03f93e09fed236500ef06. Reverted https://github.com/pytorch/pytorch/pull/160836 on behalf of https://github.com/malfet due to It broke inductor tests by improving them ([comment](https://github.com/pytorch/pytorch/pull/160836#issuecomment-3200834103))	2025-08-19 13:46:53 +00:00
FFFrog	284b719005	Remove the uncessary empty file (#160728 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160728 Approved by: https://github.com/Skylion007	2025-08-19 10:54:08 +00:00
FFFrog	daeb3a6094	Using std::make_unique<T>() instead of unique<T>(new T()) (#160723 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160723 Approved by: https://github.com/Skylion007	2025-08-19 10:25:47 +00:00
cyy	5d9653d90e	Use numpy 1.26.2 for Python 3.9 and 3.10 (#160836 ) Because numpy 1.22.4 had reached EOL 3 years ago. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160836 Approved by: https://github.com/malfet	2025-08-19 09:15:06 +00:00
Nick Riasanovsky	df60736410	[BE] [Inductor] Re-Land Support TMA before strict 3.4 cutoff (#160747 ) Summary: Inductor's 3.4 Triton release is the most common used variant of Triton, but if someone is working with an alternative version of Triton this may not match. This moves the version check from 3.4 Triton to any variant that has support for the TMA APIs. Test Plan: Testing the previously failing test `inductor/test_torchinductor_strided_blocks.py::TritonTensorDescriptorTestCUDA::test_welford_non_block_pointer_cuda` Rollback Plan: Differential Revision: D80348643 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160747 Approved by: https://github.com/NikhilAPatel	2025-08-19 07:32:55 +00:00
thenumberouscode	8f31aa97a3	[dynamo] [guard] Add caching for inside torch.compile.disable function to avoid unnecessary recompilation. (#160934 ) Fixes #157399 cherry pick of d6a5c03 @mlazos Pull Request resolved: https://github.com/pytorch/pytorch/pull/160934 Approved by: https://github.com/mlazos	2025-08-19 06:01:26 +00:00
Nikita Shulga	29afde2020	[CD] Build libtorch without nvshmem (#160910 ) It was done once for cuSparseLT in `f01d7105b1` , now it's nvShmem's time Fixes https://github.com/pytorch/pytorch/issues/160762 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160910 Approved by: https://github.com/Skylion007	2025-08-19 05:58:25 +00:00
David Berard	8dbe7f99bd	[BE][inductor] tl.dot(..., allow_tf32=...) -> tl.dot(..., input_precision=...) (#160711 ) allow_tf32 is deprecated. Also, this will make it easier to support tf32x3 (i.e. #160359). dashboard results on h100 show no change: [inference](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Mon%2C%2011%20Aug%202025%2017%3A01%3A22%20GMT&stopTime=Mon%2C%2018%20Aug%202025%2017%3A01%3A22%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cuda%20(h100)&lBranch=gh/davidberard98/399/orig&lCommit=ce12d0fd751a733f22b5bdda00bd58d323e0a526&rBranch=main&rCommit=e444cd24d48b3a46f067974f2cc157f5ed27709f), [training](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Mon%2C%2011%20Aug%202025%2017%3A01%3A22%20GMT&stopTime=Mon%2C%2018%20Aug%202025%2017%3A01%3A22%20GMT&granularity=hour&mode=training&dtype=amp&deviceName=cuda%20(h100)&lBranch=gh/davidberard98/399/orig&lCommit=ce12d0fd751a733f22b5bdda00bd58d323e0a526&rBranch=main&rCommit=e444cd24d48b3a46f067974f2cc157f5ed27709f) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160711 Approved by: https://github.com/PaulZhang12, https://github.com/njriasan	2025-08-19 05:27:10 +00:00
PyTorch UpdateBot	1d46aa736f	[audio hash update] update the pinned audio hash (#160930 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160930 Approved by: https://github.com/pytorchbot	2025-08-19 04:22:55 +00:00
PyTorch UpdateBot	2cf69fe0e1	[vllm hash update] update the pinned vllm hash (#160929 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160929 Approved by: https://github.com/pytorchbot	2025-08-19 04:22:45 +00:00
dolpm	923bc46122	fix mul.Scalar with strided tensor (#160560 ) Summary: out variant has to be strided like self. since memory format isn't provided, this should be equivalent. Test Plan: prev. when we enable static dispatch this test would have numeric issues ``` buck2 test //caffe2/test:test_export -- test__scaled_dot_product_flash_attention_cpp_runtime_nonstrict --print-passing-details ``` Rollback Plan: Reviewed By: SherlockNoMad Differential Revision: D80191085 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160560 Approved by: https://github.com/SherlockNoMad	2025-08-19 04:15:12 +00:00
Paul de Supinski	58f9a3dd63	[ez] Only use default numa bindings if nproc == cuda device count (#160848 ) # Context Another fix to enable broad rollout of #149334. The implementation assumes that the trainer process with local rank `n` only uses device `cuda:n`. However, there are sometimes jobs with more than one GPU per process, in which case our assumption could be incorrect and actually lead to worse memory locality. # This PR As titled. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160848 Approved by: https://github.com/kiukchung	2025-08-19 02:50:01 +00:00
Will Feng	a391fa1c42	Make Inductor benchmarker more compatible with Triton do_bench (#160921 ) Common benchmark suites like TritonBench uses `triton.testing.do_bench` for kernel timing measurement which is not always fair for all backends. E.g. it includes torch.compile Dynamo invocation overhead and hence doesn't reflect real-world model use case where Dynamo overhead is usually hidden. I also opened a PR to use this timing measurement function on TritonBench side: https://github.com/meta-pytorch/tritonbench/pull/333. But regardless of whether that PR can land, I think we should enhance Inductor benchmark_gpu to match do_bench features, to make it easier to people to migrate. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160921 Approved by: https://github.com/BoyuanFeng	2025-08-19 02:40:21 +00:00
Yidi Wu	209143ddeb	[while_loop][inductor] fix aliased inputs by cloning (#160668 ) [fx_graph_cse](https://github.com/pytorch/pytorch/blob/main/torch/_functorch/compile_utils.py#L46) is executed in min_cut partitioner which accidentally creates the aliasing for empty buffers and we could see the following graph node for joint graph with cmd: "pytest test/functorch/test_control_flow.py -k test_scan_multiple_layers_gradient_layers_2_device_cpu" ```python while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_0_0, while_loop_body_graph_0_0, (full_default_4, empty_strided_default, full_default_2, full_default_3, full_default_2, full_default_3, full_default, full_default, rev, rev_1, rev_2, rev_3), (primals_4, primals_5, primals_6, primals_7)); ``` Notice the operands sequence "full_default_2, full_default_3, full_default_2, full_default_3, full_default, full_default", which indicates the gradient of different layers now sharing the same buffer, which create silent incorrectness. Fixes https://github.com/pytorch/pytorch/pull/158168. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160668 Approved by: https://github.com/zou3519 ghstack dependencies: #160548, #160374	2025-08-19 02:33:59 +00:00
Wang, Chuanqi	b1380f434d	[CD] Disable USE_MPI in XPU CI/CD wheel build (#159135 ) XPU wheel build need source MPI for distributed XCCL backend build, but it also enable USE_MPI by default. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159135 Approved by: https://github.com/malfet	2025-08-19 02:32:03 +00:00
mori360	e6e45e6ae8	[FSDP] Use post_reduce_stream.record_event() on hsdp+cpuoffload (#160481 ) Fixes https://github.com/pytorch/pytorch/issues/160291 `post_reduce_stream` is `all_reduce_stream` during HSDP, but CPU-GPU sync is hard coded to `reduce_scatter_stream` The hard-code could fail unit test on HSDP+CPU offload, add unit test here. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160481 Approved by: https://github.com/weifengpy	2025-08-19 02:20:14 +00:00
Anshul Sinha	3d126e17e0	[FSDP][Collectives] skipping reduce_scatter when world size is 1 (#160136 ) Summary: In its current state, FSDP collectives uses cuda synchronizations and communication ops regardless of what the world size is. However, now that replicate will use FSDP, there will be instances where group size = 1 and these synchronizations and ops will be used needlessly. I have updated fsdp_collectives to skip reduce_scatter in the foreach_reduce API when world_size ‎ = 1. I have created edited a test that uses CommDebugMode to verify that the reduce_scatter has been removed. I also edited an affected test which used 1-way FSDP by verifying and changing its assert statements for CommDebugMode. I have also added a test command. Test Cases 1. pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_single_worldsize1 2. pytest test/distributed/_composable/test_composability/test_2d_composability.py -k test_tp_with_fsdp_offloading Pull Request resolved: https://github.com/pytorch/pytorch/pull/160136 Approved by: https://github.com/weifengpy ghstack dependencies: #160135	2025-08-19 02:13:30 +00:00
Kevin Fu	8d15af2320	[PT2]: Allow None for wrapped_fbgemm_linear_fp16_weight (#160802 ) Summary: Currently the implementation of [fbgemm_linear_fp16_weight](https://www.internalfb.com/code/fbsource/[ffe8ba561cb6af33fde5b32c27411d6d3f4f2c70]/fbcode/caffe2/aten/src/ATen/native/QuantizedLinear.cpp?lines=477) does not allow None for `bias`, but it's actually a valid case and internally `fbgemm_linear_fp16_weight_fp32_activation` accept None bias as well. For BC reason, we can't directly change the function signature. So wrapping an empty tensor if bias is None to workaround it in Sigmoid. Test Plan: P1906210273 ``` MODEL_TYPE=dpa_product_first_ctr_model MODEL_ENTITY_ID=778442870 SNAPSHOT_ID=6 MODULE=user SUFFIX=.predictor.precompute.remote_request_only buck2 run mode/opt caffe2/torch/fb/model_transform/fx2trt/packaging:load_net_predictor -- --loadMode=Benchmark --inputNetFile=/data/users/$USER/models/${MODEL_ENTITY_ID}/${SNAPSHOT_ID}/${MODEL_ENTITY_ID}_${SNAPSHOT_ID}${SUFFIX} --moduleName=${MODULE} --submodToDevice="" --benchmarkDontRebatchSamples=true --doNotRandomizeSampleInputs=true --benchmarkNumIterations=10000 &> ~/logs/${MODEL_TYPE}/load_net_predictor_${MODEL_ENTITY_ID}_${SNAPSHOT_ID}_${MODULE} ``` Rollback Plan: Reviewed By: henryoier, hl475 Differential Revision: D80382652 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160802 Approved by: https://github.com/SherlockNoMad, https://github.com/henryoier	2025-08-19 01:46:53 +00:00
zhxchen17	e9209e0854	[dynamo] Refactor tracer logic in convert_frame so that it doesn't leak to outer layer. [1/n] (#160814 ) We are refactoring dynamo code for convert frame so that we can have modularized pieces sharable between different compiler frontends (e.g. torch.compile, precompile and torch.export). One incremental step we can take is to refactor out InstructionTranslator as a functional piece providing bytecode tracing. To separate out this part, we notice currently the tracer object is being passed around in the entire convert frame compile function. This is not very ideal because we want to build a boundary between the tracing and downstream compiler stack. Ideally, we should extract all the relevant information out of the tracer object and return a new data structure that is free of internal states of InstructionTranslator. Luckily, there aren't many data used from tracer, after tracing is finished. The major one is OutputGraph, other than that, we only need to record two boolean flags for error handling purposes. The new type we're adding is called DynamoTracerOutput, which contains all the information needed by torch.compile internal after symbolic convert is finished. To simplify the current PR, we leave out the part which reduce OutputGraph into a minimal set, since this can be done in a separate PR. Differential Revision: [D80388693](https://our.internmc.facebook.com/intern/diff/D80388693/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160814 Approved by: https://github.com/tugsbayasgalan	2025-08-19 01:46:24 +00:00
Pian Pawakapan	4cb31015f2	[dynamic shapes] prims_common non_overlapping_and_dense (#160462 ) Differential Revision: D80120333 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160462 Approved by: https://github.com/laithsakka	2025-08-19 01:35:28 +00:00
PyTorch MergeBot	5e98d9f9ba	Revert "[dynamic shapes] unbacked-safe slicing (#157944 )" This reverts commit 56218d85e2da09d9ede3809718ec989c2151632c. Reverted https://github.com/pytorch/pytorch/pull/157944 on behalf of https://github.com/huydhn due to Sorry for reverting your change but I think this is failing test_draft_export in trunk `56218d85e2` ([comment](https://github.com/pytorch/pytorch/pull/157944#issuecomment-3198874677))	2025-08-19 01:16:17 +00:00
Michael Lazos	5cf6567c1f	[Inductor] add cuda compile cmd to autotuning logging (#160906 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160906 Approved by: https://github.com/henrylhtsang	2025-08-19 01:14:46 +00:00
Shangdi Yu	41b3e80a55	Fix duplicated kernel name in kernel stack trace tracking (#160905 ) Summary: as title. When we have two kernels with the same name, the stack traces should be appended, not overwritten. Test Plan: ``` buck run mode/opt fbcode//caffe2/test/inductor:provenance_tracing ``` Rollback Plan: Differential Revision: D80472731 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160905 Approved by: https://github.com/angelayi	2025-08-19 01:14:34 +00:00
Ting Lu	b6852778ff	Add Magma build for CUDA 13.0 (#160770 ) Add magma build for CUDA 13.0 after almalinux docker is available https://github.com/pytorch/pytorch/issues/159779 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160770 Approved by: https://github.com/atalman Co-authored-by: Andrey Talman <atalman@fb.com> Co-authored-by: Wei Wang <weiwan@nvidia.com>	2025-08-19 01:10:00 +00:00
xinan.lin	1853f71b4f	[Fix XPU CI][Inductor UT] Fix test cases broken by community. (#160403 ) Fixes #160243, Fixes #160244, Fixes #160245 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160403 Approved by: https://github.com/janeyx99	2025-08-19 00:54:51 +00:00
Lakshay Garg	bbc7c03e93	Fix UndefinedGrad::apply (#160572 ) The function incorrectly reserved space in the input parameter instead of the output parameter Pull Request resolved: https://github.com/pytorch/pytorch/pull/160572 Approved by: https://github.com/soulitzer	2025-08-19 00:15:51 +00:00
Justin Chu	dc200066cf	[ONNX] Use onnxruntime 1.22 in CI (#160924 ) Use onnxruntime 1.22 in CI to enable testing of newer opsets and IR versions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160924 Approved by: https://github.com/titaiwangms	2025-08-19 00:05:26 +00:00
Pian Pawakapan	56218d85e2	[dynamic shapes] unbacked-safe slicing (#157944 ) Generates new unbacked symbols for slice output size & storage offset, when appropriate semantics are unclear. Teaches inductor to codegen the slice with flexible semantics. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157944 Approved by: https://github.com/laithsakka	2025-08-18 22:38:16 +00:00
Natalia Gimelshein	0254646654	harden fabric checks for symmetric memory (#160790 ) Now we check only that fabric allocation succeeded, but sometimes we fail during export or import afterwards, with no recourse. Check the full cycle before attempting to allocate memory with the fabric. TODO: move it to c10/cuda so that it can be used from CUDACachingAllocator too Pull Request resolved: https://github.com/pytorch/pytorch/pull/160790 Approved by: https://github.com/Skylion007	2025-08-18 22:35:50 +00:00
dolpm	b439675ae2	[nativert] oss pass graph pass registration (#160859 ) Summary: att Test Plan: CI Rollback Plan: Differential Revision: D80368343 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160859 Approved by: https://github.com/georgiaphillips	2025-08-18 22:23:38 +00:00
PyTorch MergeBot	82c7a1eb4b	Revert "[ONNX] Default to dynamo export (#159646 )" This reverts commit 11b6ceb7b4f81ba02f88652136a93d685c399191. Reverted https://github.com/pytorch/pytorch/pull/159646 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/159646#issuecomment-3198507767))	2025-08-18 21:41:32 +00:00
Wei Wang	16ada80c61	[BE][CUDA][Distributed] Add require_exact_world_size() and a few distributed unit test fixes (#160803 ) 1. Add require_exact_world_size() 2. Decorate the test `test_new_subgroups_with_group_param` with this require_exact_world_size(4) as the test would fail with world_size of 8 when testing with 8xB200 runner. 3. Modify `test_new_subgroups_world_size_not_divisible_by_group_size` so that it will not fail due to 4 vs. 8 mismatch. Doing so makes the test pass with both 4-GPU runner and 8-GPU runner. Separating these changes out from B200 distributed runner PR #159323 Fixes https://github.com/pytorch/pytorch/issues/159987 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160803 Approved by: https://github.com/fduwjj	2025-08-18 21:15:33 +00:00
Klaus Zimmermann	c27d6df1ea	For sdists, replace symlink with copy for docs requirements (#157811 ) Before this change, there was the requirements file `.ci/docker/requirements-docs.txt` which was symlinked as `../.ci/docker/requirements-docs.txt` from `docs/requirements.txt` since #151796. In this situation, [because `.ci` is excluded from the source tarball](`3173616532/.github/workflows/create_release.yml (L67)`), we end up with a broken symlink, that additionally is [invalid in a Python source distribution](https://packaging.python.org/en/latest/specifications/source-distribution-format/#unpacking-without-the-data-filter). The broken symlink can be confirmed in [the rc sources](https://github.com/pytorch/pytorch/actions/runs/15892205745). ~After this change, there is still a single source of truth, which now is `docs/requirements.txt`, symlinked as `../docs/requirements.txt` from `.ci/docker/requirements-docs.txt`, which would also be invalid in a Python source distribution, but is not included in the tarball (see above). Additionally, the docs requirements that were missing from the previous tarball, are now actually included, allowing users to build the documentation again.~ @malfet clarified offline that there is a problem with the docs workflows because they use a cache with a key that includes the hash of the requirements document in the `.ci` folder, which now does no longer change when the requirements change. Hence, a different solution is needed~, though for now the problem remains~. The solution in this PR is simply to copy the actual document to replace the symlink just prior to creating the source distribution. This way, a single document needs to be maintained, git checkouts remain as they are, and the source distributions contain the before-missing document. A better solution may be implemented at a later stage with a better build system. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157811 Approved by: https://github.com/atalman	2025-08-18 21:10:44 +00:00
Mitchell, Frost	d910cb3b2d	[cpp][inductor] Fix crash on bmm when input is used twice. (#160087 ) Fixes #156412 For torch.bmm using CPP generated template code, when the input is used as both the first and second weights, the generated code will simplify so it only passes one input instead of 2. However, if the weights are being repacked and saved for more efficient data-loading patterns, then we need to save both inputs instead of just one. This PR fixes this issue. ## Test code: ```python import torch @torch.compile(mode="max-autotune") def my_function(x, y): return torch.bmm(x, x) # Test x = torch.randn(2, 3, 3) y = torch.randn(2, 3, 3) result = my_function(x, y) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160087 Approved by: https://github.com/guangyey, https://github.com/jansel	2025-08-18 20:34:14 +00:00
Ryan Guo	a1a555ed7b	[dynamo] Fix graph break on calling functions decorated with special context manager (#160703 ) As title. This is a follow-up of the previous patch, with the goal of supporting a new pattern that showed up in ComfyUI: `644b23ac0b/comfy/ops.py (L44)` Effectively, the semantics of calling a function decorated with a context manager is: ```python @ctx_manager(args) def f(x): ... f(x) # -----> with ctx_manager(args): f.__wrapped__(x) ``` Yes, a fresh context manager instance per invokation, see CPython source code: https://github.com/python/cpython/blob/3.12/Lib/contextlib.py#L119-L122 So Dynamo already 1. knows how to handle the `with ctx_manager(args)` syntax, and has special handling for a few torch native context managers, like `sdpa_kernel` in this patch. 2. can trace through a good chunk (at least the ones that matter in this case) of contextlib. This patch just let Dynamo trace a bit more into contextlib, and then keep the torch-native special cases by moving their handling a bit down the stack, so that no additional logic is introduced -- it's only refactored. This also allows us to get rid of some `_sdpa_kernel_variadic` special handling, since now we will trace through its code, and it boils down to `sdpa_kernel` anyways. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160703 Approved by: https://github.com/guilhermeleobas, https://github.com/mlazos ghstack dependencies: #160684	2025-08-18 20:33:45 +00:00
Ryan Guo	72b559b2c8	[dynamo] Fix crash and silent incorrectness issues in `attention.sdpa_kernel` calls with kwargs (#160684 ) This patch fixes 2 issues, illustrated by the test cases added: 1. using `sdpa_kernel(backends=..., set_priority=...)` due to an internal assert that forgot to be updated after #147768. 2. forgetting to convert the `set_priority` VariableTracker back to a python constant so that its value is properly used by `sdpa_kernel`, also from #147768. I ran into (1) because ComfyUI had a recent update that actually sues this pattern `644b23ac0b/comfy/ops.py (L44)`, and then noticed (2), and fixed it conveniently. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160684 Approved by: https://github.com/mlazos	2025-08-18 20:33:45 +00:00
cyy	1f19003694	Use py3.10 for ONNX CI jobs (#160852 ) Use Python 3.10 for ONNX jobs because Python 3.9 is near EOL and futher ONNX versions drop 3.9 support. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160852 Approved by: https://github.com/justinchuby, https://github.com/malfet	2025-08-18 19:37:47 +00:00
Shangdi Yu	4e90441133	Add signpost to provenance tracking error (#160755 ) Summary: As title, add signpost to better track error when computing provenance tracking related debugging information Test Plan: CI Rollback Plan: Differential Revision: D80292285 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160755 Approved by: https://github.com/angelayi	2025-08-18 19:17:47 +00:00
Xinya Zhang	bfcae7e1c1	[ROCm] Fix Sliding Window Attention in AOTriton integration code (#159773 ) AOTriton implements Sliding Window Attention (SWA) as a more generalized version of causal masks and also needs an atomic counter for dynamic workload allocation. Fixes #158308 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159773 Approved by: https://github.com/jeffdaily	2025-08-18 18:45:58 +00:00
Michael Lazos	01bba62e21	Remove unused test code (#160823 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160823 Approved by: https://github.com/Skylion007	2025-08-18 18:37:52 +00:00
angelayi	6ac9035a84	[aoti-fx] Dynamic shapes support (#160766 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160766 Approved by: https://github.com/jansel ghstack dependencies: #160765	2025-08-18 18:14:08 +00:00
angelayi	bab79824cb	[aoti-fx] Initial AOTInductor FX (#160765 ) Using the existing WrapperFxCodegen backend, this PR prototypes an AOT version of it which will directly return a graph module. How to use: ```python exported_gm = torch.export.export(model, inp, dynamic_shapes=dynamic_shapes).module() compiled_gm = torch._inductor.aot_compile( exported_gm, inp, options={"fx_wrapper": True, "compile_threads": 1} ) assert torch.allclose(model(inp), compiled_gm(inp)) ``` The motivation behind this is that backends like ExecuTorch/MTIA would like to use inductor's optimization technologies, but might have their own graph lowering pipelines so they might not want to use AOTI (which generates an so). Pull Request resolved: https://github.com/pytorch/pytorch/pull/160765 Approved by: https://github.com/jansel	2025-08-18 18:14:08 +00:00
Rob Timpe	162bf78df6	[dynamo] Support itertools.filterfalse (#160596 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160596 Approved by: https://github.com/guilhermeleobas	2025-08-18 18:07:57 +00:00
Michael Lazos	450517f346	[Dynamo][Hierarchical Compile] Flatten tuple inputs for regions (#158812 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158812 Approved by: https://github.com/anijain2305 ghstack dependencies: #158810, #158811	2025-08-18 18:03:11 +00:00
James Wu	664005662a	Recheck Autotune cache on Precompile serialization to prune compilation results (#158656 ) This PR rechecks the autotune cache on Precompile.serialize(), allowing us to ahead of time save autotune results for statically compiled triton kernels, so that warm start does not need to check the autotune cache. It has a few extra changes to make this work: ### Storing source code in TritonBundler - We now store the source_code for statically compiled triton kernels instead of the hash of the source code in TritonBundler, so that we can easily access their source code when rechecking the autotune cache on PrecompileContext.serialize. To make sure that this is not a huge space concern, I ran the entire hugging face benchmark on training. The total space of `/tmp/torchinductor_jjwu/fxgraph` before my change was 1185004 KB (1.18 GB). After my change, this increased to 1207312 KB (1.2 GB), for an increased storage cost of ~1.8%, which seems safe. - We now return early from recheck_autotune_cache if the number of triton kernels being compiled is 1, since there's no reason to check the cache at all in those cases. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158656 Approved by: https://github.com/zhxchen17	2025-08-18 17:55:10 +00:00
Sam Anklesaria	c0a1ae4404	Add `is_cpu` method to stable tensor type (#160212 ) Porting torchaudio to use the stable api requires the `is_cuda` and `dtype` functions. It would be more convenient if these were methods of the stable tensor class rather than utilities one needed to call from the C api. This PR adds them as methods, mirroring how `is_cuda` and `get_device` are already defined. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160212 Approved by: https://github.com/janeyx99	2025-08-18 17:42:43 +00:00
Nikita Shulga	b0071c65e2	[MPS] Fix error check for torch.var on scalar (#160889 ) Fixes https://github.com/pytorch/pytorch/issues/160738 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160889 Approved by: https://github.com/Skylion007 ghstack dependencies: #160850	2025-08-18 17:36:42 +00:00
Guilherme Leobas	c6333f7dae	Fixes for `collections.NamedTuple` (#159367 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159367 Approved by: https://github.com/mlazos ghstack dependencies: #159365, #159366, #159368, #159483, #159902, #159864, #159865	2025-08-18 17:32:59 +00:00
Ting Lu	87d6831b2e	Add CUDA installation script for CUDA 13 (#160201 ) Add the almalinux docker for building magma-cuda 13.0 https://github.com/pytorch/pytorch/issues/159779 Also fixed the NVSHMEM download link Pull Request resolved: https://github.com/pytorch/pytorch/pull/160201 Approved by: https://github.com/atalman Co-authored-by: Andrey Talman <atalman@fb.com>	2025-08-18 17:26:25 +00:00
James Wu	4014672b30	Replace guard_serialization_mode with save_guards, remove load cases (#160531 ) This PR replaces "guard_serialization_mode" into `save_guards`. All cases where we care about whether or not we're loading guards can be inferred automatically from the existing inputs. The only case that's special here is whether or not to check guards. We don't want to check guards on guard load in CheckFnManager, because these guards have already been checked on save. Therefore, we put the setting in OutputGraphGuardsState, so that when we save, we bypass the guards check. Because of this change, it is technically possible to do a load and a save in the same CheckFunctionManager.__init__() by passing all the necessary parts, and also passing `save_guards=True`. This should just work out of the box, but so far no callsites need it, so not super important. Next up, we'll work on removing save_guards from GuardBuilder, and putting it into its own phase. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160531 Approved by: https://github.com/zhxchen17	2025-08-18 17:04:17 +00:00
Peter Y. Yeh	e389a08dcd	AMD/ROCm OCP Micro-scaling Format (mx-fp8/mx-fp4) Support (#151360 ) - This pull request introduces support for the [OCP Micro-scaling (MX) format](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf), with a focus on compatibility with AMD ROCm 7.0 and the gfx950 architecture. This PR also establishes the foundation for enabling MX-FPX features in [TorchAO](https://github.com/pytorch/ao/issues/2229) on the AMD platform. - Validation (ROCm 7.0 + gfx950 required): `111 relevant tests passing.` > PYTORCH_TEST_WITH_ROCM=1 python test/test_matmul_cuda.py -k test_blockwise -v Co-author: @jagadish-amd — Thank you for the efforts leading validation on gfx950 with ROCm 7.0. ----------------------------------- This pull request introduces support for new scalar types and scaling methods, particularly for ROCm 7.0 and gfx950, and refines testing for these features. Key changes include adding constraints for matrix dimensions, enabling block-wise scaling, and updating tests to accommodate new data types. ### Support for new scalar types and scaling methods: * [`aten/src/ATen/cuda/CUDABlas.cpp`](diffhunk://#diff-74fcb26047c1df4024105d36ce22a36b77cf8cc93c28631d743e639b3d6066aeR1876-R1885): Added constraints for matrix dimensions when using `Float8_e8m0fnu` with block-wise scaling, ensuring dimensions are multiples of 32. Updated compatibility checks to support ROCm 7.0 for `Float8_e8m0fnu` and `Float8_e4m3fn`. [[1]](diffhunk://#diff-74fcb26047c1df4024105d36ce22a36b77cf8cc93c28631d743e639b3d6066aeR1876-R1885) [[2]](diffhunk://#diff-74fcb26047c1df4024105d36ce22a36b77cf8cc93c28631d743e639b3d6066aeL1913-R1934) * [`aten/src/ATen/native/cuda/Blas.cpp`](diffhunk://#diff-e8a569efee1e650172f120a0fdcda024fe3e4703a4ee3336425c8f685af6b3abR1276-R1290): Introduced block-wise scaling for `Float8_e8m0fnu`, with checks for ROCm 7.0 and GPU architecture `gfx950`. Added validation for supported scalar types and matrix dimensions. [[1]](diffhunk://#diff-e8a569efee1e650172f120a0fdcda024fe3e4703a4ee3336425c8f685af6b3abR1276-R1290) [[2]](diffhunk://#diff-e8a569efee1e650172f120a0fdcda024fe3e4703a4ee3336425c8f685af6b3abR1349-R1364) ### Updates to scalar type mappings: * [`aten/src/ATen/cuda/CUDADataType.h`](diffhunk://#diff-9188bb13b1a49f459141f5f9b875593d1c5ce2beb5ad711fdbaf5bc7089ec015L93-R93): Extended scalar type mappings to support `Float4_e2m1fn_x2` for ROCm 7.0. * [`aten/src/ATen/cuda/tunable/GemmHipblaslt.h`](diffhunk://#diff-bfa1a3b5d4bef1892bf50338775f3b0fd8cd31fc1868148f3968b98aefb68e3fR88-R96): Added a constexpr mapping for `Float4_e2m1fn_x2` based on ROCm version. ### Enhancements to testing(@jagadish-amd): * [`test/test_matmul_cuda.py`](diffhunk://#diff-3f31c52b48cfddf8f4617d809f7695b2e4a1c78656f8c4b5143a4b45d01fcf23R765-R766): Updated tests to include new scalar types (`Float4_e2m1fn_x2`) and recipes (`mxfp4`). Added logic to handle different scaling recipes and validate compatibility with ROCm and CUDA versions. [[1]](diffhunk://#diff-3f31c52b48cfddf8f4617d809f7695b2e4a1c78656f8c4b5143a4b45d01fcf23R765-R766) [[2]](diffhunk://#diff-3f31c52b48cfddf8f4617d809f7695b2e4a1c78656f8c4b5143a4b45d01fcf23L1331-R1356) F592e669L1353R1472) These changes improve compatibility with newer hardware and software versions, enhance functionality for matrix operations, and ensure robust testing for the added features. Pull Request resolved: https://github.com/pytorch/pytorch/pull/151360 Approved by: https://github.com/drisspg, https://github.com/malfet	2025-08-18 16:43:09 +00:00
Animesh Jain	f2be3dc8da	[dynamo][guards] Optimize module getattr access for inline flag (#160864 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160864 Approved by: https://github.com/Lucaskabela ghstack dependencies: #160863	2025-08-18 16:38:46 +00:00
Animesh Jain	b8ff0fd21b	[dynamo][guards] Remove long lines from TORCH_LOGS=guards (#160863 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160863 Approved by: https://github.com/Lucaskabela	2025-08-18 16:38:46 +00:00
Nikita Shulga	6b994c47ca	[MPS][BE] Fix unused vars in GridSampler (#160850 ) This fixes following warnings during the compilation of GridSampler.metal ``` /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/GridSampler.metal:22:23: warning: unused parameter 'input_sizes' [-Wunused-parameter] constant int32_t* input_sizes, ^ /Users/malfet/git/pytorch/pytorch/aten/src/ATen/native/mps/kernels/GridSampler.metal:24:23: warning: unused parameter 'grid_sizes' [-Wunused-parameter] constant int32_t* grid_sizes, ^ 2 warnings generated. ``` Introduced by https://github.com/pytorch/pytorch/pull/160541 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160850 Approved by: https://github.com/cyyever, https://github.com/Skylion007	2025-08-18 16:24:45 +00:00
angelayi	3c8c509a9c	[export] Fix custom ops in subgraphs (#160004 ) Fixes https://github.com/pytorch/pytorch/issues/159995 Currently there are two problems with extern kernels in subgraphs: 1. They don't get serialized to the extern kernel json file because we only look at the toplevel graph. 2. Since the scope of each extern_kernel list is within its own subgraph, the indices referencing the operator is messed up because each subgraph will start counting from 0. So, this PR moves the extern_kernels list to a global view (under virtualized) so that we can count the extern kernels across subgraphs and the toplevel graph. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160004 Approved by: https://github.com/ydwu4	2025-08-18 15:42:19 +00:00
Angela Yi	1091165826	[export] Update move_to_device_pass for to.device (#160528 ) Differential Revision: D80135455 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160528 Approved by: https://github.com/yushangdi	2025-08-18 15:41:48 +00:00
Scott Todd	d91a03f96a	[ROCm] Add HIPConfig.h to .gitignore like CUDAConfig.h. (#159805 ) This file is generated into the source directory by CMake just like `cuda/CUDAConfig.h`, so it seems appropriate to add it to `.gitignore` in the same place: `83ba3f1101/aten/src/ATen/CMakeLists.txt (L39-L47)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159805 Approved by: https://github.com/jeffdaily	2025-08-18 15:34:01 +00:00
Nichols A. Romero	0298ebc97a	[ROCm][inductor][dashboard] Add GPT2ForSequenceClassification to use_larger_multiplier_for_smaller_tensor list (#160001 ) GPT2ForSequenceClassification Hugging Face (HF) model fails on ROCm for bfloat16. The failure is numerically small. This PRs adds this model to an exception list for small tensors. The exception list already includes two models. This increases the multiplier factor to 10.0 instead of 3 (default) for this model used in `torch/_dynamo/utils.py`. In the PR comment below, I include a short analysis of the numerics. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160001 Approved by: https://github.com/anijain2305, https://github.com/jataylo, https://github.com/jeffdaily	2025-08-18 15:33:30 +00:00
PyTorch UpdateBot	179511694c	Update slow tests (#160870 ) This PR is auto-generated weekly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/weekly.yml). Update the list of slow tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160870 Approved by: https://github.com/pytorchbot	2025-08-18 11:53:41 +00:00
PyTorch UpdateBot	e7c3b77b22	[xla hash update] update the pinned xla hash (#160871 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned xla hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160871 Approved by: https://github.com/pytorchbot	2025-08-18 11:50:47 +00:00
Sun, Jiayi	95e456fcc5	[inductor] pack linear for FP32 dynamic mode (#157542 ) Summary: Currently, Linear in FP32 dynamic mode(batch_size has free symbols) does not support weight prepacking since MKL Linear does not support dynamic mode. This PR uses oneDNN Linear to support Linear weight prepacking in FP32 dynamic mode. I tested the Inductor benchmark in FP32 dynamic mode on CPU using this PR, and saw ~8% improvement in timm_models geomean speedup, ~2% improvement in torchbench geomean speedup, and no change in huggingface. There are about 18 models with different degrees of performance improvement, among which BERT_pytorch, soft_actor_critic, BlenderbotForCausalLM, ElectraForCausalLM, crossvit_9_240, mobilevit_s, twins_pcpvt_base have more than 20% performance improvement. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157542 Approved by: https://github.com/CaoE, https://github.com/jansel	2025-08-18 10:18:46 +00:00
Sun, Jiayi	de744ca4b1	[Inductor] modify convert_to_reinterpret_view (#158914 ) Summary: Fix https://github.com/pytorch/pytorch/issues/159121, Modify the rules for freezing the layout of `x.unwrap_view()` in `convert_to_reinterpret_view`: relax the condition of `isinstance(x_unwrap_view, (ReinterpretView, Buffer))` to `isinstance(x_unwrap_view, (ReinterpretView, Buffer, MutableBox))`. Prefer channels last format according to how the format of `x_unwrap_view_fx_node` is set from eager. Example: ``` import torch import torch.nn as nn class M(nn.Module): def __init__(self): super(M, self).__init__() self.relu = torch.nn.ReLU() def forward(self, x): n, c, h, w = x.shape return self.relu(x).permute(0, 2, 3, 1).reshape( n, h * w, c ) model = M().eval() x = torch.randn(2, 32, 4, 4).to(memory_format=torch.channels_last) compiled_model = torch.compile(model) with torch.no_grad(): compiled_model(x) ``` Generated code: - before ``` cpp_fused_permute_relu_view_0 = async_compile.cpp_pybinding(['const float', 'float', 'float'], ''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(const float in_ptr0, float* out_ptr0, float* out_ptr1) { { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L)) { for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(32L); x1+=static_cast<int64_t>(16L)) { for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(16L); x2+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(32L) && x2 >= static_cast<int64_t>(0) && x2 < static_cast<int64_t>(16L))) { alignas(std::max(std::size_t(16), alignof(float))) float tmp0[1616]; transpose_mxn<float,static_cast<int64_t>(16),static_cast<int64_t>(16),false>(in_ptr0 + static_cast<int64_t>(x1 + 32Lx2 + 512Lx0), static_cast<int64_t>(32L), tmp0, static_cast<int64_t>(16)); for (long x1_inner = 0; x1_inner < static_cast<int64_t>(16); x1_inner++) { auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + static_cast<int64_t>(16Lx1_inner), static_cast<int64_t>(16)); auto tmp2 = at::vec::clamp_min(tmp1, decltype(tmp1)(0)); tmp2.store(out_ptr0 + static_cast<int64_t>(x2 + 16Lx1 + 16Lx1_inner + 512Lx0)); } } } } } } } { #pragma GCC ivdep for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(2L); x0+=static_cast<int64_t>(1L)) { for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(16L)) { for(int64_t x2=static_cast<int64_t>(0L); x2<static_cast<int64_t>(32L); x2+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x1 >= static_cast<int64_t>(0) && x1 < static_cast<int64_t>(16L) && x2 >= static_cast<int64_t>(0) && x2 < static_cast<int64_t>(32L))) { alignas(std::max(std::size_t(16), alignof(float))) float tmp0[1616]; transpose_mxn<float,static_cast<int64_t>(16),static_cast<int64_t>(16),false>(out_ptr0 + static_cast<int64_t>(x1 + 16Lx2 + 512Lx0), static_cast<int64_t>(16L), tmp0, static_cast<int64_t>(16)); for (long x1_inner = 0; x1_inner < static_cast<int64_t>(16); x1_inner++) { auto tmp1 = at::vec::Vectorized<float>::loadu(tmp0 + static_cast<int64_t>(16Lx1_inner), static_cast<int64_t>(16)); tmp1.store(out_ptr1 + static_cast<int64_t>(x2 + 32Lx1 + 32Lx1_inner + 512Lx0)); } } } } } } } } ''') async_compile.wait(globals()) del async_compile def call(args): arg0_1, = args args.clear() assert_size_stride(arg0_1, (2, 32, 4, 4), (512, 1, 128, 32)) buf0 = empty_strided_cpu((2, 32, 4, 4), (512, 16, 4, 1), torch.float32) buf1 = empty_strided_cpu((2, 16, 32), (512, 32, 1), torch.float32) cpp_fused_permute_relu_view_0(arg0_1, buf0, buf1) del arg0_1 return (buf1, ) ``` - After ``` cpp_fused_relu_0 = async_compile.cpp_pybinding(['const float', 'float'], ''' #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(const float* in_ptr0, float* out_ptr0) { { for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1024L); x0+=static_cast<int64_t>(16L)) { { if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(1024L))) { auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); auto tmp1 = at::vec::clamp_min(tmp0, decltype(tmp0)(0)); tmp1.store(out_ptr0 + static_cast<int64_t>(x0)); } } } } } ''') async_compile.wait(globals()) del async_compile def call(args): arg0_1, = args args.clear() assert_size_stride(arg0_1, (2, 32, 4, 4), (512, 1, 128, 32)) buf0 = empty_strided_cpu((2, 32, 4, 4), (512, 1, 128, 32), torch.float32) cpp_fused_relu_0(arg0_1, buf0) del arg0_1 return (reinterpret_tensor(buf0, (2, 16, 32), (512, 32, 1), 0), ) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158914 Approved by: https://github.com/CaoE, https://github.com/jansel	2025-08-18 07:41:20 +00:00
PyTorch MergeBot	b82aa3df20	Revert "Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. (#159197 )" This reverts commit e444cd24d48b3a46f067974f2cc157f5ed27709f. Reverted https://github.com/pytorch/pytorch/pull/159197 on behalf of https://github.com/laithsakka due to internal build failures ([comment](https://github.com/pytorch/pytorch/pull/159197#issuecomment-3195436668))	2025-08-18 07:22:13 +00:00
zhaoguoan	d8d589bd3a	Add build support for RISCV (#160172 ) In requirements.txt, do not install lintrunner on riscv64 Fixes #160170 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160172 Approved by: https://github.com/malfet	2025-08-18 05:29:34 +00:00
drisspg	3c6efd1380	Add cutedsl template support to compile (#160108 ) ## Summary Still figuring out what actually writing a template should look like, but lands alot of the base infra <img width="1267" height="262" alt="Screenshot 2025-08-16 at 10 22 12 PM" src="https://github.com/user-attachments/assets/229f8bfa-0cb4-4fb1-8530-f535e569d350" /> Test code: ```Python #!/usr/bin/env python3 """ Fixed CuteDSL template test with proper def_kernel usage. """ import torch import torch._inductor.config as config from torch._inductor.lowering import lowerings from torch._inductor.ir import TensorBox from torch._inductor.select_algorithm import autotune_select_algorithm from torch._inductor.codegen.cutedsl import CuteDSLTemplate def create_fixed_cutedsl_template(): """Create a properly structured CuteDSL template.""" def cutedsl_grid(M, N, meta): return (1,) # Part 1: Imports and kernel definition template_part1 = r""" import torch import cutlass import cutlass.cute as cute from cutlass.cute.runtime import from_dlpack @cute.kernel def {{kernel_name}}_kernel(gA: cute.Tensor, gB: cute.Tensor, gC: cute.Tensor): # Get thread and block indices tidx, _, _ = cute.arch.thread_idx() bidx, _, _ = cute.arch.block_idx() bdim, _, _ = cute.arch.block_dim() thread_idx = bidx * bdim + tidx m, n = gA.shape if thread_idx < m * n: mi = thread_idx // n ni = thread_idx % n if mi < m and ni < n: a_val = gA[mi, ni] b_val = gB[mi, ni] result = a_val + b_val gC[mi, ni] = a_val + b_val """ # Part 2: JIT wrapper function template_part2 = r""" @cute.jit def {{kernel_name}}_jit(mA: cute.Tensor, mB: cute.Tensor, mC: cute.Tensor): m, n = mA.shape total_threads = m * n threads_per_block = 256 num_blocks = (total_threads + threads_per_block - 1) // threads_per_block kernel = {{kernel_name}}_kernel(mA, mB, mC) kernel.launch( grid=[num_blocks, 1, 1], block=[threads_per_block, 1, 1] ) """ # Part 3: Main kernel function template_part3 = r""" {{def_kernel("input_a", "input_b", "output_c")}} cute_a = from_dlpack(input_a, assumed_align=16) cute_b = from_dlpack(input_b, assumed_align=16) cute_c = from_dlpack(output_c, assumed_align=16) # Launch kernel {{kernel_name}}_jit(cute_a, cute_b, cute_c) return output_c """ # Combine all parts template = CuteDSLTemplate( name="fixed_add", grid=cutedsl_grid, source=template_part1 + template_part2 + template_part3 ) return template def fixed_cutedsl_lowering(a: TensorBox, b: TensorBox) -> TensorBox: """Fixed CuteDSL lowering.""" print(f"[FIXED] CuteDSL lowering: {a.get_size()} + {b.get_size()}") template = create_fixed_cutedsl_template() choices = [] error = template.maybe_append_choice( choices, input_nodes=[a.data, b.data], layout=a.get_layout() ) if error or not choices: print(f"[FIXED] Falling back: {error}") default_lowering = lowerings[torch.ops.aten.add.Tensor] return default_lowering(a, b) print(f"[FIXED] Using CuteDSL with {len(choices)} choices") result = autotune_select_algorithm( "fixed_cutedsl_add", choices, [a, b], a.get_layout(), ) return result def test_fixed_cutedsl(): """Test the fixed CuteDSL template.""" print("=" * 50) print("Fixed CuteDSL Template Test") print("=" * 50) original = lowerings.get(torch.ops.aten.add.Tensor, None) try: lowerings[torch.ops.aten.add.Tensor] = fixed_cutedsl_lowering def test_add(x, y): return x + y device = "cuda" if torch.cuda.is_available() else "cpu" x = torch.randn(128, 4, device=device, dtype=torch.float32) y = torch.randn(128, 4, device=device, dtype=torch.float32) print(f"[FIXED] Testing with {x.shape} tensors on {device}") compiled_fn = torch.compile(test_add, backend="inductor") result = compiled_fn(x, y) # Verify correctness expected = x + y if torch.allclose(result, expected, atol=1e-5): print("✅ [FIXED] Results match!") return True else: print("❌ [FIXED] Results don't match!") return False except Exception as e: print(f"❌ [FIXED] Failed: {e}") import traceback traceback.print_exc() return False finally: if original: lowerings[torch.ops.aten.add.Tensor] = original else: lowerings.pop(torch.ops.aten.add.Tensor, None) if __name__ == "__main__": success = test_fixed_cutedsl() print("🎉 Fixed test completed!" if success else "💥 Fixed test failed!") ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160108 Approved by: https://github.com/mlazos	2025-08-18 04:37:15 +00:00
PyTorch UpdateBot	d18007a1d0	[vllm hash update] update the pinned vllm hash (#160847 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160847 Approved by: https://github.com/pytorchbot	2025-08-18 04:36:28 +00:00
dolpm	138413907a	[nativert] oss subgraph rewriter (#160780 ) Summary: att Test Plan: ci Rollback Plan: Differential Revision: D80367765 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160780 Approved by: https://github.com/SherlockNoMad, https://github.com/georgiaphillips	2025-08-18 04:25:05 +00:00
PyTorch MergeBot	3ced4f1e6c	Revert "Use numpy 1.26.2 for Python 3.9 and 3.10 (#160836 )" This reverts commit 7a68d02292fd7a430b55c5bce3268a33c7ec5055. Reverted https://github.com/pytorch/pytorch/pull/160836 on behalf of https://github.com/clee2000 due to broke some inductor jobs? Maybe just update the expected values? Not sure what the policy is for something like this [GH job link](https://github.com/pytorch/pytorch/actions/runs/17024529273/job/48262123844) [HUD commit link](`7a68d02292`) ([comment](https://github.com/pytorch/pytorch/pull/160836#issuecomment-3194953213))	2025-08-18 03:09:31 +00:00
Pian Pawakapan	075a2e6967	[PGO] add extra read/write keys (#160715 ) Differential Revision: D80321215 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160715 Approved by: https://github.com/bobrenjc93	2025-08-18 01:41:08 +00:00
cyy	7a68d02292	Use numpy 1.26.2 for Python 3.9 and 3.10 (#160836 ) Because numpy 1.22.4 had reached EOL 3 years ago. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160836 Approved by: https://github.com/malfet	2025-08-17 18:39:06 +00:00
James Wu	63e1b58a13	[easy] [Precompile] Refactor guards, improve typing (#160530 ) Purely a refactor, improve typing and get rid of some type errors. Make certain fields as nonnull, since in general it's not empty. The goal of this stack of PRs is to move the save/load logic of guard serialization into separate, flat phases, instead of being embedded in guard creation. This way, we can put a try/catch around it and fail safely if certain guards are not serializable. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160530 Approved by: https://github.com/Lucaskabela, https://github.com/Skylion007	2025-08-17 17:54:55 +00:00
cyy	960c03daf6	Remove unused CONDA_CMAKE option (#160832 ) Remove CONDA_CMAKE from `.ci/docker/build.sh` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160832 Approved by: https://github.com/malfet	2025-08-17 17:08:42 +00:00
PyTorch MergeBot	04c7be903d	Revert "[BE] [Inductor] Re-Land Support TMA before strict 3.4 cutoff (#160747 )" This reverts commit 8f434545c2e48c858d8b0d06db8f9642d6a87ad0. Reverted https://github.com/pytorch/pytorch/pull/160747 on behalf of https://github.com/malfet due to Looks like this breaks rocm, see https://hud.pytorch.org/hud/pytorch/pytorch/main/1?per_page=50&name_filter=rocm%20%2F%20linux-jammy-rocm-py3.10 ([comment](https://github.com/pytorch/pytorch/pull/160747#issuecomment-3194417733))	2025-08-17 14:22:48 +00:00
Johnny	691d17a5c6	Update TensorPipe submodule (#160808 ) To a commit containing https://github.com/pytorch/tensorpipe/pull/464 that fixes compilation with CUDA-13 Fixes https://github.com/pytorch/pytorch/issues/160104 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160808 Approved by: https://github.com/nWEIdia, https://github.com/Skylion007, https://github.com/malfet	2025-08-17 14:11:41 +00:00
Sandeep Narendranath Karjala	c699668009	[inductor] TLParse tensor metadata logging + test (#160132 ) Summary: - Add TLParse artifact logging per op with output tensor shape, stride, and dtype for cross-rank aggregation. Testing: - Add test to verify structure and contents of tlparse artifiact Pull Request resolved: https://github.com/pytorch/pytorch/pull/160132 Approved by: https://github.com/xmfan	2025-08-17 04:27:49 +00:00
PyTorch UpdateBot	0b56f3aed8	[vllm hash update] update the pinned vllm hash (#160831 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160831 Approved by: https://github.com/pytorchbot	2025-08-17 04:25:26 +00:00
Nick Riasanovsky	8f434545c2	[BE] [Inductor] Re-Land Support TMA before strict 3.4 cutoff (#160747 ) Summary: Inductor's 3.4 Triton release is the most common used variant of Triton, but if someone is working with an alternative version of Triton this may not match. This moves the version check from 3.4 Triton to any variant that has support for the TMA APIs. Test Plan: Testing the previously failing test `inductor/test_torchinductor_strided_blocks.py::TritonTensorDescriptorTestCUDA::test_welford_non_block_pointer_cuda` Rollback Plan: Differential Revision: D80348643 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160747 Approved by: https://github.com/NikhilAPatel	2025-08-17 00:35:12 +00:00
PyTorch MergeBot	26297c27e2	Revert "[inductor] TLParse tensor metadata logging + test (#160132 )" This reverts commit 2603e40be5fa4a66301e6654e34a82a67f2e4913. Reverted https://github.com/pytorch/pytorch/pull/160132 on behalf of https://github.com/clee2000 due to broke lint [GH job link](https://github.com/pytorch/pytorch/actions/runs/17010600949/job/48226137423) [HUD commit link](`2603e40be5`). landrace with another PR that changed some had_cuda related things ([comment](https://github.com/pytorch/pytorch/pull/160132#issuecomment-3193969792))	2025-08-16 23:47:03 +00:00
Guilherme Leobas	74871d4d46	[collections.abc] Ensure that binop calls works with UserDefinedObjects (#159865 ) Changes: (1) Replace UserDefinedSetVariable by UserDefinedObjectVariable in all binop calls Test plan: (1) The three tests from CPython `test_collections.py` ensures that Dynamo can trace through a dunder method (e.g. __add__, __ixor__, etc) defined in a user defined class Pull Request resolved: https://github.com/pytorch/pytorch/pull/159865 Approved by: https://github.com/mlazos ghstack dependencies: #159365, #159366, #159368, #159483, #159902, #159864	2025-08-16 20:44:40 +00:00
Guilherme Leobas	f019da2979	Implement `list(UserDefinedObject)` via `force_unpack_var_sequence` (#159864 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159864 Approved by: https://github.com/mlazos ghstack dependencies: #159365, #159366, #159368, #159483, #159902	2025-08-16 20:44:40 +00:00
Guilherme Leobas	f1bc843a5d	Wrap class definitions in `set_fullgraph(False)` in `test_collections` (#159902 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159902 Approved by: https://github.com/mlazos ghstack dependencies: #159365, #159366, #159368, #159483	2025-08-16 20:42:15 +00:00
Sandeep Narendranath Karjala	2603e40be5	[inductor] TLParse tensor metadata logging + test (#160132 ) Summary: - Add TLParse artifact logging per op with output tensor shape, stride, and dtype for cross-rank aggregation. Testing: - Add test to verify structure and contents of tlparse artifiact Pull Request resolved: https://github.com/pytorch/pytorch/pull/160132 Approved by: https://github.com/xmfan ghstack dependencies: #160260	2025-08-16 16:37:18 +00:00
Xuehai Pan	8fe4b3f848	[BE][CI] move `MYPYSTRICT` linter from `lintrunner-noclang` to `lintrunner-mypy` (#160806 ) Like `MYPY`, linter `MYPYSTRICT` will need `--all-files` too. See also: - https://github.com/pytorch/pytorch/pull/160652#issuecomment-3193390813 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160806 Approved by: https://github.com/seemethere	2025-08-16 16:15:22 +00:00
Hai Zheng	cff6def7f4	[MTIA] add correct name for CFF in tlparse (#160599 ) Differential Revision: D80201622 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160599 Approved by: https://github.com/bdhirsh	2025-08-16 14:58:03 +00:00
Laith Sakka	e444cd24d4	Remove guard_size_oblivious from default contiguity python check, and add aten.sym_is_contiguous. (#159197 ) This might cause some new DDEs on call sites that do not use is_contiguous_or_false() or sym_is_contiguous() but want to find those call sites to handle this properly by calling is_contiguous_or_false() and not is_contiguous() explitly when appropriate. I had to fix one issue after removing the implicit size oblivious reasoning. here is context we defined in this https://github.com/pytorch/pytorch/pull/157472 sym_is_contiguous to be the function computing contiguity for dynamic shapes in c++. It returns a symbolic expression that represents contiguity and guaranteed not to throw a DDE. when people call is_contiguous we do sym_is_contiguous().guard_bool() when people call is_contiguous_or_false we do sym_is_contiguous().guard_or_false() one issue not handled well was this path ``` c10::SymBool TensorImpl::sym_is_contiguous_custom( at::MemoryFormat memory_format) const { if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) { return pyobj_slot_.load_pyobj_interpreter()->is_contiguous( this, memory_format); } return sym_is_contiguous_default(memory_format); } ``` namely if we call sym_is_contiguous_custom but we have matches_python_custom(SizesStridesPolicy::CustomStrides) return true , then we used to call is_contiguous(this, memory_format); This used to go through the load_pyobj_interpreter and end up calling the python is_contiguous call which used implicit size oblivious reasoning. once we removed that implicit size oblivious reasoning, the right thing we want is to call return pyobj_slot_.load_pyobj_interpreter()->sym_is_contiguous(this, memory_format); otherwise we would get DDE even if the caller is doing sym_is_contiguous. so I had to define it for pyinterpreter, and then I had to override it for nested tensors. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159197 Approved by: https://github.com/ezyang	2025-08-16 09:15:58 +00:00
Huy Do	a84541c73f	Update transformers version automatically with Dependabot (#160635 ) My proposal here is to use GitHub Dependabot to make sure that `transformers` version used in CI are always up-to-date. To achieve this, this PR does 2 things: 1. Pin `transformers` version across all CI jobs to only one place at `.ci/docker/ci_commit_pins/huggingface.txt`. This file is now a regular pip requirements instead of a pinned commit text. There isn't any need to pin `transformers` to a specific commit and the file already refers to a stable version `v4.54.0` 2. Create `.github/dependabot.yml` to config the bot to update `transformers` automatically when there is a new version. Those labels will ensure that the right reviewers from torch.compile and Dev Infra are notified. I'm not sure how to test this out in PR, but it feels ok to land and test this in main. If this works, we should see a PR to update `v4.54.0` to the current latest `v4.55.0` ### Reference https://docs.github.com/en/code-security/dependabot/working-with-dependabot/dependabot-options-reference Pull Request resolved: https://github.com/pytorch/pytorch/pull/160635 Approved by: https://github.com/ZainRizvi	2025-08-16 05:53:39 +00:00
Rohit Singh Rathaur	114813ca77	Fix mypy errors: PyTreeSpec inheritance (#160652 ) Fixes #160650. I added type ignore comment to `LeafSpec` class inheritance in `torch/utils/_cxx_pytree.py` to handle `PyTreeSpec` being marked as final in optree's type stubs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160652 Approved by: https://github.com/Skylion007	2025-08-16 05:14:11 +00:00
Justin Chu	11b6ceb7b4	[ONNX] Default to dynamo export (#159646 ) Set dynamo=True and enable fallback. 1. Implemented the compatible behavior where BytesIO objects as `f` is accepted 2. Update tests to explicitly set dynamo=False #151693 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159646 Approved by: https://github.com/titaiwangms	2025-08-16 04:48:58 +00:00
Michael Lazos	fb7e60ba7a	[Dynamo][Hierarchical Compile] Flatten tuple outputs in graph dedupe pass (#158811 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158811 Approved by: https://github.com/anijain2305 ghstack dependencies: #158810	2025-08-16 04:45:31 +00:00
PyTorch UpdateBot	f89186e910	[audio hash update] update the pinned audio hash (#160797 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160797 Approved by: https://github.com/pytorchbot	2025-08-16 04:26:59 +00:00
PyTorch UpdateBot	10eb83734f	[vllm hash update] update the pinned vllm hash (#160699 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160699 Approved by: https://github.com/pytorchbot	2025-08-16 04:26:55 +00:00
Yang Wang	75ea93484c	[vllm test] add vllm.yml and additional package (#160698 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160698 Approved by: https://github.com/huydhn ghstack dependencies: #160116	2025-08-16 04:24:20 +00:00
Huy Do	45c2c7a5fc	Fix the wrong dataclasses_json mointoring dep MacOS test (#160796 ) Typo mistake. This should be `dataclasses_json` https://github.com/pytorch/pytorch/actions/runs/17000197828/job/48200676725#step:10:23 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160796 Approved by: https://github.com/yangw-dev	2025-08-16 04:00:31 +00:00
Shangdi Yu	b74c7cd335	Add kernel stack traces tlparse dump (#160608 ) (#160779 ) Summary: as title This is requested by the zoomer team so they can add stack trace information to profiler result. Test Plan: ``` buck run mode/dev-nosan fbcode//caffe2/test/inductor:provenance_tracing -- -r stack_traces ``` Rollback Plan: Differential Revision: D80050233 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160779 Approved by: https://github.com/angelayi	2025-08-16 03:12:38 +00:00
Scott Todd	b7ca502f29	[ROCm][Windows] Add hipcc compatibility flags to cpp_extension.py. (#159790 ) This is a similar change to https://github.com/pytorch/pytorch/pull/153986, this time adding flags to the hipcc command under `cpp_extension.py`. The `-Wno-ignored-attributes` flag in particular avoids about 200MB of warning spam when building torchvision, like these: ``` In file included from D:\b\vision_main\torchvision\csrc\ops\hip\deform_conv2d_kernel.hip:72: In file included from D:\projects\TheRock\external-builds\pytorch\.venv\Lib\site-packages\torch\include\ATen/ATen.h:13: In file included from D:\projects\TheRock\external-builds\pytorch\.venv\Lib\site-packages\torch\include\ATen/Functions.h:386: In file included from D:\projects\TheRock\external-builds\pytorch\.venv\Lib\site-packages\torch\include\ATen/ops/_sparse_softmax.h:21: D:\projects\TheRock\external-builds\pytorch\.venv\Lib\site-packages\torch\include\ATen/ops/_sparse_softmax_ops.h:18:8: warning: __declspec attribute 'dllimport' is not supported [-Wignored-attributes] 18 \| struct TORCH_API _sparse_softmax_int { \| ^~~~~~~~~ D:\projects\TheRock\external-builds\pytorch\.venv\Lib\site-packages\torch\include\torch/headeronly/macros/Export.h💯19: note: expanded from macro 'TORCH_API' 100 \| #define TORCH_API C10_IMPORT \| ^~~~~~~~~~ D:\projects\TheRock\external-builds\pytorch\.venv\Lib\site-packages\torch\include\torch/headeronly/macros/Export.h:53:31: note: expanded from macro 'C10_IMPORT' 53 \| #define C10_IMPORT __declspec(dllimport) \| ^~~~~~~~~ ``` The `-fms-extensions` flag just seems beneficial to include: https://clang.llvm.org/docs/MSVCCompatibility.html. See also this downstream issue where these changes were tested: https://github.com/ROCm/TheRock/issues/910. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159790 Approved by: https://github.com/jeffdaily	2025-08-16 02:20:49 +00:00
Nikita Shulga	7bd4cfaef4	[BE] Update nvshem dependency to 3.3.20 (#160458 ) Which is manylinux2_28 compatible, even on aarch64 platform archive contents and URL pattern changed quite drastically between 3.3.9 and 3.3.20, but hopefully it still works. Package `libnvshmem_host.so.3` into gigantic aarch64+CUDA wheel Should fix https://github.com/pytorch/pytorch/issues/160425 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160458 Approved by: https://github.com/Skylion007, https://github.com/kwen2501, https://github.com/nWEIdia, https://github.com/atalman, https://github.com/tinglvv	2025-08-16 02:00:57 +00:00
PyTorch MergeBot	c015e53d37	Revert "[BE] Update nvshem dependency to 3.3.20 (#160458 )" This reverts commit e0488d9f00865fb56c931580c80e099771c6285e. Reverted https://github.com/pytorch/pytorch/pull/160458 on behalf of https://github.com/wdvr due to need to rerun workflow generation (failing workflow-checks) ([comment](https://github.com/pytorch/pytorch/pull/160458#issuecomment-3193133706))	2025-08-16 01:47:42 +00:00
Laith Sakka	65dc4df74d	unify broadcast_shapes functions and avoid duplicates (#160251 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160251 Approved by: https://github.com/jingsh, https://github.com/ColinPeppler ghstack dependencies: #160250	2025-08-16 00:54:32 +00:00
Laith Sakka	c03809e8a5	guard_or_false cat ops (#160250 ) keep existing unbacked semantics unchanged, just use guard_or_false instead of guard_size_obl Pull Request resolved: https://github.com/pytorch/pytorch/pull/160250 Approved by: https://github.com/ColinPeppler, https://github.com/jingsh	2025-08-16 00:54:31 +00:00
Nikita Shulga	e0488d9f00	[BE] Update nvshem dependency to 3.3.20 (#160458 ) Which is manylinux2_28 compatible, even on aarch64 platform archive contents and URL pattern changed quite drastically between 3.3.9 and 3.3.20, but hopefully it still works. Package `libnvshmem_host.so.3` into gigantic aarch64+CUDA wheel Should fix https://github.com/pytorch/pytorch/issues/160425 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160458 Approved by: https://github.com/Skylion007, https://github.com/kwen2501, https://github.com/nWEIdia, https://github.com/atalman, https://github.com/tinglvv	2025-08-16 00:50:13 +00:00
Laith Sakka	f782c790df	migrate more simple gso checks (#160253 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160253 Approved by: https://github.com/bobrenjc93	2025-08-16 00:15:24 +00:00
atalman	16ce2c15fa	Add python 3.14 support to linux aarch64 builds (#160788 ) Related to https://github.com/pytorch/pytorch/issues/156856 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160788 Approved by: https://github.com/malfet	2025-08-16 00:03:21 +00:00
Andrey Talman	0d28d12b11	Fix typo packing libnvshmem into libtorch (#160778 ) Fix typo after https://github.com/pytorch/pytorch/pull/160465 Fixes: https://github.com/pytorch/pytorch/issues/160762 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160778 Approved by: https://github.com/Camyll, https://github.com/malfet, https://github.com/ZainRizvi, https://github.com/Skylion007	2025-08-15 23:43:02 +00:00
Edward Yang	838f22c57d	Do not incorrectly chain each of the strings as iterables (#160709 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160709 Approved by: https://github.com/Skylion007, https://github.com/fduwjj	2025-08-15 23:22:24 +00:00
eqy	387fe847ab	[cuDNN][SDPA] Introduce `TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1` (#155958 ) Opt-in for now, but basically uses the variable-sequence length/ragged path for the common case of BSHD layout to avoid recompiling for different sequence lengths. Built on top of #149282 Tested using a primitive fuzzer, seems at least as stable as default path (with recompilation) on B200 (50000+ cases tested without any failures) Pull Request resolved: https://github.com/pytorch/pytorch/pull/155958 Approved by: https://github.com/drisspg	2025-08-15 21:59:18 +00:00
Mu-Chu Lee	40311e2ec1	[AOTInductor] ABI-Compatibility for RecordFunction. (#159842 ) Summary: Previous our implementation for RecordFunction injects Aten into codegen, which is breaking the ABI contract for AOTInductor. C10::IValue is aded to call the full record function. The extension of more profiling info will come in later PRs. Test Plan: Included in commit. Reviewers: Subscribers: Tasks: Tags: Differential Revision: [D79622071](https://our.internmc.facebook.com/intern/diff/D79622071) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159842 Approved by: https://github.com/desertfire	2025-08-15 21:45:47 +00:00
Yidi Wu	8ca8b6053c	[inductor][while_loop][be] improve the readability of output handling (#160374 ) The logic doesn't change but make it easier to read and change. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160374 Approved by: https://github.com/zou3519 ghstack dependencies: #160548	2025-08-15 20:13:12 +00:00
Yidi Wu	ff86509a06	[map] filter none gradients and add autograd inductor tests (#160548 ) Will filter the none outputs in autograd backward for other hops as follow ups Pull Request resolved: https://github.com/pytorch/pytorch/pull/160548 Approved by: https://github.com/zou3519	2025-08-15 20:13:12 +00:00
Shangdi Yu	fa75ba9303	Change IR node's stack traces to return a set of stack traces only (#160701 ) Summary: There can be excessive stack trace outputs in TORCH_LOGS="+inductor" when a single line of code corresponds to many post grad nodes, e.g. `self.multihead_attn(x, x, x)`, in that case, we'll see the same stack trace many times in the IR node, spamming the output log. So we change to return a set of stack traces. Test Plan: CI Rollback Plan: Differential Revision: D80310549 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160701 Approved by: https://github.com/angelayi	2025-08-15 19:31:59 +00:00
Guilherme Leobas	b78968b4d1	Support `next(iterator, default)` (#159483 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159483 Approved by: https://github.com/mlazos ghstack dependencies: #159365, #159366, #159368	2025-08-15 19:08:21 +00:00
Guilherme Leobas	e5621b4d8b	Fixes for `collections.Counter` (#159368 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159368 Approved by: https://github.com/mlazos ghstack dependencies: #159365, #159366	2025-08-15 19:08:21 +00:00
Guilherme Leobas	2542e71f3f	Change mutation type of `MutableMappingVariable` to `AttributeMutationNew` (#159366 ) Also add MutableMappingVariable to `call_or_` / `call_ior` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159366 Approved by: https://github.com/zou3519 ghstack dependencies: #159365	2025-08-15 19:08:21 +00:00
Guilherme Leobas	0242d40fa5	Enable trace through the collections module (#159365 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159365 Approved by: https://github.com/zou3519	2025-08-15 19:08:21 +00:00
atalman	17de899709	Add py3.14 to macos arm64 (#160593 ) Related to https://github.com/pytorch/pytorch/issues/156856 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160593 Approved by: https://github.com/malfet, https://github.com/Skylion007	2025-08-15 18:52:10 +00:00
Shangdi Yu	25d0d8b0a3	[inductor] Fix propagating torch.utils._sympy.functions.Identity in IndexPropagation (#155504 ) Fixes https://github.com/pytorch/pytorch/issues/160535 Index may contain ` torch.utils._sympy.functions.Identity`. When we call `SymPyOps.index_expr`, if the value is a sympy.Expr with Identity, `TypedExpr(value, dtype)` will fail. So when we unwrap arguments, we expand the sympy expression to unwrap Identity. Test Plan: buck run @mode/dev-nosan //caffe2/test/inductor:test_aot_inductor -- -r test_sym_expr_indexing Rollback Plan: Differential Re vision: D76308640 Pull Request resolved: https://github.com/pytorch/pytorch/pull/155504 Approved by: https://github.com/eellison	2025-08-15 18:38:23 +00:00
Liao, Wei	c6d697ff52	port 2 distributed pipeline test files for Intel GPU (#159140 ) it's another pr to port distributed pipeline test for Intel GPU, while the other pr is https://github.com/pytorch/pytorch/pull/159033. In this pr, we port two test files for Intel GPU We could enable Intel GPU with following methods and try the best to keep the original code styles: 1. instantiate_device_type_tests() 2. skip the case at xpu due to accuracy gap introduced by oneDNN non-deterministic Pull Request resolved: https://github.com/pytorch/pytorch/pull/159140 Approved by: https://github.com/guangyey, https://github.com/d4l3k, https://github.com/H-Huang	2025-08-15 18:29:50 +00:00
PyTorch MergeBot	30d2f98daa	Revert "[cutlass backend] re-add pip cutlass path (#160180 )" This reverts commit d556586448f3caab85673c7da0978fe31c7748f7. Reverted https://github.com/pytorch/pytorch/pull/160180 on behalf of https://github.com/atalman due to broke macos nightly ([comment](https://github.com/pytorch/pytorch/pull/160180#issuecomment-3192311552))	2025-08-15 18:00:41 +00:00
Xuan Zhang	8780d28c65	raise exception in case of errors in memory reordering (#160455 ) This PR introduce two checks in the memory reordering pass to catch graph issues before performing the reordering task. For situation not covered by these checks, the reordering pass might fail and an exception will be thrown in this case. This addresses issue -- https://github.com/pytorch/pytorch/issues/159568 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160455 Approved by: https://github.com/eellison	2025-08-15 17:31:55 +00:00
Yidi Wu	da8f48d88f	[associative_scan] support gen_schema for associative_scan (#158883 ) In-place mutation may create inter-loop dependency that breaks the parallelism we have for associative_scan so we ban input mutations. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158883 Approved by: https://github.com/zou3519 ghstack dependencies: #154193, #158965, #158863, #158864	2025-08-15 17:28:44 +00:00
Yidi Wu	cb9e2092a8	[scan] support gen_schema for scan (#158864 ) We don't want to allow scan's combine_fn to mutate its inputs. The semantic of the mutation can be confusing. For example: ```python def combine_fn(init, x): ``` If combine_fn mutates init, only first iteration mutates init, the rest of the iterations mutates the previous carry, which is an intermediate result. This is kind of a weird semantic because the only observable mutation is for init, which can be done outside of the combine_fn. If combine_fn mutates x, where x is a slice of scanned inputs (i.e. xs), this pattern is more meaningful but we've not seen any use case yet. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158864 Approved by: https://github.com/zou3519 ghstack dependencies: #154193, #158965, #158863	2025-08-15 17:28:44 +00:00
Yidi Wu	f6bf1573fc	[while_loop] support gen_schema for while_loop (#158863 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158863 Approved by: https://github.com/zou3519 ghstack dependencies: #154193, #158965	2025-08-15 17:28:34 +00:00
Yidi Wu	82a18423be	[BE] create an empty shape_env for check_input_alias_and_mutation_return_outputs (#158965 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158965 Approved by: https://github.com/zou3519 ghstack dependencies: #154193	2025-08-15 17:28:20 +00:00
Yidi Wu	3fe3c23d4e	[cond] support gen_schema for cond (#154193 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/154193 Approved by: https://github.com/zou3519	2025-08-15 17:28:13 +00:00
Prajesh Praveen Anchalia	052c441cf4	Add logging for when inbuilt_inline_nn_modules will help with ID_MATCH guard triggered recompiles (#160592 ) We add a logging around when an ID_MATCH guard is added at a place where inbuilt_inline_nn_modules would inline it. This is done with the aim of tagging recompiles that could be avoided by setting inbuilt_inline_nn_modules flag. It will help us log and track the flag's adoption and potentially quantify saving in the the number of recompiles. Differential Revision: D80075975 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160592 Approved by: https://github.com/anijain2305	2025-08-15 17:09:39 +00:00
Paul de Supinski	b26d2a9464	[ez] Make NUMA signpost parameters JSON serializable (#160710 ) # Context Broader context in #160163. In order for the _utils_internal version of signpost_event to do proper logging, its parameters argument needs to be json serializable. # This PR Convert `NumaOptions` to serializable form before inputting to `signpost_event`. # Test Plan ## Automated Added tests `$ pytest test/test_numa_binding.py`. ## Manual See [D80317206](https://www.internalfb.com/diff/D80317206). Pull Request resolved: https://github.com/pytorch/pytorch/pull/160710 Approved by: https://github.com/kiukchung	2025-08-15 16:52:43 +00:00
Kurt Mohler	6382302990	[MPS] Add `grid_sampler_3d` for MPS (#160541 ) This PR adds support for `grid_sampler_3d` for MPS with "bilinear" interpolation. NOTE: "nearest" interpolation is not yet supported Fixes #159882 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160541 Approved by: https://github.com/malfet	2025-08-15 16:19:25 +00:00
Catherine Lee	80dd05e31e	Disable flaky cpp test RecordDebugHandles.Basic (#160577 ) Test is flaky and sometimes hangs in CI Here's an example of the failure: https://github.com/pytorch/pytorch/actions/runs/16946153494/job/48027937663 ``` 2025-08-13T20:54:00.1223688Z ==================================== RERUNS ==================================== 2025-08-13T20:54:00.1224156Z ___________________________ RecordDebugHandles.Basic ___________________________ 2025-08-13T20:54:00.1224682Z [gw2] linux -- Python 3.13.5 /opt/conda/envs/py_3.13/bin/python3.13 2025-08-13T20:54:00.1225568Z Internal Error: calling /opt/conda/envs/py_3.13/lib/python3.13/site-packages/torch/bin/test_jit for test RecordDebugHandles.Basic failed (returncode=-6): 2025-08-13T20:54:00.1226430Z CUDA not available. Disabling CUDA and MultiCUDA tests 2025-08-13T20:54:00.1226988Z Note: Google Test filter = RecordDebugHandles.Basic-_CUDA:_MultiCUDA 2025-08-13T20:54:00.1227450Z [==========] Running 1 test from 1 test suite. 2025-08-13T20:54:00.1227792Z [----------] Global test environment set-up. 2025-08-13T20:54:00.1228145Z [----------] 1 test from RecordDebugHandles 2025-08-13T20:54:00.1228492Z [ RUN ] RecordDebugHandles.Basic 2025-08-13T20:54:00.1228822Z [ OK ] RecordDebugHandles.Basic (1 ms) 2025-08-13T20:54:00.1229204Z [----------] 1 test from RecordDebugHandles (1 ms total) 2025-08-13T20:54:00.1229501Z 2025-08-13T20:54:00.1229666Z [----------] Global test environment tear-down 2025-08-13T20:54:00.1230033Z [==========] 1 test from 1 test suite ran. (1 ms total) 2025-08-13T20:54:00.1230355Z [ PASSED ] 1 test. 2025-08-13T20:54:00.1230727Z terminate called after throwing an instance of 'std::system_error' 2025-08-13T20:54:00.1231154Z what(): Invalid argument 2025-08-13T20:54:00.1231416Z unknown file:0: C++ failure 2025-08-13T20:54:00.1231788Z ------------------------------ Captured c++ call ------------------------------- 2025-08-13T20:54:00.1232262Z CUDA not available. Disabling CUDA and MultiCUDA tests 2025-08-13T20:54:00.1232745Z Note: Google Test filter = RecordDebugHandles.Basic-_CUDA:_MultiCUDA 2025-08-13T20:54:00.1233199Z [==========] Running 1 test from 1 test suite. 2025-08-13T20:54:00.1233557Z [----------] Global test environment set-up. 2025-08-13T20:54:00.1233915Z [----------] 1 test from RecordDebugHandles 2025-08-13T20:54:00.1234247Z [ RUN ] RecordDebugHandles.Basic 2025-08-13T20:54:00.1234590Z [ OK ] RecordDebugHandles.Basic (1 ms) 2025-08-13T20:54:00.1235020Z [----------] 1 test from RecordDebugHandles (1 ms total) 2025-08-13T20:54:00.1235304Z 2025-08-13T20:54:00.1235431Z [----------] Global test environment tear-down 2025-08-13T20:54:00.1235793Z [==========] 1 test from 1 test suite ran. (1 ms total) 2025-08-13T20:54:00.1236126Z [ PASSED ] 1 test. 2025-08-13T20:54:00.1236481Z terminate called after throwing an instance of 'std::system_error' 2025-08-13T20:54:00.1236906Z what(): Invalid argument 2025-08-13T20:54:00.1237287Z ___________________________ RecordDebugHandles.Basic ___________________________ 2025-08-13T20:54:00.1237800Z [gw2] linux -- Python 3.13.5 /opt/conda/envs/py_3.13/bin/python3.13 2025-08-13T20:54:00.1238686Z Internal Error: calling /opt/conda/envs/py_3.13/lib/python3.13/site-packages/torch/bin/test_jit for test RecordDebugHandles.Basic failed (returncode=-6): 2025-08-13T20:54:00.1239551Z CUDA not available. Disabling CUDA and MultiCUDA tests 2025-08-13T20:54:00.1240048Z Note: Google Test filter = RecordDebugHandles.Basic-_CUDA:_MultiCUDA 2025-08-13T20:54:00.1240495Z [==========] Running 1 test from 1 test suite. 2025-08-13T20:54:00.1240848Z [----------] Global test environment set-up. 2025-08-13T20:54:00.1241199Z [----------] 1 test from RecordDebugHandles 2025-08-13T20:54:00.1241542Z [ RUN ] RecordDebugHandles.Basic 2025-08-13T20:54:00.1241871Z [ OK ] RecordDebugHandles.Basic (1 ms) 2025-08-13T20:54:00.1242249Z [----------] 1 test from RecordDebugHandles (1 ms total) 2025-08-13T20:54:00.1242503Z 2025-08-13T20:54:00.1242641Z [----------] Global test environment tear-down 2025-08-13T20:54:00.1242993Z [==========] 1 test from 1 test suite ran. (19 ms total) 2025-08-13T20:54:00.1243329Z [ PASSED ] 1 test. 2025-08-13T20:54:00.1243697Z terminate called after throwing an instance of 'std::system_error' 2025-08-13T20:54:00.1244113Z what(): Invalid argument 2025-08-13T20:54:00.1244392Z unknown file:0: C++ failure 2025-08-13T20:54:00.1244759Z ------------------------------ Captured c++ call ------------------------------- 2025-08-13T20:54:00.1245235Z CUDA not available. Disabling CUDA and MultiCUDA tests 2025-08-13T20:54:00.1283768Z ============== 1 failed, 568 passed, 2 rerun in 115.57s (0:01:55) ============== ``` Here's an example of the hang: https://github.com/pytorch/pytorch/actions/runs/16942186826/job/48015238944 Logs aren't super helpful other than stating that it took a long time. Usually this file takes <2min to run ``` 2025-08-13T18:43:24.6586481Z [gw0] [ 97%] PASSED [1.4119s] ../../../../../opt/conda/envs/py_3.13/lib/python3.13/site-packages/torch/bin/test_jit::PyTorch/LiteInterpreterDynamicTypeTestFixture::Conformance/8 2025-08-13T18:43:24.6587278Z [gw1] [ 97%] PASSED [1.4866s] ../../../../../opt/conda/envs/py_3.13/lib/python3.13/site-packages/torch/bin/test_jit::PyTorch/LiteInterpreterDynamicTypeTestFixture::Conformance/9 Command took >30min, returning 124 2025-08-13T18:43:24.6587288Z 2025-08-13T18:43:24.6587632Z FINISHED PRINTING LOG FILE of cpp/test_jit 1/1 (test/test-reports/cpp.test_jit_1.1_c259e5a152845991_.log) 2025-08-13T18:43:24.6587639Z ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160577 Approved by: https://github.com/huydhn	2025-08-15 15:59:21 +00:00
PyTorch MergeBot	9df07ecfbe	Revert "[inductor] dont reuse buffers if it affects peak (#145883 ) (#159530 )" This reverts commit 3be70dc30e893b552fc0f23ca06cd8f7949b6d08. Reverted https://github.com/pytorch/pytorch/pull/159530 on behalf of https://github.com/clee2000 due to newly added test fail internally D80316528, probably just a targets change, but also imo the tests should probably go into a testcase class from common or inductor utils. While I'm pretty sure CI can run the globally defined ones, theres some CI related functionality that on the testcase class that CI benefits from ([comment](https://github.com/pytorch/pytorch/pull/159530#issuecomment-3191947506))	2025-08-15 15:49:04 +00:00
PyTorch MergeBot	846963fa9b	Revert "[Inductor] addmm + activation function fusion (#158137 )" This reverts commit b9d7de3a094598c3dc0dd52e57bce30eb684c9d8. Reverted https://github.com/pytorch/pytorch/pull/158137 on behalf of https://github.com/malfet due to Broke inductor torchbench, see `663da17b62/1` ([comment](https://github.com/pytorch/pytorch/pull/158137#issuecomment-3191841298))	2025-08-15 15:34:09 +00:00
chunhuanMeng	663da17b62	Update torch-xpu-ops commit pin (#160062 ) Update the torch-xpu-ops commit to [77cc792cd265179745d335579d233e6d4f9a2667](`77cc792cd2`), includes: - Ensures that the XPU cache is cleared before creating tensors during the test - Add unused variable warning - Fix test_linalg and test_torch issue with bf32_on_and_off updates - Fix deterministic indexing with broadcast - Fix dist.gather with noncontiguous tensor - Improve accuracy of index put deterministic kernel - Add generate file rely avoid build before generate - optimize embedding bag Fixes #160661 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160062 Approved by: https://github.com/EikanWang	2025-08-15 15:27:24 +00:00
Shiva Kaul	e299926f72	[ONNX] Fix doc typo for symbolic_multi_out (#160702 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160702 Approved by: https://github.com/justinchuby	2025-08-15 14:34:42 +00:00
Huy Do	bbd11c4f23	Uninstall torchao on MPS benchmark (#160724 ) Fixes https://github.com/pytorch/pytorch/issues/160689 The current torchao 0.12.0 doesn't work with transformers 4.54.0 and ends up with this error: ``` File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/transformers/models/albert/modeling_albert.py", line 37, in <module> from ...modeling_utils import PreTrainedModel File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/transformers/modeling_utils.py", line 51, in <module> from torchao.quantization import Int4WeightOnlyConfig File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/torchao/__init__.py", line 41, in <module> from torchao.quantization import ( File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/torchao/quantization/__init__.py", line 6, in <module> from .autoquant import ( File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/torchao/quantization/autoquant.py", line 11, in <module> from torchao.dtypes import ( File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/torchao/dtypes/__init__.py", line 1, in <module> from . import affine_quantized_tensor_ops File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/torchao/dtypes/affine_quantized_tensor_ops.py", line 38, in <module> from torchao.dtypes.uintx.dyn_int8_act_int4_wei_cpu_layout import ( File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/torchao/dtypes/uintx/__init__.py", line 7, in <module> from .dyn_int8_act_int4_wei_cpu_layout import ( File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/torchao/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py", line 320, in <module> from ...prototype.inductor.fx_passes import register_da8w4_concat_linear_cpu_pass File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/torchao/prototype/inductor/fx_passes/__init__.py", line 2, in <module> from .int8_sdpa_fusion import _int8_sdpa_init File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/torchao/prototype/inductor/fx_passes/int8_sdpa_fusion.py", line 22, in <module> from ..int8_sdpa_lowering import register_int8_sdpa # noqa: F401 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/ec2-user/runner/_work/_temp/venv-3.12-1755212960/lib/python3.12/site-packages/torchao/prototype/inductor/int8_sdpa_lowering.py", line 6, in <module> from torch._inductor.kernel.flex_attention import construct_strides, maybe_realize ModuleNotFoundError: No module named 'torch._inductor.kernel.flex_attention' ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160724 Approved by: https://github.com/malfet	2025-08-15 13:55:39 +00:00
Sherlock Huang	eaa5d9d3d3	Introduce OpInfo test for testing export on fake device (#160694 ) Summary: Prepare for the upcoming diffs for exporting on fake cuda device. Test Plan: test Rollback Plan: Differential Revision: D80304225 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160694 Approved by: https://github.com/dolpm	2025-08-15 07:26:28 +00:00
Colin Peppler	a7c75ae976	[dde] use sym_or when checking normalized shape in layer_norm (#160683 ) Use `sym_eq` to check equality on tuple of ints/symints ### DDE ``` torch._dynamo.exc.UserError: Could not guard on data-dependent expression Eq(u0, u1) (unhinted: Eq(u0, u1)). (Size-like symbols: u1, u0) Caused by: return torch.nn.functional.layer_norm( # test/inductor/test_unbacked_symints.py:527 in fn (_refs/__init__.py:3292 in native_layer_norm) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160683 Approved by: https://github.com/bobrenjc93	2025-08-15 06:56:00 +00:00
Pian Pawakapan	f7ad69f59c	[dynamic shapes] handle Max(*,1) for inductor layout contiguity (#160578 ) Differential Revision: D80214882 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160578 Approved by: https://github.com/ZixinYang, https://github.com/bobrenjc93	2025-08-15 06:10:18 +00:00
Wang, Chuanqi	4cae9cf2df	Update triton xpu commit to support python 3.14 (#160183 ) Follow PR #159725 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160183 Approved by: https://github.com/EikanWang, https://github.com/atalman	2025-08-15 05:41:17 +00:00
Yang Wang	7710800865	[3/3][ghstack][vllm ci build setup]vllm build workflow (#160116 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160116 Approved by: https://github.com/huydhn	2025-08-15 05:35:46 +00:00
Shangdi Yu	aa99e0958f	Separate provenance tracking to different levels (#160383 ) Summary: as title. We've got request from various parties who are interested in turning on the provenance tracking by default. In this PR, we prepare to turn on part of the provenance tracking that doesn't have too much overhead by default. - Change `provenance_tracking` config to `provenance_tracking_level` - turn on the following provenance tracking by default when `basic_provenance_tracking`=True - `set_kernel_post_grad_provenance_tracing` for kernels, this add mapping between triton kernels and post_grad nodes - `dump_inductor_provenance_info` if we're dumping tlparse log - `get_graph_provenance_json` and dump `reate_mapping_pre_post_grad_nodes`. This creates mapping between pre_grad and post_grad nodes. Since we're not turning on the provenance tracking in GraphTransformObserver by default, the mapping here maybe incomplete/limited. - add stack trace from post grad nodes to inductor IR nodes - add exception swallowing for all functions above Test Plan: CI Rollback Plan: Differential Revision: D80031559 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160383 Approved by: https://github.com/angelayi	2025-08-15 04:59:35 +00:00
PyTorch UpdateBot	3fc7a95176	[audio hash update] update the pinned audio hash (#160485 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160485 Approved by: https://github.com/pytorchbot	2025-08-15 04:27:49 +00:00
Kevin Fu	858fb80b9b	[PT2]: Add Static Dispatch Kernel for wrapped_fbgemm_linear_fp16_weight (#160451 ) Summary: Add static dispatch kernel for wrapped_fbgemm_linear_fp16_weight. This optimization should improve perf for all Ads DSNN models using Sigmoid. Test Plan: ``` MODEL_TYPE=dpa_product_first_ctr_model MODEL_ENTITY_ID=892669089 SNAPSHOT_ID=37 OTHER_MODEL_ENTITY_ID=892669089 OTHER_SNAPSHOT_ID=36 MODULES=(mix prepare_float_features object user) SUFFIXES=(.predictor.local .predictor.precompute.prepare_float_features .predictor.precompute.remote_object_only .predictor.precompute.remote_request_only) for i in "${!MODULES[@]}"; do MODULE=${MODULES[i]} SUFFIX=${SUFFIXES[i]} buck2 run mode/opt caffe2/torch/fb/model_transform/fx2trt/packaging:load_net_predictor -- --loadMode=BenchmarkAB --inputNetFile=/data/users/$USER/models/${MODEL_ENTITY_ID}/${SNAPSHOT_ID}/${MODEL_ENTITY_ID}_${SNAPSHOT_ID}${SUFFIX} --otherNetFile=/data/users/$USER/models/${OTHER_MODEL_ENTITY_ID}/${OTHER_SNAPSHOT_ID}/${OTHER_MODEL_ENTITY_ID}_${OTHER_SNAPSHOT_ID}${SUFFIX} --moduleName=${MODULE} --submodToDevice "" --benchmarkDontRebatchSamples=true --doNotRandomizeSampleInputs=true ``` Before: P1900475429 I0810 19:29:22.782902 2717337 load_net_predictor_lib.cpp:1807] Average latency A: 0.0843 ms I0810 19:29:22.782905 2717337 load_net_predictor_lib.cpp:1807] Average latency B: 0.0989 ms After: P1900825771 I0811 15:42:34.866408 2311279 load_net_predictor_lib.cpp:1807] [36mAverage latency A: 0.0854 ms[0m I0811 15:42:34.866411 2311279 load_net_predictor_lib.cpp:1807] [36mAverage latency B: 0.092 ms[0m Still has some regression but the gap is smaller... Rollback Plan: Reviewed By: henryoier, muchulee8 Differential Revision: D80042054 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160451 Approved by: https://github.com/henryoier	2025-08-15 04:06:17 +00:00
Kevin Fu	55061c9602	[PT2]: Add Static Dispatch Kernel for scale_gradient (#160454 ) Summary: Add Static Dispatch Kernel for scale_gradient Test Plan: ``` MODEL_TYPE=dpa_product_first_ctr_model MODEL_ENTITY_ID=892669089 SNAPSHOT_ID=37 OTHER_MODEL_ENTITY_ID=892669089 OTHER_SNAPSHOT_ID=36 MODULES=(mix prepare_float_features object user) SUFFIXES=(.predictor.local .predictor.precompute.prepare_float_features .predictor.precompute.remote_object_only .predictor.precompute.remote_request_only) for i in "${!MODULES[@]}"; do MODULE=${MODULES[i]} SUFFIX=${SUFFIXES[i]} buck2 run mode/opt caffe2/torch/fb/model_transform/fx2trt/packaging:load_net_predictor -- --loadMode=BenchmarkAB --inputNetFile=/data/users/$USER/models/${MODEL_ENTITY_ID}/${SNAPSHOT_ID}/${MODEL_ENTITY_ID}_${SNAPSHOT_ID}${SUFFIX} --otherNetFile=/data/users/$USER/models/${OTHER_MODEL_ENTITY_ID}/${OTHER_SNAPSHOT_ID}/${OTHER_MODEL_ENTITY_ID}_${OTHER_SNAPSHOT_ID}${SUFFIX} --moduleName=${MODULE} --submodToDevice "" --benchmarkDontRebatchSamples=true --doNotRandomizeSampleInputs=true ``` Rollback Plan: Reviewed By: henryoier Differential Revision: D80062244 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160454 Approved by: https://github.com/henryoier	2025-08-15 03:42:39 +00:00
Kevin Fu	214d04833a	[PT2]: Add Static Dispatch Kernel for fmod.Scalar (#160654 ) Summary: Add static dispatch for torch.ops.aten.fmod.Scalar. Found this missing in user/object nets for DSNN models. Test Plan: ``` MODEL_TYPE=dpa_product_first_ctr_model MODEL_ENTITY_ID=892669089 SNAPSHOT_ID=36 MODULE=user SUFFIX=.predictor.precompute.remote_request_only buck2 run mode/opt caffe2/torch/fb/model_transform/fx2trt/packaging:load_net_predictor -- --loadMode=BenchmarkByOp --inputNetFile=/data/users/$USER/models/${MODEL_ENTITY_ID}/${SNAPSHOT_ID}/${MODEL_ENTITY_ID}_${SNAPSHOT_ID}${SUFFIX} --moduleName=${MODULE} --submodToDevice="" --benchmarkEnableProfiling=true --benchmarkDontRebatchSamples=true --doNotRandomizeSampleInputs=true --benchmarkNumIterations=1000 ``` Object tower: P1904347784 User tower: P1904348406 Rollback Plan: Differential Revision: D80238495 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160654 Approved by: https://github.com/henryoier	2025-08-15 03:11:48 +00:00
Johnny	9c5601ecc3	[NVIDIA] Refactor Family Blackwell Support codegen (#156176 ) With the legacy driver (nvgpu) used for CUDA 12.9, Thor was operating with SM 10.1. This changes to SM 11.0 when the newer driver model (OpenRM), which is intended for CUDA 13.0, is introduced. Thor 10.1 --> 11.0 Spark 12.1 Pull Request resolved: https://github.com/pytorch/pytorch/pull/156176 Approved by: https://github.com/ezyang	2025-08-15 02:51:26 +00:00
Nikita Shulga	5b9ad951f8	[BE][Docker] Do not install `cuda:11.8` (#160695 ) As CUDA-11.8 binary are no longer produced by CD Pull Request resolved: https://github.com/pytorch/pytorch/pull/160695 Approved by: https://github.com/huydhn	2025-08-15 02:23:04 +00:00
Lucas Kabela	4d5f92aa39	typing tvm.py (#160369 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160369 Approved by: https://github.com/Skylion007 ghstack dependencies: #160362, #160363, #160364, #160365, #160366, #160367, #160368	2025-08-15 02:09:31 +00:00
Lucas Kabela	39ca0ce0c8	Type backend torchxla (#160368 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160368 Approved by: https://github.com/Skylion007 ghstack dependencies: #160362, #160363, #160364, #160365, #160366, #160367	2025-08-15 02:09:31 +00:00
Lucas Kabela	d52bb67ac3	typing registry.py (#160367 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160367 Approved by: https://github.com/Skylion007 ghstack dependencies: #160362, #160363, #160364, #160365, #160366	2025-08-15 02:09:31 +00:00
Lucas Kabela	05b9b63fb6	typing inductor and placeholder backends (#160366 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160366 Approved by: https://github.com/Skylion007 ghstack dependencies: #160362, #160363, #160364, #160365	2025-08-15 02:09:31 +00:00
Lucas Kabela	453cfa5153	typing distributed.py (#160365 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160365 Approved by: https://github.com/StrongerXi ghstack dependencies: #160362, #160363, #160364	2025-08-15 02:09:31 +00:00
Lucas Kabela	9faca5f260	typing debugging.py (#160364 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160364 Approved by: https://github.com/Skylion007 ghstack dependencies: #160362, #160363	2025-08-15 02:09:31 +00:00
Lucas Kabela	6fe6dd9fdc	Type cudagraphs.py (#160363 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160363 Approved by: https://github.com/StrongerXi ghstack dependencies: #160362	2025-08-15 02:09:31 +00:00
Lucas Kabela	f82c7eed84	Typing for common.py (#160362 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160362 Approved by: https://github.com/Skylion007	2025-08-15 02:09:31 +00:00
Nick Riasanovsky	25ccc4716e	[Inductor] [Triton] Apply feedback to Enable padded stride support (#160614 ) Summary: Issue I noticed while fixing tests for TMA store. This triton.language.make_tensor_descriptor call hardcodes the shape information as the stride, which is not necessarily correct. In particular, its legal to have a stride bigger than the shape (e.g. padded to a size). A good example of the usage of this would be to allocate a tensor to always be a multiple of 16 and just pad the result so TMA is legal. This is redo of https://github.com/pytorch/pytorch/pull/160493 because I broke this accidentally trying to land internally first instead of merging through Github directly. Test Plan: Tested with `buck2 run mode/opt-split-dwarf mode/inplace -c fbcode.nvcc_arch=h100 caffe2/test/inductor:max_autotune 2>&1 \| tee ~/test_logs.log` and confirmed all max autotune tests passed. Rollback Plan: Differential Revision: D80224578 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160614 Approved by: https://github.com/eellison	2025-08-15 02:06:14 +00:00
Guilherme Leobas	d387a48c38	[generator] Raise `StopIteration(value)` with value from the return stmt (#157152 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/157152 Approved by: https://github.com/zou3519 ghstack dependencies: #157148	2025-08-15 01:42:40 +00:00
Guilherme Leobas	831e85104a	[contextlib] Fixes for CPython contextlib tests (#157148 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/157148 Approved by: https://github.com/zou3519	2025-08-15 01:42:40 +00:00
David Berard	211c98859a	[inductor][triton] Update triton_builtin handling after triton # 7239 (#160658 ) https://github.com/triton-lang/triton/pull/7239 will search for a _semantic kwarg in the signature of the function before passing in this kwarg. To fix this in Inductor: 1. explicitly take a _semantic kwarg 2. remove the functools.wraps around the wrapper function, which was causing inspect.signature to return the signature of the wrapped function (instead of the signature of the wrapper, which does contain the _semantic arg) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160658 Approved by: https://github.com/PaulZhang12, https://github.com/njriasan	2025-08-15 00:39:24 +00:00
Kaichao You	dae7710bf2	[cuda][cupy] Improve cupy device placement when device is provided with explicit index (#158529 ) resubmit https://github.com/pytorch/pytorch/pull/158320 , fixing a potential bug when device index is not specified explicitly. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158529 Approved by: https://github.com/ezyang	2025-08-15 00:27:42 +00:00
ankushwahaRH	dc194a3096	Test multiprocessing spawn timing fix (#160672 ) Submitting PR to fix #160511. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160672 Approved by: https://github.com/mikaylagawarecki	2025-08-15 00:11:55 +00:00
Jeff Daily	4051b42c29	[ROCm] hipify needs specific header mappings (#160675 ) Fixes #160579. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160675 Approved by: https://github.com/ScottTodd, https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-15 00:09:04 +00:00
henrylhtsang	eb0eaa67e1	[BE][ci] Increase frequency of cutlass backend ci (#160656 ) * increase frequency from every 24 hours to every 12 hours * automatically enable it if cutlass backend files are touched. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160656 Approved by: https://github.com/eellison	2025-08-14 23:44:55 +00:00
henrylhtsang	98373e5ad2	[doc] AOTI debugging guide (#160430 ) Folded from https://discuss.pytorch.org/t/a-beginners-guide-to-debugging-aot-inductor-cuda-illegal-memory-access/222188 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160430 Approved by: https://github.com/angelayi	2025-08-14 23:42:17 +00:00
Michael Lazos	371eacb2ae	[Dynamo][Hierarchical Compile] Refactor for tuple flattening (#158810 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158810 Approved by: https://github.com/StrongerXi	2025-08-14 22:45:44 +00:00
PyTorch MergeBot	3650989e6e	Revert "[cutlass] fix dictionary iteration error (#160552 )" This reverts commit 29d20d49f0b7f4e362e1cefdcdc4b5659969312c. Reverted https://github.com/pytorch/pytorch/pull/160552 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/160552#issuecomment-3189940880))	2025-08-14 21:41:28 +00:00
Markus Hoehnerbach	3be70dc30e	[inductor] dont reuse buffers if it affects peak (#145883 ) (#159530 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159530 Approved by: https://github.com/eellison	2025-08-14 21:14:36 +00:00
David Berard	47a1db823d	[triton_heuristics] Optimize the triton launcher in pt2 (#160000 ) Summary: (Original author: Xu Zhao. Commandeered by David to land this since it is relatively urgent) We observed ~10us PT2-Triton launch overhead regression after pin update. Before Triton pin-update: {F1980557238} After Triton pin-update: {F1980557240} The root cause is because https://github.com/pytorch/pytorch/pull/145051 adds `_get_args_with_constexprs` to the cubin launcher caller function, which is on the critical path. The motivation for `_get_args_with_constexprs` was that between triton 3.2 and triton 3.3, the convention for calling Triton kernels (at the level that non-static-cuda-launcher inductor integrates) changed. Previously, the callable did not take constexpr arguments as parameters; after 3.3, it does. With pointwise/reduction kernels, we don't know the constexpr values until after autotuning occurs; so `_get_args_with_constexprs` would inject constexprs into the arguments list before calling the Triton kernel. The fix (in this PR) is to instead inject the constexpr args into the launcher string - this avoids the cost of sorting/reordering arguments which previously occurred upon execution of each kernel. Note that the static_cuda_launcher.py does not require constants to be passed to the cubin launcher (`e96c7c4bb0/torch/_inductor/runtime/static_cuda_launcher.py (L220)`), there is no need to pass in constexprs to the generated launcher code. The new launcher code needs to work on three cases: - StaticallyLaunchedCudaKernel - triton.compile.CompiledKernel - AOTInductor Analysis: https://docs.google.com/document/d/1PHaSmx2w59K8qpjw5_qzKWShfEgptf_Zpv_DL7YxiWU/edit?tab=t.0 Test Plan: Before: ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only BERT_pytorch --performance --backend=inductor --training --amp --disable-cudagraphs 1.893x ``` ``` $ buck2 run mode/opt //pytorch/tritonbench:run -- --op launch_latency x_val nop_python_function-walltime nop_triton_kernel-walltime nop_triton_compiled_kernel_run-walltime nop_inductor_kernel-walltime nop_inductor_kernel_cudagraph-walltime ------- ------------------------------ ---------------------------- ----------------------------------------- ------------------------------ ---------------------------------------- 0 0.00760921 1.80298 0.623282 5.25024 0.203722 19 0.00799885 4.78223 1.00226 5.8213 0.239084 average 0.00780403 3.29261 0.812769 5.53577 0.221403 ``` After: ``` buck2 run mode/opt //pytorch/tritonbench:run -- --op launch_latency x_val nop_python_function-walltime nop_triton_kernel-walltime nop_triton_compiled_kernel_run-walltime nop_inductor_kernel-walltime nop_inductor_kernel_cudagraph-walltime ------- ------------------------------ ---------------------------- ----------------------------------------- ------------------------------ ---------------------------------------- 0 0.00747067 1.92589 0.726509 4.35459 0.204205 19 0.00747823 7.36852 1.26241 6.28208 0.239278 average 0.00747445 4.6472 0.994459 5.31834 0.221741 ``` ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only BERT_pytorch --performance --backend=inductor --training --amp --disable-cudagraphs 1.985x ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160000 Approved by: https://github.com/jansel, https://github.com/mlazos Co-authored-by: Xu Zhao <xzhao9@meta.com>	2025-08-14 21:04:08 +00:00
PyTorch MergeBot	eac2d9d695	Revert "appending the pythonpath (#160219 )" This reverts commit 1d80d697a269234b47ec7ede192faf3bb9b159e3. Reverted https://github.com/pytorch/pytorch/pull/160219 on behalf of https://github.com/clee2000 due to broke inductor? [GH job link](https://github.com/pytorch/pytorch/actions/runs/16970222746/job/48108262003) [HUD commit link](`1d80d697a2`) ([comment](https://github.com/pytorch/pytorch/pull/160219#issuecomment-3189850381))	2025-08-14 20:58:14 +00:00
Lucas Kabela	3fe19a7a0a	[Test Fix] Delete dynamo skipfile for OpenMP test_one_thread (#160562 ) Fixes #120648 During issue scrubbing I could not repro these failing tests, so reenabling them to close out the issue ### Test Original repro command: ``` PYTORCH_TEST_WITH_DYNAMO=1 pytest test/test_openmp.py -v -k test_one_thread ``` Now results in ``` platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /home/lucaskabela/.conda/envs/pytorch-3.12/bin/python3.12 cachedir: .pytest_cache hypothesis profile 'default' rootdir: /home/lucaskabela/pytorch configfile: pytest.ini plugins: hypothesis-6.138.0 collected 2 items / 1 deselected / 1 selected Running 1 items in this shard test/test_openmp.py::TestOpenMP_ParallelFor::test_one_thread PASSED [3.6874s] [100%] ===================================================== 1 passed, 1 deselected in 6.07s ===================================================== ``` And: ``` PYTORCH_TEST_WITH_DYNAMO=1 python test/test_openmp.py TestOpenMP_ParallelFor.test_one_thread ``` ``` PYTORCH_TEST_WITH_DYNAMO=1 python test/test_sort_and_select.py TestSortAndSelectCPU.test_sort_overflow_cpu_int16 ``` Both result in: ``` . ---------------------------------------------------------------------- Ran 1 test in 0.003s ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160562 Approved by: https://github.com/zou3519	2025-08-14 20:55:59 +00:00
Dev Sashidhar	4a90dc0c1f	Update checkpoint warning to target PyTorch 2.9 (#160643 ) Fixes #160534 Updates the warning in torch.utils.checkpoint to state that starting in PyTorch 2.9, calling checkpoint without explicitly passing use_reentrant will raise an exception. Follows the guidance from the issue discussion. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160643 Approved by: https://github.com/soulitzer	2025-08-14 20:53:17 +00:00
Paul Zhang	1fc683cf17	[Inductor] Allow indexing a flexible layout for extract_input_node_reduction_ranges (#160645 ) Differential Revision: D79831747 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160645 Approved by: https://github.com/eellison	2025-08-14 20:43:35 +00:00
AaronWang04	b9d7de3a09	[Inductor] addmm + activation function fusion (#158137 ) PR implements a pass in post_grad to fuse activation(add + mm) This was previously done similarly here #106912 but was reverted for performance reasons. it was replaced with a pass that unfuses the activation and add from addmm/addmm_activation and let inductor handle the fusion. however since then cuBLAS team has made a lot of perf improvements on this, will update this post with more benchmarks but preliminary benchmark show good results perf dash board <img width="3371" height="1240" alt="Screenshot from 2025-08-07 13-41-35" src="https://github.com/user-attachments/assets/d44d6205-b33a-4a20-9f0f-d9db176b3738" /> Relu works with both training and inference but gelu only works with inference mode due to some fundamental limitations since gelu's derivative depends on input and relu's doesnt. don't think this is fixable with the current addmm_activation API Graph module before and after this pass Relu(addmm) ``` graph(): %primals_1 : [num_users=1] = placeholder[target=primals_1] %primals_2 : [num_users=2] = placeholder[target=primals_2] %primals_3 : [num_users=2] = placeholder[target=primals_3] %addmm : [num_users=1] = call_function[target=torch.ops.aten.addmm.default](args = (%primals_1, %primals_3, %primals_2), kwargs = {}) %relu : [num_users=2] = call_function[target=torch.ops.aten.relu.default](args = (%addmm,), kwargs = {}) %le : [num_users=1] = call_function[target=torch.ops.aten.le.Scalar](args = (%relu, 0), kwargs = {}) %permute_1 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%primals_3, [1, 0]), kwargs = {}) return (relu, primals_2, le, permute_1) graph(): %primals_1 : [num_users=1] = placeholder[target=primals_1] %primals_2 : [num_users=2] = placeholder[target=primals_2] %primals_3 : [num_users=2] = placeholder[target=primals_3] %_addmm_activation_default : [num_users=2] = call_function[target=torch.ops.aten._addmm_activation.default](args = (%primals_1, %primals_3, %primals_2), kwargs = {}) %le : [num_users=1] = call_function[target=torch.ops.aten.le.Scalar](args = (%_addmm_activation_default, 0), kwargs = {}) %permute_1 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%primals_3, [1, 0]), kwargs = {}) return (_addmm_activation_default, primals_2, le, permute_1) ``` Gelu (addmm) ``` graph(): %arg0_1 : [num_users=1] = placeholder[target=arg0_1] %arg1_1 : [num_users=1] = placeholder[target=arg1_1] %arg2_1 : [num_users=1] = placeholder[target=arg2_1] %addmm : [num_users=4] = call_function[target=torch.ops.aten.addmm.default](args = (%arg0_1, %arg2_1, %arg1_1), kwargs = {}) %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%addmm, %addmm), kwargs = {}) %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul, %addmm), kwargs = {}) %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_1, 0.044715), kwargs = {}) %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%addmm, %mul_2), kwargs = {}) %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%add, 0.7978845608028654), kwargs = {}) %mul_4 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%addmm, 0.5), kwargs = {}) %tanh : [num_users=1] = call_function[target=torch.ops.aten.tanh.default](args = (%mul_3,), kwargs = {}) %add_1 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%tanh, 1), kwargs = {}) %mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_4, %add_1), kwargs = {}) return (mul_5,) graph(): %arg0_1 : [num_users=1] = placeholder[target=arg0_1] %arg1_1 : [num_users=1] = placeholder[target=arg1_1] %arg2_1 : [num_users=1] = placeholder[target=arg2_1] %_addmm_activation_default : [num_users=1] = call_function[target=torch.ops.aten._addmm_activation.default](args = (%arg0_1, %arg2_1, %arg1_1), kwargs = {use_gelu: True}) return (_addmm_activation_default,) ``` Benchmark setup: NGC pytorch 25.06 container cublas version: 12.9.1.4 torch.compile ran with dynamic = False and max_autotune H100 ``` Testing with M=1024, N=1024, K=1024, dtype=bfloat16 ============================================================ Average Time per Iteration (cublas): 0.0107 ms Average Time per Iteration (torch compile): 0.0296 ms ============================================================ Testing with M=2048, N=2048, K=2048, dtype=bfloat16 ============================================================ Average Time per Iteration (cublas): 0.0262 ms Average Time per Iteration (torch compile): 0.0327 ms ============================================================ Testing with M=4096, N=4096, K=4096, dtype=bfloat16 ============================================================ Average Time per Iteration (cublas): 0.1763 ms Average Time per Iteration (torch compile): 0.2457 ms ============================================================ Testing with M=8192, N=8192, K=8192, dtype=bfloat16 ============================================================ Average Time per Iteration (cublas): 1.5280 ms Average Time per Iteration (torch compile): 1.9437 ms ``` A100 ``` ############################################################ Testing with dtype: float16 ############################################################ ============================================================ Testing with M=1024, N=1024, K=1024, dtype=float16 ============================================================ Average Time per Iteration (cublas): 0.0313 ms Average Time per Iteration (torch compile): 0.0643 ms ============================================================ Testing with M=2048, N=2048, K=2048, dtype=float16 ============================================================ Average Time per Iteration (cublas): 0.1149 ms Average Time per Iteration (torch compile): 0.1255 ms ============================================================ Testing with M=4096, N=4096, K=4096, dtype=float16 ============================================================ Average Time per Iteration (cublas): 0.6297 ms Average Time per Iteration (torch compile): 0.7547 ms ============================================================ Testing with M=8192, N=8192, K=8192, dtype=float16 ============================================================ Average Time per Iteration (cublas): 4.3821 ms Average Time per Iteration (torch compile): 5.0740 ms ``` Script ```py import torch torch.manual_seed(0) warmup, numrun= 10, 100 sizes = [1024, 2048, 4096, 8192] dtypes = [torch.float16, torch.bfloat16, torch.float32] device = torch.device("cuda") for dtype in dtypes: dtype_name = str(dtype).split('.')[-1] print(f"\n{'#'60}") print(f"Testing with dtype: {dtype_name}") print(f"{'#'60}") for size in sizes: M, N, K = size, size, size print(f"\n{'='60}") print(f"Testing with M={M}, N={N}, K={K}, dtype={dtype_name}") print(f"{'='60}") A = torch.randn(M, K, device=device, dtype=dtype) B = torch.randn(K, N, device=device, dtype=dtype) C = torch.randn(M, device=device, dtype=dtype) def func1(): return torch._addmm_activation(C, A, B, use_gelu=True) def func2(): return torch.nn.functional.gelu(torch.add(C, torch.mm(A, B)), approximate="tanh") func2_compiled = torch.compile( func2, dynamic=False, options={ "force_disable_caches": True, "max_autotune": True, "max_autotune_gemm": True, "max_autotune_gemm_backends": "TRITON", "autotune_fallback_to_aten": False, } ) for _ in range(warmup): func1() torch.cuda.synchronize(device=device) start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) total_time_ms = 0.0 start_event.record() for _ in range(numrun): func1() end_event.record() torch.cuda.synchronize(device=device) total_time_ms += start_event.elapsed_time(end_event) avg_time_ms = total_time_ms / numrun print(f"Average Time per Iteration (cublas):\t {avg_time_ms:.4f} ms") for _ in range(warmup): func2_compiled() torch.cuda.synchronize(device=device) start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) total_time_ms = 0.0 start_event.record() for _ in range(numrun): func2_compiled() end_event.record() torch.cuda.synchronize(device=device) total_time_ms += start_event.elapsed_time(end_event) avg_time_ms = total_time_ms / numrun print(f"Average Time per Iteration (torch compile):\t {avg_time_ms:.4f} ms") ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158137 Approved by: https://github.com/eellison	2025-08-14 20:41:38 +00:00
Guilherme Leobas	1028c5e2d5	[Dynamo] Add CPython default dict tests (#155263 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/155263 Approved by: https://github.com/zou3519	2025-08-14 20:22:22 +00:00
vishalgoyal316	19b4283884	Typo correction in variable name uninitalized_val in resize() function (#160636 ) Fixes #160633 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160636 Approved by: https://github.com/mikaylagawarecki, https://github.com/Skylion007	2025-08-14 20:11:43 +00:00
Michael Lazos	8d6d324631	[Dynamo][Hierarchical-Compile] Don't allow node duplicates to be added (#160605 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160605 Approved by: https://github.com/StrongerXi	2025-08-14 20:02:10 +00:00
Alex Malyshev	fdfd69bb05	Set PYTHONHOME for inductor subprocesses using torch (#160008 ) This is needed for subprocesses that are trying to call back into torch functionality, i.e. anything that's also setting `PYTHONPATH`. If they're part of an application that bundles the Python runtime, then they should use the bundled runtime to keep their view of the world consistent. There are more `sys.executable` subprocesses in torch/ but it seems like they're fine. Previous PR at https://github.com/pytorch/pytorch/pull/159382, but was reverted because it caused macOS jobs on GitHub to timeout. What was happening was inductor subprocesses were scheduling C++ compilation tasks that were failing to find the Python.h header. This was because they were running in venvs and now trying to find the CPython headers inside the venv, where the headers do not exist. This PR gates the new behavior to internal builds only. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160008 Approved by: https://github.com/aorenste	2025-08-14 19:57:14 +00:00
Logan Thomas	0d3461bac0	DOC: update CrossEntropyLoss with note and example of incorrect target specification (#155649 ) Fixes #134771 Pull Request resolved: https://github.com/pytorch/pytorch/pull/155649 Approved by: https://github.com/mikaylagawarecki Co-authored-by: Svetlana Karslioglu <svekars@meta.com> Co-authored-by: mikaylagawarecki <mikaylagawarecki@gmail.com>	2025-08-14 18:34:57 +00:00
Howard Huang	65053c03a3	[FR] Don't check incomplete ranks for printing (#160195 ) When just printing the ranks (`-j` option) we should skip the check for "incomplete ranks" since that doesn't affect the print Pull Request resolved: https://github.com/pytorch/pytorch/pull/160195 Approved by: https://github.com/fduwjj ghstack dependencies: #160097	2025-08-14 18:19:45 +00:00
Howard Huang	96f9fbe21a	Fix flight recorder for P2P ops (#160097 ) Fixes errors in debugging a trace as mentioned in https://docs.google.com/document/d/1EKVJYmW2hj_VsvDvnSggXhZzJyvMu9dA0iDJWOZAtjY/edit?tab=t.0 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160097 Approved by: https://github.com/fduwjj	2025-08-14 18:19:45 +00:00
Thomas Germer	1c25871191	Allow torch.hub.load with unauthorized GITHUB_TOKEN (#159896 ) Allow torch.hub.load with unauthorized GITHUB_TOKEN `torch.hub.load` fails if a `GITHUB_TOKEN` with few permissions is set, as can be seen in the following example. Make sure that the model has not been cached before, for example with `rm ~/.cache/torch`. If the model has been downloaded already, it will not be downloaded again and the authorization error will not occur. ```python export GITHUB_TOKEN="" python >>> import torch >>> torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14') Traceback (most recent call last): File "<stdin>", line 1, in <module> File "~/miniconda3/lib/python3.12/site-packages/torch/hub.py", line 567, in load repo_or_dir = _get_cache_or_reload(repo_or_dir, force_reload, trust_repo, "load", ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "~/miniconda3/lib/python3.12/site-packages/torch/hub.py", line 231, in _get_cache_or_reload _validate_not_a_forked_repo(repo_owner, repo_name, ref) File "~/miniconda3/lib/python3.12/site-packages/torch/hub.py", line 191, in _validate_not_a_forked_repo response = json.loads(_read_url(Request(url, headers=headers))) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "~/miniconda3/lib/python3.12/site-packages/torch/hub.py", line 174, in _read_url with urlopen(url) as r: ^^^^^^^^^^^^ File "~/miniconda3/lib/python3.12/urllib/request.py", line 215, in urlopen return opener.open(url, data, timeout) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "~/miniconda3/lib/python3.12/urllib/request.py", line 521, in open response = meth(req, response) ^^^^^^^^^^^^^^^^^^^ File "~/miniconda3/lib/python3.12/urllib/request.py", line 630, in http_response response = self.parent.error( ^^^^^^^^^^^^^^^^^^ File "~/miniconda3/lib/python3.12/urllib/request.py", line 559, in error return self._call_chain(args) ^^^^^^^^^^^^^^^^^^^^^^^ File "~/miniconda3/lib/python3.12/urllib/request.py", line 492, in _call_chain result = func(args) ^^^^^^^^^^^ File "~/miniconda3/lib/python3.12/urllib/request.py", line 639, in http_error_default raise HTTPError(req.full_url, code, msg, hdrs, fp) urllib.error.HTTPError: HTTP Error 401: Unauthorized ``` The cause of the error is that the function `_validate_not_a_forked_repo` in `hub.py` always uses `GITHUB_TOKEN` for authorization, even when downloading does not require authorization. `0ba09a6d34/torch/hub.py (L194)` This fix simply retries the download without the token in case of a failure. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159896 Approved by: https://github.com/albanD	2025-08-14 18:15:49 +00:00
Xilun Wu	6c05ea6475	[DTensor] add op support: aten.squeeze_.dim (#159532 ) Summary This PR enables in-place op `aten.squeeze_.dim` on DTensor with a change to DTensor dispatch logic: when processing in-place operator, we should assign `output_sharding.output_spec` back to the first argument. This is because the in-place op_call on `arg._local_tensor` could also shift the tensor meta. Test `pytest test/distributed/tensor/test_view_ops.py -s -k test_squeeze_` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159532 Approved by: https://github.com/zpcore	2025-08-14 18:01:19 +00:00
Howard Huang	5665dc9ab7	[PP] Allow larger world_size schedule tests (#160559 ) Update schedule tests to use `world_size=4`, changes needed: - Move some tests that require world_size=2 to new class - Move helper methods from class level to function level - Update some initialization to pass assert since gradients were super small. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160559 Approved by: https://github.com/wconstab ghstack dependencies: #159591, #160558	2025-08-14 17:41:58 +00:00
Howard Huang	2ff7c1c774	[PP] Rename _load_actions and validate (#160558 ) Rename method and add validation Pull Request resolved: https://github.com/pytorch/pytorch/pull/160558 Approved by: https://github.com/wconstab ghstack dependencies: #159591	2025-08-14 17:41:58 +00:00
Guilherme Leobas	3028fa6ce9	Wrap class definitions in `set_fullgraph(False)` in `test_list`/`tuple` (#160277 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160277 Approved by: https://github.com/zou3519 ghstack dependencies: #160216, #160217, #160276, #160278, #160330, #160331	2025-08-14 17:29:45 +00:00
Matthew Haddock	077cb38974	Add dtype checks in meta dispatch for various ordering ops (#159556 ) This adds data type checks for the unsupported bool and complex types for argmax/min topk, sort, minimum, maximum. As listed here: `0a99b026d6/torch/testing/_internal/common_methods_invocations.py (L21076)` Currently the ops will fail on CPU or CUDA calculation, rather than at meta dispatch stage as with for example max: `0a99b026d6/aten/src/ATen/native/TensorCompare.cpp (L285)` . This will catch it early. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159556 Approved by: https://github.com/janeyx99	2025-08-14 17:06:27 +00:00
Jovian Anthony Jaison	cd8d8c18f5	[pytorch][dynamo_compile] Log graph_node_shape to dynamo_compile (#160556 ) This PR adds the dynamo graph node shape logging to dynamo compile. Also added unit tests to check if correct graph node shape is being logged. Test Plan: $ python -m test_utils Ran 12 tests in 36.447s OK Note: Will merge after D80185628 lands. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160556 Approved by: https://github.com/masnesral, https://github.com/jingsh	2025-08-14 16:42:35 +00:00
Lucas Kabela	63654ba4c5	[BE][Dynamo] Type improvements in `_dynamo/utils` to generics (#159824 ) Follow up to #159580 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159824 Approved by: https://github.com/williamwen42	2025-08-14 16:06:50 +00:00
Ke Wen	7e27347fd3	[SymmMem] Check return of nvshmem_malloc (#160603 ) `nvshmem_malloc` returns a null pointer when allocation fails. We should check here. Otherwise, the nullptr can go down the road and into the device kernel, causing CUDA illegal memory access. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160603 Approved by: https://github.com/fduwjj, https://github.com/ngimel	2025-08-14 15:57:55 +00:00
Raman Kumar	1d80d697a2	appending the pythonpath (#160219 ) Fixes #160193 `PYTHONPATH=/torchbench` to `PYTHONPATH=/torchbench:$PYTHONPATH` in [pytorch/.ci/pytorch/test.sh](`b5fd7223b1/.ci/pytorch/test.sh (L1715)`) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160219 Approved by: https://github.com/malfet	2025-08-14 15:55:31 +00:00
Xinya Zhang	b6b74aed60	[ROCm] Support large inputs for coalesceValuesKernel (#158281 ) # Description `.coalesce` cannot handle large inputs on ROCM due to maximal grid size limit. This PR splits axis `X` into axes `X` and `Y`, and repurposes `Z` for original `Y` on ROCm to avoid such limitation. Confirmed the new approach can handle large inputs. Correctness needs validation. # Testing Command `python torch_spmv.py 22500000 272500000` ## Script `torch_spmv.py` ``` python import torch import argparse def parse_args(): parser = argparse.ArgumentParser( description="Sparse COO Matrix by Dense Vector Multiplication using PyTorch" ) parser.add_argument("n", type=int, help="Size of the NxN matrix") parser.add_argument("nnz", type=int, help="Number of non-zero entries") return parser.parse_args() def main(): args = parse_args() n = args.n nnz = args.nnz dtype = torch.float32 device = torch.device('cuda') # Generate random indices for the sparse matrix in COO format. torch.manual_seed(42) rows = torch.randint(0, n, (nnz,), dtype=torch.int64, device=device) cols = torch.randint(0, n, (nnz,), dtype=torch.int64, device=device) indices = torch.stack([rows, cols], dim=0) # Generate random values. values = torch.randn(nnz, dtype=torch.float32, device=device) # Create the sparse COO matrix and move it to the target device. sparse_matrix = torch.sparse_coo_tensor(indices, values, size=(n, n), dtype=torch.float32, device=device) sparse_matrix = sparse_matrix.coalesce() # Generate a random dense vector. dense_vector = torch.randn(n, dtype=torch.float32, device=device) # Perform sparse matrix - dense vector multiplication. # Using torch.sparse.mm which expects a 2D tensor for the vector. result = torch.sparse.mm(sparse_matrix, dense_vector.unsqueeze(1)).squeeze() # result = torch.mv(sparse_matrix, dense_vector) # Print the result. print("Result of the multiplication:") print(torch.sum(result)) if __name__ == "__main__": main() ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158281 Approved by: https://github.com/jeffdaily	2025-08-14 15:09:16 +00:00
Tugsbayasgalan Manlaibaatar	4a773e1e86	Warn when there is side effect in strict mode (#160060 ) Differential Revision: [D79784354](https://our.internmc.facebook.com/intern/diff/D79784354) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160060 Approved by: https://github.com/zhxchen17, https://github.com/StrongerXi	2025-08-14 14:59:44 +00:00
Howard Huang	198b5fd2d4	[PP] Add DualPipeV schedule (#159591 ) Added the DualPipeV schedule according to http://github.com/deepseek-ai/DualPipe/blob/main/dualpipe/dualpipev.py#L11 <img width="3633" height="486" alt="image" src="https://github.com/user-attachments/assets/4e843bb9-87cd-4d11-936c-7dfe8ee12f16" /> This schedule doesn't perform the actual "overlap" during execution, but provides the scaffolding and schedule definition we need to run it E2E in torchtitan. Supporting the overlapped operation will be worked on in following PRs. Tests: ```sh python test/distributed/pipelining/test_schedule_multiproc.py -k test_v_shape_schedules python test/distributed/pipelining/test_schedule.py -k test_pipeline_order_for_v_schedules ``` Also tested in TorchTitan and is running. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159591 Approved by: https://github.com/wconstab	2025-08-14 14:58:35 +00:00
blaine-rister	20bdabbb3c	[Dynamo] Fix MTIA dynamo backend by avoiding has_trition() at import time (#160604 ) # Summary MTIA's torch.compile tests were broken by D80037015. (For details, see internal task T234563969.) The root cause was that `has_triton` can change state after we call `torch.mtia.init()`, but it was used in a way that fixes Inductor's behavior at import time. (Note that `has_triton` is cached, and there's no opportunity to call `torch.mtia.init()` prior to `import torch`.) To fix this, we use `try: import triton` as opposed to `has_triton()` at the module level. # Test Plan See the internal diff. As a follow-up, we will add appropriate unit tests and/or CI hints so this type of issue can be caught at PR/diff time. Differential Revision: D80228000 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160604 Approved by: https://github.com/PaulZhang12, https://github.com/eellison	2025-08-14 14:54:49 +00:00
Alexander Grund	d556586448	[cutlass backend] re-add pip cutlass path (#160180 ) Revert #156651 to allow using the cutlass PIP package which is easier for users than the Git checkout or similar method. Also fix a bug where the PIP cutlass path wouldn't be available to subprocesses spawned during benchmarking for algorithm selection. Looks like the "spawn" method does not inherit the (potentially) already set up `config.cuda.cutlass_dir` so in the subprocess the include paths will still be set to `"../third_party/cutlass/"` leading to compilation failure due to missing headers. Ensure `try_import_cutlass` is called at that point, which due to caching is a no-op in most cases, so doesn't hurt. Change the logic to return `None` when cutlass isn't available returning more useful values for include paths, namely an empty list. This is in line with other inductor code which disables the CUTLASS backend when `try_import_cutlass` returns False Pull Request resolved: https://github.com/pytorch/pytorch/pull/160180 Approved by: https://github.com/henrylhtsang, https://github.com/mlazos	2025-08-14 14:48:31 +00:00
Isuru Fernando	781e9a7724	Fix meta for constant_pad_nd (#159878 ) Fixes https://github.com/pytorch/pytorch/issues/144187 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159878 Approved by: https://github.com/Skylion007, https://github.com/ezyang	2025-08-14 14:47:47 +00:00
atalman	e4de93f6a3	Add sm50 and sm60 back to windows builds (#160586 ) Addresses the issue reported in https://github.com/pytorch/pytorch/issues/160575 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160586 Approved by: https://github.com/malfet	2025-08-14 12:46:35 +00:00
Wang, Chuanqi	a5652407e4	[CI] Fix triton xpu build on Windows (#160442 ) Pin the ninja version to 1.11 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160442 Approved by: https://github.com/atalman	2025-08-14 12:43:49 +00:00
Laith Sakka	6f0f4e0c3e	reduce threshold to suggest changes to expected results (#160463 ) Since we increase threshold to 10% i would like suggestions to show up to update those +-2% instead of 3.3% now Pull Request resolved: https://github.com/pytorch/pytorch/pull/160463 Approved by: https://github.com/jamesjwu	2025-08-14 09:11:27 +00:00
fengqing.lu	db763b1717	[Intel GPU] Support SDPA backend selection and priority setting on XPU (#159464 ) Currentlly SPDA XPU use own `priority_order` instead of the one from global context. Hence it does not support `with sdpa_kernel(order, set_priority=True)` with set_priority=True. This PR enables this feature. To make default `priority_order` from global context works for XPU, I also move MATH backend to lowest priority, otherwise `cudnn attention` and `overrideable attention` will never be selected. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159464 Approved by: https://github.com/guangyey, https://github.com/drisspg Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com> Co-authored-by: mayuyuace <qiming1.zhang@intel.com>	2025-08-14 08:55:31 +00:00
Phil Xiaojun Hu	089c4a1ba0	Fix wrong log file name in the docs of `torch.distributed.elastic.multiprocessing.start_processes()` (#160396 ) Fixes #160395 In https://docs.pytorch.org/docs/stable/elastic/multiprocessing.html#starting-multiple-workers and also in the code comment of the function[1], it was specified that: ``` For each process, the ``log_dir`` will contain: #. ``{local_rank}/error.json``: if the process failed, a file with the error info #. ``{local_rank}/stdout.json``: if ``redirect & STDOUT == STDOUT`` #. ``{local_rank}/stderr.json``: if ``redirect & STDERR == STDERR`` ``` While in code[2], the files are `stdout.log` and `stderr.log`, instead of the `.json` ones listed in the doc. [1]: https://github.com/pytorch/pytorch/blob/main/torch/distributed/elastic/multiprocessing/__init__.py#L144-L145 [2]: https://github.com/pytorch/pytorch/blob/main/torch/distributed/elastic/multiprocessing/api.py#L354-L357 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160396 Approved by: https://github.com/fduwjj	2025-08-14 08:24:07 +00:00
zpcore	97c8c98f8d	measure dispatch overhead (#160504 ) Reopen https://github.com/pytorch/pytorch/pull/159699 to merge to main. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160504 Approved by: https://github.com/wconstab	2025-08-14 06:13:53 +00:00
FFFrog	39aa3d1471	Remove the dead code in setup.py (#160515 ) The following line has no effect. `34ec5ed275/setup.py (L1205)` This code was originally introduced in this PR: `dd7cec680c`, and clang11 and later now support `-fstack-clash-protection`. Can we remove this line? @malfet Pull Request resolved: https://github.com/pytorch/pytorch/pull/160515 Approved by: https://github.com/isuruf, https://github.com/albanD	2025-08-14 06:02:11 +00:00
Yang Wang	639778b3ee	[2/3 step][ vllm ci build setup] Add vlllm buld logic and dockerfile (#160089 ) # set up vllm build logic - dockerfile: please notice the dockfile introduced here is only temporary, once we migrate this file to vllm, we will fetch it directly from there - VllmBuildRunner: - implement logic to prepare and run vllm build with dockerfile - Pull Request resolved: https://github.com/pytorch/pytorch/pull/160089 Approved by: https://github.com/huydhn ghstack dependencies: #160043	2025-08-14 05:51:45 +00:00
Yang Wang	00d7d6f123	[1/3][ghstack] [vllm ci build setup ]setup lumen_cli (#160043 ) # Description set up torch_cli using argparses ## Details: - add vllm placeholer in the cli - add unittest for cli command see Readme.md to see how to run the cli Pull Request resolved: https://github.com/pytorch/pytorch/pull/160043 Approved by: https://github.com/huydhn	2025-08-14 05:51:45 +00:00
Jeff Daily	c6d78d4dbd	[ROCm] enable miopen channels last 3d for conv and batchnorm (#160529 ) miopen batchnorm for channels last is guarded by env var PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM similar to existing PYTORCH_MIOPEN_SUGGEST_NHWC for conv. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160529 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-14 05:30:19 +00:00
Boyuan Feng	2898d3f965	[Lowering] Add assertion msg to sym_size and sym_stride (#160591 ) Summary: Add assertion msg to sym_size and sym_stride lowering function. Test Plan: Will test in mast job. Rollback Plan: Differential Revision: D80187693 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160591 Approved by: https://github.com/angelayi	2025-08-14 04:55:32 +00:00
PyTorch UpdateBot	34358f335d	[vllm hash update] update the pinned vllm hash (#160594 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160594 Approved by: https://github.com/pytorchbot	2025-08-14 04:21:28 +00:00
zeshengzong	fe3f5fe4ea	Optimize `min`, `max` gradient behavior description (#160312 ) Fixes #160273 ## Test Result <img width="897" height="593" alt="image" src="https://github.com/user-attachments/assets/6ebcdb2c-8a2c-4f0d-8195-656089e88325" /> <img width="985" height="653" alt="image" src="https://github.com/user-attachments/assets/606a7264-e223-4d2b-8c3f-f153ce43b208" /> <img width="903" height="607" alt="image" src="https://github.com/user-attachments/assets/0ae2f56f-820f-4194-b15c-a02a078c0487" /> <img width="903" height="607" alt="image" src="https://github.com/user-attachments/assets/79c38a17-45ac-4808-829f-d538178de36b" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160312 Approved by: https://github.com/ngimel	2025-08-14 04:18:49 +00:00
Aidyn-A	45ba7ecda8	Flex Attention heuristics: a Blackwell config (#160192 ) Fixes #160074 and more. This is the working config for B200 and RTX 5080. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160192 Approved by: https://github.com/drisspg	2025-08-14 03:47:02 +00:00
Tugsbayasgalan (Tugsuu) Manlaibaatar	194fcfcfbd	Add support for param mutation under inference mode (#159661 ) Summary: In HF model rwkv, we have parameter mutation under inference mode which should be safe. This PR does multiple things to make sure it works: 1. We execute global autograd mutation while tracing so that we can actually trace through parameter inplace mutation 2. Add support for parameter mutation under inference mode in AOTAutograd 3. Add support for parameter mutation under inference mode in export. Test Plan: test Rollback Plan: Differential Revision: D79460136 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159661 Approved by: https://github.com/ydwu4	2025-08-14 03:34:04 +00:00
Michael Lazos	29d20d49f0	[cutlass] fix dictionary iteration error (#160552 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160552 Approved by: https://github.com/henrylhtsang, https://github.com/jingsh	2025-08-14 03:23:46 +00:00
Guilherme Leobas	3faee0a631	Update nullcontext to return input args (#158776 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158776 Approved by: https://github.com/zou3519	2025-08-14 03:02:44 +00:00
Yu, Guangye	8cfaf51d4e	Generalize support of background thread in pinned allocator (#160505 ) # Motivation https://github.com/pytorch/pytorch/pull/135524 only introduces the support of background thread for CUDA, this PR intends to support it for other backend such as XPU as well. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160505 Approved by: https://github.com/albanD	2025-08-14 02:22:39 +00:00
Guilherme Leobas	af3cabc55d	Wrap class definitions in `set_fullgraph(False)` in `test_sort` (#160331 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160331 Approved by: https://github.com/zou3519 ghstack dependencies: #160216, #160217, #160276, #160278, #160330	2025-08-14 02:12:20 +00:00
Guilherme Leobas	74bbe7b4a3	Wrap class definitions in `set_fullgraph(False)` in `test_math`/`cmath` (#160330 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160330 Approved by: https://github.com/zou3519 ghstack dependencies: #160216, #160217, #160276, #160278	2025-08-14 02:12:20 +00:00
Guilherme Leobas	7bfc424a61	Wrap class definitions in `set_fullgraph(False)` in `test_iter` (#160278 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160278 Approved by: https://github.com/williamwen42, https://github.com/zou3519 ghstack dependencies: #160216, #160217, #160276	2025-08-14 02:12:20 +00:00
RajeshvShiyal	5ace061254	finfo eps doc fix (#160502 ) Existing documentation for torch.finfo().eps is as below: \| eps \| float \| The smallest representable number such that ``1.0 + eps != 1.0``. \| Proposed documentation for torch.finfo().eps is as below: \| eps \| float \| The difference between 1.0 and the next smallest representable float larger than 1.0. \| Fixes #160397 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160502 Approved by: https://github.com/ngimel	2025-08-14 01:49:35 +00:00
drisspg	15e49f6164	Factor out the strings to templates for better editor integration (#160357 ) # Summary More code motion, tldr is that install 'Better Jinja' in vscode and now you can get highlighting Before <img width="776" height="926" alt="Screenshot 2025-08-11 at 2 41 08 PM" src="https://github.com/user-attachments/assets/10868b31-f8ac-4cf5-99fe-19b8789ce06b" /> After: <img width="1184" height="1299" alt="Screenshot 2025-08-11 at 2 40 27 PM" src="https://github.com/user-attachments/assets/45203765-589e-4d76-8196-d895a2f2fbf6" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160357 Approved by: https://github.com/eellison	2025-08-14 01:07:53 +00:00
Laith Sakka	dd21c8a578	refresh expected results (#160537 ) regression introduced by https://github.com/pytorch/pytorch/pull/160314 not much worried about it since it did not effect other inductor benchmarks could not repo locally Pull Request resolved: https://github.com/pytorch/pytorch/pull/160537 Approved by: https://github.com/eellison	2025-08-14 00:56:14 +00:00
Nikita Shulga	a06ec54d40	[MPS] Add API to query GPU core count (#160414 ) Using good old IOKit to get `gpu-core-count` property from device implementing `AGXAccelerator` service Expose this one as `torch.backend.mps.get_core_count()` and make it accessible via `MpsInterface` to the inductor Test Plan: Run `python3 -c "import torch;print(torch.backends.mps.get_name(), torch.backends.mps.get_core_count())"` and compare it to `system_profiler SPDisplaysDataType\|head -n10` ``` % python3 -c "import torch;print(torch.backends.mps.get_name(), torch.backends.mps.get_core_count())" Apple M1 Pro 16 % system_profiler SPDisplaysDataType\|head -n10 Graphics/Displays: Apple M1 Pro: Chipset Model: Apple M1 Pro Type: GPU Bus: Built-In Total Number of Cores: 16 Vendor: Apple (0x106b) Metal Support: Metal 3 ``` This would significantly improve occupancy for torch.compile generated kernels Pull Request resolved: https://github.com/pytorch/pytorch/pull/160414 Approved by: https://github.com/dcci	2025-08-14 00:05:17 +00:00
Mikayla Gawarecki	50a8c11875	Add getCurrentDeviceIndex to torch::stable::accelerator (#160453 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160453 Approved by: https://github.com/janeyx99 ghstack dependencies: #159679	2025-08-13 23:42:24 +00:00
Mikayla Gawarecki	e4e4dbd2f8	Add beginnings of torch::stable::accelerator (#159679 ) Adds - `torch::stable::accelerator::DeviceGuard`: `std::unique_ptr` to `DeviceGuardOpauqe` mostly copied from the below (but made generic) `50eac811a6/torch/csrc/inductor/aoti_runtime/utils_cuda.h (L30-L46)` - constructor `DeviceGuard(DeviceIndex)` (this matches aoti but defers from the actual c10 DeviceGuard constructor that takes in device) - `set_index(DeviceIndex)` - `torch::stable::accelerator::Stream`: `std::shared_ptr` to `StreamOpaque` - constructor `Stream(StreamHandle stream)` (similar to torch::stable::Tensor) - `id() -> StreamId` - `getCurrentStream(DeviceIndex device_index) -> stable::accelerator::Stream` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159679 Approved by: https://github.com/guangyey, https://github.com/janeyx99	2025-08-13 23:42:24 +00:00
Aidyn-A	d670304001	[ATen][CUDA] Use new CCCL API in v2.8 (#160554 ) Silences deprecation warnings like: ``` In file included from tmpxft_003a195d_00000000-6_Nonzero.cudafe1.stub.c:1: /tmp/tmpxft_003a195d_00000000-6_Nonzero.cudafe1.stub.c: At global scope: /tmp/tmpxft_003a195d_00000000-6_Nonzero.cudafe1.stub.c:243:219: warning: 'template<class ValueType, class OffsetT> class at_cuda_detail::cub::CountingInputIterator' is deprecated: Use thrust::counting_iterator instead [-Wdeprecated-declarations] 243 \| static void __device_stub__ZN2at6native43_GLOBAL__N__3cee4041_10_Nonzero_cu_cba1aaa011flag_kernelILi512ELi16EhEEvPKT1_PlPKllli( const _ZN3c104impl20ScalarTypeToCPPTypeTILNS_10ScalarTypeE0EEE __par0, int64_t __par1, const int64_t __par2, int64_t __par3, int64_t __par4, int __par5) { __cudaLaunchPrologue(6); __cudaSetupArgSimple(__par0, 0UL); __cudaSetupArgSimple(__par1, 8UL); __cudaSetupArgSimple(__par2, 16UL); __cudaSetupArgSimple(__par3, 24UL); __cudaSetupArgSimple(__par4, 32UL); __cudaSetupArgSimple(__par5, 40UL); __cudaLaunch(((char )((void ( )(const _ZN3c104impl20ScalarTypeToCPPTypeTILNS_10ScalarTypeE0EEE , int64_t , const int64_t , int64_t, int64_t, int))at::native::_NV_ANON_NAMESPACE::flag_kernel<(int)512, (int)16, unsigned char> ))); }namespace at{ \| ^~~~~~~~~~~~~~~~~~~~~ /usr/local/cuda-12.9/include/cub/iterator/counting_input_iterator.cuh:93:63: note: declared here 93 \| class CCCL_DEPRECATED_BECAUSE("Use thrust::counting_iterator instead") CountingInputIterator \| ^~~~~~~~~~~~~~~~~~~~~ ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160554 Approved by: https://github.com/ngimel, https://github.com/malfet, https://github.com/atalman	2025-08-13 23:15:53 +00:00
Sheng Fu	c5efc5c8a6	Fix unit test test_equivalent_template_code (#160432 ) Summary: Fix unit test test_equivalent_template_code https://github.com/pytorch/pytorch/pull/159920 treats ReinterpretView as a not-realized node when searching FX origin nodes for fused triton kernel. In test_equivalent_template_code, there is a transpose node (which is a ReinterpretView) before matmul. It was not in FX graph segment before PR 159920. FX origin nodes are used to define the name of triton kernel. That is the reason test_equivalent_template_code failed with PR 159920 since it uses hard-coded triton kernel name to check the result. The fix is to update the triton kernel name in the unit test. Test Plan: buck2 run mode/opt caffe2/test/inductor:benchmark_fusion -- caffe2.test.inductor.test_benchmark_fusion.BenchmarkMultiTemplateFusionCudaTest Rollback Plan: Differential Revision: D80101711 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160432 Approved by: https://github.com/clee2000	2025-08-13 23:14:51 +00:00
Will Constable	6da11d9aaf	[C10D] Add check_rng_sync util (#160283 ) Debugs RNG desync by checking the current state on each rank in the group and summarizing the differences if any are detected. Notes: - used allgather instead of gather since its simpler to do this SPMD rather than add conditional behavior, though I could be convinced we only want to log on rank0. Usage: `check_rng_sync(generator, group)` Prints something like this: (cuda): ``` [rank0]:E0808 ] Generator desync detected: [rank0]:E0808 ] Ranks (Seed, Offset) values [rank0]:E0808 ] ------- ----------------------- [rank0]:E0808 ] 0 (456, 0) [rank0]:E0808 ] 1 (123, 4) [rank0]:E0808 ] 2-3 (123, 0) ``` (cpu): ``` [rank2]:E0810 ] Generator desync detected: [rank2]:E0810 ] Ranks Generator State Hash values [rank2]:E0810 ] ------- ----------------------------- [rank2]:E0810 ] 0 7633364531954955665 [rank2]:E0810 ] 1 8807615394212033278 [rank2]:E0810 ] 2-3 -6150027303226666531 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160283 Approved by: https://github.com/ezyang	2025-08-13 23:05:29 +00:00
Markus Hoehnerbach	182efe31db	[inductor] add lowering for repeat_interleave.Tensor with output size specified (#147160 ) (#158462 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158462 Approved by: https://github.com/eellison	2025-08-13 22:54:18 +00:00
William Wen	1ea688f9a2	[dynamo] fix EXTENDED_ARG starts_line dropping bug (#160478 ) Fixes https://github.com/pytorch/pytorch/issues/160471 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160478 Approved by: https://github.com/Lucaskabela, https://github.com/billmguo	2025-08-13 22:27:40 +00:00
Isabella Ni	53e3949495	[MTIA-T][CFF] Pass backend parameter into GPU vertical pass file and pattern matcher (#160404 ) Summary: As titled Please see https://fb.workplace.com/groups/1075192433118967/posts/1735215827116621/?comment_id=1735220747116129&reply_comment_id=1735242997113904 Basically, for MTIA, we want mtia_afg to show up in the counters and backend, instead of Inductor. MTIA is not using inductor yet. Using env var TORCHINDUCTOR_PATTERN_MATCH_BACKEND to pass in the actual backend. The env var default value is "inductor", so nothing should break for GPU. Test Plan: Default is always "inductor", so existing test should not break. CI tests Rollback Plan: Differential Revision: D80069072 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160404 Approved by: https://github.com/BoyuanFeng	2025-08-13 22:24:27 +00:00
PyTorch MergeBot	33d9401866	Revert "[BE][Dynamo] Type improvements in `_dynamo/utils` to generics (#159824 )" This reverts commit 3ef2e1ef769582a82c6ddf150e9d11bf4bf1c44f. Reverted https://github.com/pytorch/pytorch/pull/159824 on behalf of https://github.com/clee2000 due to I think this broke dynamo/test_trace_rules.py::TraceRuleTests::test_almost_impossible_missing_name [GH job link](https://github.com/pytorch/pytorch/actions/runs/16948305999/job/48035192324) [HUD commit link](`3ef2e1ef76`) ([comment](https://github.com/pytorch/pytorch/pull/159824#issuecomment-3186003531))	2025-08-13 22:17:29 +00:00
Shangdi Yu	d1950d4bb5	Change IR node's stack trace to be computed lazily (#160487 ) Summary: When an IR node is an inherited class, post_init is called once for each super().__init__() call. To avoid duplicated calls, we make stack trace computation happen lazily. Test Plan: CI Rollback Plan: Differential Revision: D80137870 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160487 Approved by: https://github.com/angelayi	2025-08-13 21:41:25 +00:00
Mikayla Gawarecki	1196bb1c2e	Add utility to get computed kernel in torch.library (#158393 ) Adds `OperatorEntry::getComputedKernelForDispatchKey` which returns the KernelFunction corresponding to `OperatorEntry.dispatchTable_[dispatch_ix]` for a given dispatch key - Specifically it returns a `SafeKernelFunction` that holds a `KernelToken`. This `KernelToken` is registered to the `KernelFunction` in `OperatorEntry.kernels_` and will be invalidated when the `KernelFunction` is destructed (i.e. when the `AnnotatedKernel` that holds this `KernelFunction` is removed from `kernels_`, which happens when the corresponding impl is deregistered). - `SafeKernelFunction` can be called via `callBoxed`, the validity of the token will be checked before this happens - `SafeKernelFunction` is pybinded and `getComputedKernelForDispatchKey` is exposed to the frontend ia `torch.library.get_kernel` Related to https://github.com/pytorch/pytorch/issues/155330 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158393 Approved by: https://github.com/albanD	2025-08-13 21:00:59 +00:00
henrylhtsang	e9eb2096a5	[cutlass backend] Allow bmm use cases when batch stride is 0 (#160356 ) Differential Revision: [D80035771](https://our.internmc.facebook.com/intern/diff/D80035771/) The motivation and the original change is to reduce the number parameters we pass into the kernel, which was motivated by aesthetic reasons only. But seeing the need to use different batch stride, we should just pass in the batch stride. That would be a good long term fix. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160356 Approved by: https://github.com/mlazos	2025-08-13 20:52:24 +00:00
Lucas Kabela	3ef2e1ef76	[BE][Dynamo] Type improvements in `_dynamo/utils` to generics (#159824 ) Follow up to #159580 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159824 Approved by: https://github.com/williamwen42	2025-08-13 20:17:01 +00:00
Jithun Nair	4cde0acc0e	Make triton build ROCm library version-agnostic (#158408 ) Fixes maintenance of triton packaging script when library versions change from one ROCm version to next. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158408 Approved by: https://github.com/jeffdaily Co-authored-by: Ethan Wee <Ethan.Wee@amd.com>	2025-08-13 19:49:23 +00:00
Jerry Mannil	70ccdec44b	[ROCm] Improve reduction sum performance (#160466 ) * Use input vectorization for reduction_on_fastest_striding_dimension when dim0 >= 128 Reproducer: ``` import time import torch shapes = [ (5079670, 128) ] dims = [ (1) ] for i, shape in enumerate(shapes): x = torch.randn(shape, device='cuda', dtype=torch.float) for _ in range(10): w = torch.sum(x, dims[i]) torch.cuda.synchronize() print(w.size()) start_time = time.time() for _ in range(50): _ = torch.sum(x, dims[i]) torch.cuda.synchronize() end_time = time.time() mean_time = (end_time - start_time)/50 print(f"Avg time for shape {shape}: {mean_time * 1e6:.2f} us") ``` Before (MI300X): Avg time for shape (5079670, 128): 1629.99 us After (MI300X) Avg time for shape (5079670, 128): 1008.59 us Pull Request resolved: https://github.com/pytorch/pytorch/pull/160466 Approved by: https://github.com/petrex, https://github.com/jeffdaily	2025-08-13 18:46:58 +00:00
Nikita Shulga	db0b7f1cc9	[BE][CI] Adjust `error_inputs` for cat and complex (#160378 ) MPS backend does not support double, so errors should be different Pull Request resolved: https://github.com/pytorch/pytorch/pull/160378 Approved by: https://github.com/dcci	2025-08-13 18:35:06 +00:00
ILCSFNO	1c26c53851	Fix the Doc of `pivot` in `torch.lu` (#159617 ) Fixes #159616 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159617 Approved by: https://github.com/lezcano, https://github.com/jansel	2025-08-13 18:30:54 +00:00
Alexander Grund	adcca7d9a1	Do not rpath CUDA stubs folder in JIT generated code (#160179 ) `_transform_cuda_paths` intentionally includes the CUDA stubs folder. However this path must not be added to the rpath as otherwise any CUDA command will fail at runtime with > CUDA_ERROR_STUB_LIBRARY: "CUDA driver is a stub library" This results in e.g. non-descriptive errors like ``` cutlass_library/source/tools/util/include/cutlass/util/device_memory.h:67 cutlass::device_memory::allocate: cudaMalloc failed: bytes=4096 terminate called after throwing an instance of 'cutlass::cuda_exception' what(): std::exception ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160179 Approved by: https://github.com/jansel	2025-08-13 18:29:24 +00:00
Dmitry Nikolaev	01584d2a7d	[ROCm] remove extra transposes in NHWC convolutions on MIOpen (#160435 ) remove aten::contiguous for NHWC convolutions on ROCm Tests: - nn/test_convolution.py::TestConvolutionNNDeviceTypeCUDA::test_conv_cudnn_nhwc_cuda_float32 - nn/test_convolution.py::TestConvolutionNNDeviceTypeCUDA::test_conv_cudnn_nhwc_cuda_float16 Before: <img width="1255" height="228" alt="image" src="https://github.com/user-attachments/assets/b125ccab-00c2-4d3a-a341-4583e51d8d57" /> After: <img width="874" height="153" alt="image" src="https://github.com/user-attachments/assets/ec200754-3622-488e-8762-bff1c2d22818" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160435 Approved by: https://github.com/jeffdaily	2025-08-13 17:58:22 +00:00
ILCSFNO	87e6c4079d	Fix the Doc issue on the description of edge_order in torch.gradient() (#159130 ) Fixes #159129 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159130 Approved by: https://github.com/soulitzer	2025-08-13 16:48:47 +00:00
Nikita Shulga	7d87e358ac	Fix MPS conv3d autocast bias dtype mismatch (#160423 ) ## Summary - register conv3d with MPS autocast to ensure bias dtypes match under AMP - add regression test chaining two Conv3d layers on MPS autocast Written by Codex, see https://chatgpt.com/codex/tasks/task_e_689b64192df883278648935963d2776d Pull Request resolved: https://github.com/pytorch/pytorch/pull/160423 Approved by: https://github.com/dcci	2025-08-13 16:23:21 +00:00
Saurabh Mishra	6ee175195a	[DCP][OSS] Rank local checkpointing in DCP without collectives (#147758 ) Summary: DCP metadata collectives become prohibitively expensive as the job scale grows. This PR introduces rank-local checkpointing which basically saves and loads the checkpoint without any collective. The trade off for now is the dedupe and re-sharding. Support for these would be introduced soon. Differential Revision: D70112642 Pull Request resolved: https://github.com/pytorch/pytorch/pull/147758 Approved by: https://github.com/meetv18	2025-08-13 16:20:28 +00:00
zhangfei	db32b60662	[ci] Add riscv opt-int build (#143979 ) Hi, @malfet Based on the previous discussion: [RISCV CI support · Issue #141550 · pytorch/pytorch](https://github.com/pytorch/pytorch/issues/141550) I have cross-compiled PyTorch for the RISC-V architecture on x86_64 Ubuntu 24.04 and created a new PR for it. Could you please help review it? Pull Request resolved: https://github.com/pytorch/pytorch/pull/143979 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-08-13 16:12:02 +00:00
Paul Zhang	56c828bef9	Followup of #160002 , gracefully fail if Triton functions don't contain attributes (#160436 ) Summary: Fixes internal test failures of D80037015 Test Plan: CI Rollback Plan: Differential Revision: D80094187 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160436 Approved by: https://github.com/clee2000	2025-08-13 16:04:56 +00:00
Natalia Gimelshein	a2fd106d67	guard cuMulticastUnbind call (#160499 ) Fixes builds for old compilers Pull Request resolved: https://github.com/pytorch/pytorch/pull/160499 Approved by: https://github.com/Skylion007	2025-08-13 15:45:51 +00:00
PyTorch MergeBot	c656334120	Revert "Factor out the strings to templates for better editor integration (#160357 )" This reverts commit cbffde774557752cf20447d42d99ec6102673c31. Reverted https://github.com/pytorch/pytorch/pull/160357 on behalf of https://github.com/clee2000 due to broke a bunch of internal builds due to not being able to find the file No such file or directory: torch/_inductor/kernel/flex/templates/flex_decode.py.jinja D80145761, might need a buck targets change? ([comment](https://github.com/pytorch/pytorch/pull/160357#issuecomment-3184435581))	2025-08-13 15:40:50 +00:00
fduwjj	31c9ac4319	[c10d] Fix test test_nccl_user_buffer_registration (#160497 ) Fixed `test_nccl_user_buffer_registration ` due to https://github.com/pytorch/pytorch/pull/160145, somehow CI didn't capture it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160497 Approved by: https://github.com/ngimel	2025-08-13 15:29:41 +00:00
Catherine Lee	deea71a90e	[ez][CI] Set timeout for linux-jammy-py3_13-clang12-test from 600min -> default val of 240 (#160500 ) 10 hours is very long Pull Request resolved: https://github.com/pytorch/pytorch/pull/160500 Approved by: https://github.com/huydhn	2025-08-13 15:14:24 +00:00
Svetlana Karslioglu	114a6c4043	Add placeholder for the User Guide (#159379 ) - Add pytorch_overview.md - Add pytorch_main_components.md - Reorganize top nav to have Get Started, User Guide, Reference API, Community, Tutorials - Move notes under user guide Pull Request resolved: https://github.com/pytorch/pytorch/pull/159379 Approved by: https://github.com/albanD Co-authored-by: sekyondaMeta <127536312+sekyondaMeta@users.noreply.github.com> Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-08-13 14:56:04 +00:00
libohao	ee1b0412b9	[1/N]Port 3 distributed/_tools test cases to Intel GPU (#159543 ) For [#114850](https://github.com/pytorch/pytorch/issues/114850), we will port distributed tests to Intel GPU. We could enable Intel GPU with following methods and try the best to keep the original code styles: 1. use "torch.accelerator.current_accelerator()" to determine the accelerator backend 2. enabled XPU for some test path 3. skip some test cases which Intel GPU does not support Pull Request resolved: https://github.com/pytorch/pytorch/pull/159543 Approved by: https://github.com/guangyey, https://github.com/d4l3k Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com>	2025-08-13 12:49:01 +00:00
Han, Chao1	42e51cd4b3	Support ddp zero hook XCCL path (#159240 ) XCCL backend no https://github.com/pytorch/pytorch/issues/62300 issue, add xccl path here. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159240 Approved by: https://github.com/guangyey, https://github.com/Skylion007, https://github.com/EikanWang	2025-08-13 12:37:33 +00:00
Laith Sakka	96bd33b2de	Fix get_free_symbol_uses for several nodes (#160314 ) get_free_symbol_uses is used to know what unbacked symbols are used by a given node. not having correct get_free_symbol_uses defined properly leads to : - eliminating of some nodes due to not detection of any users. (See the added unit test) - Incorrect topological sort. Fix get_free_symbol_uses , NopKernel , ConcarKernel, InputsKerenl, external kernel. for ComputedBuffer with NonOwningLayout its interesting case. when layout is NonOwningLayout we need to access the actual view op base layout and use detect symbols in it. Because when we codegen the ComputedBuffer we uses those symbols. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160314 Approved by: https://github.com/eellison	2025-08-13 12:28:29 +00:00
Michael Lazos	ecde76c764	[Hierarchical Compile] Sort all regions identically (#158814 ) Before we would topologically sort each region individually, this works well except if some nodes have no arguments, then their order may change. To rectify this, we sort the first region as the reference region and use that sort order to sort the remaining regions. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158814 Approved by: https://github.com/williamwen42	2025-08-13 11:55:23 +00:00
Michael Lazos	34ec5ed275	[Dynamo][Hierarchical Compile] Allow parameters to be propagated to submodules (#157979 ) Fixes issue with HF Gen AI models where we mark a param as static and a get_attr node gets put in the region. The effect of this is lifting get_attr nodes to be inputs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157979 Approved by: https://github.com/williamwen42	2025-08-13 09:12:10 +00:00
PyTorch MergeBot	641ee74781	Revert "Add `label_smoothing` param in `nn.BCELoss` and `nn.BCEWithLogitsLoss` (#150282 )" This reverts commit f990490a23815ea6ee27e487c70ba2cf513ba43d. Reverted https://github.com/pytorch/pytorch/pull/150282 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/150282#issuecomment-3182844949))	2025-08-13 09:01:52 +00:00
Deng, Daisy	6e8865fbc1	port 3 distributed test to Intel GPU and unified some common functions (#158533 ) For https://github.com/pytorch/pytorch/issues/114850, we will port distributed tests to Intel GPU. We could enable Intel GPU with following methods and try the best to keep the original code styles: - instantiate_device_type_tests() - use "torch.accelerator.current_accelerator()" to determine the accelerator backend - enabled XPU for some test path - Unify some common code under torch/testing/_internal for multiple backend, for example: - requires_nccl_version - _dynamo_dist_per_rank_init - DynamoDistributedSingleProcTestCase - DistTestCases - FSDPTestMultiThread Pull Request resolved: https://github.com/pytorch/pytorch/pull/158533 Approved by: https://github.com/guangyey, https://github.com/d4l3k Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com>	2025-08-13 08:13:23 +00:00
Edward Yang	9a06e6d031	[claude-code] Add top-level module doc for torch/distributed/tensor/_op_schema.py (#157804 ) Not sure how good the description is, seeking insight from maintainers. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/157804 Approved by: https://github.com/wanchaol	2025-08-13 07:27:11 +00:00
Erxin Shang	6ea8376f84	Enable XPU for test_autograd_function.py (#160309 ) # Description Fixes #114850, we will port dynamo tests to Intel GPU We could enable Intel GPU with following methods and try the best to keep the original code styles: # Changes 1. Get device type from get_devtype() method. 2. Replace the requires_cuda_and_triton with requires_gpu. 3. Add HAS_XPU_AND_TRITON into the scope. # Notify Pull Request resolved: https://github.com/pytorch/pytorch/pull/160309 Approved by: https://github.com/guangyey, https://github.com/ezyang	2025-08-13 06:38:34 +00:00
FFFrog	8eee08d227	Replace TORCH_INTERNAL_ASSERT with TORCH_CHECK (#160411 ) As the title stated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160411 Approved by: https://github.com/ezyang	2025-08-13 06:31:10 +00:00
Masaki Kozuki	e497620260	Add `compile_id: Optional[CompileID]` to `torch._logging._internal.trace_structured_artifact` (#160440 ) Context: When writing a custom `torch.compile` backend, I quite frequently (ab)use `trace_structured_artifact` because I'm too lazy to customize tlparse (ref: `6d8b13c867`). I recently notice some of the artifacts I want to store are generated where CompileID cannot be correlated and `tlparse` html says > Sometimes, logs are made without a compile id. This makes it difficult to correlate related logs. This stack trie shows all places where log entries occurred without compile context; to fix, look an appropriate place in the stack where compile id should have been specified. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160440 Approved by: https://github.com/ezyang	2025-08-13 06:28:23 +00:00
kshitij12345	199e9abb6a	[fx] fix split_module with symint (#160093 ) Fixes https://github.com/pytorch/pytorch/issues/155220 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160093 Approved by: https://github.com/ezyang	2025-08-13 05:50:15 +00:00
PyTorch UpdateBot	685f15dbea	[vllm hash update] update the pinned vllm hash (#160484 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160484 Approved by: https://github.com/pytorchbot	2025-08-13 04:54:03 +00:00
Guilherme Leobas	85db508af5	Wrap class definitions in `set_fullgraph(False)` in `test_int`/`bool`/`float`/`complex` (#160276 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160276 Approved by: https://github.com/zou3519 ghstack dependencies: #160216, #160217	2025-08-13 04:53:03 +00:00
Guilherme Leobas	27156ec804	Wrap class definitions in `set_fullgraph(False)` in `test_operator` (#160217 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160217 Approved by: https://github.com/zou3519 ghstack dependencies: #160216	2025-08-13 04:53:03 +00:00
Guilherme Leobas	6746bc59df	Wrap class definitions in `set_fullgraph(False)` in `test_set` (#160216 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160216 Approved by: https://github.com/zou3519	2025-08-13 04:53:03 +00:00
Nikita Shulga	3008d985a8	[CD] Do not build pytorch with nvshem on ARM (#160465 ) As nvshmem binary from 3.3.9 is not compatible with manylinux2_28, and 3.3.20 is not available for download yet Also, package nvshmem binary into full wheel Fixes https://github.com/pytorch/pytorch/issues/160425 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160465 Approved by: https://github.com/atalman, https://github.com/huydhn	2025-08-13 04:10:43 +00:00
PyTorch MergeBot	652a6f5954	Revert "[Fix XPU CI][Inductor UT] Fix test cases broken by community. (#160403 )" This reverts commit 5a9c4cfce42b9eb87da0de40c5633f083115c307. Reverted https://github.com/pytorch/pytorch/pull/160403 on behalf of https://github.com/malfet due to It indeed consistently broken inductor, see `118bc97b14/1` ([comment](https://github.com/pytorch/pytorch/pull/160403#issuecomment-3182101130))	2025-08-13 04:05:46 +00:00
Ankita George	118bc97b14	Write full tensors out at once in HF consolidation script (#159394 ) Not all storage systems support writing at random offsets. This PR changes the writes of the consolidation script to write each tensor to a buffer, and then write out the buffer, sequentially going through every tensor in the output file. This will also help in the case where the sharded files weren't just sharded in the row-wise dimension. The reason is because small writes are expensive and we were writing each write for every chunk that was the largest number of contiguous bytes in the final tensor, but this could be a small amount of bytes for col-wise sharding. Now the full tensor is needed for the write, making the number of small writes smaller. Differential Revision: [D78684452](https://our.internmc.facebook.com/intern/diff/D78684452/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159394 Approved by: https://github.com/saumishr ghstack dependencies: #159392, #159393	2025-08-13 03:51:16 +00:00
Nikita Shulga	305fa22393	[GHF] Remove `app { name databaseId}` query (#160494 ) From `PRCheckSuites` fragment, as it's causes security exception when used with new GITHUB_TOKEN, that will looks as follows ``` RuntimeError: GraphQL query fragment PRReviews on PullRequestReviewConnection { nodes { author { login } bodyText createdAt authorAssociation editor { login } databaseId url state } pageInfo { startCursor hasPreviousPage } } fragment PRCheckSuites on CheckSuiteConnection { edges { node { app { name databaseId } workflowRun { workflow { name databaseId } databaseId url } checkRuns(first: 50) { nodes { name conclusion detailsUrl databaseId title summary } pageInfo { endCursor hasNextPage } } conclusion } cursor } pageInfo { hasNextPage } } fragment CommitAuthors on PullRequestCommitConnection { nodes { commit { authors(first: 2) { nodes { user { login } email name } } oid } } pageInfo { endCursor hasNextPage } } query ($owner: String!, $name: String!, $number: Int!) { repository(owner: $owner, name: $name) { pullRequest(number: $number) { closed isCrossRepository author { login } title body headRefName headRepository { nameWithOwner } baseRefName baseRefOid baseRepository { nameWithOwner isPrivate defaultBranchRef { name } } mergeCommit { oid } commits_with_authors: commits(first: 100) { ...CommitAuthors totalCount } commits(last: 1) { nodes { commit { checkSuites(first: 10) { ...PRCheckSuites } status { contexts { context state targetUrl } } oid } } } changedFiles files(first: 100) { nodes { path } pageInfo { endCursor hasNextPage } } reviews(last: 100) { ...PRReviews } comments(last: 5) { nodes { bodyText createdAt author { login } authorAssociation editor { login } databaseId url } pageInfo { startCursor hasPreviousPage } } labels(first: 100) { edges { node { name } } } } } } , args {'name': 'pytorch', 'owner': 'pytorch', 'number': 159820} failed: [{'type': 'FORBIDDEN', 'path': ['repository', 'pullRequest', 'commits', 'nodes', 0, 'commit', 'checkSuites', 'edges', 4, 'node', 'app'], 'extensions': {'saml_failure': False}, 'locations': [{'line': 26, 'column': 7}], 'message': 'Resource not accessible by integration'}] ``` But the same query works fine if executed using one's Personal Access Token Updated mocks file by running ``` sed -i -e s/a32a7ca3a2f6e2c9de07aef821b0111539758b4ac254f8a3432af32314f94876/8e262b0495bd934d39dda198d4c09144311c5ddd6cca6a227194bd48dbfe7201/ gql_mocks.json sed -i -e s/157add81c519f614388f3a67e287bdf4fbb1791e6d0bffe312e169d02ac2813f/28349cb4c891bbf85255fab2c33c770baf77c3e02b29ca9a0e4c6c97bed041db/ gql_mocks.json sed '/"app": {/,+3d' gql_mocks-orig.json >gql_mocks.json sed '/"app": null/d' gql_mocks-orig.json >gql_mocks.json ``` Undisable offending jobs Fixes https://github.com/pytorch/pytorch/issues/159894 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160494 Approved by: https://github.com/huydhn ghstack dependencies: #160490, #160492	2025-08-13 03:46:39 +00:00
Nikita Shulga	1151b40cbf	[BE] Filter unused mocks (#160492 ) Somebody checked in twice the number of mocks into the archive Filter them out by running following script ```python import json with open("gql_mocks-orig.json") as f: mocks = json.load(f) keys = list(mocks.keys()) good_shas = {'a32a7ca3a2f6e2c9de07aef821b0111539758b4ac254f8a3432af32314f94876', '157add81c519f614388f3a67e287bdf4fbb1791e6d0bffe312e169d02ac2813f', '4715ed05b382e572135c049664939f22f9b1249bc0c499ae278d655ad8cb598b', 'a91ab398f97fb43cbe6e0899980dad8ff7447457ea5a71bbc59f7702a9280eb5', 'e5130469b5373479776bfbccade8039ce4741b97873bb3bec4e279fed08602be', '5dc32efeb8306f03744f6804ef4b500882f2759f7ac17fdc9f123669bfe4805a', '0a34acb829d8aca9dd28a8ba388dfa52f6ecdde7e903ace1caabdcfaba87de98', '8b50878b010492fe64005cc4b4ed34ac5f6695ce093f06b0d8d5403b7787c2c0', '2877b3b1e8630ca4ae797b9d85d5673d25ca8488c01141e11ff55f4a1359fca7'} for k in keys: if any(sha in k for sha in good_shas): continue del mocks[k] with open("gql_mocks.json","w") as f: json.dump(mocks, f, indent=2) f.write("\n") ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160492 Approved by: https://github.com/huydhn ghstack dependencies: #160490	2025-08-13 03:46:39 +00:00
Nikita Shulga	d0f9785af3	[CI] Prevent accidental gql_mocks updates by test_trymerge (#160490 ) As they could not longer be fetched from GitHub, see https://github.com/pytorch/pytorch/issues/160489 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160490 Approved by: https://github.com/huydhn	2025-08-13 03:46:32 +00:00
Jerry Mannil	ba47821f52	[ROCm] Set thread_work_size to 16 for vectorized elementwise kernels for MI300X (#160444 ) * thread_work_size of 16 is giving better perf with many workloads for MI300X cherry-pick of `fb81400d34` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160444 Approved by: https://github.com/jeffdaily	2025-08-13 03:41:25 +00:00
Ankita George	2c5e10a5fc	Add new function consolidate_safetensors_files_on_every_rank for HF consolidation (#159393 ) Currently we are only using rank-0 for HF consolidation. But we should be able to use every rank to consolidate the sharded files, which will speed up the consolidation by Nx (where N is the number of ranks). Adding a new method consolidate_safetensors_files_on_every_rank to do this. Differential Revision: [D79000720](https://our.internmc.facebook.com/intern/diff/D79000720/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159393 Approved by: https://github.com/saumishr ghstack dependencies: #159392	2025-08-13 03:31:36 +00:00
Jane Xu	355462e127	Add stable Tensor get_device_index, use more stable DeviceIndex (#160143 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160143 Approved by: https://github.com/mikaylagawarecki	2025-08-13 03:27:10 +00:00
Xu Han	41673110cd	[inductor] Windows inductor use intel-openmp. (#160258 ) After some debug work, I found PyTorch torch_cpu.dll is using intel-openmp, but not MSVC openmp. So, switch Windows inductor to intel-openmp. It fixed: `c8205cb354/test/inductor/test_aot_inductor.py (L2405-L2408)` <img width="896" height="230" alt="image" src="https://github.com/user-attachments/assets/273b00f8-7dc1-43c9-9b7f-752e16355a80" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160258 Approved by: https://github.com/ezyang	2025-08-13 02:36:19 +00:00
Yu, Guangye	6be6d06295	Avoid potential deadlocks in host allocator (#159352 ) # Motivation This PR fixes a potential deadlock in the host allocator. When calling `event->record(stream)`, the `record_stream` implementation may acquire the Python GIL. In places such as `842cc77ab9/aten/src/ATen/cuda/CachingHostAllocator.cpp (L145-L151)`, and `842cc77ab9/aten/src/ATen/xpu/CachingHostAllocator.cpp (L22-L28)` `record_stream` is invoked while holding the allocator lock. To prevent deadlocks, we must ensure the locking order is: GIL → Allocator Lock. Reversing the order (Allocator Lock → GIL) can cause a deadlock. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159352 Approved by: https://github.com/cyyever, https://github.com/ezyang	2025-08-13 02:30:17 +00:00
nandesuka	f15ada5c6f	Enable output padding when only outermost dim is dynamic (#159404 ) Summary: When the shape of the output tensor has a dynamic outer most dim, the stride can still be padded to conform to configured alignment if required. Test Plan: CI Rollback Plan: Differential Revision: D79146886 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159404 Approved by: https://github.com/blaine-rister, https://github.com/eellison	2025-08-13 01:28:22 +00:00
Nikhil Patel	69a0a9aa7f	[Inductor][Triton] Pass GPUTarget param to updated make_ir function (#160422 ) Summary: A recent Triton commit changed `ASTSource.make_ir` to a 5-arg signature that includes a `GPUTarget`. We need to pass in this new argument. Test Plan: `buck2 test 'fbcode//mode/opt' -m ovr_config//triton:trunk fbcode//caffe2/test/inductor:test_inductor_cuda -- triton_kernel` Rollback Plan: Reviewed By: davidberard98 Differential Revision: D80069909 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160422 Approved by: https://github.com/davidberard98, https://github.com/mlazos	2025-08-13 01:27:57 +00:00
Nikita Shulga	32099961d5	[EZ] Delete CircleCI case (#160479 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160479 Approved by: https://github.com/izaitsevfb ghstack dependencies: #160477	2025-08-13 01:19:09 +00:00
Nikita Shulga	8d1cf52922	[EZ][BE] Remove unused `conda-env-macOS-ARM64` (#160477 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160477 Approved by: https://github.com/atalman	2025-08-12 23:41:25 +00:00
fduwjj	b1f43548ca	[c10d] Error out the case when registering symmetric memory without eager init (#160145 ) Instead of implicitly creating nccl comm inside mem pool registration for symmetric memory, we decide to error it out so that we only support eager init case when the nccl comm is already initiated. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160145 Approved by: https://github.com/kwen2501	2025-08-12 23:25:04 +00:00
Zain Rizvi	0d71ca2c46	[EZ] Replace `pytorch-labs` with `meta-pytorch` (#160459 ) This PR replaces all instances of 'pytorch-labs' with 'meta-pytorch' in this repository now that the 'pytorch-labs' org has been renamed to 'meta-pytorch' ## Changes Made - Replaced all occurrences of 'pytorch-labs' with 'meta-pytorch' - Only modified files with extensions: .py, .md, .sh, .rst, .cpp, .h, .txt, .yml - Skipped binary files and files larger than 1MB due to GitHub api payload limits in the script to cover all repos in this org. Will do a more manual second pass later to cover any larger files ## Files Modified This PR updates files that contained the target text. Generated by automated script on 2025-08-12T20:41:29.888681+00:00Z Pull Request resolved: https://github.com/pytorch/pytorch/pull/160459 Approved by: https://github.com/huydhn, https://github.com/clee2000, https://github.com/atalman, https://github.com/malfet	2025-08-12 22:44:25 +00:00
deedongala	5737372862	[CI] Switch ROCm MI300 GitHub Actions workflows from 2-GPU to 1-GPU runners (#158882 ) Updated .github/actionlint.yaml to replace linux.rocm.gpu.mi300.2 with linux.rocm.gpu.mi300.1 in the supported runner list Modified all affected workflows (inductor-perf-test-nightly-rocm.yml, inductor-periodic.yml, inductor-rocm-mi300.yml, and rocm-mi300.yml) to run jobs on 1-GPU MI300 runners instead of 2-GPU runners This should help increase available runners even with same number of CI nodes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158882 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-12 22:42:40 +00:00
Isalia20	2e4e5ab4be	[MPS] Add mps keys to `indices` and `values` ops (#160223 ) enable indices and values on sparse mps Pull Request resolved: https://github.com/pytorch/pytorch/pull/160223 Approved by: https://github.com/malfet	2025-08-12 22:08:44 +00:00
Zhengxu Chen	16d15445f8	Fullgraph graph capture with dynamo. (#159749 ) Summary: Following up on Avik's doc https://docs.google.com/document/d/11RW0Bbkp1QwFbEu8rCNW5d7wUFaEkxbL0uLyqcc2jTk/edit?tab=t.0 We are experimenting with a new API which utilizes torch.compile(fullgraph=True) and intend to use it to replace the old dynamo.export() API. This PR adds a prototype for the API described in the doc. Test Plan: test_misc -- -k test_aot_capture Rollback Plan: Differential Revision: D79534608 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159749 Approved by: https://github.com/tugsbayasgalan	2025-08-12 22:06:18 +00:00
henrylhtsang	101276f81b	[BE] Save attributes for CppCompileError for pickleing (#160294 ) Differential Revision: [D79977408](https://our.internmc.facebook.com/intern/diff/D79977408/) Context: When testing cutlass backend and used autotune with subproc, sometimes I would see C++ compilation error (expected) followed by ``` Traceback (most recent call last): File "/torch/_inductor/autotune_process.py", line 175, in get result = TuningProcess.recv(self.read_pipe) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/torch/_inductor/autotune_process.py", line 99, in recv return pickle.load(read_pipe) ^^^^^^^^^^^^^^^^^^^^^^ TypeError: CppCompileError.__init__() missing 1 required positional argument: 'output' ``` which is unexpected. After asking claude, it seems > Now I can see the issue. The `CppCompileError` class requires two arguments: `cmd` (a list of strings) and `output` (a string). However, when exceptions are being pickled and unpickled across process boundaries, the pickling process might not be preserving the constructor arguments correctly. > > The problem is likely that when a `CppCompileError` is raised in the subprocess and then pickled/unpickled through the `recv` function, the unpickling process is trying to reconstruct the exception but doesn't have the required constructor arguments. > > The issue is clear now. The `CppCompileError` class doesn't have custom pickle methods (`__reduce__`, `__getstate__`, `__setstate__`), so when it's pickled and unpickled across process boundaries, Python's default pickling mechanism tries to reconstruct it but fails because it doesn't preserve the constructor arguments properly. > > The solution is to add a `__reduce__` method to the `CppCompileError` class to ensure it can be properly pickled and unpickled. Let me implement this fix: Adding these seem to help. fbcode repro: [D79977541](https://www.internalfb.com/diff/D79977541) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160294 Approved by: https://github.com/masnesral	2025-08-12 22:03:36 +00:00
drisspg	cbffde7745	Factor out the strings to templates for better editor integration (#160357 ) # Summary More code motion, tldr is that install 'Better Jinja' in vscode and now you can get highlighting Before <img width="776" height="926" alt="Screenshot 2025-08-11 at 2 41 08 PM" src="https://github.com/user-attachments/assets/10868b31-f8ac-4cf5-99fe-19b8789ce06b" /> After: <img width="1184" height="1299" alt="Screenshot 2025-08-11 at 2 40 27 PM" src="https://github.com/user-attachments/assets/45203765-589e-4d76-8196-d895a2f2fbf6" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160357 Approved by: https://github.com/eellison	2025-08-12 21:59:54 +00:00
David Berard	78a2fe1d42	[TorchScript] thread-safe ErrorReport::CallStack (#160386 ) Context: During jit.script, the TorchScript frontend maintains a callstack of Python frames, which is used to present the corresponding user code in case TorchScript errors. The callstack is maintained via ErrorReport::CallStack RAII guards. Before recursing into a function, an ErrorReport::CallStack guard is created and the CallStack guard pushes the frame information onto a thread_local callstack (a list of calls); and after exiting, the frame information is popped off the callstack. Note that the CallStack guards are also sometimes used in python via pybindings. The problem is that sometimes another thread can obtain a reference to the CallStack guard (if it's a Python CallStack guard). This means that the destructor for a CallStack guard can be called from a different thread than the constructor was called. When this happens, it causes a segfault. This PR makes the callstack vector thread-safe to access, and each CallStack guard will store a reference to the callstack vector onto which it pushed. When the CallStack guard is destructed, it pops off the appropriate callstack vector. Although this could potentially lead to mangled callstacks, it should prevent segfaults. Added a test `test_thread_safe_error_stacks` which segfaults prior to these changes, and no longer segfaults. Differential Revision: [D80054972](https://our.internmc.facebook.com/intern/diff/D80054972) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160386 Approved by: https://github.com/eellison	2025-08-12 21:59:04 +00:00
Ivan Zaitsev	f8f0414a59	fix cpp builder to avoid missing-source compile error (#160354 ) Summary: the condition ``` if config.is_fbcode() and (not self._aot_mode or self._use_relative_path): sources = [os.path.basename(i) for i in sources] ``` unintentionally (?) stripped paths even when use_relative_path was False (as long as aot_mode was False), breaking local tests that rely on absolute temp-file paths. Fixes internal issue: ``` FAILED (errors=1) CppCompileError: C++ compile error Command: /mnt/gvfs/third-party2/llvm-fb/0f1f083aa5508772f3db24bf4f697bc118ba0958/17/platform010/72a2ff8/bin/clang-17 czyi3nhzin5b3mc3376vmfnlbjobvjcghbvv4tatuazs3syqubay.cpp -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -Werror=ignored-optimization-argument -g -o /re_tmp/tmpsp58ya2h/zy/test_symbol.so Output: clang-17: error: no such file or directory: 'czyi3nhzin5b3mc3376vmfnlbjobvjcghbvv4tatuazs3syqubay.cpp' clang-17: error: no input files ``` Reviewed By: clee2000 Differential Revision: D80025417 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160354 Approved by: https://github.com/benjaminglass1, https://github.com/clee2000	2025-08-12 21:36:22 +00:00
Mikayla Gawarecki	4d419a7461	Add pad and narrow to torch/csrc/stable/ops.h (#159328 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159328 Approved by: https://github.com/janeyx99 ghstack dependencies: #159507	2025-08-12 21:29:49 +00:00
Mikayla Gawarecki	655137b678	Update torch::stable::Tensor() default constructor (#159507 ) Allows things like ```cpp Tensor cu_seqlens_q; if (...) { cu_seqlens_q = ... } ... ``` Also adds `torch::stable::Tensor.defined()` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159507 Approved by: https://github.com/janeyx99	2025-08-12 21:29:49 +00:00
Gheorghe-Teodor Bercea	f27232a213	[ROCm] Limit number of values per thread for reductions on three dimensions (#159652 ) In the current implementation of reductions in three dimensions for AMD GPUs the number of values per thread is unbounded and can end up being in the hundreds of thousands for certain tensors. This of course is bad for performance. This patch fixes this issue by increasing the parallelism and thus lowering the number of value per thread to reasonable limits i.e. less than 2048 values per thread. The performance gains can be between 10x-17x for certain examples where the number of values per thread was originally very high. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159652 Approved by: https://github.com/jeffdaily	2025-08-12 21:15:56 +00:00
Anshul Sinha	c24ca7f4bf	[FSDP][Collectives] skipping allgather when world size is 1 (#160135 ) Summary: In its current state, FSDP collectives uses cuda synchronizations and communication ops regardless of what the world size is. However, now that replicate will use FSDP, there will be instances where group size = 1 and these synchronizations and ops will be used needlessly. I have updated fsdp_params group to skip the foreach_all_gather and foreach_all_gather_copy_out APIs when world_size ‎ = 1. I have created a test that uses CommDebugMode to verify that the all gather comm has been removed. I also edited an affected test which used 1-way FSDP by verifying and changing its assert statements for CommDebugMode. Below, I have included the link to the profile trace verifying these two APIs were skipped and two test commands. https://interncache-all.fbcdn.net/manifold/perfetto-artifacts/tree/ui/index.html#!/?url=https://interncache-all.fbcdn.net/manifold/perfetto_internal_traces/tree/shared_trace/anshulsi_f846ac3b-9467-4060-8e36-8cc3bc4449c3_devgpu263.prn2.facebook.com_652183.1753822140871934814.pt.trace.json Pull Request resolved: https://github.com/pytorch/pytorch/pull/160135 Approved by: https://github.com/weifengpy	2025-08-12 21:13:29 +00:00
AaronWang04	b4596895b9	[DTensor] Registers sharding rule for rms_norm (#159692 ) Reduces collective calls in the forward pass from 2 to 1 In #158716 I added the sharding rule for the backward pass but didn't add the forward pass as it didn't get dispatched. After #159324 this should get properly dispatched hence I am adding it now. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159692 Approved by: https://github.com/tianyu-l	2025-08-12 21:05:24 +00:00
xinan.lin	5a9c4cfce4	[Fix XPU CI][Inductor UT] Fix test cases broken by community. (#160403 ) Fixes #160243, Fixes #160244, Fixes #160245 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160403 Approved by: https://github.com/janeyx99	2025-08-12 21:02:44 +00:00
Chien-Lin Chen	a354fa91e2	added class or module info for functions blocked by weight-only load (#159935 ) Fixes #152985 In #152985, users are confused why weights-only load failed even though functions were registered in safe_globals. Because the error message doesn't make the critical failure reason clear, they couldn't figure out only some functions are missing from safe_globals registration. This fix is to make that point more clear. Here's the new errror message, the blocked function information will be following the warning message with a line breaker to make it stand out. ``` _pickle.UnpicklingError: Weights only load failed. In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source. Please file an issue with the following so that we can make `weights_only=True` compatible with your use case: WeightsUnpickler error: Trying to call reduce for unrecognized function <built-in method _unpickle of type object at 0x641e8a57d1f0> which belongs to <class 'zoneinfo.ZoneInfo'> Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html. To execute this test, run the following from the base repo dir: python test/test_serialization.py TestSerialization.test_weights_only_with_safe_zoneinfo_unpickle_registration_success This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159935 Approved by: https://github.com/mikaylagawarecki	2025-08-12 20:52:25 +00:00
Ankita George	f95b58c284	Remove usage of fsspec in HF consolidation script (#159392 ) Moving towards just supporting local storage to take advantage of HF apis such as safe_open. This was already done in Storage component in https://github.com/pytorch/pytorch/pull/159405. This PR removes fsspec usages in consolidation script and relies on local storage only Differential Revision: [D78997975](https://our.internmc.facebook.com/intern/diff/D78997975/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159392 Approved by: https://github.com/sibuachu	2025-08-12 20:41:06 +00:00
albanD	8e6a313858	Add ownership token when needed on GradientEdge (#160098 ) We can avoid the token by introducing PyObject preservation for THPFunction. But I think it will be too much complexity given that this kind of issue is very rare. Happy to be talked into doing it though if someone really wants to. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160098 Approved by: https://github.com/ezyang, https://github.com/soulitzer	2025-08-12 20:14:18 +00:00
Paul de Supinski	7e91394955	Support NUMA Binding for Callable Entrypoints (#160163 ) # Context This is an extension of #149334. # This PR Add support for NUMA bindings with Callable entrypoints, such as `do_train` instead of `/usr/local/bin/python`. Most notably, we utilize a hack in order to force `Process.start()` to use custom NUMA bindings for each subprocess. Please search for `HACK:` in the code to see a description of the implementation we chose, and #160006 for discussion of alternatives and why this is necessary. Other changes: * Remove unnecessary `--preferred` option from all binding strategies. By default, Linux already allocates memory to the NUMA node local to the CPU which triggered the allocation. (See [MPOL_LOCAL](https://man7.org/linux/man-pages/man2/set_mempolicy.2.html).) * Refactor so that the main API is `maybe_wrap_command_with_numa_bindings`, which computes bindings for a single rank at a time, rather than `maybe_wrap_with_numa_bindings` which computed bindings for all ranks at once. This allowed for more code sharing between `Callable` and `str` entrypoints. # Test Plan ## Automated `$ pytest test/test_numa_binding.py` ## Manual Using [this benchmark,](https://gist.github.com/pdesupinski/bbe01ade455d86e989794f2c612e2d91), ran ``` $ PYTHONUNBUFFERED=1 LOGLEVEL=INFO perf stat -e ls_dmnd_fills_from_sys.dram_io_far,ls_dmnd_fills_from_sys.dram_io_near -- python -m torch.distributed.run --standalone --nproc-per-node=8 --numa-binding=node --run-path mlp_train.py 2>&1 \| tee node_callable.txt && PYTHONUNBUFFERED=1 LOGLEVEL=INFO perf stat -e ls_dmnd_fills_from_sys.dram_io_far,ls_dmnd_fills_from_sys.dram_io_near -- python -u -m torch.distributed.run --standalone --nproc-per-node=8 --run-path mlp_train.py 2>&1 \| tee none_callable.txt ``` and observed * 6.6% remote memory accesses with 'node' bindings * 11.6% remote without bindings I also ran similar with `str` entrypoints as before just to be sure it's still working. NOTE: [--run-path triggers the code to be run inside a `Callable`.](`017259f9c6/torch/distributed/run.py (L870)`) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160163 Approved by: https://github.com/d4l3k	2025-08-12 20:08:49 +00:00
Markus Hoehnerbach	89654db1ab	[inductor] fix triton bucketize mask propagation (#159961 ) See `6b414f56a4` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159961 Approved by: https://github.com/eellison	2025-08-12 19:59:32 +00:00
Natalia Gimelshein	2d0cdee394	move thread-local capture mode guard to include work.isStarted (#160398 ) Per title, should fix capture errors that happen because nccl watchdog races with capture start. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160398 Approved by: https://github.com/aorenste	2025-08-12 19:25:04 +00:00
eqy	9903ca4f70	[cuDNN][64-bit indexing] update conv depthwise 64bit indexing dispatch condition to match native kernel (#156140 ) The native kernel doesn't support batch splitting so the previous check wasn't aggressive enough in dispatching to cuDNN https://github.com/pytorch/pytorch/issues/155225 Pull Request resolved: https://github.com/pytorch/pytorch/pull/156140 Approved by: https://github.com/ngimel, https://github.com/atalman	2025-08-12 18:07:41 +00:00
PyTorch MergeBot	f341077ce4	Revert "[ROCm] Support large inputs for coalesceValuesKernel (#158281 )" This reverts commit a7abf57aabec0ce686092e2d66e53ba185dbc56b. Reverted https://github.com/pytorch/pytorch/pull/158281 on behalf of https://github.com/clee2000 due to broke windows cuda build? [GH job link](https://github.com/pytorch/pytorch/actions/runs/16915172288/job/47927141460) [HUD commit link](`a7abf57aab`). Not caught b/c PR didn't have ciflow/trunk ([comment](https://github.com/pytorch/pytorch/pull/158281#issuecomment-3180408766))	2025-08-12 17:57:57 +00:00
Edward Z. Yang	3cec82a7e9	Ensure outer aliasing on DTensor matches inner aliasing (#158954 ) Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/158954 Approved by: https://github.com/albanD, https://github.com/wconstab	2025-08-12 17:47:48 +00:00
Jerry Mannil	ee9f8ba11d	[ROCm] Use opportunistic fastatomics based on hueristics (#159430 ) * Opportunistic fast atomics works better with small sizes, since there is more chance of lanes doing atomics on the same address Co-author: @amd-hhashemi Reproducer: ``` import time import torch x = torch.randn((1_632_960, 128), device='cuda', dtype=torch.float) ind = torch.randint(0, x.size(0), size=(5_079_670,), device='cuda') src = torch.randn((5_079_670, 128), device='cuda', dtype=torch.float) for _ in range(20): x.index_add_(0, ind, src) start_time = time.time() for i in range(100): x.index_add_(0, ind, src) torch.cuda.synchronize() end_time = time.time() mean_time = (end_time - start_time)/100 print(f"Avg time for index_add_: {mean_time * 1e6:.2f} us") ``` Perf numbers: ``` Before: Avg time for index_add_: 25652.16 us After: Avg time for index_add_: 2675.15 us ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159430 Approved by: https://github.com/pruthvistony, https://github.com/jeffdaily	2025-08-12 17:13:54 +00:00
David Berard	1f4057c11a	[inductor] remove no_x_dim (#159810 ) no_x_dim is used to indicate that a reduction operates on a single row, and data loaded for the reduction is 1-dimensional. no_x_dim was introduced in https://github.com/pytorch/pytorch/pull/102444 - in which there was bad perf in some reductions, and using 1D tensors fixed the perf issue. However, it appears that this perf issue no longer exists in current Triton versions. https://github.com/pytorch/pytorch/pull/118822 checked this, and we can also check this on H100 benchmarks (linked below). And another motivation for removing this behavior is that it enables larger loads, which we observe is necessary for good performance on certain shapes on Blackwell. H100 inference benchmarks: https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Mon%2C%2004%20Aug%202025%2004%3A13%3A24%20GMT&stopTime=Mon%2C%2011%20Aug%202025%2004%3A13%3A24%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cuda%20(h100)&lBranch=gh/davidberard98/396/orig&lCommit=a6bcd4692fb39fa2fad260f290bff545d4425829&rBranch=main&rCommit=e96c7c4bb0f6aeae2ab3b6f040f7d67edbec199a H100 training benchmarks: https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Mon%2C%2004%20Aug%202025%2004%3A13%3A24%20GMT&stopTime=Mon%2C%2011%20Aug%202025%2004%3A13%3A24%20GMT&granularity=hour&mode=training&dtype=amp&deviceName=cuda%20(h100)&lBranch=gh/davidberard98/396/orig&lCommit=a6bcd4692fb39fa2fad260f290bff545d4425829&rBranch=main&rCommit=e96c7c4bb0f6aeae2ab3b6f040f7d67edbec199a Overall, the benchmarks show minimal change in performance. Differential Revision: [D79599286](https://our.internmc.facebook.com/intern/diff/D79599286) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159810 Approved by: https://github.com/ngimel, https://github.com/eellison	2025-08-12 17:10:31 +00:00
Jovian Anthony Jaison	94b91a8763	[redone][pytorch] Moving torch.compile worker process logs to a dedicated rank based log directory (#160352 ) Summary: Writing torch.compile worked logs to dedicated_log_rank{RANK} if we're running on mast. ref: D79456310 (got reverted because of linter) Testing: Refer differential Revision: D79917440 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160352 Approved by: https://github.com/masnesral	2025-08-12 16:49:08 +00:00
Xinya Zhang	a7abf57aab	[ROCm] Support large inputs for coalesceValuesKernel (#158281 ) # Description `.coalesce` cannot handle large inputs on ROCM due to maximal grid size limit. This PR splits axis `X` into axes `X` and `Y`, and repurposes `Z` for original `Y` on ROCm to avoid such limitation. Confirmed the new approach can handle large inputs. Correctness needs validation. # Testing Command `python torch_spmv.py 22500000 272500000` ## Script `torch_spmv.py` ``` python import torch import argparse def parse_args(): parser = argparse.ArgumentParser( description="Sparse COO Matrix by Dense Vector Multiplication using PyTorch" ) parser.add_argument("n", type=int, help="Size of the NxN matrix") parser.add_argument("nnz", type=int, help="Number of non-zero entries") return parser.parse_args() def main(): args = parse_args() n = args.n nnz = args.nnz dtype = torch.float32 device = torch.device('cuda') # Generate random indices for the sparse matrix in COO format. torch.manual_seed(42) rows = torch.randint(0, n, (nnz,), dtype=torch.int64, device=device) cols = torch.randint(0, n, (nnz,), dtype=torch.int64, device=device) indices = torch.stack([rows, cols], dim=0) # Generate random values. values = torch.randn(nnz, dtype=torch.float32, device=device) # Create the sparse COO matrix and move it to the target device. sparse_matrix = torch.sparse_coo_tensor(indices, values, size=(n, n), dtype=torch.float32, device=device) sparse_matrix = sparse_matrix.coalesce() # Generate a random dense vector. dense_vector = torch.randn(n, dtype=torch.float32, device=device) # Perform sparse matrix - dense vector multiplication. # Using torch.sparse.mm which expects a 2D tensor for the vector. result = torch.sparse.mm(sparse_matrix, dense_vector.unsqueeze(1)).squeeze() # result = torch.mv(sparse_matrix, dense_vector) # Print the result. print("Result of the multiplication:") print(torch.sum(result)) if __name__ == "__main__": main() ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158281 Approved by: https://github.com/jithunnair-amd, https://github.com/jeffdaily	2025-08-12 16:42:55 +00:00
PyTorch MergeBot	f7b2f3314c	Revert "[triton_heuristics] Optimize the triton launcher in pt2 (#160000 )" This reverts commit d0e2240f680ea2a553f7ee8188f52482e130bfd0. Reverted https://github.com/pytorch/pytorch/pull/160000 on behalf of https://github.com/davidberard98 due to D80054972 failing with test_triton_kernel_2d_autotune_grad_False_dynamic_True_backend_inductor_grid_type_1_tdlp_1 ([comment](https://github.com/pytorch/pytorch/pull/160000#issuecomment-3180144676))	2025-08-12 16:33:02 +00:00
Jeff Daily	9d37c960a4	[ROCm][CI] use new benchmark image for dynamo (#160421 ) Follow-up to #160047 that separated the rocm image into default CI and benchmarks. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160421 Approved by: https://github.com/jeffdaily Co-authored-by: Jeff Daily <jeff.daily@amd.com>	2025-08-12 16:07:19 +00:00
PyTorch MergeBot	b219ca2a00	Revert "Update triton xpu commit to support python 3.14 (#160183 )" This reverts commit 7fbc22855c17741ae016992803b2e147a13aa22d. Reverted https://github.com/pytorch/pytorch/pull/160183 on behalf of https://github.com/clee2000 due to I'm not sure how, but it seems to have broken inductor/test_extension_backend.py::ExtensionBackendTests::test_open_device_registration [GH job link](https://github.com/pytorch/pytorch/actions/runs/16911267995/job/47917091939) [HUD commit link](`7fbc22855c`). Maybe because the docker build changed? Note to self: not bad TD ([comment](https://github.com/pytorch/pytorch/pull/160183#issuecomment-3179840160))	2025-08-12 15:29:19 +00:00
atalman	b7db86600a	Fix Tensor illustration, use permalinks for image embedding in Readme.md (#160416 ) Fixes Tensor illustration being broken on pypi.org. Also uses permalinks instead of links to images for embedding as per this suggestion of Alban: https://github.com/pytorch/pytorch/pull/160187#discussion_r2262978006 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160416 Approved by: https://github.com/malfet	2025-08-12 15:15:12 +00:00
James Wu	9708fcf92d	Account for triton kernel source code hidden in custom ops properly in AOTAutogradCache (#160120 ) This PR fixes a bug where user defined triton kernels hidden behind `triton_op` do not register source code changes. If a user only changes a triton kernel source_code, because triton kernels are hidden under the custom op, dynamo hasn't traced into them yet. This means at AOTAutograd time, we don't know the list of triton kernels that are defined by custom ops. This is an initial fix for the issue by parsing the AST of the custom op looking for triton kernels. This won't catch more degenerate cases if the custom op calls other custom ops/functions that then call triton kernels, and then the toplevel compiled graph doesn't know about it. To handle that, we'd have to trace through the custom op at dynamo time. This should handle 99% of cases, though. I added an expectedFailure test to show the limitation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160120 Approved by: https://github.com/zou3519	2025-08-12 14:11:06 +00:00
Wang, Chuanqi	a288b15ea9	[CI] Reduce XPU Windows build time (#159763 ) Reduce the time cost from 2.5 hours to about 1.5 hours. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159763 Approved by: https://github.com/EikanWang, https://github.com/atalman	2025-08-12 14:04:29 +00:00
Wang, Chuanqi	7fbc22855c	Update triton xpu commit to support python 3.14 (#160183 ) Follow PR #159725 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160183 Approved by: https://github.com/EikanWang, https://github.com/atalman	2025-08-12 14:02:36 +00:00
IvanKobzarev	f33ce40bc0	[bucketing] Bucket only adjacent collectives to prevent reordering (#159983 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159983 Approved by: https://github.com/wconstab, https://github.com/eellison	2025-08-12 11:57:00 +00:00
Animesh Jain	4d5b3f2d5a	[dynamo][guards] Install dict watchers for recrusive dict tag optimization (#159796 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159796 Approved by: https://github.com/jansel	2025-08-12 09:49:11 +00:00
zeshengzong	f990490a23	Add `label_smoothing` param in `nn.BCELoss` and `nn.BCEWithLogitsLoss` (#150282 ) Fixes #91545 ## Changes - Add `label_smoothing` param and docs - Add test case for `label_smoothing` - Remove duplicate description in `nn.BCELoss` and `nn.BCEWithLogitsLoss` ## Test Result ```bash pytest -s test/test_nn.py -k test_bce ``` ![image](https://github.com/user-attachments/assets/30c0b7fe-fe49-4aa0-9b05-4d70403a7b05) ![image](https://github.com/user-attachments/assets/4fe3fd1c-54b8-4012-afd9-133ce9fb4964) ![image](https://github.com/user-attachments/assets/5cad019a-3a4c-475a-9fde-9c1acad5792d) Pull Request resolved: https://github.com/pytorch/pytorch/pull/150282 Approved by: https://github.com/cyyever, https://github.com/mikaylagawarecki	2025-08-12 09:37:03 +00:00
morrison-turnansky	b9003ed3d8	Dynamo Deep Dive Documentation Fix (#158860 ) changed SourceBuilder to VariableBuilder Fixes #158447 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158860 Approved by: https://github.com/mlazos	2025-08-12 08:53:33 +00:00
Laith Sakka	fea7e9dd37	extract shape in _view_has_unbacked_input (#160255 ) Summary: We were getting DDE on reshape still!! i looked deeper and found an issue in _view_has_unbacked_input namely when input is [[,,]] it need to be normalized to [..] Test Plan: existing tests. Rollback Plan: Differential Revision: D79951119 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160255 Approved by: https://github.com/bobrenjc93	2025-08-12 08:38:19 +00:00
Jovian Anthony Jaison	9a0f7a3bb0	[retry-land][pytorch][dynamo_compile] Log stack_trace to dynamo_compile (#160348 ) refer: https://github.com/pytorch/pytorch/pull/159655 Earlier pr failed on dynamo/test_utils.py::TestDynamoTimed::test_dynamo_timed. Updated test_dynamo_timed + re-ran locally to test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160348 Approved by: https://github.com/masnesral	2025-08-12 06:24:54 +00:00
Animesh Jain	01bcf9a40d	Bump transformers pin (#159291 ) Trying to update hf pin. Benchmarking run to figure out issues <img width="1356" height="123" alt="image" src="https://github.com/user-attachments/assets/fbc435f3-a7cb-4280-9636-2ea6d15d7b6d" /> Retrying - https://github.com/pytorch/pytorch/pull/156118 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159291 Approved by: https://github.com/BoyuanFeng, https://github.com/huydhn Co-authored-by: Huy Do <huydhn@gmail.com>	2025-08-12 05:14:17 +00:00
Animesh Jain	8d3d1c8443	[dynamo] fixes to propagate tag safeness (#159807 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159807 Approved by: https://github.com/jansel	2025-08-12 04:50:13 +00:00
PyTorch UpdateBot	0f3b10b8ee	[audio hash update] update the pinned audio hash (#160384 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160384 Approved by: https://github.com/pytorchbot	2025-08-12 04:38:04 +00:00
Boyuan Feng	5f1010fbb3	[Graph Partition] Pass all OSS unit tests (#154667 ) Graph partition leads to 6.2% speedup on vision_maskrcnn, 5.8% speedup on yolov3. [P1819700563](https://www.internalfb.com/phabricator/paste/view/P1819700563), 39.5% speedup on speech_transformer inference [P1830602200](https://www.internalfb.com/phabricator/paste/view/P1830602200), 85% speedup on speech_transformer training [P1831115315](https://www.internalfb.com/phabricator/paste/view/P1831115315). Run the same diff on two days and both show speedup on average. [first TorchInductor Benchmark ci run](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Mon%2C%2021%20Jul%202025%2016%3A37%3A55%20GMT&stopTime=Mon%2C%2028%20Jul%202025%2016%3A37%3A55%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cuda%20(h100)&lBranch=bf/partition-turn-on&lCommit=75ef90fe89b82c967362a2d40fdf1af047202bc2&rBranch=main&rCommit=abcb24f4de11f8fedf2c2c9ff53b6092ef42306d) <img width="1885" height="752" alt="image" src="https://github.com/user-attachments/assets/13bba9fc-5dbf-42ad-8558-d54f7e367b41" /> [second TorchInductorBenchmark ci run](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Wed%2C%2023%20Jul%202025%2016%3A38%3A27%20GMT&stopTime=Wed%2C%2030%20Jul%202025%2016%3A38%3A27%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cuda%20(h100)&lBranch=bf/partition-turn-on&lCommit=66de27e29338c26b1be94733049868cb0309ea52&rBranch=main&rCommit=70d2e9ba455c3c910f6f95b24171c8eee7bc00bf) <img width="2513" height="1030" alt="image" src="https://github.com/user-attachments/assets/3a413dcb-2314-4292-919a-7ca181f9eeac" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/154667 Approved by: https://github.com/eellison	2025-08-12 04:37:58 +00:00
Nikita Shulga	edaa151d0d	[CI] Move CUDA tests to trunk workflow (#160379 ) Which is getting run before PR is merged anyway, but according to 3X less frequently than pull workflow according to [Flambeau](https://pytorchci.grafana.net/public-dashboards/1c571e79090443eaaa9811db71f8d23b) <img width="796" height="573" alt="image" src="https://github.com/user-attachments/assets/0235e610-4e1c-4be5-88bf-ea8278d1c656" /> I.e. that will probably results in some longer time to signal, but considering that frequency of changes to eager PyTorch-on-CUDA slowed down and Inductor changes are decorated with ciflow/inductor, this looks like an acceptable tradeoff to reduce costs Pull Request resolved: https://github.com/pytorch/pytorch/pull/160379 Approved by: https://github.com/izaitsevfb	2025-08-12 04:23:50 +00:00
rzou	10bc36fe84	Get tensor subclasses and torch.library.triton_op to dispatch correctly (#160341 ) Short-term fix for https://github.com/pytorch/pytorch/issues/160333 The problem is: 1) `triton_op` adds a decomposition for FunctionalTensorMode for this operation 2) Tensor Subclasses rely on FunctionalTensorMode's `__torch_dispatch__` returning NotImplemented. 3) `triton_op`'s FunctionalTensorMode decomposition takes precedence over FunctionalTensorMode's decomposition. The easy fix is to copy-paste the FunctionalTensorMode's NotImplemented return logic into the decomposition. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160341 Approved by: https://github.com/drisspg	2025-08-12 04:09:37 +00:00
PyTorch UpdateBot	32e5e2f596	[vllm hash update] update the pinned vllm hash (#160259 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160259 Approved by: https://github.com/pytorchbot	2025-08-12 04:04:53 +00:00
Scott Todd	bfc873d02e	[ROCm][Windows] Revert copying hipblaslt and rocblas dirs. (#159083 ) This reverts the changes from `b367e5f6a6`. This will also close https://github.com/pytorch/pytorch/pull/158922. Since `30387ab2e4`, ROCm is bootstrapped using the 'rocm' Python module which contains these files (see https://github.com/ROCm/TheRock/blob/main/docs/packaging/python_packaging.md), so they do not need to be bundled into torch/lib. There was also a bug in here - if `ROCM_DIR` is unset, the code crashes: ``` File "D:\projects\TheRock\external-builds\pytorch\.venv\Lib\site-packages\setuptools\_distutils\dist.py", line 1002, in run_command cmd_obj.run() File "D:\b\pytorch_main\setup.py", line 853, in run rocm_dir_path = Path(os.environ["ROCM_DIR"]) ~~~~~~~~~~^^^^^^^^^^^^ File "<frozen os>", line 714, in __getitem__ KeyError: 'ROCM_DIR' ``` The code could have checked for `ROCM_PATH` too. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159083 Approved by: https://github.com/jeffdaily	2025-08-12 02:45:49 +00:00
Scott Todd	eed9dbf70f	[ROCm] Add torch/_rocm_init.py to .gitignore. (#159806 ) Follow-up to https://github.com/pytorch/pytorch/pull/155285. Build scripts like https://github.com/ROCm/TheRock/blob/main/external-builds/pytorch/build_prod_wheels.py generate this file with contents like: ```python def initialize(): import rocm_sdk rocm_sdk.initialize_process( preload_shortnames=['amd_comgr', 'amdhip64', 'hiprtc', 'hipblas', 'hipfft', 'hiprand', 'hipsparse', 'hipsolver', 'hipblaslt', 'miopen'], check_version='7.0.0rc20250804') ``` We may also have https://github.com/pytorch/pytorch/blob/main/tools/amd_build/build_amd.py do the same thing as more of that build support moves here into the upstream PyTorch repository itself (see https://github.com/pytorch/pytorch/issues/159520). This file is then loaded if present here: `a7f3bdf550/torch/__init__.py (L145-L157)` Given that the file is generated by build scripts, I think adding it to `.gitignore` makes sense, as that will prevent accidental check-ins and keep local history cleaner. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159806 Approved by: https://github.com/jeffdaily	2025-08-12 02:24:21 +00:00
Natalia Gimelshein	be53f609aa	fix retaining multimem in symmetric memory (#160343 ) fixes OOM in #160289 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160343 Approved by: https://github.com/eqy	2025-08-12 02:03:20 +00:00
Zain Rizvi	95210cc409	[BE] Isolate pre-push hook dependencies in dedicated virtual environment (#160048 ) This adds two changes: - Isolates pre-push hook dependencies into an isolated venv, no longer affect your system environment - Lets you manually run the pre-push lintrunner (including with lintrunner -a) by invoking `python scripts/lintrunner.py [-a]` (it's ugly, but better than nothing...for now) This is a follow up to: - https://github.com/pytorch/pytorch/pull/158389 ## Problem The current pre-push hook setup installs lintrunner and related dependencies globally, which makes developers nervous about system pollution and can cause version conflicts with existing installations. Also, if the pre-push lintrunner found errors, you had to hope your normal lintrunner could fix them (which wasn't always the case, e.g. if those errors only manifested in certain python versions) ## Key Changes: - Isolated Environment: Creates .git/hooks/linter/.venv/ with Python 3.9 (the python used in CI) and an isolated lintrunner installation - User-Friendly CLI: New python scripts/lintrunner.py wrapper allows developers to run lintrunner (including -a auto-fix) from any environment - Simplified Architecture: Eliminates pre-commit dependency entirely - uses direct git hooks File Changes: - scripts/setup_hooks.py: Rewritten to create isolated uv-managed virtual environment - scripts/lintrunner.py: New wrapper script with shared hash management logic - scripts/run_lintrunner.py: Removed (functionality merged into lintrunner.py) - .pre-commit-config.yaml: Removed (no longer needed) ## Usage: ``` # Setup (run once) python scripts/setup_hooks.py # Manual linting (works from any environment) python scripts/lintrunner.py # Check mode python scripts/lintrunner.py -a # Auto-fix mode # Git hooks work automatically git push # Runs lintrunner in isolated environment # Need to skip the pre-push hook? git push --no-verify ``` ## Benefits: - ✅ Zero global dependency installation - ✅ Per-repository isolation prevents version conflicts - ✅ Full lintrunner functionality is now accessible ## Implementation Notes: - Virtual env is kept in a dedicated dir in .git, to keep per-repo mechanics - lintrunner.py does not need to be invoked from a specific venv. It'll invoke the right venv itself. A minor bug: It tends to garble the lintrunner output a bit, like the screenshot below shows, but I haven't found a workaround so far and it remains understandable to users: <img width="241" height="154" alt="image" src="https://github.com/user-attachments/assets/9496f925-8524-4434-8486-dc579442d688" /> ## What's next? Features that could be added: - Check for lintrunner updates, auto-update if needed - Depending on dev response, this could be enabled by default for all pytorch/pytorch environments Pull Request resolved: https://github.com/pytorch/pytorch/pull/160048 Approved by: https://github.com/seemethere	2025-08-12 01:58:46 +00:00
Ramya Ramineni	7a974a88f2	[ROCm] Fix resource_strings.h (#159996 ) This PR fixes the errors like below: ``` [rank7]: RuntimeError: /tmp/comgr-c3c81b/input/CompileSourceejOPx6:34:8: error: unknown type name 'uint64_t'; did you mean '__hip_internal::uint64_t'? [rank7]: 34 \| if(((uint64_t) t0.data) % (4 * sizeof(half)) != 0) flag_vec4 = false; ``` The following datatypes needs to be defined in `torch/csrc/jit/codegen/fuser/cuda/resource_strings.h` for ROCm versions >= 7.0. ``` typedef unsigned char uint8_t; typedef signed char int8_t; typedef short int int16_t; typedef long long int int64_t; typedef unsigned long long int uint64_t; ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159996 Approved by: https://github.com/pruthvistony, https://github.com/Skylion007, https://github.com/jeffdaily	2025-08-12 01:58:02 +00:00
henrylhtsang	f3f159ff8c	[BE][cutlass backend] Reduce severity of log message for no cutlass config found (#160148 ) This is not really a problem. Sometimes we cannot find a cutlass config due to shape, e.g. when k is odd. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160148 Approved by: https://github.com/mlazos, https://github.com/Skylion007	2025-08-12 01:41:58 +00:00
henrylhtsang	b90feeac86	[BE][cutlass backend] Fix subproc addmm tests (#160295 ) Differential Revision: [D79977421](https://our.internmc.facebook.com/intern/diff/D79977421/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160295 Approved by: https://github.com/jingsh	2025-08-12 01:41:06 +00:00
Han, Xu	0d40ff3b49	[inductor] fix test_different_file_paths_local_pgo on Windows. (#160382 ) fix test_different_file_paths_local_pgo on Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160382 Approved by: https://github.com/angelayi	2025-08-12 01:35:39 +00:00
Scott Todd	cae2b5e3d2	[ROCm][Windows] Enable USE_ROCM, disable USE_RCCL on Windows. (#159079 ) This allows setting `USE_ROCM` on Windows. A few other patches are still required to build (see https://github.com/ROCm/TheRock/issues/589), but we have instructions using open source code and rocm python packages available at https://github.com/ROCm/TheRock/tree/main/external-builds/pytorch#build-pytorch-with-rocm-support. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159079 Approved by: https://github.com/jeffdaily	2025-08-12 01:28:20 +00:00
Scott Todd	ee89cc7a0a	[ROCm][Windows] Fix LoadHIP handling of environment variable paths on Windows. (#159080 ) See https://cmake.org/cmake/help/latest/command/file.html#path-conversion. Paths stored in environment variables may use `/` or `\` (e.g. on Windows), while cmake-style paths always use `/`. This fixes configure errors like: ``` CMake Error at D:/b/pytorch_main/build/CMakeFiles/CMakeScratch/TryCompile-srhq07/CMakeLists.txt:2 (set): Syntax error in cmake code at D:/b/pytorch_main/build/CMakeFiles/CMakeScratch/TryCompile-srhq07/CMakeLists.txt:2 when parsing string D:\projects\TheRock\external-builds\pytorch\.venv\Lib\site-packages\_rocm_sdk_devel/cmake/;D:/b/pytorch_main/cmake/Modules Invalid character escape '\p'. CMake Error at D:/projects/TheRock/external-builds/pytorch/.venv/Lib/site-packages/cmake/data/share/cmake-3.31/Modules/Internal/CheckSourceCompiles.cmake:108 (try_compile): Failed to configure test project build system. ``` (note the mixed usage of `\` and `/` in that string) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159080 Approved by: https://github.com/jeffdaily	2025-08-12 00:18:19 +00:00
Howard Huang	e63c2b21c1	[PP] Initialize P2P communicators on first step (#160210 ) Was hitting hangs in multi-node settings and initializing the NCCL communicators needed for batch p2p ops ahead of time fixes this. This change adds extra communication since it communicates a dummy tensor to next and previous stage ranks. However, this is only paid on the first step so it is negligible. Debug history: https://docs.google.com/document/d/1EKVJYmW2hj_VsvDvnSggXhZzJyvMu9dA0iDJWOZAtjY/edit?tab=t.0 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160210 Approved by: https://github.com/wconstab	2025-08-11 23:46:58 +00:00
drisspg	3626ba711b	[FlexAttention] Swap from and to & for new triton (#160227 ) Fixes #158463 On B200 I am getting a bunch of error spew: ```Shell /tmp/tmp0yiz3c94/p4/cp4ahrfnz4obsvzgftux7dg3aszopks2jljnoaz3eowlooi2scem.py:18:0: error: Failures have been detected while processing an MLIR pass pipeline /tmp/tmp0yiz3c94/p4/cp4ahrfnz4obsvzgftux7dg3aszopks2jljnoaz3eowlooi2scem.py:18:0: note: Pipeline failed while executing [`TritonGPUHoistTMEMAlloc` on 'builtin.module' operation]: reproducer generated at `std::errs, please share the reproducer above with Triton project.` Triton compilation failed: triton_tem_fused_zeros_1 def triton_tem_fused_zeros_1(arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0): PRESCALE_QK : tl.constexpr = False ``` ```Shell 74 = arith.subi %170, %166 : i32 %175 = arith.muli %174, %c128_i32 : i32 %176 = arith.subi %175, %c64_i32 : i32 %177 = arith.extui %173 : i1 to i32 %178 = arith.muli %176, %177 : i32 %179 = arith.subi %c1_i32, %177 : i32 %180 = arith.muli %179, %c64_i32 : i32 %181 = arith.addi %178, %180 : i32 %182 = arith.muli %181, %c64_i32 : i32 %183 = tt.splat %182 : i32 -> tensor<64x64xi32> %184 = tt.addptr %arg19, %183 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %185 = tt.addptr %arg20, %183 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %186 = tt.splat %181 : i32 -> tensor<64xi32> %187 = arith.addi %arg21, %186 : tensor<64xi32> scf.yield %163, %184, %185, %187 : tensor<64x64xf32>, tensor<64x64x!tt.ptr<f16>>, tensor<64x64x!tt.ptr<f16>>, tensor<64xi32> } %114 = tt.expand_dims %113#3 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %115 = arith.cmpi slt, %114, %cst_7 : tensor<1x64xi32> %116 = tt.broadcast %115 : tensor<1x64xi1> -> tensor<64x64xi1> %117 = tt.load %113#1, %116, %cst_8 : tensor<64x64x!tt.ptr<f16>> %118 = tt.dot %46, %117, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %119 = arith.mulf %118, %cst_13 : tensor<64x64xf32> %120 = arith.mulf %119, %cst_3 : tensor<64x64xf32> %121 = arith.select %116, %120, %cst_6 : tensor<64x64xi1>, tensor<64x64xf32> %122 = arith.select %115, %cst_4, %cst_5 : tensor<1x64xi1>, tensor<1x64xi1> %123 = tt.broadcast %122 : tensor<1x64xi1> -> tensor<64x64xi1> %124 = arith.select %123, %121, %cst_6 : tensor<64x64xi1>, tensor<64x64xf32> %125 = arith.mulf %124, %cst_2 : tensor<64x64xf32> %126 = tt.broadcast %61 : tensor<64x1xf32> -> tensor<64x64xf32> %127 = arith.subf %125, %126 : tensor<64x64xf32> %128 = math.exp2 %127 : tensor<64x64xf32> %129 = tt.load %113#2, %116, %cst_8 : tensor<64x64x!tt.ptr<f16>> %130 = tt.dot %51, %129, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %131 = tt.expand_dims %55 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> %132 = tt.broadcast %131 : tensor<64x1xf32> -> tensor<64x64xf32> %133 = arith.subf %130, %132 : tensor<64x64xf32> %134 = arith.mulf %128, %133 : tensor<64x64xf32> %135 = arith.mulf %134, %cst_3 : tensor<64x64xf32> %136 = arith.select %116, %135, %cst_9 : tensor<64x64xi1>, tensor<64x64xf32> %137 = arith.select %115, %122, %cst_5 : tensor<1x64xi1>, tensor<1x64xi1> %138 = tt.broadcast %137 : tensor<1x64xi1> -> tensor<64x64xi1> %139 = arith.select %138, %136, %cst_9 : tensor<64x64xi1>, tensor<64x64xf32> %140 = arith.truncf %139 : tensor<64x64xf32> to tensor<64x64xf16> %141 = tt.trans %117 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %142 = tt.dot %140, %141, %113#0, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> scf.yield %142 : tensor<64x64xf32> } else { scf.yield %cst_9 : tensor<64x64xf32> } %84 = tt.addptr %arg13, %22 : !tt.ptr<i32>, i32 %85 = tt.load %84 : !tt.ptr<i32> %86 = arith.muli %85, %c128_i32 : i32 %87 = tt.addptr %arg12, %21 : !tt.ptr<i32>, i32 %88 = tt.load %87 : !tt.ptr<i32> %89 = tt.splat %86 : i32 -> tensor<64xi32> %90 = arith.addi %89, %14 : tensor<64xi32> %91 = tt.expand_dims %90 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %92 = arith.muli %91, %cst_11 : tensor<1x64xi32> %93 = tt.addptr %71, %92 : tensor<1x64x!tt.ptr<f16>>, tensor<1x64xi32> %94 = tt.broadcast %93 : tensor<1x64x!tt.ptr<f16>> -> tensor<64x64x!tt.ptr<f16>> %95 = tt.addptr %94, %74 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %96 = tt.addptr %76, %92 : tensor<1x64x!tt.ptr<f16>>, tensor<1x64xi32> %97 = tt.broadcast %96 : tensor<1x64x!tt.ptr<f16>> -> tensor<64x64x!tt.ptr<f16>> %98 = tt.addptr %97, %74 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %99 = arith.muli %88, %c2_i32 : i32 %100 = arith.minsi %99, %c4_i32 : i32 %101 = arith.cmpi sge, %100, %c1_i32 : i32 %102 = scf.if %101 -> (tensor<64x64xf32>) { %112 = arith.subi %100, %c1_i32 : i32 %113:4 = scf.for %arg17 = %c0_i32 to %112 step %c1_i32 iter_args(%arg18 = %83, %arg19 = %95, %arg20 = %98, %arg21 = %90) -> (tensor<64x64xf32>, tensor<64x64x!tt.ptr<f16>>, tensor<64x64x!tt.ptr<f16>>, tensor<64xi32>) : i32 { %137 = tt.expand_dims %arg21 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %138 = arith.cmpi slt, %137, %cst_7 : tensor<1x64xi32> %139 = tt.broadcast %138 : tensor<1x64xi1> -> tensor<64x64xi1> %140 = tt.load %arg19, %139, %cst_8 : tensor<64x64x!tt.ptr<f16>> %141 = tt.dot %46, %140, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %142 = arith.mulf %141, %cst_13 : tensor<64x64xf32> %143 = arith.mulf %142, %cst_3 : tensor<64x64xf32> %144 = arith.mulf %143, %cst_2 : tensor<64x64xf32> %145 = tt.broadcast %61 : tensor<64x1xf32> -> tensor<64x64xf32> %146 = arith.subf %144, %145 : tensor<64x64xf32> %147 = math.exp2 %146 : tensor<64x64xf32> %148 = tt.load %arg20, %139, %cst_8 : tensor<64x64x!tt.ptr<f16>> %149 = tt.dot %51, %148, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %150 = tt.expand_dims %55 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> %151 = tt.broadcast %150 : tensor<64x1xf32> -> tensor<64x64xf32> %152 = arith.subf %149, %151 : tensor<64x64xf32> %153 = arith.mulf %147, %152 : tensor<64x64xf32> %154 = arith.mulf %153, %cst_3 : tensor<64x64xf32> %155 = arith.truncf %154 : tensor<64x64xf32> to tensor<64x64xf16> %156 = tt.trans %140 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %157 = tt.dot %155, %156, %arg18, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %158 = arith.divsi %arg17, %c2_i32 : i32 %159 = tt.addptr %84, %158 : !tt.ptr<i32>, i32 %160 = tt.load %159 evictionPolicy = evict_last : !tt.ptr<i32> %161 = arith.addi %158, %c1_i32 : i32 %162 = arith.cmpi slt, %161, %88 : i32 %163 = tt.addptr %159, %c1_i32 : !tt.ptr<i32>, i32 %164 = tt.load %163, %162 evictionPolicy = evict_last : !tt.ptr<i32> %165 = arith.addi %arg17, %c1_i32 : i32 %166 = arith.remsi %165, %c2_i32 : i32 %167 = arith.cmpi eq, %166, %c0_i32 : i32 %168 = arith.subi %164, %160 : i32 %169 = arith.muli %168, %c128_i32 : i32 %170 = arith.subi %169, %c64_i32 : i32 %171 = arith.extui %167 : i1 to i32 %172 = arith.muli %170, %171 : i32 %173 = arith.subi %c1_i32, %171 : i32 %174 = arith.muli %173, %c64_i32 : i32 %175 = arith.addi %172, %174 : i32 %176 = arith.muli %175, %c64_i32 : i32 %177 = tt.splat %176 : i32 -> tensor<64x64xi32> %178 = tt.addptr %arg19, %177 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %179 = tt.addptr %arg20, %177 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %180 = tt.splat %175 : i32 -> tensor<64xi32> %181 = arith.addi %arg21, %180 : tensor<64xi32> scf.yield %157, %178, %179, %181 : tensor<64x64xf32>, tensor<64x64x!tt.ptr<f16>>, tensor<64x64x!tt.ptr<f16>>, tensor<64xi32> } %114 = tt.expand_dims %113#3 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %115 = arith.cmpi slt, %114, %cst_7 : tensor<1x64xi32> %116 = tt.broadcast %115 : tensor<1x64xi1> -> tensor<64x64xi1> %117 = tt.load %113#1, %116, %cst_8 : tensor<64x64x!tt.ptr<f16>> %118 = tt.dot %46, %117, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %119 = arith.mulf %118, %cst_13 : tensor<64x64xf32> %120 = arith.mulf %119, %cst_3 : tensor<64x64xf32> %121 = arith.select %116, %120, %cst_6 : tensor<64x64xi1>, tensor<64x64xf32> %122 = arith.mulf %121, %cst_2 : tensor<64x64xf32> %123 = tt.broadcast %61 : tensor<64x1xf32> -> tensor<64x64xf32> %124 = arith.subf %122, %123 : tensor<64x64xf32> %125 = math.exp2 %124 : tensor<64x64xf32> %126 = tt.load %113#2, %116, %cst_8 : tensor<64x64x!tt.ptr<f16>> %127 = tt.dot %51, %126, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %128 = tt.expand_dims %55 {axis = 1 : i32} : tensor<64xf32> -> tensor<64x1xf32> %129 = tt.broadcast %128 : tensor<64x1xf32> -> tensor<64x64xf32> %130 = arith.subf %127, %129 : tensor<64x64xf32> %131 = arith.mulf %125, %130 : tensor<64x64xf32> %132 = arith.mulf %131, %cst_3 : tensor<64x64xf32> %133 = arith.select %116, %132, %cst_9 : tensor<64x64xi1>, tensor<64x64xf32> %134 = arith.truncf %133 : tensor<64x64xf32> to tensor<64x64xf16> %135 = tt.trans %117 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %136 = tt.dot %134, %135, %113#0, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> scf.yield %136 : tensor<64x64xf32> } else { scf.yield %83 : tensor<64x64xf32> } %103 = tt.splat %33 : !tt.ptr<f16> -> tensor<64x1x!tt.ptr<f16>> %104 = tt.addptr %103, %37 : tensor<64x1x!tt.ptr<f16>>, tensor<64x1xi32> %105 = tt.broadcast %104 : tensor<64x1x!tt.ptr<f16>> -> tensor<64x64x!tt.ptr<f16>> %106 = tt.addptr %105, %42 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %107 = arith.mulf %102, %cst_13 : tensor<64x64xf32> %108 = arith.cmpi slt, %40, %cst_11 : tensor<1x64xi32> %109 = tt.broadcast %108 : tensor<1x64xi1> -> tensor<64x64xi1> %110 = arith.andi %45, %109 : tensor<64x64xi1> %111 = arith.truncf %107 : tensor<64x64xf32> to tensor<64x64xf16> tt.store %106, %111, %110 : tensor<64x64x!tt.ptr<f16>> } else { %16 = arith.divsi %0, %c2_i32 : i32 %17 = arith.muli %0, %c64_i32 : i32 %18 = tt.splat %17 : i32 -> tensor<64xi32> %19 = arith.addi %18, %14 : tensor<64xi32> %20 = tt.expand_dims %19 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> %21 = arith.muli %20, %cst_14 : tensor<64x1xi32> %22 = tt.splat %11 : !tt.ptr<f16> -> tensor<64x1x!tt.ptr<f16>> %23 = tt.addptr %22, %21 : tensor<64x1x!tt.ptr<f16>>, tensor<64x1xi32> %24 = tt.expand_dims %14 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %25 = tt.broadcast %23 : tensor<64x1x!tt.ptr<f16>> -> tensor<64x64x!tt.ptr<f16>> %26 = tt.broadcast %24 : tensor<1x64xi32> -> tensor<64x64xi32> %27 = tt.addptr %25, %26 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %28 = arith.cmpi slt, %20, %cst_10 : tensor<64x1xi32> %29 = tt.broadcast %28 : tensor<64x1xi1> -> tensor<64x64xi1> %30 = tt.load %27, %29, %cst_8 : tensor<64x64x!tt.ptr<f16>> %31 = tt.splat %12 : !tt.ptr<f16> -> tensor<64x1x!tt.ptr<f16>> %32 = tt.addptr %31, %21 : tensor<64x1x!tt.ptr<f16>>, tensor<64x1xi32> %33 = tt.broadcast %32 : tensor<64x1x!tt.ptr<f16>> -> tensor<64x64x!tt.ptr<f16>> %34 = tt.addptr %33, %26 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %35 = tt.load %34, %29, %cst_8 : tensor<64x64x!tt.ptr<f16>> %36:2 = scf.for %arg17 = %c0_i32 to %c4_i32 step %c1_i32 iter_args(%arg18 = %cst_9, %arg19 = %cst_9) -> (tensor<64x64xf32>, tensor<64x64xf32>) : i32 { %55 = arith.muli %2, %c4_i32 : i32 %56 = arith.addi %55, %arg17 : i32 %57 = arith.muli %56, %c2048_i32 : i32 %58 = arith.muli %1, %c32768_i32 : i32 %59 = arith.addi %57, %58 : i32 %60 = arith.extsi %59 : i32 to i64 %61 = arith.muli %1, %c16_i32 : i32 %62 = arith.addi %61, %56 : i32 %63 = arith.muli %62, %c32_i32 : i32 %64 = arith.extsi %63 : i32 to i64 %65 = tt.addptr %arg0, %60 : !tt.ptr<f16>, i64 %66 = tt.addptr %arg5, %60 : !tt.ptr<f16>, i64 %67 = tt.addptr %arg3, %64 : !tt.ptr<f32>, i64 %68 = tt.addptr %arg4, %64 : !tt.ptr<f32>, i64 %69 = arith.remsi %56, %c16_i32 : i32 %70 = arith.muli %3, %c16_i32 : i32 %71 = arith.addi %70, %69 : i32 %72 = arith.muli %71, %c2_i32 : i32 %73 = arith.addi %72, %16 : i32 %74 = tt.addptr %arg11, %73 : !tt.ptr<i32>, i32 %75 = tt.load %74 : !tt.ptr<i32> %76 = arith.muli %75, %c128_i32 : i32 %77 = tt.addptr %arg10, %73 : !tt.ptr<i32>, i32 %78 = tt.load %77 : !tt.ptr<i32> %79 = tt.splat %76 : i32 -> tensor<64xi32> %80 = arith.addi %79, %14 : tensor<64xi32> %81 = tt.expand_dims %80 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %82 = arith.muli %81, %cst_11 : tensor<1x64xi32> %83 = tt.splat %65 : !tt.ptr<f16> -> tensor<1x64x!tt.ptr<f16>> %84 = tt.addptr %83, %82 : tensor<1x64x!tt.ptr<f16>>, tensor<1x64xi32> %85 = tt.expand_dims %14 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> %86 = tt.broadcast %84 : tensor<1x64x!tt.ptr<f16>> -> tensor<64x64x!tt.ptr<f16>> %87 = tt.broadcast %85 : tensor<64x1xi32> -> tensor<64x64xi32> %88 = tt.addptr %86, %87 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %89 = tt.expand_dims %80 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> %90 = arith.muli %89, %cst_14 : tensor<64x1xi32> %91 = tt.splat %66 : !tt.ptr<f16> -> tensor<64x1x!tt.ptr<f16>> %92 = tt.addptr %91, %90 : tensor<64x1x!tt.ptr<f16>>, tensor<64x1xi32> %93 = tt.broadcast %92 : tensor<64x1x!tt.ptr<f16>> -> tensor<64x64x!tt.ptr<f16>> %94 = tt.addptr %93, %26 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %95 = arith.muli %78, %c2_i32 : i32 %96 = arith.minsi %95, %c1_i32 : i32 %97 = arith.cmpi sge, %96, %c1_i32 : i32 %98:2 = scf.if %97 -> (tensor<64x64xf32>, tensor<64x64xf32>) { %120 = arith.subi %96, %c1_i32 : i32 %121:5 = scf.for %arg20 = %c0_i32 to %120 step %c1_i32 iter_args(%arg21 = %arg18, %arg22 = %arg19, %arg23 = %88, %arg24 = %94, %arg25 = %80) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64x!tt.ptr<f16>>, tensor<64x64x!tt.ptr<f16>>, tensor<64xi32>) : i32 { %167 = tt.expand_dims %arg25 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %168 = arith.cmpi slt, %167, %cst_1 : tensor<1x64xi32> %169 = tt.broadcast %168 : tensor<1x64xi1> -> tensor<64x64xi1> %170 = tt.load %arg23, %169, %cst_8 : tensor<64x64x!tt.ptr<f16>> %171 = arith.cmpi slt, %arg25, %cst_17 : tensor<64xi32> %172 = tt.splat %67 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>> %173 = tt.addptr %172, %arg25 : tensor<64x!tt.ptr<f32>>, tensor<64xi32> %174 = tt.load %173, %171 : tensor<64x!tt.ptr<f32>> %175 = arith.cmpf oeq, %174, %cst_16 : tensor<64xf32> %176 = arith.select %175, %cst_15, %174 : tensor<64xi1>, tensor<64xf32> %177 = tt.dot %30, %170, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %178 = arith.mulf %177, %cst_13 : tensor<64x64xf32> %179 = arith.mulf %178, %cst_3 : tensor<64x64xf32> %180 = arith.mulf %179, %cst_2 : tensor<64x64xf32> %181 = tt.expand_dims %176 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> %182 = tt.broadcast %181 : tensor<1x64xf32> -> tensor<64x64xf32> %183 = arith.subf %180, %182 : tensor<64x64xf32> %184 = math.exp2 %183 : tensor<64x64xf32> %185 = tt.expand_dims %arg25 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> %186 = arith.cmpi slt, %185, %cst_12 : tensor<64x1xi32> %187 = tt.broadcast %186 : tensor<64x1xi1> -> tensor<64x64xi1> %188 = tt.load %arg24, %187, %cst_8 : tensor<64x64x!tt.ptr<f16>> %189 = arith.truncf %184 : tensor<64x64xf32> to tensor<64x64xf16> %190 = tt.dot %189, %188, %arg22, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %191 = tt.splat %68 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>> %192 = tt.addptr %191, %arg25 : tensor<64x!tt.ptr<f32>>, tensor<64xi32> %193 = tt.load %192, %171 : tensor<64x!tt.ptr<f32>> %194 = tt.trans %188 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %195 = tt.dot %35, %194, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %196 = tt.expand_dims %193 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> %197 = tt.broadcast %196 : tensor<1x64xf32> -> tensor<64x64xf32> %198 = arith.subf %195, %197 : tensor<64x64xf32> %199 = arith.mulf %184, %198 : tensor<64x64xf32> %200 = arith.mulf %199, %cst_3 : tensor<64x64xf32> %201 = arith.truncf %200 : tensor<64x64xf32> to tensor<64x64xf16> %202 = tt.trans %170 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %203 = tt.dot %201, %202, %arg21, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %204 = arith.divsi %arg20, %c2_i32 : i32 %205 = tt.addptr %74, %204 : !tt.ptr<i32>, i32 %206 = tt.load %205 evictionPolicy = evict_last : !tt.ptr<i32> %207 = arith.addi %204, %c1_i32 : i32 %208 = arith.cmpi slt, %207, %78 : i32 %209 = tt.addptr %205, %c1_i32 : !tt.ptr<i32>, i32 %210 = tt.load %209, %208 evictionPolicy = evict_last : !tt.ptr<i32> %211 = arith.addi %arg20, %c1_i32 : i32 %212 = arith.remsi %211, %c2_i32 : i32 %213 = arith.cmpi eq, %212, %c0_i32 : i32 %214 = arith.subi %210, %206 : i32 %215 = arith.muli %214, %c128_i32 : i32 %216 = arith.subi %215, %c64_i32 : i32 %217 = arith.extui %213 : i1 to i32 %218 = arith.muli %216, %217 : i32 %219 = arith.subi %c1_i32, %217 : i32 %220 = arith.muli %219, %c64_i32 : i32 %221 = arith.addi %218, %220 : i32 %222 = arith.muli %221, %c64_i32 : i32 %223 = tt.splat %222 : i32 -> tensor<64x64xi32> %224 = tt.addptr %arg23, %223 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %225 = tt.addptr %arg24, %223 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %226 = tt.splat %221 : i32 -> tensor<64xi32> %227 = arith.addi %arg25, %226 : tensor<64xi32> scf.yield %203, %190, %224, %225, %227 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64x!tt.ptr<f16>>, tensor<64x64x!tt.ptr<f16>>, tensor<64xi32> } %122 = tt.expand_dims %121#4 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %123 = arith.cmpi slt, %122, %cst_1 : tensor<1x64xi32> %124 = tt.broadcast %123 : tensor<1x64xi1> -> tensor<64x64xi1> %125 = tt.load %121#2, %124, %cst_8 : tensor<64x64x!tt.ptr<f16>> %126 = arith.cmpi slt, %121#4, %cst_17 : tensor<64xi32> %127 = tt.splat %67 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>> %128 = tt.addptr %127, %121#4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32> %129 = tt.load %128, %126 : tensor<64x!tt.ptr<f32>> %130 = arith.cmpf oeq, %129, %cst_16 : tensor<64xf32> %131 = arith.select %130, %cst_15, %129 : tensor<64xi1>, tensor<64xf32> %132 = tt.dot %30, %125, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %133 = arith.mulf %132, %cst_13 : tensor<64x64xf32> %134 = arith.mulf %133, %cst_3 : tensor<64x64xf32> %135 = arith.select %29, %134, %cst_6 : tensor<64x64xi1>, tensor<64x64xf32> %136 = arith.select %28, %cst, %cst_0 : tensor<64x1xi1>, tensor<64x1xi1> %137 = tt.broadcast %136 : tensor<64x1xi1> -> tensor<64x64xi1> %138 = arith.select %137, %135, %cst_6 : tensor<64x64xi1>, tensor<64x64xf32> %139 = arith.mulf %138, %cst_2 : tensor<64x64xf32> %140 = tt.expand_dims %131 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> %141 = tt.broadcast %140 : tensor<1x64xf32> -> tensor<64x64xf32> %142 = arith.subf %139, %141 : tensor<64x64xf32> %143 = math.exp2 %142 : tensor<64x64xf32> %144 = tt.expand_dims %121#4 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> %145 = arith.cmpi slt, %144, %cst_12 : tensor<64x1xi32> %146 = tt.broadcast %145 : tensor<64x1xi1> -> tensor<64x64xi1> %147 = tt.load %121#3, %146, %cst_8 : tensor<64x64x!tt.ptr<f16>> %148 = arith.truncf %143 : tensor<64x64xf32> to tensor<64x64xf16> %149 = tt.dot %148, %147, %121#1, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %150 = tt.splat %68 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>> %151 = tt.addptr %150, %121#4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32> %152 = tt.load %151, %126 : tensor<64x!tt.ptr<f32>> %153 = tt.trans %147 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %154 = tt.dot %35, %153, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %155 = tt.expand_dims %152 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> %156 = tt.broadcast %155 : tensor<1x64xf32> -> tensor<64x64xf32> %157 = arith.subf %154, %156 : tensor<64x64xf32> %158 = arith.mulf %143, %157 : tensor<64x64xf32> %159 = arith.mulf %158, %cst_3 : tensor<64x64xf32> %160 = arith.select %29, %159, %cst_9 : tensor<64x64xi1>, tensor<64x64xf32> %161 = arith.select %28, %136, %cst_0 : tensor<64x1xi1>, tensor<64x1xi1> %162 = tt.broadcast %161 : tensor<64x1xi1> -> tensor<64x64xi1> %163 = arith.select %162, %160, %cst_9 : tensor<64x64xi1>, tensor<64x64xf32> %164 = arith.truncf %163 : tensor<64x64xf32> to tensor<64x64xf16> %165 = tt.trans %125 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %166 = tt.dot %164, %165, %121#0, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> scf.yield %166, %149 : tensor<64x64xf32>, tensor<64x64xf32> } else { scf.yield %arg18, %arg19 : tensor<64x64xf32>, tensor<64x64xf32> } %99 = tt.addptr %arg15, %73 : !tt.ptr<i32>, i32 %100 = tt.load %99 : !tt.ptr<i32> %101 = arith.muli %100, %c128_i32 : i32 %102 = tt.addptr %arg14, %73 : !tt.ptr<i32>, i32 %103 = tt.load %102 : !tt.ptr<i32> %104 = tt.splat %101 : i32 -> tensor<64xi32> %105 = arith.addi %104, %14 : tensor<64xi32> %106 = tt.expand_dims %105 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %107 = arith.muli %106, %cst_11 : tensor<1x64xi32> %108 = tt.addptr %83, %107 : tensor<1x64x!tt.ptr<f16>>, tensor<1x64xi32> %109 = tt.broadcast %108 : tensor<1x64x!tt.ptr<f16>> -> tensor<64x64x!tt.ptr<f16>> %110 = tt.addptr %109, %87 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %111 = tt.expand_dims %105 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> %112 = arith.muli %111, %cst_14 : tensor<64x1xi32> %113 = tt.addptr %91, %112 : tensor<64x1x!tt.ptr<f16>>, tensor<64x1xi32> %114 = tt.broadcast %113 : tensor<64x1x!tt.ptr<f16>> -> tensor<64x64x!tt.ptr<f16>> %115 = tt.addptr %114, %26 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %116 = arith.muli %103, %c2_i32 : i32 %117 = arith.minsi %116, %c1_i32 : i32 %118 = arith.cmpi sge, %117, %c1_i32 : i32 %119:2 = scf.if %118 -> (tensor<64x64xf32>, tensor<64x64xf32>) { %120 = arith.subi %117, %c1_i32 : i32 %121:5 = scf.for %arg20 = %c0_i32 to %120 step %c1_i32 iter_args(%arg21 = %98#0, %arg22 = %98#1, %arg23 = %110, %arg24 = %115, %arg25 = %105) -> (tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64x!tt.ptr<f16>>, tensor<64x64x!tt.ptr<f16>>, tensor<64xi32>) : i32 { %161 = tt.expand_dims %arg25 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %162 = arith.cmpi slt, %161, %cst_1 : tensor<1x64xi32> %163 = tt.broadcast %162 : tensor<1x64xi1> -> tensor<64x64xi1> %164 = tt.load %arg23, %163, %cst_8 : tensor<64x64x!tt.ptr<f16>> %165 = arith.cmpi slt, %arg25, %cst_17 : tensor<64xi32> %166 = tt.splat %67 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>> %167 = tt.addptr %166, %arg25 : tensor<64x!tt.ptr<f32>>, tensor<64xi32> %168 = tt.load %167, %165 : tensor<64x!tt.ptr<f32>> %169 = arith.cmpf oeq, %168, %cst_16 : tensor<64xf32> %170 = arith.select %169, %cst_15, %168 : tensor<64xi1>, tensor<64xf32> %171 = tt.dot %30, %164, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %172 = arith.mulf %171, %cst_13 : tensor<64x64xf32> %173 = arith.mulf %172, %cst_3 : tensor<64x64xf32> %174 = arith.mulf %173, %cst_2 : tensor<64x64xf32> %175 = tt.expand_dims %170 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> %176 = tt.broadcast %175 : tensor<1x64xf32> -> tensor<64x64xf32> %177 = arith.subf %174, %176 : tensor<64x64xf32> %178 = math.exp2 %177 : tensor<64x64xf32> %179 = tt.expand_dims %arg25 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> %180 = arith.cmpi slt, %179, %cst_12 : tensor<64x1xi32> %181 = tt.broadcast %180 : tensor<64x1xi1> -> tensor<64x64xi1> %182 = tt.load %arg24, %181, %cst_8 : tensor<64x64x!tt.ptr<f16>> %183 = arith.truncf %178 : tensor<64x64xf32> to tensor<64x64xf16> %184 = tt.dot %183, %182, %arg22, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %185 = tt.splat %68 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>> %186 = tt.addptr %185, %arg25 : tensor<64x!tt.ptr<f32>>, tensor<64xi32> %187 = tt.load %186, %165 : tensor<64x!tt.ptr<f32>> %188 = tt.trans %182 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %189 = tt.dot %35, %188, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %190 = tt.expand_dims %187 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> %191 = tt.broadcast %190 : tensor<1x64xf32> -> tensor<64x64xf32> %192 = arith.subf %189, %191 : tensor<64x64xf32> %193 = arith.mulf %178, %192 : tensor<64x64xf32> %194 = arith.mulf %193, %cst_3 : tensor<64x64xf32> %195 = arith.truncf %194 : tensor<64x64xf32> to tensor<64x64xf16> %196 = tt.trans %164 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %197 = tt.dot %195, %196, %arg21, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %198 = arith.divsi %arg20, %c2_i32 : i32 %199 = tt.addptr %99, %198 : !tt.ptr<i32>, i32 %200 = tt.load %199 evictionPolicy = evict_last : !tt.ptr<i32> %201 = arith.addi %198, %c1_i32 : i32 %202 = arith.cmpi slt, %201, %103 : i32 %203 = tt.addptr %199, %c1_i32 : !tt.ptr<i32>, i32 %204 = tt.load %203, %202 evictionPolicy = evict_last : !tt.ptr<i32> %205 = arith.addi %arg20, %c1_i32 : i32 %206 = arith.remsi %205, %c2_i32 : i32 %207 = arith.cmpi eq, %206, %c0_i32 : i32 %208 = arith.subi %204, %200 : i32 %209 = arith.muli %208, %c128_i32 : i32 %210 = arith.subi %209, %c64_i32 : i32 %211 = arith.extui %207 : i1 to i32 %212 = arith.muli %210, %211 : i32 %213 = arith.subi %c1_i32, %211 : i32 %214 = arith.muli %213, %c64_i32 : i32 %215 = arith.addi %212, %214 : i32 %216 = arith.muli %215, %c64_i32 : i32 %217 = tt.splat %216 : i32 -> tensor<64x64xi32> %218 = tt.addptr %arg23, %217 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %219 = tt.addptr %arg24, %217 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %220 = tt.splat %215 : i32 -> tensor<64xi32> %221 = arith.addi %arg25, %220 : tensor<64xi32> scf.yield %197, %184, %218, %219, %221 : tensor<64x64xf32>, tensor<64x64xf32>, tensor<64x64x!tt.ptr<f16>>, tensor<64x64x!tt.ptr<f16>>, tensor<64xi32> } %122 = tt.expand_dims %121#4 {axis = 0 : i32} : tensor<64xi32> -> tensor<1x64xi32> %123 = arith.cmpi slt, %122, %cst_1 : tensor<1x64xi32> %124 = tt.broadcast %123 : tensor<1x64xi1> -> tensor<64x64xi1> %125 = tt.load %121#2, %124, %cst_8 : tensor<64x64x!tt.ptr<f16>> %126 = arith.cmpi slt, %121#4, %cst_17 : tensor<64xi32> %127 = tt.splat %67 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>> %128 = tt.addptr %127, %121#4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32> %129 = tt.load %128, %126 : tensor<64x!tt.ptr<f32>> %130 = arith.cmpf oeq, %129, %cst_16 : tensor<64xf32> %131 = arith.select %130, %cst_15, %129 : tensor<64xi1>, tensor<64xf32> %132 = tt.dot %30, %125, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %133 = arith.mulf %132, %cst_13 : tensor<64x64xf32> %134 = arith.mulf %133, %cst_3 : tensor<64x64xf32> %135 = arith.select %29, %134, %cst_6 : tensor<64x64xi1>, tensor<64x64xf32> %136 = arith.mulf %135, %cst_2 : tensor<64x64xf32> %137 = tt.expand_dims %131 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> %138 = tt.broadcast %137 : tensor<1x64xf32> -> tensor<64x64xf32> %139 = arith.subf %136, %138 : tensor<64x64xf32> %140 = math.exp2 %139 : tensor<64x64xf32> %141 = tt.expand_dims %121#4 {axis = 1 : i32} : tensor<64xi32> -> tensor<64x1xi32> %142 = arith.cmpi slt, %141, %cst_12 : tensor<64x1xi32> %143 = tt.broadcast %142 : tensor<64x1xi1> -> tensor<64x64xi1> %144 = tt.load %121#3, %143, %cst_8 : tensor<64x64x!tt.ptr<f16>> %145 = arith.truncf %140 : tensor<64x64xf32> to tensor<64x64xf16> %146 = tt.dot %145, %144, %121#1, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %147 = tt.splat %68 : !tt.ptr<f32> -> tensor<64x!tt.ptr<f32>> %148 = tt.addptr %147, %121#4 : tensor<64x!tt.ptr<f32>>, tensor<64xi32> %149 = tt.load %148, %126 : tensor<64x!tt.ptr<f32>> %150 = tt.trans %144 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %151 = tt.dot %35, %150, %cst_9, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> %152 = tt.expand_dims %149 {axis = 0 : i32} : tensor<64xf32> -> tensor<1x64xf32> %153 = tt.broadcast %152 : tensor<1x64xf32> -> tensor<64x64xf32> %154 = arith.subf %151, %153 : tensor<64x64xf32> %155 = arith.mulf %140, %154 : tensor<64x64xf32> %156 = arith.mulf %155, %cst_3 : tensor<64x64xf32> %157 = arith.select %29, %156, %cst_9 : tensor<64x64xi1>, tensor<64x64xf32> %158 = arith.truncf %157 : tensor<64x64xf32> to tensor<64x64xf16> %159 = tt.trans %125 {order = array<i32: 1, 0>} : tensor<64x64xf16> -> tensor<64x64xf16> %160 = tt.dot %158, %159, %121#0, inputPrecision = tf32 : tensor<64x64xf16> * tensor<64x64xf16> -> tensor<64x64xf32> scf.yield %160, %146 : tensor<64x64xf32>, tensor<64x64xf32> } else { scf.yield %98#0, %98#1 : tensor<64x64xf32>, tensor<64x64xf32> } scf.yield %119#0, %119#1 : tensor<64x64xf32>, tensor<64x64xf32> } %37 = tt.splat %13 : !tt.ptr<f16> -> tensor<64x1x!tt.ptr<f16>> %38 = tt.addptr %37, %21 : tensor<64x1x!tt.ptr<f16>>, tensor<64x1xi32> %39 = tt.broadcast %38 : tensor<64x1x!tt.ptr<f16>> -> tensor<64x64x!tt.ptr<f16>> %40 = tt.addptr %39, %26 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %41 = arith.cmpi slt, %24, %cst_11 : tensor<1x64xi32> %42 = tt.broadcast %41 : tensor<1x64xi1> -> tensor<64x64xi1> %43 = arith.andi %29, %42 : tensor<64x64xi1> %44 = arith.truncf %36#1 : tensor<64x64xf32> to tensor<64x64xf16> tt.store %40, %44, %43 : tensor<64x64x!tt.ptr<f16>> %45 = arith.mulf %36#0, %cst_13 : tensor<64x64xf32> %46 = tt.broadcast %21 : tensor<64x1xi32> -> tensor<64x64xi32> %47 = arith.addi %26, %46 : tensor<64x64xi32> %48 = tt.splat %4 : i32 -> tensor<64x64xi32> %49 = arith.addi %47, %48 : tensor<64x64xi32> %50 = tt.splat %8 : i32 -> tensor<64x64xi32> %51 = arith.addi %49, %50 : tensor<64x64xi32> %52 = tt.splat %arg16 : !tt.ptr<f16> -> tensor<64x64x!tt.ptr<f16>> %53 = tt.addptr %52, %51 : tensor<64x64x!tt.ptr<f16>>, tensor<64x64xi32> %54 = arith.truncf %45 : tensor<64x64xf32> to tensor<64x64xf16> tt.store %53, %54, %29 : tensor<64x64x!tt.ptr<f16>> } tt.return } } {-# external_resources: { mlir_reproducer: { pipeline: "builtin.module(convert-triton-to-tritongpu{enable-source-remat=false num-ctas=1 num-warps=4 target=cuda:100 threads-per-warp=32}, tritongpu-coalesce, tritongpu-F32DotTC, triton-nvidia-gpu-plan-cta, tritongpu-remove-layout-conversions, tritongpu-optimize-thread-locality, tritongpu-accelerate-matmul, tritongpu-remove-layout-conversions, tritongpu-optimize-dot-operands{hoist-layout-conversion=true}, triton-nvidia-optimize-descriptor-encoding, triton-loop-aware-cse, tritongpu-fuse-nested-loops, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, triton-licm, tritongpu-optimize-accumulator-init, tritongpu-hoist-tmem-alloc, tritongpu-promote-lhs-to-tmem, tritongpu-assign-latencies{num-stages=3}, tritongpu-schedule-loops, tritongpu-automatic-warp-specialization{num-stages=3}, tritongpu-pipeline{dump-intermediate-steps=false num-stages=3}, tritongpu-combine-tensor-select-and-if, triton-nvidia-gpu-remove-tmem-tokens, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, triton-loop-aware-cse, tritongpu-prefetch, tritongpu-optimize-dot-operands{hoist-layout-conversion=true}, tritongpu-coalesce-async-copy, triton-nvidia-optimize-tmem-layouts, tritongpu-remove-layout-conversions, triton-nvidia-interleave-tmem, tritongpu-reduce-data-duplication, tritongpu-reorder-instructions, triton-loop-aware-cse, symbol-dce, triton-nvidia-tma-lowering, triton-nvidia-gpu-fence-insertion{compute-capability=90}, sccp, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true})", disable_threading: false, verify_each: true } } #-} /tmp/tmp0yiz3c94/p4/cp4ahrfnz4obsvzgftux7dg3aszopks2jljnoaz3eowlooi2scem.py:18:0: error: Failures have been detected while processing an MLIR pass pipeline /tmp/tmp0yiz3c94/p4/cp4ahrfnz4obsvzgftux7dg3aszopks2jljnoaz3eowlooi2scem.py:18:0: note: Pipeline failed while executing [`TritonGPUHoistTMEMAlloc` on 'builtin.module' operation]: reproducer generated at `std::errs, please share the reproducer above with Triton project.` Triton compilation failed: triton_tem_fused_zeros_1 def triton_tem_fused_zeros_1(arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0): PRESCALE_QK : tl.constexpr = False ROWS_GUARANTEED_SAFE : tl.constexpr = False BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False WRITE_DQ : tl.constexpr = True OUTPUT_LOGSUMEXP : tl.constexpr = True FLOAT32_PRECISION : tl.constexpr = 'tf32' IS_DIVISIBLE : tl.constexpr = False SM_SCALE : tl.constexpr = 0.125 GQA_SHARED_HEADS : tl.constexpr = 4 HAS_FULL_BLOCKS : tl.constexpr = True QK_HEAD_DIM : tl.constexpr = 64 QK_HEAD_DIM_ROUNDED : tl.constexpr = 64 V_HEAD_DIM : tl.constexpr = 64 V_HEAD_DIM_ROUNDED : tl.constexpr = 64 SAFE_HEAD_DIM : tl.constexpr = True BLOCK_M1 : tl.constexpr = 64 BLOCK_N1 : tl.constexpr = 64 BLOCK_M2 : tl.constexpr = 64 BLOCK_N2 : tl.constexpr = 64 SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128 SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128 Q = arg_Q K = arg_K V = arg_V LSE = arg_LSE DELTA = arg_DELTA DO = arg_DO DQ = arg_DQ DV = arg_DV KV_NUM_BLKS = arg_KV_NUM_BLKS KV_IDX = arg_KV_IDX Q_NUM_BLKS = arg_Q_NUM_BLKS Q_IDX = arg_Q_IDX FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS FULL_KV_IDX = arg_FULL_KV_IDX FULL_Q_NUM_BLKS = arg_FULL_Q_NUM_BLKS FULL_Q_IDX = arg_FULL_Q_IDX # Sub notation for this kernel: # # Q: Query, K: Key, V: Value # LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype) # DELTA: Precomputed sum(OUTDO, axis=-1) # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value # DK: Derivative of Key, is the written to via the store_output call due to some limitations with # inductor codegen # M: Number of queries, N: Number of keys/values # QK_HEAD_DIM: The dimension of the query and key embeddings # V_HEAD_DIM: The dimension of the value embeddings # z: Batch size, h: Number of heads, m: Number of queries or keys/values, d: Head dim # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups. # (Modifiable) Performance tuning options # BLOCK_M1: when calculating DK & DV, iterate over BLOCK_M1 across the seqlen dim of Q in each thread block. # BLOCK_N1: when calculating DK & DV, the thread block size across the seqlen dim of K/V. # BLOCK_M2: when calculating DQ, the thread block size across the seqlen dim of Q. # BLOCK_N2: when calculating DQ, iterate over BLOCK_N2 across the seqlen dim of K/V in each thread block. # # The following FULL_ and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid. # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query. # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query. # Q_NUM_BLKS: The number of Q blocks (that may or may not require masking) for each query. # Q_IDX: The indices of Q blocks (that may or may not require masking) for each query. # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query. # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query. # FULL_Q_NUM_BLKS: The number of fully unmasked Q blocks (so we don't need masking) for each query. # FULL_Q_IDX: The indices of fully unmasked Q blocks (so we don't need masking) for each query. # The below are kernel options that can be applied for certain score_mods, # or involve a numerics vs. perf tradeoff # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has # about 20% more numerical error, but slightly faster. # Define strides of inputs stride_qz, stride_qh, stride_qm, stride_qd = 32768, 2048, 64, 1 stride_kz, stride_kh, stride_kn, stride_kd = 65536, 16384, 64, 1 stride_vz, stride_vh, stride_vn, stride_vd = 65536, 16384, 64, 1 stride_doz, stride_doh, stride_dom, stride_dod = 32768, 2048, 64, 1 stride_dqz, stride_dqh, stride_dqm, stride_dqd = 32768, 2048, 64, 1 stride_dvz, stride_dvh, stride_dvm, stride_dvd = 65536, 16384, 64, 1 ZQ = 2 HQ = 16 HKV = 4 Q_LEN = 32 ZKV = 2 KV_LEN = 256 MATMUL_PRECISION = Q.dtype.element_ty pid = tl.program_id(0) NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1) NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2) off_zq = tl.program_id(1) # q batch idx off_hkv = tl.program_id(2) # kv head idx off_zkv = off_zq % ZKV # kv batch idx SPARSE_Z = 2 SPARSE_HQ = 16 sparse_idx_z = off_zq % SPARSE_Z k_adj = (stride_kh * off_hkv + stride_kz * off_zkv).to(tl.int64) v_adj = (stride_vh * off_hkv + stride_vz * off_zkv).to(tl.int64) # first compute broadcasted dv of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM] # then reduce to dv of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM] dv_adj = (stride_dvh * off_hkv + stride_dvz * off_zq).to(tl.int64) # offset K, V, DV pointers for batch/kv-head K += k_adj V += v_adj DV += dv_adj RCP_LN2 = 1.44269504 offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED) offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED) if pid >= NUM_KV_BLOCKS: off_pid = pid - NUM_KV_BLOCKS # THIS BLOCK DOES DQ SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M2) SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N2) off_hq2 = off_pid // NUM_Q_BLOCKS + off_hkv * GQA_SHARED_HEADS start_m2_block = off_pid % NUM_Q_BLOCKS off_pid_mask = start_m2_block // SPARSE_Q_MULTIPLE stride_kv_num_blks_h = 1 stride_kv_idx_h = 2 stride_kv_idx_m = 2 sparse_idx_hq2 = off_hq2 % SPARSE_HQ sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq2 sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + off_pid_mask sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + off_pid_mask * stride_kv_idx_m # noqa: B950 # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads. q_adj2 = (stride_qh * off_hq2 + stride_qz * off_zq).to(tl.int64) do_adj2 = (stride_doh * off_hq2 + stride_doz * off_zq).to(tl.int64) dq_adj2 = (stride_dqh * off_hq2 + stride_dqz * off_zq).to(tl.int64) off_chz2 = ((off_zq * HQ + off_hq2) * Q_LEN).to(tl.int64) Q2 = Q + q_adj2 DO2 = DO + do_adj2 # TODO: This does not work if DQ is not the same layout as Q (for example, # if Q is broadcasted) DQ2 = DQ + dq_adj2 LSE2 = LSE + off_chz2 DELTA2 = DELTA + off_chz2 # dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM], dtype=tl.float32) dq = tl.zeros([BLOCK_M2, QK_HEAD_DIM_ROUNDED], dtype=tl.float32) start_m2 = start_m2_block * BLOCK_M2 offs_m2 = start_m2 + tl.arange(0, BLOCK_M2) # load Q and do: they stay in SRAM throughout the inner loop. q = load_checked_2d(Q2, offs_m2, offs_k, stride_qm, stride_qd, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM) do = load_checked_2d(DO2, offs_m2, offs_v, stride_dom, stride_dod, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, V_HEAD_DIM) if PRESCALE_QK: q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION) if IS_DIVISIBLE: Di = tl.load(DELTA2 + offs_m2) lse = tl.load(LSE2 + offs_m2) else: Di = tl.load(DELTA2 + offs_m2, mask=offs_m2 < Q_LEN) lse = tl.load(LSE2 + offs_m2, mask=offs_m2 < Q_LEN) lse = tl.where(lse == -float("inf"), 0.0, lse) lse = lse[:, None] # ~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # KV_IDX and KV_NUM_BLKS are always contiguous. kv_indices = KV_IDX + sparse_kv_idx_offset kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading sparse_kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset) offs_n2 = kv_start + tl.arange(0, BLOCK_N2) dq = bwd_dq_inner( arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, K, V, dq, q, do, Di, lse, off_zq, off_hq2, offs_m2, offs_n2, stride_kn, stride_kd, stride_vn, stride_vd, kv_indices, sparse_kv_num_blocks, MATMUL_PRECISION, IS_FULL_BLOCKS=False, ) if HAS_FULL_BLOCKS: # ~~~~~~~~~~~ partial unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous. kv_indices = FULL_KV_IDX + sparse_kv_idx_offset kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading sparse_kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset) offs_n2 = kv_start + tl.arange(0, BLOCK_N2) dq = bwd_dq_inner( arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, K, V, dq, q, do, Di, lse, off_zq, off_hq2, offs_m2, offs_n2, stride_kn, stride_kd, stride_vn, stride_vd, kv_indices, sparse_kv_num_blocks, MATMUL_PRECISION, IS_FULL_BLOCKS=True, ) # Write back dQ. dq_ptrs = DQ2 + offs_m2[:, None] * stride_dqm + offs_k[None, :] * stride_dqd dq = SM_SCALE if IS_DIVISIBLE and SAFE_HEAD_DIM: tl.store(dq_ptrs, dq) else: tl.store(dq_ptrs, dq, mask=(offs_m2[:, None] < Q_LEN) & (offs_k[None, :] < QK_HEAD_DIM)) else: # THIS BLOCK DOES DK & DV SPARSE_Q_MULTIPLE = (SPARSE_Q_BLOCK_SIZE // BLOCK_M1) SPARSE_KV_MULTIPLE = (SPARSE_KV_BLOCK_SIZE // BLOCK_N1) pid_mask = pid // SPARSE_KV_MULTIPLE stride_q_num_blks_h = 2 stride_q_idx_h = 2 stride_q_idx_n = 1 dv = tl.zeros([BLOCK_N1, V_HEAD_DIM_ROUNDED], dtype=tl.float32) dk = tl.zeros([BLOCK_N1, QK_HEAD_DIM_ROUNDED], dtype=tl.float32) start_n1 = pid BLOCK_N1 offs_n1 = start_n1 + tl.arange(0, BLOCK_N1) # load K and V: they stay in SRAM throughout the inner loop. k = load_checked_2d(K, offs_n1, offs_k, stride_kn, stride_kd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM) v = load_checked_2d(V, offs_n1, offs_v, stride_vn, stride_vd, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM) if PRESCALE_QK: k = (k * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION) for off_g in range(0, GQA_SHARED_HEADS): off_hq1 = off_hkv * GQA_SHARED_HEADS + off_g # Offset Q, DQ, DO, DELTA & LSE. These inputs are offsetted by query heads. q_adj1 = (stride_qh * off_hq1 + stride_qz * off_zq).to(tl.int64) do_adj1 = (stride_doh * off_hq1 + stride_doz * off_zq).to(tl.int64) dq_adj1 = (stride_dqh * off_hq1 + stride_dqz * off_zq).to(tl.int64) off_chz1 = ((off_zq * HQ + off_hq1) * Q_LEN).to(tl.int64) Q1 = Q + q_adj1 DO1 = DO + do_adj1 # TODO: This does not work if DQ is not the same layout as Q (for example, # if Q is broadcasted) LSE1 = LSE + off_chz1 DELTA1 = DELTA + off_chz1 sparse_idx_hq1 = off_hq1 % SPARSE_HQ sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq1 sparse_q_num_blks_offset = sparse_hz_offset * stride_q_num_blks_h + pid_mask sparse_q_idx_offset = sparse_hz_offset * stride_q_idx_h + pid_mask * stride_q_idx_n # noqa: B950 # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Q_IDX and Q_NUM_BLKS are always contiguous. q_indices = Q_IDX + sparse_q_idx_offset q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading sparse_q_num_blocks = tl.load(Q_NUM_BLKS + sparse_q_num_blks_offset) offs_m1 = q_start + tl.arange(0, BLOCK_M1) dk, dv = bwd_dkdv_inner( arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, Q1, DO1, DELTA1, LSE1, dk, dv, k, v, off_zq, off_hq1, offs_n1, offs_m1, stride_qm, stride_qd, stride_dom, stride_dod, q_indices, sparse_q_num_blocks, MATMUL_PRECISION, IS_FULL_BLOCKS=False, ) if HAS_FULL_BLOCKS: # ~~~~~~~~~~~~~~~ fully unmasked blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # FULL_Q_IDX and FULL_Q_NUM_BLKS are always contiguous. q_indices = FULL_Q_IDX + sparse_q_idx_offset q_start = tl.load(q_indices) * SPARSE_Q_BLOCK_SIZE # first q block we're loading sparse_q_num_blocks = tl.load(FULL_Q_NUM_BLKS + sparse_q_num_blks_offset) offs_m1 = q_start + tl.arange(0, BLOCK_M1) dk, dv = bwd_dkdv_inner( arg_Q, arg_K, arg_V, arg_LSE, arg_DELTA, arg_DO, arg_DQ, arg_DV, arg_KV_NUM_BLKS, arg_KV_IDX, arg_Q_NUM_BLKS, arg_Q_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, arg_FULL_Q_NUM_BLKS, arg_FULL_Q_IDX, out_ptr0, Q1, DO1, DELTA1, LSE1, dk, dv, k, v, off_zq, off_hq1, offs_n1, offs_m1, stride_qm, stride_qd, stride_dom, stride_dod, q_indices, sparse_q_num_blocks, MATMUL_PRECISION, IS_FULL_BLOCKS=True, ) # Write back dV and dK. dv_ptrs = DV + offs_n1[:, None] * stride_dvm + offs_v[None, :] * stride_dvd index_n = offs_n1[:, None] index_k = offs_k[None, :] index_v = offs_v[None, :] if IS_DIVISIBLE and SAFE_HEAD_DIM: tl.store(dv_ptrs, dv) else: tl.store(dv_ptrs, dv, mask=(index_n < KV_LEN) & (index_v < V_HEAD_DIM)) dk = SM_SCALE if SAFE_HEAD_DIM: mask = index_n < KV_LEN else: mask = (index_n < KV_LEN) & (index_k < QK_HEAD_DIM) # first compute broadcasted dk of shape [Bq, Hkv, KV_LEN, V_HEAD_DIM] # then reduce to dk of shape [Bkv, Hkv, KV_LEN, V_HEAD_DIM] xindex = index_k + 64index_n + 16384off_hkv + 65536off_zq tl.store(out_ptr0 + (tl.broadcast_to(xindex, dk.shape)), dk, mask) metadata: {'signature': {'arg_Q': 'fp16', 'arg_K': 'fp16', 'arg_V': 'fp16', 'arg_LSE': 'fp32', 'arg_DELTA': 'fp32', 'arg_DO': 'fp16', 'arg_DQ': 'fp16', 'arg_DV': 'fp16', 'arg_KV_NUM_BLKS': 'i32', 'arg_KV_IDX': 'i32', 'arg_Q_NUM_BLKS': 'i32', 'arg_Q_IDX': 'i32', 'arg_FULL_KV_NUM_BLKS': 'i32', 'arg_FULL_KV_IDX': 'i32', 'arg_FULL_Q_NUM_BLKS': 'i32', 'arg_FULL_Q_IDX': 'i32', 'out_ptr0': 'fp16'}, 'device': 0, 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]], (7,): [['tt.divisibility', 16]], (8,): [['tt.divisibility', 16]], (9,): [['tt.divisibility', 16]], (10,): [['tt.divisibility', 16]], (11,): [['tt.divisibility', 16]], (12,): [['tt.divisibility', 16]], (13,): [['tt.divisibility', 16]], (14,): [['tt.divisibility', 16]], (15,): [['tt.divisibility', 16]], (16,): [['tt.divisibility', 16]]}], 'device_type': 'cuda', 'num_warps': 4, 'num_stages': 3, 'debug': True, 'cc': 100} Traceback (most recent call last): File "/home/drisspg/meta/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 748, in _precompile_config binary = triton.compile(compile_args, *compile_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/.conda/envs/dev/lib/python3.12/site-packages/triton/compiler/compiler.py", line 359, in compile next_module = compile_ir(module, metadata) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/.conda/envs/dev/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py", line 456, in <lambda> stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, capability) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/.conda/envs/dev/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py", line 298, in make_ttgir pm.run(mod) RuntimeError: PassManager::run failed frames [('total', 3), ('ok', 3)] inline_call [] stats [('calls_captured', 8), ('unique_graphs', 3)] aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('ok', 1)] inductor [('triton_bundler_save_kernel', 8), ('async_compile_cache_miss', 3), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1), ('fxgraph_cache_bypass', 1)] graph_break [] F ==================================================== FAILURES ===================================================== _____________________________ TestFlexAttentionCUDA.test_GQA_score_mod1_cuda_float16 ______________________________ Traceback (most recent call last): File "/home/drisspg/.conda/envs/dev/lib/python3.12/unittest/case.py", line 58, in testPartExecutor yield File "/home/drisspg/.conda/envs/dev/lib/python3.12/unittest/case.py", line 634, in run self._callTestMethod(testMethod) File "/home/drisspg/.conda/envs/dev/lib/python3.12/unittest/case.py", line 589, in _callTestMethod if method() is not None: ^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/testing/_internal/common_utils.py", line 3224, in wrapper method(args, *kwargs) File "/home/drisspg/meta/pytorch/torch/testing/_internal/common_utils.py", line 3224, in wrapper method(args, kwargs) File "/home/drisspg/meta/pytorch/torch/testing/_internal/common_device_type.py", line 446, in instantiated_test raise rte File "/home/drisspg/meta/pytorch/torch/testing/_internal/common_device_type.py", line 426, in instantiated_test result = test(self, param_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/testing/_internal/common_device_type.py", line 1349, in dep_fn return fn(self, args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/testing/_internal/common_device_type.py", line 1215, in dep_fn return fn(slf, args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/test/inductor/test_flex_attention.py", line 1430, in test_GQA self.run_test(inputs) File "/home/drisspg/meta/pytorch/test/inductor/test_flex_attention.py", line 566, in run_test compiled_out.backward(backward_grad) File "/home/drisspg/meta/pytorch/torch/_tensor.py", line 625, in backward torch.autograd.backward( File "/home/drisspg/meta/pytorch/torch/autograd/__init__.py", line 354, in backward _engine_run_backward( File "/home/drisspg/meta/pytorch/torch/autograd/graph.py", line 829, in _engine_run_backward return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/autograd/function.py", line 315, in apply return user_fn(self, args) ^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2303, in backward return impl_fn() ^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2289, in impl_fn out = CompiledFunction._backward_impl(ctx, all_args) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_functorch/_aot_autograd/runtime_wrappers.py", line 2394, in _backward_impl CompiledFunction.compiled_bw = aot_config.bw_compiler( ^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_functorch/_aot_autograd/schemas.py", line 1256, in __call__ return self.compiler_fn(gm, example_inputs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_dynamo/backends/common.py", line 76, in _wrapped_bw_compiler disable( File "/home/drisspg/meta/pytorch/torch/_dynamo/eval_frame.py", line 1005, in _fn return fn(args, *kwargs) ^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_utils_internal.py", line 92, in wrapper_function return function(args, *kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/compile_fx.py", line 2428, in bw_compiler return inner_compile( ^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/compile_fx.py", line 773, in compile_fx_inner return wrap_compiler_debug(_compile_fx_inner, compiler_name="inductor")( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_dynamo/repro/after_aot.py", line 124, in debug_wrapper inner_compiled_fn = compiler_fn(gm, example_inputs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/compile_fx.py", line 952, in _compile_fx_inner mb_compiled_graph = fx_codegen_and_compile( ^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/compile_fx.py", line 1652, in fx_codegen_and_compile return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/compile_fx.py", line 1506, in codegen_and_compile compiled_module = graph.compile_to_module() ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/graph.py", line 2318, in compile_to_module return self._compile_to_module() ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/graph.py", line 2328, in _compile_to_module mod = self._compile_to_module_lines(wrapper_code) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/graph.py", line 2396, in _compile_to_module_lines mod = PyCodeCache.load_by_key_path( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/codecache.py", line 3466, in load_by_key_path mod = _reload_python_module(key, path, set_sys_modules=in_toplevel) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/runtime/compile_tasks.py", line 33, in _reload_python_module exec(code, mod.__dict__, mod.__dict__) File "/tmp/tmp0yiz3c94/az/caza2gzmsagyuusmf2ka3oat3na4xv6zudssk244xmlzsbv2knze.py", line 117, in <module> File "/home/drisspg/meta/pytorch/torch/_inductor/async_compile.py", line 489, in triton kernel.precompile( File "/home/drisspg/meta/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 437, in precompile self._precompile_worker() File "/home/drisspg/meta/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 459, in _precompile_worker compile_results.append(self._precompile_config(c)) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/meta/pytorch/torch/_inductor/runtime/triton_heuristics.py", line 748, in _precompile_config binary = triton.compile(compile_args, **compile_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/.conda/envs/dev/lib/python3.12/site-packages/triton/compiler/compiler.py", line 359, in compile next_module = compile_ir(module, metadata) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/.conda/envs/dev/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py", line 456, in <lambda> stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, capability) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/drisspg/.conda/envs/dev/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py", line 298, in make_ttgir pm.run(mod) RuntimeError: PassManager::run failed To execute this test, run the following from the base repo dir: python test/inductor/test_flex_attention.py TestFlexAttentionCUDA.test_GQA_score_mod1_cuda_float16 This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 ============================================= short test summary info ============================================= FAILED [5.1441s] test/inductor/test_flex_attention.py::TestFlexAttentionCUDA::test_GQA_score_mod1_cuda_float16 - RuntimeError: PassManager::run failed ================================== 1 failed, 1 passed, 1404 deselected in 18.10s ================================== ~/meta/pytorch flex-warning !1 ❯ ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160227 Approved by: https://github.com/Skylion007, https://github.com/Chillee	2025-08-11 23:30:20 +00:00
Sherlock Huang	99bc2f94c1	Update export/schema.py (#160220 ) Summary: Model could have multiple ExportedPrograms - for different methods. They can have different weights. - for different delegates. They can also have different weights. For this reason, we make weight per ExportedProgram. Also, we cleanup Model, and Program. IIUC, Model and Program are not used anywhere, so it's ok to make BC breaking change. Test Plan: CI Rollback Plan: Differential Revision: D79917395 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160220 Approved by: https://github.com/angelayi, https://github.com/dolpm, https://github.com/jingsh	2025-08-11 23:14:08 +00:00
Yidi Wu	fc25c68f20	[hop][exc] make UncapturedHigherOrderOpError print user code and avoid re-raise (#159296 ) After the change, the error stacktrace is attached with user code stack and is suppressed into 1 (without the scrolling up mssage). For example: ```python class Test(torch.nn.Module): def forward(self, c, x): def cond_fn(c, x): return c > 0 and x.size(0) < 20 def body_fn(c, x): return c - 1, x.sin() return torch._higher_order_ops.while_loop(cond_fn, body_fn, (c, x)) ``` Now gives the following error message: ```python Traceback (most recent call last): File "/home/yidi/local/pytorch/test/inductor/test_control_flow.py", line 1705, in test_while_loop_size_mismatch_tensor_expansion self._run_test( ~~~~~~~~~~~~~~^ model=WhileLoopModels.SizeMismatchTensorExpansion(), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ...<2 lines>... dynamic=dynamic, ^^^^^^^^^^^^^^^^ ) ^ File "/home/yidi/local/pytorch/test/inductor/test_control_flow.py", line 1417, in _run_test result = model(inputs_with_counters) File "/home/yidi/local/pytorch/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl return self._call_impl(args, *kwargs) ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/nn/modules/module.py", line 1784, in _call_impl return forward_call(args, *kwargs) File "/home/yidi/local/pytorch/test/inductor/test_control_flow.py", line 1053, in forward return torch._higher_order_ops.while_loop(cond_fn, body_fn, (c, x)) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_higher_order_ops/while_loop.py", line 176, in while_loop return torch.compile( ~~~~~~~~~~~~~~ _while_loop_op_wrapper, backend=backend, fullgraph=True ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ )(flat_cond_fn, flat_body_fn, tuple(flat_inputs), tuple()) ~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/eval_frame.py", line 804, in compile_wrapper return fn(args, *kwargs) File "/home/yidi/local/pytorch/torch/_dynamo/convert_frame.py", line 1595, in __call__ result = self._torchdynamo_orig_backend( frame, cache_entry, self.hooks, frame_state, skip=1 ) File "/home/yidi/local/pytorch/torch/_dynamo/convert_frame.py", line 1353, in __call__ result = self._inner_convert( frame, cache_entry, hooks, frame_state, skip=skip + 1 ) File "/home/yidi/local/pytorch/torch/_dynamo/convert_frame.py", line 682, in __call__ result = _compile( frame.f_code, ...<16 lines>... convert_frame_box=self._box, ) File "/home/yidi/local/pytorch/torch/_dynamo/convert_frame.py", line 1172, in _compile guarded_code = compile_inner(code, one_graph, hooks, transform) File "/home/yidi/local/pytorch/torch/_utils_internal.py", line 98, in wrapper_function return function(args, *kwargs) File "/home/yidi/local/pytorch/torch/_dynamo/convert_frame.py", line 858, in compile_inner return _compile_inner(code, one_graph, hooks, transform) File "/home/yidi/local/pytorch/torch/_dynamo/convert_frame.py", line 897, in _compile_inner out_code = transform_code_object(code, transform) File "/home/yidi/local/pytorch/torch/_dynamo/bytecode_transformation.py", line 1461, in transform_code_object transformations(instructions, code_options) ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/convert_frame.py", line 300, in _fn return fn(args, *kwargs) File "/home/yidi/local/pytorch/torch/_dynamo/convert_frame.py", line 818, in transform tracer.run() ~~~~~~~~~~^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 3528, in run super().run() ~~~~~~~~~~~^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 1372, in run while self.step(): ~~~~~~~~~^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 1276, in step self.dispatch_table[inst.opcode](self, inst) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 852, in wrapper return inner_fn(self, inst) File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2240, in CALL_FUNCTION_EX self.call_function(fn, argsvars.items, kwargsvars) ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 1200, in call_function self.push(fn.call_function(self, args, kwargs)) # type: ignore[arg-type] ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/variables/lazy.py", line 212, in realize_and_forward return getattr(self.realize(), name)(args, *kwargs) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/variables/higher_order_ops.py", line 91, in graph_break_as_hard_error raise exc.with_traceback(sys.exc_info()[2]) from None File "/home/yidi/local/pytorch/torch/_dynamo/variables/higher_order_ops.py", line 77, in graph_break_as_hard_error return fn(args, *kwargs) File "/home/yidi/local/pytorch/torch/_dynamo/variables/higher_order_ops.py", line 1287, in call_function ) = speculate_subgraph( ~~~~~~~~~~~~~~~~~~^ tx, ^^^ ...<33 lines>... supports_aliasing=self.supports_aliasing, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ) ^ File "/home/yidi/local/pytorch/torch/_dynamo/variables/higher_order_ops.py", line 877, in speculate_subgraph raise ex File "/home/yidi/local/pytorch/torch/_dynamo/variables/higher_order_ops.py", line 718, in speculate_subgraph output = f.call_function(tx, args, sub_kwargs) File "/home/yidi/local/pytorch/torch/_dynamo/variables/functions.py", line 580, in call_function return super().call_function(tx, args, kwargs) ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/variables/functions.py", line 334, in call_function return tx.inline_user_function_return(self, [self.self_args(), args], kwargs) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 1217, in inline_user_function_return return InliningInstructionTranslator.inline_call(self, fn, args, kwargs) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 3733, in inline_call return tracer.inline_call_() ~~~~~~~~~~~~~~~~~~~^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 3936, in inline_call_ self.run() ~~~~~~~~^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 1372, in run while self.step(): ~~~~~~~~~^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 1276, in step self.dispatch_table[inst.opcode](self, inst) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 852, in wrapper return inner_fn(self, inst) File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 2240, in CALL_FUNCTION_EX self.call_function(fn, argsvars.items, kwargsvars) ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 1200, in call_function self.push(fn.call_function(self, args, kwargs)) # type: ignore[arg-type] ~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/variables/lazy.py", line 212, in realize_and_forward return getattr(self.realize(), name)(args, *kwargs) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/variables/functions.py", line 580, in call_function return super().call_function(tx, args, kwargs) ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/variables/functions.py", line 334, in call_function return tx.inline_user_function_return(self, [self.self_args(), args], kwargs) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 1217, in inline_user_function_return return InliningInstructionTranslator.inline_call(self, fn, args, kwargs) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 3733, in inline_call return tracer.inline_call_() ~~~~~~~~~~~~~~~~~~~^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 3936, in inline_call_ self.run() ~~~~~~~~^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 1372, in run while self.step(): ~~~~~~~~~^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 1276, in step self.dispatch_table[inst.opcode](self, inst) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^ File "/home/yidi/local/pytorch/torch/_dynamo/symbolic_convert.py", line 830, in inner unimplemented_v2( ~~~~~~~~~~~~~~~~^ gb_type="Data-dependent branching", ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ...<5 lines>... ], ^^ ) ^ File "/home/yidi/local/pytorch/torch/_dynamo/exc.py", line 580, in unimplemented_v2 raise Unsupported(msg) torch._dynamo.exc.UncapturedHigherOrderOpError: while_loop doesn't work unless it is captured completely with torch.compile. Got Data-dependent branching Explanation: Detected data-dependent branching (e.g. `if my_tensor.sum() > 0:`). Dynamo does not support tracing dynamic control flow. Hint: This graph break is fundamental - it is unlikely that Dynamo will ever be able to trace through your code. Consider finding a workaround. Hint: Use `torch.cond` to express dynamic control flow. Developer debug context: attempted to jump with TensorVariable() For more details about this graph break, please visit: https://pytorch-labs.github.io/compile-graph-break-site/gb/gb0170.html from user code: File "/home/yidi/local/pytorch/torch/_higher_order_ops/while_loop.py", line 167, in _while_loop_op_wrapper return while_loop_op(args, *kwargs) File "/home/yidi/local/pytorch/torch/_higher_order_ops/while_loop.py", line 137, in flat_cond_fn return cond_fn(carried, *additional) File "/home/yidi/local/pytorch/test/inductor/test_control_flow.py", line 1047, in cond_fn return c > 0 and x.size(0) < 20 Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo" To execute this test, run the following from the base repo dir: python test/inductor/test_control_flow.py WhileLoopTests.test_while_loop_size_mismatch_tensor_expansion_device_cpu_dynamic_False This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159296 Approved by: https://github.com/zou3519	2025-08-11 22:48:10 +00:00
Pat Vignola	5a40c57844	[MTIA] Implement isAvailable() for MTIA hooks (#160304 ) Summary: MTIA is missing the `isAvailable()` override, which is necessary for some of the device agnostic methods. Test Plan: `torch._C._get_accelerator()` Rollback Plan: Differential Revision: D79981115 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160304 Approved by: https://github.com/nautsimon	2025-08-11 21:45:11 +00:00
Nikita Shulga	7d2ec704e4	Fix MPS autocast for ConvTranspose3d (#160345 ) ## Summary - ensure ConvTranspose3d uses fp32 under MPS autocast - add MPS autocast test for ConvTranspose3d Generated by Codex, see https://chatgpt.com/codex/tasks/task_e_689a360388288327a2cac6f55bbfc42c Fixes https://github.com/pytorch/pytorch/issues/160332 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160345 Approved by: https://github.com/dcci	2025-08-11 21:01:52 +00:00
Sandeep Narendranath Karjala	fc80f6859e	Fix collective schedule logging and runtime tests (#160260 ) Summary: - Fix collective schedule logging so that only logs when collectives present - Fix runtime estimate test to check if each op has a number value Pull Request resolved: https://github.com/pytorch/pytorch/pull/160260 Approved by: https://github.com/Skylion007	2025-08-11 20:58:52 +00:00
PaulZhang12	cf0a0dcb0a	Make user defined Triton kernels serializable for fx_graph_runnable (#160002 ) Resolves issue https://github.com/pytorch/pytorch/issues/153475 where `fx_graph_runnable` didn't work with user defined triton kernels. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160002 Approved by: https://github.com/eellison	2025-08-11 20:54:33 +00:00
PyTorch MergeBot	b149c7204c	Revert "port distributed pipeline test files for Intel GPU (#159033 )" This reverts commit 76a0609b6bddb2bc40f1eb4ade12885023653d59. Reverted https://github.com/pytorch/pytorch/pull/159033 on behalf of https://github.com/clee2000 due to broke test_cpp_extensions_stream_and_event.py::TestCppExtensionStreamAndEvent::test_stream_event [GH job link](https://github.com/pytorch/pytorch/actions/runs/16890370216/job/47849586456) [HUD commit link](`76a0609b6b`) note to self: bad TD ([comment](https://github.com/pytorch/pytorch/pull/159033#issuecomment-3176833314))	2025-08-11 20:44:45 +00:00
PyTorch MergeBot	09381f5dac	Revert "[Graph Partition] Pass all OSS unit tests (#154667 )" This reverts commit ca7315c17162ea21b1ca5ba23f4bf6168766c7b9. Reverted https://github.com/pytorch/pytorch/pull/154667 on behalf of https://github.com/clee2000 due to broke inductor/test_memory.py::TestOperatorReorderForPeakMemory::test_reorder_peak_memory_lpmf [GH job link](https://github.com/pytorch/pytorch/actions/runs/16885961204/job/47836769279) [HUD commit link](`ca7315c171`) note to self: bad TD ([comment](https://github.com/pytorch/pytorch/pull/154667#issuecomment-3176805477))	2025-08-11 20:34:27 +00:00
Pian Pawakapan	9eedd2a20b	[PGO] no counterfactual suggestions for dynamic allowlist (#160231 ) Being more conservative with whitelist suggestions as we roll out suggestions; now we only suggest sources that were dynamic in previous runs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160231 Approved by: https://github.com/bobrenjc93	2025-08-11 20:13:25 +00:00
Edward Yang	c3dc8dc412	159965 is merged, no need to patch it in (#160275 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160275 Approved by: https://github.com/albanD, https://github.com/ZainRizvi	2025-08-11 19:55:04 +00:00
Liao, Wei	76a0609b6b	port distributed pipeline test files for Intel GPU (#159033 ) In this PR we will port all distributed pipeline test files. We could enable Intel GPU with following methods and try the best to keep the original code styles: 1. instantiate_device_type_tests() 2. use "torch.accelerator.current_accelerator()" to determine the accelerator backend 3. use "requires_accelerator_dist_backend()" to replace requires_nccl() 4. use "get_default_backend_for_device()" to get backend 5. enabled XPU for some test path 6. add TEST_MULTIACCELERATOR in common_utils for all backend. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159033 Approved by: https://github.com/guangyey, https://github.com/d4l3k Co-authored-by: Daisy Deng <daisy.deng@intel.com>	2025-08-11 19:43:15 +00:00
Simon Fan	c8205cb354	[autograd] match 0-dim gradients device type regardless of subclassness (#160165 ) Not sure if there some subclasses where the outer.dim() == 0 but you wouldn't want to move it? FIXES https://github.com/pytorch/pytorch/issues/160084 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160165 Approved by: https://github.com/ezyang, https://github.com/albanD	2025-08-11 17:57:32 +00:00
Nikita Shulga	d25c4f954d	[MPS] Type-promote tensor-iterator common dtype (#160334 ) Otherwise, `torch.add(FloatTensor, IntTensor, alpha=2)` and `torch.add(FloatTensor, IntTensor, alpha=2)` were dispatched to different kernels Fixes https://github.com/pytorch/pytorch/issues/160208 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160334 Approved by: https://github.com/Skylion007, https://github.com/dcci	2025-08-11 17:53:56 +00:00
David Berard	d0e2240f68	[triton_heuristics] Optimize the triton launcher in pt2 (#160000 ) Summary: (Original author: Xu Zhao. Commandeered by David to land this since it is relatively urgent) We observed ~10us PT2-Triton launch overhead regression after pin update. Before Triton pin-update: {F1980557238} After Triton pin-update: {F1980557240} The root cause is because https://github.com/pytorch/pytorch/pull/145051 adds `_get_args_with_constexprs` to the cubin launcher caller function, which is on the critical path. The motivation for `_get_args_with_constexprs` was that between triton 3.2 and triton 3.3, the convention for calling Triton kernels (at the level that non-static-cuda-launcher inductor integrates) changed. Previously, the callable did not take constexpr arguments as parameters; after 3.3, it does. With pointwise/reduction kernels, we don't know the constexpr values until after autotuning occurs; so `_get_args_with_constexprs` would inject constexprs into the arguments list before calling the Triton kernel. The fix (in this PR) is to instead inject the constexpr args into the launcher string - this avoids the cost of sorting/reordering arguments which previously occurred upon execution of each kernel. Note that the static_cuda_launcher.py does not require constants to be passed to the cubin launcher (`e96c7c4bb0/torch/_inductor/runtime/static_cuda_launcher.py (L220)`), there is no need to pass in constexprs to the generated launcher code. The new launcher code needs to work on three cases: - StaticallyLaunchedCudaKernel - triton.compile.CompiledKernel - AOTInductor Analysis: https://docs.google.com/document/d/1PHaSmx2w59K8qpjw5_qzKWShfEgptf_Zpv_DL7YxiWU/edit?tab=t.0 Test Plan: Before: ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only BERT_pytorch --performance --backend=inductor --training --amp --disable-cudagraphs 1.893x ``` ``` $ buck2 run mode/opt //pytorch/tritonbench:run -- --op launch_latency x_val nop_python_function-walltime nop_triton_kernel-walltime nop_triton_compiled_kernel_run-walltime nop_inductor_kernel-walltime nop_inductor_kernel_cudagraph-walltime ------- ------------------------------ ---------------------------- ----------------------------------------- ------------------------------ ---------------------------------------- 0 0.00760921 1.80298 0.623282 5.25024 0.203722 19 0.00799885 4.78223 1.00226 5.8213 0.239084 average 0.00780403 3.29261 0.812769 5.53577 0.221403 ``` After: ``` buck2 run mode/opt //pytorch/tritonbench:run -- --op launch_latency x_val nop_python_function-walltime nop_triton_kernel-walltime nop_triton_compiled_kernel_run-walltime nop_inductor_kernel-walltime nop_inductor_kernel_cudagraph-walltime ------- ------------------------------ ---------------------------- ----------------------------------------- ------------------------------ ---------------------------------------- 0 0.00747067 1.92589 0.726509 4.35459 0.204205 19 0.00747823 7.36852 1.26241 6.28208 0.239278 average 0.00747445 4.6472 0.994459 5.31834 0.221741 ``` ``` $ buck2 run mode/opt //pytorch/benchmark:pt2 -- --only BERT_pytorch --performance --backend=inductor --training --amp --disable-cudagraphs 1.985x ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160000 Approved by: https://github.com/jansel Co-authored-by: Xu Zhao <xzhao9@meta.com>	2025-08-11 17:22:40 +00:00
Shangdi Yu	9ccd0f5e31	Fix unbacked symint and memory leak in inductor memory planning (#159839 ) Summary: In memory planning, some allocation sizes involve unbacked symints. These unbacked symints are not known before they are computed in run time, so allocation pools that involve unbacked symints cannot be allocated until we have the values of the unbacked symints . So we add a notion of `earliest_available` to Allocation nodes. If an allocation node has unbacked symint, it is available at only when its live range begin. Then in AllocationPool, if a pool involves an Allocation node that has an earliest available time, we restrict its life range. If a block's earliest available time is later than a pool's life range's start time, we cannot allocate it from the pool. We also fix a memory leak that's caused by allocating tensor without wrapping it with RAIIAtenTensor. In python wrapper for JIT inductor, `codegen_alloc_from_pool` doesn't actually write the alloc lines to wrapper, it just returns the string to alloc. However, in cpp_wrapper, `codegen_alloc_from_pool` actually write to the wrapper. Specifically, it writes the following and returns string `RAIIAtenTensorHandle`. ``` AtenTensorHandle handle_name; AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__alloc_from_pool(....); ``` This is bug prune. If you write aoti_torch__alloc_from_pool lines, you must write the RAIIAtenTensorHandle as well, otherwise you get memory leaks. We remove the alloc_from_pool call from codegen_create, because this doesn't work for AOTI. In python wrapper, we can generate the same alloc_from_pool variable name for the same block, but cpp_wrapper will generate a different variable name for each call to alloc_from_pool. Test Plan: ``` python test/inductor/test_memory_planning.py ``` Rollback Plan: Differential Revision: D79603119 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159839 Approved by: https://github.com/jansel	2025-08-11 17:16:15 +00:00
Boyuan Feng	ca7315c171	[Graph Partition] Pass all OSS unit tests (#154667 ) Graph partition leads to 6.2% speedup on vision_maskrcnn, 5.8% speedup on yolov3. [P1819700563](https://www.internalfb.com/phabricator/paste/view/P1819700563), 39.5% speedup on speech_transformer inference [P1830602200](https://www.internalfb.com/phabricator/paste/view/P1830602200), 85% speedup on speech_transformer training [P1831115315](https://www.internalfb.com/phabricator/paste/view/P1831115315). Run the same diff on two days and both show speedup on average. [first TorchInductor Benchmark ci run](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Mon%2C%2021%20Jul%202025%2016%3A37%3A55%20GMT&stopTime=Mon%2C%2028%20Jul%202025%2016%3A37%3A55%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cuda%20(h100)&lBranch=bf/partition-turn-on&lCommit=75ef90fe89b82c967362a2d40fdf1af047202bc2&rBranch=main&rCommit=abcb24f4de11f8fedf2c2c9ff53b6092ef42306d) <img width="1885" height="752" alt="image" src="https://github.com/user-attachments/assets/13bba9fc-5dbf-42ad-8558-d54f7e367b41" /> [second TorchInductorBenchmark ci run](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Wed%2C%2023%20Jul%202025%2016%3A38%3A27%20GMT&stopTime=Wed%2C%2030%20Jul%202025%2016%3A38%3A27%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cuda%20(h100)&lBranch=bf/partition-turn-on&lCommit=66de27e29338c26b1be94733049868cb0309ea52&rBranch=main&rCommit=70d2e9ba455c3c910f6f95b24171c8eee7bc00bf) <img width="2513" height="1030" alt="image" src="https://github.com/user-attachments/assets/3a413dcb-2314-4292-919a-7ca181f9eeac" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/154667 Approved by: https://github.com/eellison	2025-08-11 16:25:12 +00:00
Richard Barnes	68a4b4b2e3	[codemod] Fix unreachable-break issue in caffe2/c10/cuda/CUDAFunctions.cpp +2 (#160257 ) Summary: LLVM has a warning `-Wunreachable-code-break` which identifies `break` statements that cannot be reached. These compromise readability, are misleading, and may identify bugs. This diff removes such statements. For questions/comments, contact r-barnes. - If you approve of this diff, please use the "Accept & Ship" button :-) Test Plan: Sandcastle Rollback Plan: Differential Revision: D79835614 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160257 Approved by: https://github.com/Skylion007	2025-08-11 16:09:24 +00:00
Xu Han	80cca83079	[inductor] Skip some AOTI UTs on Windows. (#160287 ) Skip some AOTI UTs on Windows, it is not fully ready. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160287 Approved by: https://github.com/ezyang	2025-08-11 13:50:43 +00:00
Xu Han	515cb70367	[inductor] normalize_path_separator for test_different_file_paths_local_pgo (#160286 ) `normalize_path_separator` for test_different_file_paths_local_pgo Pull Request resolved: https://github.com/pytorch/pytorch/pull/160286 Approved by: https://github.com/ezyang	2025-08-11 13:50:18 +00:00
cyy	c184cb3852	[submodule] Bump fbgemm to latest (#158210 ) Merge the recent commits of FBGEMM and remove unnecessary CMake code. Specifically, we 1. enable `fbgemm_autovec` since the target is now correctly handled. 2. remove option `USE_FAKELOWP` which is not used. 3. remove `CAFFE2_COMPILER_SUPPORTS_AVX512_EXTENSIONS` check. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158210 Approved by: https://github.com/q10	2025-08-11 13:48:02 +00:00
PyTorch UpdateBot	2259dbed4e	Update slow tests (#158222 ) This PR is auto-generated weekly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/weekly.yml). Update the list of slow tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158222 Approved by: https://github.com/pytorchbot	2025-08-11 12:00:13 +00:00
PyTorch UpdateBot	05029ad1c3	[xla hash update] update the pinned xla hash (#160306 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned xla hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160306 Approved by: https://github.com/pytorchbot	2025-08-11 11:28:49 +00:00
cyy	cf4964be68	Remove unnecessary CMake checks for glog (#158185 ) With the updating to CMake 2.27, some old scripts can be removed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158185 Approved by: https://github.com/malfet, https://github.com/Skylion007	2025-08-11 10:14:47 +00:00
Tanmay Sinha	ecea81117b	Fix clang builds by adding headers (#160252 ) Clang compiler from llvm-14 fails to build full torch from source with the message ``` no template named 'unordered_map' in namespace 'std' std::unordered_map<std::string, HandlerFunc> handlers_{}; ~~~~~^ ``` A similar issue here https://github.com/intel/llvm/issues/5264 Fix is to add the correct headers. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160252 Approved by: https://github.com/Skylion007, https://github.com/cyyever	2025-08-11 09:03:14 +00:00
fduwjj	1c2cba17ea	[FR] Add stack_id and an optional print of stack_id to stack_trace mapping (#160119 ) To better help users debug with FR, we want to add stack_id and print a map between stack_id and stack_trace (optional) Screenshot: <img width="1029" height="529" alt="image" src="https://github.com/user-attachments/assets/8404a1d3-cc33-4f5f-971b-29609ec316c1" /> <img width="1620" height="358" alt="image" src="https://github.com/user-attachments/assets/3dd29c8c-ff68-41a2-acfd-e770036cfeb1" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160119 Approved by: https://github.com/H-Huang, https://github.com/wconstab	2025-08-11 07:27:10 +00:00
Nick Riasanovsky	ff0d56d035	[Inductor] [Triton] Enable Configuration warmup/rep iterations when benchmarking in inductor (#159982 ) Summary: When benchmarking on B200 Max Autotune, I discovered that the estimations from the autotune logs consistently produced a better ATEN result by > 20% on an example shape. Here is an example of the output: ``` Autotune Choices Stats: {"num_choices": 20, "num_triton_choices": 19, "best_kernel": "mm", "best_time": 0.3081120103597641, "best_triton_pos": 1, "best_triton_time": 0.6589759886264801, "best_triton_kernel": "triton_mm_16", "best_triton_kernel_desc": "ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0"} AUTOTUNE mm(3840x1152, 1152x49136) strides: [1, 3840], [49152, 1] dtypes: torch.bfloat16, torch.bfloat16 mm 0.3081 ms 100.0% triton_mm_16 0.6590 ms 46.8% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_17 0.6830 ms 45.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_13 0.7015 ms 43.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_9 0.8487 ms 36.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_11 0.8695 ms 35.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_10 0.8797 ms 35.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=4, num_warps=8, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_18 0.9089 ms 33.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=5, num_warps=8, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_14 0.9718 ms 31.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=4, num_warps=8, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_15 1.0169 ms 30.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=2, num_warps=8, num_consumer_groups=0, num_buffers_warp_spec=0 SingleProcess AUTOTUNE benchmarking takes 2.8574 seconds and 0.1032 seconds precompiling for 20 choices Removed 3483 outliers from 28645 samples 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:20<00:00, 20.00s/it] (M, N, K) pt2_matmul_maxautotune-latency pt2_matmul_maxautotune-speedup pt2_matmul_maxautotune-tflops ------------------- -------------------------------- -------------------------------- ------------------------------- (3840, 49136, 1152) 0.359392 (±8.27%) 1209.61 average 1209.61 ``` Based on my reading about B200 power usage, I believe this is due to the new for power aware benchmarking as a kernel may perform better in short bursts. This adds environment variables to expand autotuning iterations so we can get more consistent results between the estimation and the actual runtime. I did not update the default yet, even for B200 because I'm not sure how this is used in practice. This is the new output: ``` Autotune Choices Stats: {"num_choices": 20, "num_triton_choices": 19, "best_kernel": "mm", "best_time": 0.3848319947719574, "best_triton_pos": 1, "best_triton_time": 0.6287680268287659, "best_triton_kernel": "triton_mm_16", "best_triton_kernel_desc": "ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0"} AUTOTUNE mm(3840x1152, 1152x49136) strides: [1, 3840], [49152, 1] dtypes: torch.bfloat16, torch.bfloat16 mm 0.3848 ms 100.0% triton_mm_16 0.6288 ms 61.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_13 0.6299 ms 61.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_17 0.6728 ms 57.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_9 0.7189 ms 53.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_18 0.8566 ms 44.9% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=5, num_warps=8, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_11 0.8693 ms 44.3% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=3, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_14 0.9298 ms 41.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=64, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=4, num_warps=8, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_10 0.9524 ms 40.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=64, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=4, num_warps=8, num_consumer_groups=0, num_buffers_warp_spec=0 triton_mm_15 1.0216 ms 37.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=128, BLOCK_N=128, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=2, num_warps=8, num_consumer_groups=0, num_buffers_warp_spec=0 SingleProcess AUTOTUNE benchmarking takes 3.9245 seconds and 0.0965 seconds precompiling for 20 choices Removed 3537 outliers from 29530 samples 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:23<00:00, 23.70s/it] (M, N, K) pt2_matmul_maxautotune-latency pt2_matmul_maxautotune-speedup pt2_matmul_maxautotune-tflops ------------------- -------------------------------- -------------------------------- ------------------------------- (3840, 49136, 1152) 0.359328 (±9.71%) 1209.82 average 1209.82 ``` Test Plan: `TORCH_AUTOTUNE_REP=1000 CUDA_VISIBLE_DEVICES=2 ENABLE_MMA_V5_ATT_PIPELINE=1 TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 buck2 run mode/opt //pytorch/tritonbench:run -c fbcode.nvcc_arch=b200a -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8 -- --op gemm --iter $NUM_ITERS --input-loader /home/njriasan/parsed_shapes.json --only pt2_matmul_maxautotune` Rollback Plan: Reviewed By: NikhilAPatel Differential Revision: D79737929 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159982 Approved by: https://github.com/NikhilAPatel	2025-08-11 05:27:51 +00:00
Jiaxi WANG	334b38ccc4	Fix typo in README.md (#160160 ) The "Get the PyTorch Source" section is now located before the "Install Dependencies/Common" section, so "... using the “Get the PyTorch Source“ section below" should be "... using the “Get the PyTorch Source“ section above". Pull Request resolved: https://github.com/pytorch/pytorch/pull/160160 Approved by: https://github.com/BoyuanFeng	2025-08-11 05:09:59 +00:00
FFFrog	dc0d18e023	[CUDA] Remove the uncessary CUDA_GUARD (#160249 ) `CUDA_GUARD` is unnecessary in `initDeviceStreamState`, because the `initSingleStream` has already done it. `29712314dd/c10/cuda/CUDAStream.cpp (L202-L203)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/160249 Approved by: https://github.com/Skylion007	2025-08-11 05:08:05 +00:00
cyy	8ae4d2652f	Tidy torch/csrc/jit/passes/onnx code (#160262 ) Apply clang-tidy fixes to torch/csrc/jit/passes/onnx Pull Request resolved: https://github.com/pytorch/pytorch/pull/160262 Approved by: https://github.com/justinchuby	2025-08-11 04:50:38 +00:00
Edward Z. Yang	8088cfa592	Add type assert for tensor_meta, based on real bug in autoparallel. (#157927 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/157927 Approved by: https://github.com/albanD, https://github.com/Skylion007, https://github.com/wconstab	2025-08-11 04:22:02 +00:00
Nikita Shulga	d8cb3db533	Add unsigned support to `IValue` (#160102 ) - Moved repeated logic of saving int64/uint64 into a polymorphic container into `THPUtils_unpackInteger` - Added `TestPythonDispatch.test_dispatch_uint64` regression test Fixes https://github.com/pytorch/pytorch/issues/159168 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160102 Approved by: https://github.com/ezyang	2025-08-11 03:57:18 +00:00
Han, Xu	e7152ff8a6	[inductor] fix some windows inductor UTs (#160292 ) This PR is the UT part of https://github.com/pytorch/pytorch/pull/160161. As @malfet 's comments: https://github.com/pytorch/pytorch/pull/160161#pullrequestreview-3103812178 This PR will not land turn on change, and only land UT part. changes: 1. Fixed `test_invalid_artifact_flag_error_msg`. 2. Skiped `test_distributed_rank_logging` and `test_disable_recursive_false`. 3. Skiped whole UT `test_cpu_select_algorithm.py`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160292 Approved by: https://github.com/malfet	2025-08-11 02:55:37 +00:00
Nikita Shulga	842cc77ab9	[MPS] Extend addmm to integral types (#160270 ) By adding `addmm` kernel, which is a logical continuation of `mm` one. The only tricking part are how alpha and beta constants are handled, which are passed as `optmath_t`, i.e. that it could be, int64, int32 or float Unified all MM flavors instantiations thru `INSTANTIATE_MM_OPS` and tested that `addmm` metal kernel works as expected for floating types as well by testing it via ``` PYTORCH_MPS_PREFER_METAL=1 python test/test_mps.py -v -k test_output_match_addmm_mps_ ``` Fixes https://github.com/pytorch/pytorch/issues/154901 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160270 Approved by: https://github.com/Skylion007, https://github.com/dcci ghstack dependencies: #160228, #160234	2025-08-11 00:54:17 +00:00
PyTorch MergeBot	b602ea9cab	Revert "[inductor] turn on windows inductor UTs (#160161 )" This reverts commit 4416433c7c625127b7f975c92f8ec98ea4c67fd3. Reverted https://github.com/pytorch/pytorch/pull/160161 on behalf of https://github.com/xuhancn due to auto merged with two related issue ([comment](https://github.com/pytorch/pytorch/pull/160161#issuecomment-3172982125))	2025-08-11 00:04:25 +00:00
Xu Han	4416433c7c	[inductor] turn on windows inductor UTs (#160161 ) With this PR, we can turn on the inductor UTs on Windows CPU. changes: 1. Turn on inductor UTs on Windows CPU. 2. Add a shard to balance added UTs, otherwise it should run timeout. 3. Fixed `test_invalid_artifact_flag_error_msg`. 4. Skiped `test_distributed_rank_logging` and `test_disable_recursive_false`. 5. Skiped whole UT `test_cpu_select_algorithm.py`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160161 Approved by: https://github.com/jansel	2025-08-10 23:18:35 +00:00
Andy (An) Wang	05c19d1ace	[Inductor] Add back the revert part (#160054 ) Add back the reverted code(https://github.com/pytorch/pytorch/pull/159809) as we've figured out the actual root cause of the internal test failures. Mote details in the internal diff. Rollback Plan: Differential Revision: D79776691 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160054 Approved by: https://github.com/blaine-rister	2025-08-10 19:20:30 +00:00
Xu Han	d6786741a7	[inductor] slow test some Windows UTs. (#160267 ) When we enabled Windows inductor UTs since the PR: https://github.com/pytorch/pytorch/pull/160161/ The main branch CI occurred timeout issue, Let's move some UT to slow test. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160267 Approved by: https://github.com/ezyang	2025-08-10 18:35:42 +00:00
PyTorch MergeBot	7ae0629d64	Revert "[inductor] turn on windows inductor UTs (#160161 )" This reverts commit f0980fc0bbd656d6c02d23ad97e945353b314f35. Reverted https://github.com/pytorch/pytorch/pull/160161 on behalf of https://github.com/clee2000 due to broke some inductor tests on windows inductor\test_codecache.py::TestStandaloneCompile::test_different_process [GH job link](https://github.com/pytorch/pytorch/actions/runs/16853706010/job/47748778757) [HUD commit link](`f0980fc0bb`). note to self: bad TD ([comment](https://github.com/pytorch/pytorch/pull/160161#issuecomment-3172784292))	2025-08-10 17:33:19 +00:00
Xu Han	0e3e377bd5	[inductor] fix CompiledArtifact.load path on Windows. (#160268 ) fix CompiledArtifact.load path on Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160268 Approved by: https://github.com/ezyang	2025-08-10 14:22:52 +00:00
Isalia20	a84b60c0c4	[MPS] Sparse coalesce more dtypes to match cpu (#160254 ) More dtypes to match the cpu Pull Request resolved: https://github.com/pytorch/pytorch/pull/160254 Approved by: https://github.com/malfet	2025-08-10 12:25:18 +00:00
atalman	3ac86e728d	Add Alban and Piotr to list of maintainers (#160187 ) Add Alban and Piotr to list of maintainers Pull Request resolved: https://github.com/pytorch/pytorch/pull/160187 Approved by: https://github.com/albanD	2025-08-10 12:00:16 +00:00
Edward Yang	c9671dc865	Delete Python reference implementation from torchdim, as it is untested (#160115 ) Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/160115 Approved by: https://github.com/albanD	2025-08-10 11:21:33 +00:00
ghostspiders	af10f1f86c	Fix requires_cuda to requires_cuda_and_triton (#160222 ) Fixes ##159399 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160222 Approved by: https://github.com/janeyx99	2025-08-10 07:05:52 +00:00
Edward Yang	5dddcd5b07	Correctly copy self.module_stack in ModuleStackTracer (#159956 ) There is a bigger cluster of issues which this does not completely fix, but I think this is a matter of good hygiene, especially because we immediately mutate the dict after assigning it. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159956 Approved by: https://github.com/pianpwk	2025-08-10 03:33:59 +00:00
PyTorch MergeBot	d3d359dbaf	Revert "Fix get_free_symbol_uses for several nodes. (#160134 )" This reverts commit db78943a1ca13a32a3d6045eb15e2b719ee13a2f. Reverted https://github.com/pytorch/pytorch/pull/160134 on behalf of https://github.com/malfet due to No, those are not pre-existing, see `df55ec7d4b/1` ([comment](https://github.com/pytorch/pytorch/pull/160134#issuecomment-3172314322))	2025-08-10 02:37:40 +00:00
Nikita Shulga	df55ec7d4b	[OpInfo][BE] Better inputs for addmm (#160234 ) Right now alpha and betha are both less than zero, which makes them useless for all addmm samples for interal types Pull Request resolved: https://github.com/pytorch/pytorch/pull/160234 Approved by: https://github.com/Skylion007 ghstack dependencies: #160228	2025-08-10 01:26:48 +00:00
Xu Han	f0980fc0bb	[inductor] turn on windows inductor UTs (#160161 ) With this PR, we can turn on the inductor UTs on Windows CPU. changes: 1. Turn on inductor UTs on Windows CPU. 2. Add a shard to balance added UTs, otherwise it should run timeout. 3. Fixed `test_invalid_artifact_flag_error_msg`. 4. Skiped `test_distributed_rank_logging` and `test_disable_recursive_false`. 5. Skiped whole UT `test_cpu_select_algorithm.py`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160161 Approved by: https://github.com/jansel	2025-08-09 21:06:00 +00:00
Laith Sakka	db78943a1c	Fix get_free_symbol_uses for several nodes. (#160134 ) get_free_symbol_uses is used to know what unbacked symbols are used by a given node. not having correct get_free_symbol_uses defined properly leads to : 1. eliminating of some nodes due to not detection of any users. (See the added unit test) 2. Incorrect topological sort. Fix get_free_symbol_uses , NopKernel , ConcarKernel, InputsKerenl, external kernel. for ComputedBuffer with NonOwningLayout its interesting case. when layout is NonOwningLayout we need to access the actual view op base layout and use detect symbols in it. Because when we codegen the ComputedBuffer we uses those symbols. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160134 Approved by: https://github.com/bobrenjc93	2025-08-09 18:15:46 +00:00
thenumberouscode	29712314dd	[fx][pass] Support converting a float32 tensor to a scalar in FX trace. (#158216 ) Fixes https://github.com/pytorch/pytorch/issues/158083 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158216 Approved by: https://github.com/laithsakka	2025-08-09 15:13:13 +00:00
cyy	01f66d08d9	Remove outdated CMAKE_CUDA_COMPILER_VERSION branch (#160075 ) Remove the condition `if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0)` in cmake/Codegen.cmake, because we are now default to CUDA >=12.0 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160075 Approved by: https://github.com/Skylion007	2025-08-09 14:23:17 +00:00
PyTorch MergeBot	2f4c222617	Revert "Make user defined Triton kernels serializable for fx_graph_runnable (#160002 )" This reverts commit 4183d4ff3dcc1d87400326a9a7998c3f9e966f60. Reverted https://github.com/pytorch/pytorch/pull/160002 on behalf of https://github.com/albanD due to Breaks inductor tests in trunk ([comment](https://github.com/pytorch/pytorch/pull/160002#issuecomment-3170855866))	2025-08-09 14:01:58 +00:00
xinan.lin	8047421fbb	[Linter] Expanding the scope of detecting device-bias code. (#159949 ) Currently, the device-bias linter only targets functions decorated with @requires_gpu. This PR adds support for two new detection scenarios: 1. Detect device-bias code in functions decorated with @requires_triton. 2. Detect device-bias code for entire test suites that are defined as shared across GPUs. For example: ``` if __name__ == "__main__": if HAS_GPU: run_tests() ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159949 Approved by: https://github.com/EikanWang, https://github.com/jansel	2025-08-09 09:41:16 +00:00
PaulZhang12	4183d4ff3d	Make user defined Triton kernels serializable for fx_graph_runnable (#160002 ) Resolves issue https://github.com/pytorch/pytorch/issues/153475 where `fx_graph_runnable` didn't work with user defined triton kernels. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160002 Approved by: https://github.com/eellison	2025-08-09 09:26:05 +00:00
Sherlock Huang	fb887c3bb5	Add Sherlock and Zhengxu as codeowner for schema.py (#160233 ) Test Plan: CI Rollback Plan: Differential Revision: D79933462 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160233 Approved by: https://github.com/zhxchen17	2025-08-09 04:44:12 +00:00
PyTorch UpdateBot	bcf23ecc47	[vllm hash update] update the pinned vllm hash (#160235 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160235 Approved by: https://github.com/pytorchbot	2025-08-09 04:17:32 +00:00
Animesh Jain	303c614f3d	[dynamo] Be consistent with UserMethodVariable source (#160155 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160155 Approved by: https://github.com/StrongerXi	2025-08-09 04:16:14 +00:00
PyTorch UpdateBot	0d88593dd8	[audio hash update] update the pinned audio hash (#160153 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160153 Approved by: https://github.com/pytorchbot	2025-08-09 04:01:31 +00:00
Rob Timpe	5ed4f91779	[dynamo] support itertools.permutations (#159694 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159694 Approved by: https://github.com/guilhermeleobas ghstack dependencies: #159693	2025-08-09 03:01:58 +00:00
Rob Timpe	e07c52b2c0	[dynamo] Improve support for itertools.product (#159693 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159693 Approved by: https://github.com/guilhermeleobas, https://github.com/mlazos	2025-08-09 03:01:58 +00:00
cyy	10e3514c96	Remove tensorexpr tests (#158928 ) The tests are not maintained. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158928 Approved by: https://github.com/albanD, https://github.com/malfet	2025-08-09 02:21:22 +00:00
Shangdi Yu	11a3565f18	[Torch Native] Add test for packaging weight (#158750 ) Add test that require weights to be packaged for torch native For now, we need `package_weights_in_so=True` for compile standalone. The constants are in a `.o` file and will be added as a source to the CMakeLists.txt of the model. After we added weight deduping, we should be able to let this config be False. ``` python test/inductor/test_aot_inductor_package.py -k test_compile_with_exporter_weights ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158750 Approved by: https://github.com/desertfire	2025-08-09 01:04:21 +00:00
Ankita George	e96c7c4bb0	[dcp][hf] Improve HF consolidation algorithm (#158648 ) Before we had a bunch of if-else cases based on sharding strategy to decide how to save the tensor with different logic for different strategies. This can be consolidated into one function that uses an algorithm to handle all cases by finding the max possible contiguous bytes that can be written Differential Revision: [D78489438](https://our.internmc.facebook.com/intern/diff/D78489438/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158648 Approved by: https://github.com/saumishr	2025-08-09 00:11:22 +00:00
Jane Xu	9b803cdbe2	[BE] Remove more optim entries from docs coverage ignore list (#160194 ) This PR does privatize ReduceLRSchedulerOnPlateau.is_better -> ReduceLRSchedulerOnPlateau._is_better because that API was never meant to be public. A GitHub search for it also reveals that the API is not commonly used much. https://github.com/search?q=.is_better%28&type=code&p=2 If you do use this API and you rely on it for some reason, please file an issue. In the meantime, you can access it through `_is_better(...)`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160194 Approved by: https://github.com/albanD, https://github.com/Skylion007	2025-08-09 00:09:45 +00:00
Nikita Shulga	8c41cb800a	[MPS][BE] Combine all pre-MacOS14 xfail lists (#160228 ) It does not matter whether it started to fail after 13.1 or 13.3, fact that it still fails on latest MacOS Pull Request resolved: https://github.com/pytorch/pytorch/pull/160228 Approved by: https://github.com/dcci	2025-08-09 00:00:46 +00:00
Yanan Cao (PyTorch)	731ee31f7b	[TorchScript, PT2] Add torch._check compatibility support (#159988 ) Summary: Add support for torch._check() in TorchScript jit.script frontend. * It will be special cased to behave like torch._assert, turned into an if + raise exception. Test Plan: Unit tests Rollback Plan: Differential Revision: D79744604 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159988 Approved by: https://github.com/davidberard98	2025-08-08 23:14:13 +00:00
Ti-Tai Wang	566c6d52ef	[ONNX] Fix the export of the model having none as output (#160200 ) Fixes #160150 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160200 Approved by: https://github.com/justinchuby Co-authored-by: Justin Chu <justinchuby@users.noreply.github.com>	2025-08-08 23:09:34 +00:00
Aidyn-A	4e2ddb5db6	[Inductor][CUTLASS] Copy cutlass_mock_imports directory (#159724 ) Pip wheels of PyTorch nightly and 2.8 release candidates do not contain `cutlass_mock_imports`. This is the path to the source code: ``` root@8120d02fd9c5:$ tree ./torch/_inductor/codegen/cuda/cutlass_lib_extensions/ ./torch/_inductor/codegen/cuda/cutlass_lib_extensions/ ├── cutlass_mock_imports │ ├── cuda │ │ ├── __init__.py │ │ ├── cuda.py │ │ └── cudart.py │ ├── pydot │ │ └── __init__.py │ └── scipy │ ├── __init__.py │ └── special.py ├── evt_extensions.py └── gemm_operation_extensions.py 5 directories, 8 files ``` And this what installed wheel has: ``` root@8120d02fd9c5:$ tree /usr/local/lib/python3.12/dist-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/ /usr/local/lib/python3.12/dist-packages/torch/_inductor/codegen/cuda/cutlass_lib_extensions/ ├── __init__.py ├── evt_extensions.py └── gemm_operation_extensions.py 1 directory, 3 files ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159724 Approved by: https://github.com/henrylhtsang	2025-08-08 22:56:05 +00:00
Kanya-Mo	9e07673deb	Fix test_fsdp_ep.py due to _MeshEnv API change (#158695 ) #132339 changed parent/child mesh related APIs from _MeshEnv. UT TestFSDPWithEP.test_e2e still uses old APIs and will fail: ``` File "/home/kanya/pytorch/test/distributed/checkpoint/e2e/test_fsdp_ep.py", line 77, in test_e2e mesh_fsdp_ep = _mesh_resources.create_child_mesh(mesh_fsdp_tp, ("dp",)) AttributeError: '_MeshEnv' object has no attribute 'create_child_mesh' To execute this test, run the following from the base repo dir: python test/distributed/checkpoint/e2e/test_fsdp_ep.py TestFSDPWithEP.test_e2e This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0. Did you mean: 'create_sub_mesh'? ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/158695 Approved by: https://github.com/Skylion007, https://github.com/nWEIdia	2025-08-08 22:36:47 +00:00
Eddie Yan	1128f4c2a8	[cuDNN][SDPA] cuDNN SDPA refactor/cleanup, nested tensor backward, test priority bump for `sm90`, `sm100` (#149282 ) cleanup tuple/tensor boilerplate in cuDNN SDPA, preparation for nested/ragged tensor backward Pull Request resolved: https://github.com/pytorch/pytorch/pull/149282 Approved by: https://github.com/drisspg Co-authored-by: Aaron Gokaslan <aaronGokaslan@gmail.com>	2025-08-08 22:22:48 +00:00
Robert Hardwick	334ecbd4ff	Add torchao to install_inductor_benchmark_deps cleanup stage (#160191 ) It looks like `torcho` was missed from the cleanup during torchbench setup. Fixes #160188 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160191 Approved by: https://github.com/huydhn	2025-08-08 22:18:41 +00:00
PyTorch MergeBot	206c1eef65	Revert "[pytorch][dynamo_compile] Log stack_trace to dynamo_compile (#159655 )" This reverts commit 2ee22e435131369a7e4f8cc4732579acc29a941b. Reverted https://github.com/pytorch/pytorch/pull/159655 on behalf of https://github.com/clee2000 due to broke dynamo/test_utils.py::TestDynamoTimed::test_dynamo_timed [GH job link](https://github.com/pytorch/pytorch/actions/runs/16839294394/job/47711078667) [HUD commit link](`2ee22e4351`). Probably a landrace since it did run on the PR ([comment](https://github.com/pytorch/pytorch/pull/159655#issuecomment-3169400889))	2025-08-08 22:04:22 +00:00
Nikita Shulga	28ccc9e724	[MPS] Extend `index_put` to complex types (#160159 ) And delete confusing supported types check. Move all pseudo atomic (but eventually consistent) ops to `c10/metal/atomic.h` header Fixes https://github.com/pytorch/pytorch/issues/160034 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160159 Approved by: https://github.com/manuelcandales, https://github.com/dcci, https://github.com/Skylion007	2025-08-08 21:54:30 +00:00
Syed Tousif Ahmed	2247aa6d1d	Documents tuning NVLink performance on H100/H200 (#159792 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159792 Approved by: https://github.com/ngimel	2025-08-08 20:28:24 +00:00
Sheng Fu	1febab2a89	Do not treat ReinterpretView as a realized node (#159920 ) Summary: Do not treat ReinterpretView as a realized node Function [gather_origins](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/utils.py#L888](https://l.facebook.com/l.php?u=https%3A%2F%2Fgithub.com%2Fpytorch%2Fpytorch%2Fblob%2Fmain%2Ftorch%2F_inductor%2Futils.py%23L888&h=AT2PYr83thTo6VUjPs26Y8QAN6Sid16rvDMHtxO-Bp9FDwHr4J5PObtH3IhNTL-LPSRVC9WVJAcmwUToVWJIrDWb84i0j61QE55ySYAkGbuigqcNc7xczlirHhbiC9vMqiz91VwWdl4Pe2yKN7VIjjCiFUqw) calls is_realized_node to decide if a FX node should be included in the origins of a IR node. ReinterpretView is considered a realized node, so it is not included in the origins. It leads to an incomplete graph. For example: ``` @torchdynamo.optimize("inductor") def fn(input_data, weight): normalized_input = input_data * weight.unsqueeze(0) return normalized_input input_data = torch.randn(4272, 192, requires_grad=True).to(device) weight = torch.randn(192, requires_grad=True).to(device) fn(input_data, weight) ``` The original FX graph returned in [get_kernel_metadata](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/utils.py#L723](https://l.facebook.com/l.php?u=https%3A%2F%2Fgithub.com%2Fpytorch%2Fpytorch%2Fblob%2Fmain%2Ftorch%2F_inductor%2Futils.py%23L723&h=AT2PYr83thTo6VUjPs26Y8QAN6Sid16rvDMHtxO-Bp9FDwHr4J5PObtH3IhNTL-LPSRVC9WVJAcmwUToVWJIrDWb84i0j61QE55ySYAkGbuigqcNc7xczlirHhbiC9vMqiz91VwWdl4Pe2yKN7VIjjCiFUqw) is the following: %primals_2 : Tensor "f32[4272, 192][192, 1]cuda:0" = PlaceHolder[target=primals_2] %primals_1 : Tensor "f32[192][1]cuda:0" = PlaceHolder[target=primals_1] %mul : Tensor "f32[4272, 192][192, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%primals_2, %unsqueeze), kwargs = {}) return %mul The unsqueeze op is missing. With this DIFF, the new FX graph is the following: %primals_2 : Tensor "f32[4272, 192][192, 1]cuda:0" = PlaceHolder[target=primals_2] %primals_1 : Tensor "f32[192][1]cuda:0" = PlaceHolder[target=primals_1] %unsqueeze : Tensor "f32[1, 192][192, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%primals_1, 0), kwargs = {}) %mul : Tensor "f32[4272, 192][192, 1]cuda:0"[num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%primals_2, %unsqueeze), kwargs = {}) return %mul Pull Request resolved: https://github.com/pytorch/pytorch/pull/159920 Approved by: https://github.com/mlazos	2025-08-08 20:13:35 +00:00
Jovian Anthony Jaison	2ee22e4351	[pytorch][dynamo_compile] Log stack_trace to dynamo_compile (#159655 ) This change logs the stack trace of the code being compiled by Dynamo, improving visibility into what is compiled. It adds a stack_trace field to compilation metrics. This helps with debugging and analysis of Dynamo compilation behavior. Ref [D79287964](https://www.internalfb.com/diff/D79287964) Test Plan: $ python -m test_utils Internal: ref [D79372519](https://www.internalfb.com/diff/D79372519) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159655 Approved by: https://github.com/c00w	2025-08-08 19:53:47 +00:00
James Dong	c86040a8e6	[torch.export] Fix test_export_api_with_dynamic_shapes (#160164 ) Summary: Update test KJT's dynamic_shapes to match the newly exported fields. Test Plan: ``` buck test 'fbcode//mode/opt' fbcode//caffe2/test:test_export -- --exact 'caffe2/test:test_export - test_export_api_with_dynamic_shapes_cpp_runtime_nonstrict (caffe2.test.export.test_nativert.NativeRTTestExport)' File changed: fbcode//caffe2/test/export/test_export.py Buck UI: https://www.internalfb.com/buck2/8247eaf8-eaf9-4876-95cb-7b4263d15ef2 Test UI: https://www.internalfb.com/intern/testinfra/testrun/2533275093345198 Network: Up: 100KiB Down: 0B (reSessionID-72a2579f-df3f-4262-9aa3-de0db9687 Executing actions. Remaining 0/2 Command: test. Time elapsed: 2:20.5s Tests finished: Pass 1. Fail 0. Fatal 0. Skip 0. Build failure 0 ``` Rollback Plan: Reviewed By: malaybag Differential Revision: D79862872 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160164 Approved by: https://github.com/angelayi, https://github.com/ezyang	2025-08-08 19:45:30 +00:00
Anshul Sinha	72009ec6be	[replicate][be] improved readability and cleaned up remaining DDP code (#160133 ) Summary As much of ReplicateState functionality is copied from FSDPState, I fixed any remaining comments that incorrectly used FSDP instead of Replicate. In addition, instead of labeling modules FSDPModule or FSDPLinear, I have changed it so that is now uses Replicate____. Finally, I have removed some leftover code from the DDP implementation. I have included test cases to verify correctness. Test Case 1. pytest test/distributed/_composable/test_replicate_with_fsdp.py Pull Request resolved: https://github.com/pytorch/pytorch/pull/160133 Approved by: https://github.com/mori360 ghstack dependencies: #160128	2025-08-08 19:42:23 +00:00
Andres Lugo	5f5f508aa8	[ROCm] Ck backend UX refactor (#152951 ) Refactors how the enablement/disablement of CK Gemms and SDPA works. - Adds USE_ROCM_CK_GEMM compile flag for enabling CK gemms. - USE_ROCM_CK_GEMM is set to True by default on Linux - Updates USE_CK_FLASH_ATTENTION to USE_ROCM_CK_SDPA. - USE_ROCM_CK_SDPA is set to False by default - (USE_CK_FLASH_ATTENTION still works for now, but will be deprecated in a future release) - Prevents these CK libraries from being used unless pytorch has been built specifically with the functionality AND is running on a system architecture that supports it. - the getters for these library backends will also do some validity checking in case the user used an environment variable to change the backend. If invalid, (i.e. one of the cases mentioned above is false) the backend will be set as the current non-CK default Pull Request resolved: https://github.com/pytorch/pytorch/pull/152951 Approved by: https://github.com/eqy, https://github.com/jeffdaily, https://github.com/m-gallus Co-authored-by: Jeff Daily <jeff.daily@amd.com> Co-authored-by: Jithun Nair <jithun.nair@amd.com> Co-authored-by: Jane (Yuan) Xu <31798555+janeyx99@users.noreply.github.com>	2025-08-08 18:40:17 +00:00
Yu, Guangye	da1f608ca3	Add UT for torch.accelerator memory-related API (#155200 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/155200 Approved by: https://github.com/albanD ghstack dependencies: #138222, #152932	2025-08-08 17:41:22 +00:00
Yu, Guangye	84f7e88aef	Add unified memory APIs for torch.accelerator (#152932 ) # Motivation The following API will be put under torch.accelerator - empty_cache - max_memory_allocated - max_memory_reserved - memory_allocated - memory_reserved - memory_stats - reset_accumulated_memory_stats - reset_peak_memory_stats Pull Request resolved: https://github.com/pytorch/pytorch/pull/152932 Approved by: https://github.com/albanD ghstack dependencies: #138222	2025-08-08 17:41:22 +00:00
Yu, Guangye	d7114f05b1	Add DeviceAllocator as the base device allocator (#138222 ) # Motivation In line with [RFC] [A device-agnostic Python device memory related API design for stream-based accelerators](https://github.com/pytorch/pytorch/issues/134978), some memory-related APIs are widely used in popular repositories, such as HuggingFace [so many if-else conditional code](https://github.com/search?q=repo%3Ahuggingface%2Faccelerate%20torch.cuda.empty_cache&type=code). We would like to introduce a generic API set under torch.accelerator namespace to generalize these user cases. <div align="center"> <table> <tr> <td> Device-specific memory APIs torch.xxx.foo</td> <td> Device-agnostic memory APIs torch.accelerator.foo</td> </tr> <tr> <td> ```python torch.xxx.empty_cache ``` </td> <td> ```python torch.accelerator.empty_cache ``` </td> </tr> <tr> <td> ```python torch.xxx.reset_peak_memory_stats ``` </td> <td> ```python torch.accelerator.reset_peak_memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.reset_accumulated_memory_stats ``` </td> <td> ```python torch.accelerator.reset_accumulated_memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_stats ``` </td> <td> ```python torch.accelerator.memory_stats ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_allocated ``` </td> <td> ```python torch.accelerator.memory_allocated ``` </td> </tr> <tr> <td> ```python torch.xxx.max_memory_allocated ``` </td> <td> ```python torch.accelerator.max_memory_allocated ``` </td> </tr> <tr> <td> ```python torch.xxx.memory_reserved ``` </td> <td> ```python torch.accelerator.memory_reserved ``` </td> </tr> <tr> <td> ```python torch.xxx.max_memory_reserved ``` </td> <td> ```python torch.accelerator.max_memory_reserved ``` </td> </tr> </table> </div> # Solution This design follows a similar pattern to `HostAllocator`. We're introducing a base class `DeviceAllocator`, from which `CUDAAllocator` and `XPUAllocator` will inherit. This allows us to provide a unified call path like: `torch.accelerator.empty_cache()` -> `GetDeviceAllocator(allocator)->empty_cache()`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/138222 Approved by: https://github.com/albanD, https://github.com/Camyll	2025-08-08 17:41:10 +00:00
albanD	c5ec5458a5	Don't build nccl when distributed is disabled (#160086 ) Because distributed doesn't build on recent compilers, I have to disable distributed, but this makes it still fail as nccl is still built Pull Request resolved: https://github.com/pytorch/pytorch/pull/160086 Approved by: https://github.com/Skylion007, https://github.com/janeyx99	2025-08-08 17:19:16 +00:00
Kurt Mohler	86eb65f7f0	[MPS] Move max_pool2d to Metal for `stride != 1` (#157876 ) This PR updates `max_pool2d` to use a Metal kernel instead of the old MPS graph impl. However, when the `stride` argument is 1 in all dimensions, the old implementation gives significantly better performance, so we fall back to it in that case. Below is a performance comparison of `max_pool2d` before and after this PR, obtained from this script: `2f02f2bf7a/max_pool_mps/perf.py` <details><summary>Click to expand</summary> case \| before PR \| after PR \| speedup \| \| case info -- \| -- \| -- \| -- \| -- \| -- 0 \| 0.014264 \| 0.004473 \| 3.188911245 \| \| (3, 2, 2), {'kernel_size': 2, 'return_indices': True} 1 \| 0.010752 \| 0.00421 \| 2.55391924 \| \| (3, 2, 2), {'kernel_size': 2, 'return_indices': False} 2 \| 0.020777 \| 0.006123 \| 3.393271272 \| \| (3, 10, 10), {'kernel_size': 5, 'return_indices': True} 3 \| 0.011065 \| 0.005759 \| 1.921340511 \| \| (3, 10, 10), {'kernel_size': 5, 'return_indices': False} 4 \| 0.01452 \| 0.007829 \| 1.854642994 \| \| (3, 100, 100), {'kernel_size': 5, 'return_indices': True} 5 \| 0.009258 \| 0.007075 \| 1.308551237 \| \| (3, 100, 100), {'kernel_size': 5, 'return_indices': False} 6 \| 0.188137 \| 0.168688 \| 1.115295694 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': True} 7 \| 0.161362 \| 0.154746 \| 1.042753932 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': False} 8 \| 0.182883 \| 0.16945 \| 1.079274122 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': True} 9 \| 0.156875 \| 0.163346 \| 0.9603847049 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': False} 10 \| 0.193433 \| 0.167396 \| 1.155541351 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': True} 11 \| 0.158967 \| 0.151246 \| 1.051049284 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': False} 12 \| 0.931071 \| 0.932883 \| 0.9980576342 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': True} 13 \| 0.324496 \| 0.3252 \| 0.9978351784 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': False} 14 \| 0.944071 \| 0.936246 \| 1.008357846 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': True} 15 \| 0.322171 \| 0.314854 \| 1.023239343 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': False} 16 \| 0.894158 \| 0.886408 \| 1.008743152 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': True} 17 \| 0.309338 \| 0.304146 \| 1.017070749 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': False} 18 \| 0.606 \| 0.260546 \| 2.325884873 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': True} 19 \| 0.30445 \| 0.231054 \| 1.317657344 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': False} 20 \| 0.474708 \| 0.261925 \| 1.812381407 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': True} 21 \| 0.23175 \| 0.231883 \| 0.9994264349 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': False} 22 \| 0.434475 \| 0.266246 \| 1.631855502 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': True} 23 \| 0.236942 \| 0.231792 \| 1.022218196 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': False} 24 \| 0.202396 \| 0.174888 \| 1.157289237 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': True} 25 \| 0.160679 \| 0.158246 \| 1.015374796 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': False} 26 \| 0.200354 \| 0.184133 \| 1.088093932 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': True} 27 \| 0.160779 \| 0.160679 \| 1.000622359 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': False} 28 \| 0.199175 \| 0.178625 \| 1.115045486 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': True} 29 \| 0.159458 \| 0.160883 \| 0.9911426316 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': False} 30 \| 0.199021 \| 0.165329 \| 1.203787599 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': True} 31 \| 0.156337 \| 0.158213 \| 0.9881425673 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': False} 32 \| 0.180146 \| 0.174483 \| 1.032455884 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': True} 33 \| 0.156988 \| 0.158167 \| 0.9925458534 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': False} 34 \| 0.182133 \| 0.176521 \| 1.031792251 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': True} 35 \| 0.169042 \| 0.156483 \| 1.080257919 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': False} 36 \| 1.767821 \| 1.766254 \| 1.000887188 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': True} 37 \| 1.059346 \| 1.058775 \| 1.000539302 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': False} 38 \| 1.85755 \| 1.859429 \| 0.9989894747 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': True} 39 \| 1.100417 \| 1.097683 \| 1.002490701 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': False} 40 \| 1.843167 \| 1.847558 \| 0.9976233493 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': True} 41 \| 1.090142 \| 1.093163 \| 0.9972364597 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': False} 42 \| 0.480867 \| 0.251733 \| 1.910226311 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': True} 43 \| 0.319246 \| 0.236479 \| 1.349997251 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': False} 44 \| 0.49315 \| 0.256408 \| 1.923301925 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': True} 45 \| 0.316746 \| 0.227854 \| 1.390127011 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': False} 46 \| 0.4912 \| 0.257762 \| 1.905633879 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': True} 47 \| 0.324771 \| 0.229371 \| 1.41592006 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': False} 48 \| 0.152904 \| 0.095079 \| 1.608178462 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': True} 49 \| 0.102963 \| 0.089217 \| 1.154073775 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': False} 50 \| 0.155158 \| 0.095429 \| 1.625899884 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': True} 51 \| 0.104338 \| 0.089979 \| 1.15958168 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': False} 52 \| 0.153121 \| 0.096429 \| 1.587914424 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': True} 53 \| 0.103642 \| 0.090254 \| 1.148336916 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': False} 54 \| 0.191071 \| 0.165125 \| 1.157129447 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': True} 55 \| 0.153971 \| 0.149021 \| 1.033216795 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': False} 56 \| 0.193192 \| 0.166892 \| 1.157586942 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': True} 57 \| 0.156617 \| 0.15215 \| 1.029359185 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': False} 58 \| 0.178033 \| 0.167308 \| 1.06410333 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': True} 59 \| 0.157425 \| 0.164404 \| 0.9575496947 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': False} 60 \| 1.757638 \| 1.750896 \| 1.0038506 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': True} 61 \| 1.048471 \| 1.047967 \| 1.000480931 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': False} 62 \| 1.790708 \| 1.789767 \| 1.000525767 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': True} 63 \| 1.054575 \| 1.054796 \| 0.9997904808 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': False} 64 \| 1.785837 \| 1.784192 \| 1.000921986 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': True} 65 \| 1.054713 \| 1.054492 \| 1.00020958 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': False} 66 \| 0.478267 \| 0.261017 \| 1.832321266 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': True} 67 \| 0.32005 \| 0.226654 \| 1.412064204 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': False} 68 \| 0.484008 \| 0.254721 \| 1.900149575 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': True} 69 \| 0.321 \| 0.218842 \| 1.466811672 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': False} 70 \| 0.482087 \| 0.248771 \| 1.937874591 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': True} 71 \| 0.316558 \| 0.230533 \| 1.373156988 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': False} 72 \| 0.137842 \| 0.085088 \| 1.619993419 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': True} 73 \| 0.100671 \| 0.0769 \| 1.309115735 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': False} 74 \| 0.148321 \| 0.086967 \| 1.705485989 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': True} 75 \| 0.101392 \| 0.075454 \| 1.343759112 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': False} 76 \| 0.150208 \| 0.083742 \| 1.793699697 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': True} 77 \| 0.099587 \| 0.075825 \| 1.313379492 \| \| (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': False} 78 \| 0.622546 \| 0.602729 \| 1.03287879 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': True} 79 \| 0.531696 \| 0.5067 \| 1.049330965 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': False} 80 \| 0.626646 \| 0.617038 \| 1.015571164 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': True} 81 \| 0.530354 \| 0.525367 \| 1.009492412 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': False} 82 \| 0.633933 \| 0.577775 \| 1.097197006 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': True} 83 \| 0.533067 \| 0.526954 \| 1.011600633 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': False} 84 \| 3.372867 \| 3.386412 \| 0.9960001914 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': True} 85 \| 1.155975 \| 1.156604 \| 0.9994561665 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': False} 86 \| 3.401921 \| 3.39755 \| 1.001286515 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': True} 87 \| 1.202829 \| 1.192538 \| 1.008629494 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': False} 88 \| 3.23675 \| 3.220238 \| 1.005127571 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': True} 89 \| 1.077067 \| 1.085613 \| 0.9921279498 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': False} 90 \| 1.572925 \| 0.925625 \| 1.699311276 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': True} 91 \| 0.791204 \| 0.793454 \| 0.9971642969 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': False} 92 \| 1.572742 \| 0.922729 \| 1.704446268 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': True} 93 \| 0.784292 \| 0.788871 \| 0.9941955022 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': False} 94 \| 1.526546 \| 0.925708 \| 1.649057802 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': True} 95 \| 0.769321 \| 0.787675 \| 0.9766985114 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': False} 96 \| 0.736033 \| 0.612808 \| 1.201082558 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': True} 97 \| 0.574625 \| 0.530925 \| 1.082309177 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': False} 98 \| 0.722021 \| 0.614488 \| 1.174996094 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': True} 99 \| 0.563171 \| 0.533721 \| 1.055178642 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': False} 100 \| 0.735725 \| 0.613992 \| 1.198264798 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': True} 101 \| 0.583487 \| 0.532513 \| 1.095723485 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': False} 102 \| 0.656383 \| 0.575313 \| 1.140914598 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': True} 103 \| 0.559796 \| 0.509079 \| 1.099625009 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': False} 104 \| 0.662046 \| 0.572362 \| 1.156691045 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': True} 105 \| 0.552633 \| 0.508671 \| 1.086425214 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': False} 106 \| 0.634108 \| 0.574629 \| 1.103508525 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': True} 107 \| 0.534013 \| 0.510996 \| 1.045043405 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': False} 108 \| 7.056642 \| 7.066717 \| 0.9985743026 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': True} 109 \| 4.144275 \| 4.142658 \| 1.000390329 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': False} 110 \| 7.172683 \| 7.189867 \| 0.9976099697 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': True} 111 \| 4.162538 \| 4.158875 \| 1.000880767 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': False} 112 \| 7.194233 \| 7.181837 \| 1.001726021 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': True} 113 \| 4.294083 \| 4.196062 \| 1.023360236 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': False} 114 \| 1.875692 \| 0.891071 \| 2.104986022 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': True} 115 \| 1.097479 \| 0.781175 \| 1.404907991 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': False} 116 \| 1.8883 \| 0.89015 \| 2.121327866 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': True} 117 \| 1.101329 \| 0.778542 \| 1.414604479 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': False} 118 \| 1.872833 \| 0.893654 \| 2.095702587 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': True} 119 \| 1.096712 \| 0.784579 \| 1.397835017 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': False} 120 \| 0.513029 \| 0.374417 \| 1.370207549 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': True} 121 \| 0.349546 \| 0.305763 \| 1.143192603 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': False} 122 \| 0.518929 \| 0.377487 \| 1.374693698 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': True} 123 \| 0.364662 \| 0.3145 \| 1.159497615 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': False} 124 \| 0.521275 \| 0.375242 \| 1.389170189 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': True} 125 \| 0.367488 \| 0.308354 \| 1.191773092 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': False} 126 \| 0.652342 \| 0.569308 \| 1.145850752 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': True} 127 \| 0.555696 \| 0.506892 \| 1.096280865 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': False} 128 \| 0.654333 \| 0.570367 \| 1.147213987 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': True} 129 \| 0.548925 \| 0.505825 \| 1.085207335 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': False} 130 \| 0.655908 \| 0.571904 \| 1.146884792 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': True} 131 \| 0.560808 \| 0.508238 \| 1.103435792 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': False} 132 \| 6.949462 \| 6.949112 \| 1.000050366 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': True} 133 \| 4.072913 \| 4.065013 \| 1.001943413 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': False} 134 \| 7.200896 \| 7.197792 \| 1.000431243 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': True} 135 \| 4.291367 \| 4.218538 \| 1.017264038 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': False} 136 \| 7.1823 \| 7.306933 \| 0.9829431856 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': True} 137 \| 4.151175 \| 4.149592 \| 1.000381483 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': False} 138 \| 1.781279 \| 0.884288 \| 2.014365229 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': True} 139 \| 1.050804 \| 0.774362 \| 1.356993241 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': False} 140 \| 1.860758 \| 0.884637 \| 2.103414169 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': True} 141 \| 1.099908 \| 0.775887 \| 1.417613647 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': False} 142 \| 1.857387 \| 0.885738 \| 2.096993693 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': True} 143 \| 1.105279 \| 0.77365 \| 1.428655077 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': False} 144 \| 0.489408 \| 0.269583 \| 1.815426047 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': True} 145 \| 0.322525 \| 0.236979 \| 1.360985573 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': False} 146 \| 0.515475 \| 0.265813 \| 1.93923924 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': True} 147 \| 0.315525 \| 0.228146 \| 1.382995976 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': False} 148 \| 0.503438 \| 0.277204 \| 1.816128194 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': True} 149 \| 0.335421 \| 0.228275 \| 1.469372467 \| \| (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': False} 150 \| 5.72495 \| 4.909554 \| 1.166083518 \| \| (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': None, 'return_indices': True} 151 \| 4.45215 \| 4.251333 \| 1.047236243 \| \| (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': None, 'return_indices': False} 152 \| 29.953021 \| 29.879879 \| 1.002447868 \| \| (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': True} 153 \| 9.854683 \| 9.839517 \| 1.001541336 \| \| (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': False} 154 \| 6.178033 \| 5.697375 \| 1.084364817 \| \| (10, 10, 1000, 1000), {'kernel_size': 100, 'padding': 50, 'return_indices': True} 155 \| 6.280317 \| 5.712525 \| 1.099394226 \| \| (10, 10, 1000, 1000), {'kernel_size': 100, 'padding': 50, 'return_indices': False} 156 \| 10.256062 \| 11.336527 \| 0.9046917103 \| \| (10, 10, 1000, 1000), {'kernel_size': 250, 'padding': 50, 'return_indices': True} 157 \| 9.469546 \| 11.33705 \| 0.8352742556 \| \| (10, 10, 1000, 1000), {'kernel_size': 250, 'padding': 50, 'return_indices': False} 158 \| 0.119087 \| 0.0797 \| 1.494190715 \| \| (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': True} 159 \| 0.098713 \| 0.047173 \| 2.092574142 \| \| (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': False} 160 \| 0.960812 \| 0.675762 \| 1.421820108 \| \| (10, 10, 300, 300), {'kernel_size': 2, 'return_indices': True} 161 \| 0.536546 \| 0.485958 \| 1.104099531 \| \| (10, 10, 300, 300), {'kernel_size': 2, 'return_indices': False} 162 \| 2.555225 \| 1.791567 \| 1.426251432 \| \| (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': True} 163 \| 1.419087 \| 1.305137 \| 1.087308842 \| \| (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': False} 164 \| 5.182008 \| 3.48085 \| 1.488719135 \| \| (10, 10, 700, 700), {'kernel_size': 2, 'return_indices': True} 165 \| 2.831779 \| 2.498537 \| 1.133374851 \| \| (10, 10, 700, 700), {'kernel_size': 2, 'return_indices': False} 166 \| 8.546038 \| 5.7783 \| 1.478988284 \| \| (10, 10, 900, 900), {'kernel_size': 2, 'return_indices': True} 167 \| 4.731004 \| 4.161975 \| 1.136720908 \| \| (10, 10, 900, 900), {'kernel_size': 2, 'return_indices': False} 168 \| 0.084754 \| 0.07435 \| 1.139932751 \| \| (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': True} 169 \| 0.057933 \| 0.043096 \| 1.344277891 \| \| (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': False} 170 \| 2.568592 \| 1.802117 \| 1.425319222 \| \| (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': True} 171 \| 1.433054 \| 1.307342 \| 1.096158465 \| \| (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': False} 172 \| 10.3213 \| 7.111604 \| 1.451332217 \| \| (10, 10, 1000, 1000), {'kernel_size': 2, 'return_indices': True} 173 \| 5.680525 \| 5.168129 \| 1.099145358 \| \| (10, 10, 1000, 1000), {'kernel_size': 2, 'return_indices': False} 174 \| 1.02255 \| 1.01375 \| 1.008680641 \| \| (10, 1000, 1000), {'kernel_size': 2, 'padding': 1, 'stride': 1, 'return_indices': False} 175 \| 3.074233 \| 3.094383 \| 0.993488201 \| \| (10, 1000, 1000), {'kernel_size': 2, 'padding': 1, 'stride': 1, 'return_indices': True} 176 \| 1.016812 \| 1.030575 \| 0.9866453194 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': False} 177 \| 3.053658 \| 3.089504 \| 0.9883974903 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': True} 178 \| 1.025863 \| 1.032088 \| 0.9939685376 \| \| (10, 1000, 1000), {'kernel_size': 8, 'padding': 1, 'stride': 1, 'return_indices': False} 179 \| 3.798942 \| 3.799213 \| 0.9999286694 \| \| (10, 1000, 1000), {'kernel_size': 8, 'padding': 1, 'stride': 1, 'return_indices': True} 180 \| 4.492979 \| 4.493421 \| 0.999901634 \| \| (10, 1000, 1000), {'kernel_size': 16, 'padding': 1, 'stride': 1, 'return_indices': False} 181 \| 51.543363 \| 51.266204 \| 1.005406271 \| \| (10, 1000, 1000), {'kernel_size': 16, 'padding': 1, 'stride': 1, 'return_indices': True} 182 \| 1.018008 \| 1.001587 \| 1.016394981 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 1), 'return_indices': False} 183 \| 3.035404 \| 3.003113 \| 1.010752509 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 1), 'return_indices': True} 184 \| 0.610421 \| 0.56 \| 1.0900375 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 4), 'return_indices': False} 185 \| 1.138983 \| 0.757296 \| 1.504012962 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 4), 'return_indices': True} 186 \| 0.641558 \| 0.557808 \| 1.150141267 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (4, 1), 'return_indices': False} 187 \| 1.181475 \| 0.754725 \| 1.565437742 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (4, 1), 'return_indices': True} 188 \| 1.03045 \| 1.026904 \| 1.003453098 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 1), 'return_indices': False} 189 \| 3.041421 \| 3.0263 \| 1.00499653 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 1), 'return_indices': True} 190 \| 0.609929 \| 0.572304 \| 1.065743032 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 4), 'return_indices': False} 191 \| 1.146875 \| 0.756446 \| 1.516135983 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 4), 'return_indices': True} 192 \| 0.645187 \| 0.561708 \| 1.148616363 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (4, 1), 'return_indices': False} 193 \| 1.181721 \| 0.758054 \| 1.558887625 \| \| (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (4, 1), 'return_indices': True} 194 \| 0.927654 \| 0.925946 \| 1.0018446 \| \| (10, 1000, 1000), {'kernel_size': 1, 'return_indices': False} 195 \| 2.749983 \| 2.740354 \| 1.00351378 \| \| (10, 1000, 1000), {'kernel_size': 1, 'return_indices': True} </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/157876 Approved by: https://github.com/malfet	2025-08-08 16:40:10 +00:00
Animesh Jain	a4f69a5da0	[dynamo][guards] Remove guards on stdlib modules (#159913 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159913 Approved by: https://github.com/StrongerXi	2025-08-08 16:26:04 +00:00
Adam J. Stewart	231c72240d	CMake build: preserve PYTHONPATH (#160144 ) Fixes #160092 I'm very new to CMake, so let me know if there's a fancier way to do this. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160144 Approved by: https://github.com/malfet Co-authored-by: Xuehai Pan <XuehaiPan@outlook.com>	2025-08-08 16:03:49 +00:00
gaoyvfeng	50f23ff6f8	rename-HAS_CUDA-to-HAS_CUDA_AND_TRITON (#159883 ) Fixes #159399 "Modified torch.testing._internal.inductor_utils and test/inductor" Pull Request resolved: https://github.com/pytorch/pytorch/pull/159883 Approved by: https://github.com/janeyx99	2025-08-08 15:44:52 +00:00
zpcore	8a37f0c903	improve gather and scatter_add strategy (#160140 ) As title. This PR made a small fix on top of https://github.com/meta-pytorch/autoparallel/pull/81. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160140 Approved by: https://github.com/fmassa	2025-08-08 15:06:24 +00:00
Edward Z. Yang	b5fd7223b1	Improve pin_memory error message on CPU-only systems (#159994 ) ## Summary - clarify pin_memory error message when no accelerator backend is available ## Testing - `python repro_pin_memory.py` (fails: Need to provide pin_memory allocator to use pin memory) - `lintrunner -a` ------ https://chatgpt.com/codex/tasks/task_e_6893ba92c93483238a9bdfdd6c52812b Pull Request resolved: https://github.com/pytorch/pytorch/pull/159994 Approved by: https://github.com/albanD	2025-08-08 14:36:45 +00:00
Edward Yang	9fa8ce26cf	Working setup with runnable PyTorch on Codex. (#159968 ) Sample transcript: https://chatgpt.com/s/cd_68938effc1a88191ae78bc82a8cefe94 This makes use of https://github.com/pytorch/pytorch/pull/159965 to bypass doing an actual build and use nightly. Things to improve: - Once USE_NIGHTLY is in main can remove the patching - We should just keep using the latest nightly, instead of a hard coded one Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159968 Approved by: https://github.com/wdvr	2025-08-08 14:34:15 +00:00
David Berard	62bac07981	[inductor][triton] support profile_scratch launcher arg (#159772 ) This adds support for Triton after https://github.com/triton-lang/triton/pull/7258 landed. https://github.com/triton-lang/triton/pull/7258 adds a new argument to all the Triton kernels - a profile_scratch argument, similar to global_scratch. This PR updates the static cuda launcher and the AOTI kernel callers to pass in these arguments when calling the Triton kernel. Tests: https://github.com/pytorch/pytorch/pull/159158. I also verified these test locally with triton 3.2, 3.3, and 3.4. Fixes: * static_cuda_launcher (test/repro: `python tools/dynamo/verify_dynamo.py`) * AOTI calling logic (test/repro: `TORCHINDUCTOR_CPP_WRAPPER=1 python test/inductor/test_torchinductor_opinfo.py -k test_comprehensive_linalg_vander_cuda_float32`) Differential Revision: [D79825121](https://our.internmc.facebook.com/intern/diff/D79825121) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159772 Approved by: https://github.com/NikhilAPatel, https://github.com/eellison	2025-08-08 14:27:38 +00:00
Isalia20	7f4cb4a3e0	[MPS] coalesce for sparse tensors (#159729 ) MPS coalesce function for sparse tensors Pull Request resolved: https://github.com/pytorch/pytorch/pull/159729 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>	2025-08-08 13:49:55 +00:00
Aidyn-A	556e2a73f4	[Test][Easy] Use float16 dtype in test_sort_large (#159939 ) The test fails with: >RuntimeError: var_mean only support floating point and complex dtypes Pull Request resolved: https://github.com/pytorch/pytorch/pull/159939 Approved by: https://github.com/eqy	2025-08-08 09:56:44 +00:00
Xuehai Pan	178515d0ff	[BE][PYFMT] remove `black`: finish `black -> ruff format` migration (#144557 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/144557 Approved by: https://github.com/ezyang	2025-08-08 07:46:10 +00:00
codingwithsurya	3a56237440	[SymmMem] Send tensors with unerased type information to NVSHMEM Triton kernels (#159788 ) This PR introduces a small `@triton.jit` wrapper function over our core NVSHMEM extern functions for users to send tensors as inputs to their NVSHMEM Triton kernels (rather than pointers). The goal is to abstract away tedious details from the developer, like manual byte-size calculations and handling of raw `int64` pointers. This lets developers work directly with typed Triton tensors and element counts, which will also be useful if you want to do for instance some local math on the data. ----- TODO: This is almost complete. One pending item is tensor-aware implementation of `nvshmem.putmem_signal_block `and `nvshmem.signal_wait_until` From my investigation, I found the root cause to be that this specific tensor API uses local addresses instead of remote addresses for the peer ``` Pointer-Based Version: Rank 0 → Rank 1: Local buffer: 0x430300a00 (src) Remote buffer: 0x2430300c00 (dst) ← Rank 1's memory Remote signal: 0x2430301600 (sig) ← Rank 1's signal Rank 1 (waiting): Local signal: 0x430301600 (waits here) Tensor-Based Version: Rank 0 → Rank 1: Local buffer: 0x430300a00 (src) Local buffer: 0x430300c00 (dst) ← this is wrong Local signal: 0x430300e00 (sig) ← this is wrong Rank 1 (waiting): Local signal: 0x430300e00 (waits here) ``` Next Steps: Need mechanism to resolve local tensor → remote PE address, equivalent to handle.buffer_ptrs[peer] lookup. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159788 Approved by: https://github.com/mandroid6, https://github.com/ngimel ghstack dependencies: #158515, #158718, #159136, #159215, #159701, #159734, #159755, #159756	2025-08-08 05:20:42 +00:00
codingwithsurya	e0d8a315c5	[SymmMem] Add helpful docstrings for all NVSHMEM APIs (#159756 ) Fed Claude Code NVSHMEM Documentation and asked it to generate helpful docstrings. Verified for correctness. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159756 Approved by: https://github.com/mandroid6, https://github.com/ngimel ghstack dependencies: #158515, #158718, #159136, #159215, #159701, #159734, #159755	2025-08-08 05:20:42 +00:00
codingwithsurya	bfff2e3592	[SymmMem] Refactor NVSHMEM Reduction API to be more ergonomic with automatic dtype‐based dispatch (#159755 ) This change introduces a single, generic Triton‐extern wrapper for NVSHMEM team‐based reductions. We now expose one function, `nvshmem.reduce(team, dest, source, nreduce, operation, dtype_id)`, that covers all supported ops (sum, max, min, prod) and dtypes (int8…int64, uint8…uint64, float16, bfloat16, float32, float64). It accepts real dtype objects (torch.dtype or tl.dtype) directly in the Triton kernel launch. Internally, we normalize dtype_id (handling tl.dtype, torch.dtype, str, or constexpr) into the canonical NVSHMEM typename and assemble the proper function name, e.g. nvshmem_float_sum_reduce or nvshmem_bfloat16_prod_reduce Pull Request resolved: https://github.com/pytorch/pytorch/pull/159755 Approved by: https://github.com/ngimel ghstack dependencies: #158515, #158718, #159136, #159215, #159701, #159734	2025-08-08 05:20:36 +00:00
codingwithsurya	1c881440f4	[SymmMem] Initialize NVSHMEM module only for kernels that have nvshmem in their name (#159734 ) Previously, a global post-compile hook initialized the NVSHMEM module for all Triton kernels, which was inefficient. This change conditionally initializes `_nvshmemx_cumodule_init(kernel.module)` only for Triton kernels containing "nvshmem" in their name. Also updated the names for all of our nvshmem kernels to align with this. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159734 Approved by: https://github.com/ngimel ghstack dependencies: #158515, #158718, #159136, #159215, #159701	2025-08-08 05:20:29 +00:00
codingwithsurya	7c4f7b9340	[SymmMem] Add Triton 3.4 support to NVSHMEM Triton and fix CI tests (make device library discoverable + fix peer calculation bug) (#159701 ) This PR introduces support for Triton 3.4 and resolves several CI and test-related issues. Triton 3.4 Compatibility - The JIT post-compile hook has been updated from the legacy JITFunction.compiled_hook to the new API path at triton.knobs.runtime.jit_post_compile_hook. - The internal parameter for kernel semantics in extern function definitions has been updated from _semantic to _builder to align with API changes. Fix CI Errors - The new logic inspects the RPATH of libtorch_nvshmem.so to find the NVSHMEM device library, preventing CI tests from being skipped. - Added a decorator to run NVSHMEM tests only on H100s (compatible hardware) Peer Rank Calculation Fix - The peer calculation in test_nvshmem_triton.py was changed from peer = (world_size - 1) - rank to peer = 1 - rank. Reasoning: The previous logic was only valid for a 2-rank setup. In the 8-rank CI environment, it incorrectly mapped peers (e.g., rank 0 to 7), breaking tests that assume a 0↔1 communication pattern. This was reproduced and validated on an 8-rank dev setup. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159701 Approved by: https://github.com/ngimel ghstack dependencies: #158515, #158718, #159136, #159215	2025-08-08 05:20:22 +00:00
codingwithsurya	1783d6e966	[SymmMem] Fix flaky wait_until test (#159215 ) When playing around with it, I noticed some flakiness in this test across sessions. After debugging, turns out the heavy sync primitives that I was calling (like `nvshmem_quiet()` or `nvshmem_fence()`) from inside Triton kernels was causing deadlocks. The original test tried to guarantee ordering: `put(data) -> fence/quiet -> put(flag)`. But the GPU thread got stuck in `quiet()` waiting for network confirmation while holding the SM, creating a deadlock. The fix was realizing `wait_until` already provides all the sync you need. Just do: - PE A: `nvshmem_wait_until(&ivar, ...)` - PE B: `nvshmem_put(&ivar_on_PE_A, ...)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159215 Approved by: https://github.com/mandroid6, https://github.com/ngimel ghstack dependencies: #158515, #158718, #159136	2025-08-08 05:20:16 +00:00
codingwithsurya	ea7fe0ecf6	[SymmMem] Standardize NVSHMEM Triton wrappers on byte-based APIs + improve code clarity (#159136 ) Quick refactor for consistency and clarity. 1. We now standardize all NVSHMEM data-moving collectives (put, get, alltoall, broadcast) to use their byte-based *_mem_block variants. This makes the API behavior more predictable and avoids mixing paradigms. 2. Previously, some functions operated on element counts (nelems), while others expected byte sizes but still used `nelems` as the param name. That inconsistency was easy to miss and could lead to bugs, especially for devs not familiar with the NVSHMEM internals. To clean this up: • All byte-based APIs now use nbytes or nbytes_per_pe to make the units explicit. • Typed APIs consistently use nelems for element counts. • Docstrings were added or updated to clarify expected units. Also did some code cleanup — removed unused functions, fixed typos in comments, and did some general housekeeping. This should make the API more intuitive and reduce friction for developers. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159136 Approved by: https://github.com/mandroid6, https://github.com/ngimel ghstack dependencies: #158515, #158718	2025-08-08 05:20:09 +00:00
codingwithsurya	b0b229b197	[SymmMem] Use _get_default_group() instead of group.WORLD for group_name access (#158718 ) Both approaches functionally return the default process group created by `init_process_group()` but `_get_default_group()` is a dedicated function with [better error handling and type safety](`4869f71170/torch/distributed/distributed_c10d.py (L1300-L1310)`). Pull Request resolved: https://github.com/pytorch/pytorch/pull/158718 Approved by: https://github.com/Skylion007, https://github.com/fduwjj ghstack dependencies: #158515	2025-08-08 05:20:02 +00:00
codingwithsurya	b5c937259b	[SymmMem] Add NVSHMEM Reduction support (sum, min, max) into Triton (#158515 ) Implements sum_reduce, min_reduce, and max_reduce collective operations for NVSHMEM Triton kernels. Enables parallel reduction computations across PE teams for int64 data types. Tests: `python test/distributed/test_nvshmem_triton.py` <details> <summary> Quick debug print for sanity check </summary> ```markdown ============================================================ [Rank 1] Starting min/max reduction test with world_size=2 ============================================================ ============================================================ [Rank 0] Starting min/max reduction test with world_size=2 ============================================================ [Rank 0] Source data for min/max: [10, 20] [Rank 1] Source data for min/max: [15, 5] [Rank 1] All values across PEs: [Rank 0] All values across PEs: - Position 0: [10, 15] - Position 0: [10, 15] - Position 1: [20, 5] - Position 1: [20, 5] [Rank 1] Expected min: [10, 5] [Rank 0] Expected min: [10, 5] [Rank 1] Expected max: [15, 20] [Rank 0] Expected max: [15, 20] [Rank 0] Executing MIN reduction... [Rank 1] Executing MIN reduction... [Rank 0] Executing MAX reduction... [Rank 1] Executing MAX reduction... /data/users/suryasub/pytorch/torch/distributed/distributed_c10d.py:4809: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once /data/users/suryasub/pytorch/torch/distributed/distributed_c10d.py:4809: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [Rank 1] Results: [Rank 0] Results: [Rank 1] MIN reduction result: [10, 5] [Rank 1] MAX reduction result: [15, 20] [Rank 0] MIN reduction result: [10, 5] [Rank 0] MAX reduction result: [15, 20] [Rank 1] ============================================================ [Rank 1] Min/Max reduction test PASSED ✓ [Rank 1] ============================================================ [Rank 0] ============================================================ [Rank 0] Min/Max reduction test PASSED ✓ [Rank 0] ============================================================ ...... ============================================================ ============================================================ [Rank 0] Starting sum reduction test with world_size=2 [Rank 1] Starting sum reduction test with world_size=2 ============================================================ ============================================================ [Rank 0] Configuration: [Rank 1] Configuration: - nreduce: 3 (number of separate reductions) - nreduce: 3 (number of separate reductions) - dtype: torch.int64 - dtype: torch.int64 [Rank 1] Source data: [2, 4, 6] [Rank 1] Contribution explanation: [Rank 0] Source data: [1, 2, 3] [Rank 0] Contribution explanation: - Element 0: 2 = (rank=1+1) * (index=0+1) - Element 0: 1 = (rank=0+1) * (index=0+1) - Element 1: 4 = (rank=1+1) * (index=1+1) - Element 1: 2 = (rank=0+1) * (index=1+1) - Element 2: 6 = (rank=1+1) * (index=2+1) - Element 2: 3 = (rank=0+1) * (index=2+1) [Rank 1] Initial destination: [-1, -1, -1] [Rank 0] Initial destination: [-1, -1, -1] [Rank 0] Expected results after reduction: [3, 6, 9] [Rank 1] Expected results after reduction: [3, 6, 9] [Rank 0] Executing sum reduction... [Rank 1] Executing sum reduction... [Rank 1] Sum reduction completed /data/users/suryasub/pytorch/torch/distributed/distributed_c10d.py:4809: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [Rank 0] Sum reduction completed /data/users/suryasub/pytorch/torch/distributed/distributed_c10d.py:4809: UserWarning: No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. warnings.warn( # warn only once [Rank 0] Results after reduction: [Rank 0] Destination buffer: [3, 6, 9] [Rank 1] Results after reduction: [Rank 0] Verification: - Reduction 0: PE0: 1 + PE1: 2 = 3 Result: 3, Match: ✓ - Reduction 1: PE0: 2 + PE1: 4 = 6 Result: 6, Match: ✓ [Rank 1] Destination buffer: [3, 6, 9] - Reduction 2: PE0: 3 + PE1: 6 = 9 [Rank 1] Verification: - Reduction 0: PE0: 1 + PE1: 2 = 3 Result: 9, Match: ✓ Result: 3, Match: ✓ - Reduction 1: PE0: 2 + PE1: 4 = 6 Result: 6, Match: ✓ - Reduction 2: PE0: 3 + PE1: 6 = 9 Result: 9, Match: ✓ [Rank 0] ============================================================ [Rank 0] Sum reduction test PASSED ✓ [Rank 0] All 3 reductions computed correctly across 2 PEs [Rank 0] ============================================================ [Rank 1] ============================================================ [Rank 1] Sum reduction test PASSED ✓ [Rank 1] All 3 reductions computed correctly across 2 PEs [Rank 1] ============================================================ ``` </details> Pull Request resolved: https://github.com/pytorch/pytorch/pull/158515 Approved by: https://github.com/mandroid6, https://github.com/ngimel	2025-08-08 05:19:55 +00:00
PyTorch UpdateBot	24257f5bfa	[vllm hash update] update the pinned vllm hash (#159822 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned vllm hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159822 Approved by: https://github.com/pytorchbot	2025-08-08 04:13:48 +00:00
Yiming Zhou	017259f9c6	[benchmarks] Add nativert benchmark (#159922 ) Add NativeRT as an option in the PT2 OSS benchmark ``` python ./benchmarks/dynamo/huggingface.py --performance --inference --export-nativert python ./benchmarks/dynamo/timm_models.py --performance --inference --export-nativert python ./benchmarks/dynamo/torchbench.py --performance --inference --export-nativert ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159922 Approved by: https://github.com/angelayi	2025-08-08 03:38:32 +00:00
xinan.lin	2ea40fba84	[Linter] Improve device-bias linter by adding detection for `with torch.device("cuda")`. (#159926 ) ``` For example, detect the following situation: >>>Lint for test/dynamo/test_modes.py: Error (TEST_DEVICE_BIAS) [device-bias] `@requires_gpu` function should not hardcode `with torch.device('cuda')`, suggest to use torch.device(GPU_TYPE) 687 \| flex_attention as flex_attention_eager, 688 \| ) 689 \| >>> 690 \| with torch.device("cuda"): 691 \| flex_attention = torch.compile(flex_attention_eager, dynamic=False) 692 \| 693 \| with self.assertRaisesRegex( ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159926 Approved by: https://github.com/EikanWang, https://github.com/jansel ghstack dependencies: #159759	2025-08-08 03:20:42 +00:00
Aaron Gokaslan	beb4d7816d	[BE]: ruff PLC0207 - use maxsplit kwarg (#160107 ) Automatically replaces split with rsplit when relevant and only performs the split up to the first ( or last value). This allows early return of the split function and improve efficiency. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160107 Approved by: https://github.com/albanD	2025-08-08 03:14:59 +00:00
Guilherme Leobas	3fcd79e023	Fix infinite loop when iterating over an empty zip (#159673 ) Dynamo would enter in an infinite recursion when `ZipVariable.next_variable(tx)` was called and there was no iterable to be iterated Pull Request resolved: https://github.com/pytorch/pytorch/pull/159673 Approved by: https://github.com/williamwen42	2025-08-08 02:50:21 +00:00
bobrenjc93	05c417715f	integrate kernacle into inductor (#160121 ) This adds integration into inductor in two parts 1) It kicks off the best config lookup at lowering time within mm.py 2) It awaits the future at scheduling time in select_algorithm.py Notably this does not do the following 1) Support for enumerating between mm, addmm and bmm 2) Support for enumerating between exhaustive/max 3) Enumerating different hardware SKUs eg. H100, A100, etc. those will come in the next diffs Differential Revision: [D79824921](https://our.internmc.facebook.com/intern/diff/D79824921/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160121 Approved by: https://github.com/izaitsevfb	2025-08-08 02:14:44 +00:00
Georgia Phillips	ba4ccf5d67	turn on executon frame clenaup by default (#160110 ) Summary: Turning execution frame cleanup back on since D78621408 is done Test Plan: See D78621408 Rollback Plan: Differential Revision: D79730674 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160110 Approved by: https://github.com/jingsh	2025-08-08 02:13:48 +00:00
Wenyuan Chi	d68c323692	Log max_autotune exceptions (#159687 ) (#159688 ) Summary: Exceptions during autotune kernel precompilation are now systematically captured and reported via the chromium_event_logger, enabling better debugging and analysis of autotune failures. Currently, exceptions are dumped to the console in the following format:: ``` [0/0] RuntimeError: No valid triton configs. OutOfMemoryError: out of resource: triton_mm Required: 262144 Hardware limit:232448 Reducing block sizes or `num_stages` may help. [0/0] Runtime error during autotuning: [0/0] No valid triton configs. OutOfMemoryError: out of resource: triton_mm Required: 262144 Hardware limit:232448 Reducing block sizes or `num_stages` may help.. [0/0] Ignoring this choice. ``` The exception tracebacks: ``` # inner exception traceback: File "/torch/_inductor/runtime/triton_heuristics.py", line 603, in _make_launchers launchers.append(result.make_launcher()) ^^^^^^^^^^^^^^^^^^^^^^ File "/torch/_inductor/runtime/triton_heuristics.py", line 1503, in make_launcher self.kernel.load_kernel(device) File "/torch/_inductor/runtime/static_cuda_launcher.py", line 113, in load_kernel (self.function, self.n_regs, self.n_spills) = _StaticCudaLauncher._load_kernel( # wrapped exception traceback: File "/usr/local/fbcode/platform010/lib/python3.12/concurrent/futures/thread.py", line 59, in run result = self.fn(self.args, *self.kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "<trimmed>#link-tree/torch/_inductor/select_algorithm.py", line 2596, in precompile_with_captured_stdout choice.precompile() File "<trimmed>#link-tree/torch/_inductor/select_algorithm.py", line 1881, in precompile self.bmreq.precompile() File "<trimmed>#link-tree/torch/_inductor/autotune_process.py", line 660, in precompile getattr(mod, self.kernel_name).precompile() File "<trimmed>#link-tree/torch/_inductor/runtime/triton_heuristics.py", line 440, in precompile self._make_launchers() File "<trimmed>#link-tree/torch/_inductor/runtime/triton_heuristics.py", line 608, in _make_launchers raise RuntimeError(f"No valid triton configs. {type(exc).__name__}: {exc}") ``` With this change, the exception details will also be logged in the metadata of the `{name}_template_precompiling` event. The format: ``` { "exceptions": [ { "choice_type": "triton", "choice": "ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=64, BLOCK_N=64, EVEN_K=True, GROUP_M=8, USE_FAST_ACCUM=False, num_stages=5, num_warps=4, num_consumer_groups=0, num_buffers_warp_spec=0", "exception_message": "No valid triton configs. OutOfMemoryError: out of resource: triton_mm Required: 262144 Hardware limit:232448 Reducing block sizes or `num_stages` may help.", "exception": "OutOfMemoryError", "required_memory": "262144", "hardware_limit": "232448" } ] } ``` Test Plan: buck2 run //scripts/wychi:test_autotune_mm 2>&1 > /tmp/mylog.txt Rollback Plan: Differential Revision: D79420953 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159688 Approved by: https://github.com/stashuk-olek	2025-08-08 01:30:08 +00:00
Edward Z. Yang	03b254e49f	Extend torch function support to ALL arguments, not just scalar type (but not insides of list) (#145089 ) Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/145089 Approved by: https://github.com/albanD, https://github.com/zou3519	2025-08-07 23:43:53 +00:00
PyTorch MergeBot	195b5c2e27	Revert "dynamo: Remove passing or deleted dynamo_expected_failures (#159691 )" This reverts commit 36f46d082a4954921cb8493223f000f2aab79ed7. Reverted https://github.com/pytorch/pytorch/pull/159691 on behalf of https://github.com/izaitsevfb due to breaking dynamo tests ([comment](https://github.com/pytorch/pytorch/pull/159691#issuecomment-3166067241))	2025-08-07 22:55:51 +00:00
Anshul Sinha	f077c2402e	[replicate][be] improved readability of test case description (#160128 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/160128 Approved by: https://github.com/mori360	2025-08-07 22:51:58 +00:00
Patrick C. Toulme	d46768db04	[MTIA] Allow users who know what they are doing to ignore all device mismatches in tracing and take a preferred device. (#159931 ) Summary: Device mismatches in tracing can most often be ignored. These are only logical mismatches not physical. Take any intermediate computation, and that computation will not actually materialize in a compiled binary execution. So a device mismatch in the middle of the program is not real. The runtime will never materialize those tensors on CPU device during the execution, as they are temporary allocations. If a user knows his tensors at graph input are all on the correct device, then he can ignore all tracing errors. Users who know what they are doing should have an escape hatch to ignore any device mismatch in tracing. Users can set ``` torch._functorch.config.fake_tensor_prefer_device_type = 'mtia' ``` to forcefully override any mismatch and prefer the non cpu device. This unblocks vLLM graph mode for MTIA. Test Plan: Added two unit tests. Rollback Plan: Differential Revision: D79698438 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159931 Approved by: https://github.com/jansel	2025-08-07 22:37:15 +00:00
clr	36f46d082a	dynamo: Remove passing or deleted dynamo_expected_failures (#159691 ) partially generated with ``` for TESTCASE in $(ls \| cut -f1 -d'.' \| grep -v CPython \| uniq); do if grep "$TESTCASE" -m 1 .. -r; then echo; else sl rm "$TESTCASE"* ; fi; done ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159691 Approved by: https://github.com/xmfan	2025-08-07 21:41:50 +00:00
Sherlock Huang	8147370733	Fix qembeddingbag_byte_prepack_meta to use sym_sizes (#159985 ) Summary: In qembeddingbag_byte_prepack_meta, weight.sizes() would return a concrete int. we should use .sym_size() to return a SymInt instead. Test Plan: CI Rollback Plan: Reviewed By: kqfu, henryoier Differential Revision: D79744512 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159985 Approved by: https://github.com/jerryzh168, https://github.com/henryoier	2025-08-07 21:22:29 +00:00
Angela Yi	e619c6bb90	[export] Apply move_to_device_pass to all submodules (#159992 ) Previously we only applied this move_to_device_pass to the toplevel graph. However if we have HOO, this pass will not be applied on the HOO submodules. This PR modifies the pass to run on all submodules. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159992 Approved by: https://github.com/yiming0416	2025-08-07 18:51:15 +00:00
Will Constable	3cf7b4024e	[DTensor] Support user-supplied Generator for random ops (#159933 ) If the user provides a generator kwarg to a random op (e.g. nn.init.uniform_(..., generator=my_generator)), we can still advance that generator's state in a SPMD-global way so that each local-tensor gets appropriate values and the generator advances to the same state as if it had operated on the full tensor. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159933 Approved by: https://github.com/fduwjj, https://github.com/XilunWu, https://github.com/wanchaol	2025-08-07 18:47:22 +00:00
Xu Han	21392c0e06	[inductor] disable flex decoding on Windows. (#160072 ) Discussed with @jianan-gu and @Valentine233 , disable flex decoding on Windows. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160072 Approved by: https://github.com/angelayi	2025-08-07 18:07:36 +00:00
Aleksei Nikiforov	ee1fb43450	Fix docker image creation (#158634 ) Since switching from wheel 0.34.2 to wheel 0.45.1 python symlinks are no longer correctly created. Migrate to packaging package for symlink creation Pull Request resolved: https://github.com/pytorch/pytorch/pull/158634 Approved by: https://github.com/malfet	2025-08-07 17:41:47 +00:00
Aidyn-A	0bd3af4fb8	Further fix failing tests in test/inductor/test_analysis.py (#160070 ) This is a follow up on #159800 as other tests are still failing. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160070 Approved by: https://github.com/aorenste	2025-08-07 17:32:58 +00:00
Ankita George	8399cf88ce	Use only safetensors APIs in HFStorageReader (#159681 ) Get rid of the logic to read the metadata from the header of the safetensors file manually and use the functions as part of safe_open() to get the metadata. This is much cleaner and allows us to not rely on our own custom methods to get metadata, but use safetensors provided APIs Differential Revision: [D79460272](https://our.internmc.facebook.com/intern/diff/D79460272/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159681 Approved by: https://github.com/saumishr ghstack dependencies: #159405, #159406	2025-08-07 17:23:03 +00:00
Ankita George	0b187b3114	DCP HF reader: use safe_open instead of reading the bytes (#159406 ) Reading the bytes and converting to tensors is much slower than using safe_open. For a 8B model across 8 ranks, took ~30s to load before this change and ~4s after. Differential Revision: [D78994259](https://our.internmc.facebook.com/intern/diff/D78994259/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159406 Approved by: https://github.com/saumishr ghstack dependencies: #159405	2025-08-07 17:23:03 +00:00
Ankita George	69cc606fda	HF component update to not use fsspec components (#159405 ) Update HF components to not inherit from fsspec components and instead use filesystem writer/reader. The reason is because there doesn't seem to be much of a need for fsspec, since users are using mounted storage. Using local storage will allow for performance improvements because we can take advantage of the safe_open API provided by HF safetensors (30s vs 4s for load of 8b model), which is signifcant performance wins over reading bytes and converting to tensors which is what we are doing now. Also, we can use the official methods provided by HF instead of relying on reading the metadata by bytes and loading it Differential Revision: [D78993550](https://our.internmc.facebook.com/intern/diff/D78993550/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159405 Approved by: https://github.com/saumishr	2025-08-07 17:22:54 +00:00
Markus Hoehnerbach	57f738b635	[inductor] move all cpu scalars using pinned memory for graph partition (#155360 ) (#158983 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/158983 Approved by: https://github.com/eellison ghstack dependencies: #158758	2025-08-07 17:07:26 +00:00
Markus Hoehnerbach	e167c7d0f3	[inductor] allocate non-blocking copy destinations in pinned memory (#155121 ) (#158758 ) Fixes #155121 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158758 Approved by: https://github.com/EikanWang, https://github.com/eellison	2025-08-07 17:07:26 +00:00
Shivam Raikundalia	b1a602762e	[Profiler] Update README (#159816 ) Summary: Updated README with code structure and explanation of core features within profiler Test Plan: N/A Rollback Plan: Differential Revision: D79604189 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159816 Approved by: https://github.com/sanrise, https://github.com/aaronenyeshi	2025-08-07 16:44:41 +00:00
Han, Xu	e1cf0d496e	[inductor] unification for inductor debug. (#159998 ) Unification inductor debug build, follow @desertfire 's suggestion: https://github.com/pytorch/pytorch/pull/159938#pullrequestreview-3093803196 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159998 Approved by: https://github.com/angelayi	2025-08-07 16:38:00 +00:00
Xu Han	06824f3c72	[inductor] fix test_dynamo_timed on Windows. (#159981 ) Fixed `test_dynamo_timed `: <img width="1030" height="389" alt="image" src="https://github.com/user-attachments/assets/02d84dd8-6a65-4f91-8d4c-48ba0a81fac1" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159981 Approved by: https://github.com/angelayi	2025-08-07 16:37:52 +00:00
PyTorch MergeBot	f3a4d742ec	Revert "Add DeviceAllocator as the base device allocator (#138222 )" This reverts commit f7a66da5f9f6b8b75119b1ee8ce9ddc23e15570e. Reverted https://github.com/pytorch/pytorch/pull/138222 on behalf of https://github.com/jithunnair-amd due to Broke ROCm periodic runs on MI300 e.g. https://github.com/pytorch/pytorch/actions/runs/16764977800/job/47470050573 ([comment](https://github.com/pytorch/pytorch/pull/138222#issuecomment-3164941815))	2025-08-07 16:34:36 +00:00
PyTorch MergeBot	74da2604c9	Revert "Add unified memory APIs for torch.accelerator (#152932 )" This reverts commit 15f1173e5d72d6d45faba4cecd135e0160f06c6f. Reverted https://github.com/pytorch/pytorch/pull/152932 on behalf of https://github.com/jithunnair-amd due to Broke ROCm periodic runs on MI300 e.g. https://github.com/pytorch/pytorch/actions/runs/16764977800/job/47470050573 ([comment](https://github.com/pytorch/pytorch/pull/138222#issuecomment-3164941815))	2025-08-07 16:34:36 +00:00
PyTorch MergeBot	c4e64467b5	Revert "Add UT for torch.accelerator memory-related API (#155200 )" This reverts commit 4604f0482c2b4a3001b62e5bc5085149a9bb053c. Reverted https://github.com/pytorch/pytorch/pull/155200 on behalf of https://github.com/jithunnair-amd due to Broke ROCm periodic runs on MI300 e.g. https://github.com/pytorch/pytorch/actions/runs/16764977800/job/47470050573 ([comment](https://github.com/pytorch/pytorch/pull/138222#issuecomment-3164941815))	2025-08-07 16:34:36 +00:00
Zain Rizvi	90b78ee50f	Move xla jobs to unstable workflow (#159272 ) Disables the job on PRs completely, so that we don't litter people's CI signals and use machines unnecessarily. If you want to run these xla tests, add the ciflow/unstable label to your PR Pull Request resolved: https://github.com/pytorch/pytorch/pull/159272 Approved by: https://github.com/atalman, https://github.com/malfet	2025-08-07 16:22:52 +00:00
Xilun Wu	e248719ac0	[DTensor] support _StridedShard in view op (#159656 ) Summary Some thoughts on view-op and `_StridedShard` interaction: 1. `_StridedShard` has no impact on sharding (i.e. how tensor is partitioned) compared to `Shard`. It only changes how shards permute across the devices. 2. `view()` op on DTensor strictly forbids shard redistribution which means if `view()` may cause shard permutation across devices, it should be rejected. This is enforced in today's sharding prop for `view()`. 3. Since DTensor `view()` won't introduce any redistribution, it's certain that `placements` won't change except the inner `dim` attribute of `Shard` or `_StridedShard`. Therefore, to support `_StridedShard` in `view()` op, the only change required is to keep `_StridedShard` as `_StridedShard` in the output spec. Test `pytest test/distributed/tensor/test_view_ops.py` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159656 Approved by: https://github.com/wconstab	2025-08-07 15:59:25 +00:00
Aleksei Nikiforov	f60454cce8	S390X: update test dependencies (#158636 ) numba currently doesn't build from source due to https://github.com/numba/numba/pull/10073 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158636 Approved by: https://github.com/malfet	2025-08-07 15:58:30 +00:00
rzou	8ab5868a21	Actually run the einops tests in CI (#159776 ) The test filter was wrong, it should not start with "test/". Test Plan: - wait for CI - Tested locally with `python test/run_test.py --einops --verbose` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159776 Approved by: https://github.com/atalman, https://github.com/StrongerXi	2025-08-07 15:23:06 +00:00
Wang, Chuanqi	d20c4c20e6	[CI] Update xpu ci use rolling driver for new features (#158340 ) Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/158340 Approved by: https://github.com/seemethere Co-authored-by: xinan.lin <xinan.lin@intel.com>	2025-08-07 15:18:51 +00:00
Zhengxu Chen	83875cdb55	[nativert] Expose ModelRunner to public through pmpl type ModelRunnerHandle. (#159989 ) Summary: Today users outside of pytorch core cannot `#include <torch/nativert/ModelRunner.h>`. It turns out that we should place a header inside `torch/csrc/api/include/`. Placing every single nativert header here would pollute the namespace a lot and that's not what we want in general. Therefore here we just create a Handle type which hold a pointer to decouple the actual type from header definition. Test Plan: CI Rollback Plan: Differential Revision: D79751098 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159989 Approved by: https://github.com/dolpm	2025-08-07 14:23:21 +00:00
PyTorch MergeBot	a53d14d5f8	Revert "unskipped mobilenet_v3 quantization and mobilenet_v2 quantization plus tests from https://github.com/pytorch/pytorch/issues/125438 (#157786 )" This reverts commit 3a2c3c8ed365eb4e4cf4620c25d70b2f70483762. Reverted https://github.com/pytorch/pytorch/pull/157786 on behalf of https://github.com/albanD due to Breaks lint ([comment](https://github.com/pytorch/pytorch/pull/157786#issuecomment-3164126250))	2025-08-07 13:09:33 +00:00
Dev Sashidhar	8cb91e20bc	Renaming HAS_XPU to HAS_XPU_AND_TRITON (#159908 ) This PR follows up on the discussion in #159399 where @Akabbaj and @janeyx99 mentioned renaming HAS_XPU to HAS_XPU_AND_TRITON for consistency. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159908 Approved by: https://github.com/janeyx99, https://github.com/guangyey	2025-08-07 11:24:44 +00:00
Huy Do	b0df7715e8	Remove benchmark dependencies from regular ROCm CI images (#160047 ) Instead, use a new `pytorch-linux-jammy-rocm-n-py3-benchmarks` image for Docker benchmark job. This addresses 2 issues: * The current ROCm failures in trunk w.r.t librosa version https://github.com/pytorch/pytorch/actions/runs/16789466749/job/47549950994 that TorchBench pulls in. * Reduce the size of the regular ROCm CI images by removing TorchBench models, which is needed only for benchmarking jobs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160047 Approved by: https://github.com/malfet, https://github.com/izaitsevfb	2025-08-07 09:26:58 +00:00
Avik Chaudhuri	422bd6808b	dataclass pytree fix (#159916 ) Differential Revision: D79687243 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159916 Approved by: https://github.com/XuehaiPan, https://github.com/angelayi	2025-08-07 08:22:41 +00:00
thenumberouscode	24f43d0da7	[inductor] [cpu] fix the dype hardcoded to int64 in store_reduction (#157904 ) ## Fixes https://github.com/pytorch/pytorch/issues/157683 ## mini repro * Just copy the code from the issue to reproduce it. ```python import torch device = "cpu" # Input tensors v2_0 = torch.randn(16, 24, 59, dtype=torch.complex64, device=device) v3_0 = torch.randn(16, 24, 59, dtype=torch.complex64, device=device) def my_model(v2_0, v3_0): v6_0 = -v3_0 v4_0 = v2_0 * v3_0 v1_0 = v4_0.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) v0_0 = v2_0.to(torch.int32) v5_0 = v0_0.amax(dim=0) return v6_0, v4_0, v1_0, v0_0, v5_0 v6_0, v4_0, v1_0, v0_0, v5_0 = my_model(v2_0, v3_0) print("v6_0", v6_0.shape) print("v4_0", v4_0.shape) compiled_model = torch.compile(my_model, backend="inductor") v6_0, v4_0, v1_0, v0_0, v5_0 = compiled_model(v2_0, v3_0) print("v6_0", v6_0.shape) print("v4_0", v4_0.shape) print("v1_0", v1_0.shape) print("v0_0", v0_0.shape) print("v5_0", v5_0.shape) ``` error_stack ``` /home/admin/pytorch/pytorch/torch/include/ATen/cpu/vec/vec_convert.h:41:1: 附注：candidate: ‘template<class dst_t, class src_t> std::enable_if_t<(! is_same_v<dst_t, src_t>), at::vec::CPU_CAPABILITY::Vectorized<T> > at::vec::CPU_CAPABILITY::convert(const at::vec::CPU_CAPABILITY::Vectorized<T>&)’ 41 \| convert(const Vectorized<src_t>& src) { \| ^~~~~~~ /home/admin/pytorch/pytorch/torch/include/ATen/cpu/vec/vec_convert.h:41:1: 附注： template argument deduction/substitution failed: /tmp/torchinductor_admin/6k/c6kr65o43rlmp2cmkpn5ezewhe5bla4w72hpcrg5biyelrs4skyw.main.cpp:37:99: 错误：模板参数数目不对(不应是 4 个而应是 2 个) 37 \| auto int32_t_tmp_acc0_vec = at::vec::convert<int32_t,1,int64_t,2>(tmp_acc0_vec); ``` ## summary The C++ kernel generated by the Inductor had the wrong data type for the output variable; it should be int32_t instead of int64_t. This incorrect data type led to an incompatible data type conversion, which caused the g++ compilation to fail. The original code that caused the problem. ``` def my_model(v2_0, v3_0): v6_0 = -v3_0 v4_0 = v2_0 * v3_0 v1_0 = v4_0.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) v0_0 = v2_0.to(torch.int32) // The original code that caused the problem. v5_0 = v0_0.amax(dim=0) ``` ## proof procedure The c++ kernel generated by inductor: ```c++ #include <torch/csrc/inductor/cpp_prefix.h> extern "C" void kernel(const int32_t* in_ptr0, int32_t* out_ptr0) { { for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(1416L); x0+=static_cast<int64_t>(16L)) { { int32_t tmp_acc0_arr[16]; for (int i = 0; i < 16; i++) { tmp_acc0_arr[i] = std::numeric_limits<int32_t>::min(); } int32_t tmp_acc0 = std::numeric_limits<int32_t>::min(); at::vec::Vectorized<int32_t> tmp_acc0_vec = at::vec::Vectorized<int32_t>(std::numeric_limits<int32_t>::min()); for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(16L); x1+=static_cast<int64_t>(1L)) { { if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(1408L))) { auto tmp0 = at::vec::Vectorized<int32_t>::loadu(in_ptr0 + static_cast<int64_t>(x0 + 1416Lx1), static_cast<int64_t>(16)); tmp_acc0_vec = at::vec::maximum(tmp_acc0_vec, tmp0); } if(C10_UNLIKELY(x0 >= static_cast<int64_t>(1408L) && x0 < static_cast<int64_t>(1416L))) { for (int64_t x0_tail = static_cast<int64_t>(1408L);x0_tail < static_cast<int64_t>(1416L); x0_tail++) { auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail + 1416Lx1)]; tmp_acc0_arr[x0_tail - static_cast<int64_t>(1408L)] = max_propagate_nan(tmp_acc0_arr[x0_tail - static_cast<int64_t>(1408L)], tmp0); } } } } if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(1408L))) { // impossible data type conversion which would caused the g++ compilation to fail. auto int32_t_tmp_acc0_vec = at::vec::convert<int32_t,1,int64_t,2>(tmp_acc0_vec); int32_t_tmp_acc0_vec.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16)); } if(C10_UNLIKELY(x0 >= static_cast<int64_t>(1408L) && x0 < static_cast<int64_t>(1416L))) { for (int64_t x0_tail = static_cast<int64_t>(1408L);x0_tail < static_cast<int64_t>(1416L); x0_tail++) { out_ptr0[static_cast<int64_t>(x0_tail)] = tmp_acc0_arr[x0_tail - static_cast<int64_t>(1408L)]; } } } } } } ``` the compilers complains ```text /home/admin/pytorch/pytorch/torch/include/ATen/cpu/vec/vec_convert.h:41:1: 附注：candidate: ‘template<class dst_t, class src_t> std::enable_if_t<(! is_same_v<dst_t, src_t>), at::vec::CPU_CAPABILITY::Vectorized<T> > at::vec::CPU_CAPABILITY::convert(const at::vec::CPU_CAPABILITY::Vectorized<T>&)’ 41 \| convert(const Vectorized<src_t>& src) { \| ^~~~~~~ /home/admin/pytorch/pytorch/torch/include/ATen/cpu/vec/vec_convert.h:41:1: 附注： template argument deduction/substitution failed: /tmp/torchinductor_admin/6k/c6kr65o43rlmp2cmkpn5ezewhe5bla4w72hpcrg5biyelrs4skyw.main.cpp:37:99: 错误：模板参数数目不对(不应是 4 个而应是 2 个) 37 \| auto int32_t_tmp_acc0_vec = at::vec::convert<int32_t,1,int64_t,2>(tmp_acc0_vec); ``` so the following line have problem ```c++ // this line means that tmp_acc0_vec should be Vectorized<int64_t>, and it will convert it to Vectorized<int32_t>. auto int32_t_tmp_acc0_vec = at::vec::convert<int32_t,1,int64_t,2>(tmp_acc0_vec); ``` The issue is that tmp_acc0_vec is of type Vectorized<int32_t>, but the template parameters expect it to be Vectorized<int64_t>. and it will convert it to a Vectorized<int32_t>. this is conflict. the conversion should not be exist for tmp_acc0_vec is already Vectorized<int32_t>.The following line hardcodes the output variable type to int64, which causes unnecessary and incorrect type conversions. `d89f30ad45/torch/_inductor/codegen/cpp.py (L2985-L2993)` Pull Request resolved: https://github.com/pytorch/pytorch/pull/157904 Approved by: https://github.com/jgong5	2025-08-07 08:03:05 +00:00
Sherlock Huang	aa75e917bd	[Export Schema] Remove deviceAllocationMap field (#159653 ) Summary: This field is not used today, and it's not useful either. The device allocation is configured at model loading time, specified by user. It shouldn't be part of the model definition. Test Plan: CI Rollback Plan: Differential Revision: D79385513 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159653 Approved by: https://github.com/zhxchen17	2025-08-07 07:31:42 +00:00
PyTorch UpdateBot	3f1636ebef	[audio hash update] update the pinned audio hash (#160046 ) This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml). Update the pinned audio hash. Pull Request resolved: https://github.com/pytorch/pytorch/pull/160046 Approved by: https://github.com/pytorchbot	2025-08-07 04:16:35 +00:00
IlyasMoutawwakil	c859ba7114	Make onnx export SDPA match aten behavior (#159973 ) This PR makes onnx sdpa export match the behavior of aten sdpa when boolean mask is used. @justinchuby ```python import onnxruntime as ort import torch class ScaledDotProductAttention(torch.nn.Module): def forward(self, query, key, value, attn_mask): return torch.nn.functional.scaled_dot_product_attention(query, key, value, attn_mask=attn_mask) model = ScaledDotProductAttention() attn_mask = torch.ones(2, 4, 8, 8).bool() # boolean mask for attention attn_mask[0, 0, 0, :] = False # masking an entire row (padding token) query = key = value = torch.randn(2, 4, 8, 16) output = model(query, key, value, attn_mask) torch.onnx.export( model, (query, key, value, attn_mask), "scaled_dot_product_attention.onnx", input_names=["query", "key", "value", "attn_mask"], output_names=["output"], dynamo=false, # or True, ) ort_session = ort.InferenceSession("scaled_dot_product_attention.onnx") np_inputs = {"query": query.numpy(), "key": key.numpy(), "value": value.numpy(), "attn_mask": attn_mask.numpy()} onnx_outputs = ort_session.run(None, np_inputs)[0] torch.testing.assert_close(output, torch.tensor(onnx_outputs), equal_nan=True) ``` fails the assertion because the ort model outputs nans. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159973 Approved by: https://github.com/xadupre, https://github.com/titaiwangms	2025-08-07 04:06:07 +00:00
Simon Fan	d4c1a08c89	Relax unclaimed successes in dtype op tests when running under TEST_WITH_DYNAMO/TEST_WITH_INDUCTOR (#159976 ) This PR changes the behavior for compile wrapped op tests: - supported_but_unclaimed_forward - supported_but_unclaimed_backward These typically manifest when the op doesn't support inputs of certain dtypes. But under torch.compile, Dynamo/AOTAutograd will trace the graph with FakeTensors, which @ezyang and @eellison tell me need to run decomps before op dispatch. The decomp may map this test to a different op, one that does support the dtype. I suspect all of our failures here are due to decomps, and so I propose to just disable this check for compile. ~~TODO: re-enable all the failed tests.~~ jk there were no failed tests outside of compiled autograd due to this. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159976 Approved by: https://github.com/ezyang	2025-08-07 02:38:45 +00:00
Nikita Shulga	81d72fb1f7	Move smoke binary builds to 3.12 (#159993 ) And limit them just to stable CUDA version (as there weren't any recent instances when only one of those jobs failed to build) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159993 Approved by: https://github.com/ngimel ghstack dependencies: #159986, #159990	2025-08-07 01:59:30 +00:00
Nikita Shulga	d0226719a9	[BE][EZ] Delete remains of split-build logic (#159990 ) Hopefully last piece of https://github.com/pytorch/pytorch/issues/138750 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159990 Approved by: https://github.com/atalman ghstack dependencies: #159986	2025-08-07 01:59:30 +00:00
Edward Yang	38d65c6465	Add a USE_NIGHTLY option to setup.py (#159965 ) If you run python setup.py develop with USE_NIGHTLY, instead of actually building PyTorch we will just go ahead and download the corresponding nightly version you specified and dump its binaries. This is intended to obsolete tools/nightly.py. There's some UX polish for detecting what the latest nightly is if you pass in a blank string. I only tested on OS X. Coded with claude code. Signed-off-by: Edward Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159965 Approved by: https://github.com/malfet	2025-08-07 01:44:20 +00:00
Yu, Guangye	2ba2f598f3	[Dynamo] Add torch.xpu.stream to trace rules (#159844 ) # Motivation Previously, I thought using `with stream:` was sufficient. However, many older scripts still use `torch.xpu.stream` as the context manager. To maintain backward compatibility, I had to include `torch.xpu.stream` in the trace rules. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159844 Approved by: https://github.com/jansel	2025-08-07 01:35:50 +00:00
Laith Sakka	1bb5e6c076	update expected results (#159867 ) refresh due to https://github.com/pytorch/pytorch/pull/159696 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159867 Approved by: https://github.com/masnesral	2025-08-07 01:18:36 +00:00
Denghui Dong	8b0be7b65a	[Profiler] Fix unexpected C return events (#159574 ) The fix in https://github.com/pytorch/pytorch/pull/155446 addressed the "stack empty" issue that's easily reproducible on CPython 3.12.0-4. While this issue can also appear in other versions, it's not as easy to reproduce there. I recently found a new cause for this problem. `1df5d00145/Python/ceval.c (L5807-L5836)` In the CPython 3.10 implementation, PyTrace_C_CALL and PyTrace_C_RETURN/PyTrace_C_EXCEPTION are supposed to appear in pairs. However, when c_profilefunc is changed, unexpected PyTrace_C_RETURN/PyTrace_C_EXCEPTION events can occur. Here is the code to reproduce this problem. ``` import threading import time import torch from threading import Event, Lock lock = Lock() lock.acquire() event1 = Event() event2 = Event() event3 = Event() def run(): event1.set() event2.wait() lock.acquire() event3.set() threading.Thread(target=run).start() with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU], with_stack=True): event1.wait() event2.set() time.sleep(1) with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU], with_stack=True): lock.release() event3.wait() ``` <img width="1766" height="1250" alt="image" src="https://github.com/user-attachments/assets/6794eeca-7364-429e-91eb-62cdad116bd3" /> To fix this problem, we can record active_frames_ and remaining_start_frames_ for each thread, and when the PyTrace_C-RETURN/PyTrace_CEXT CEPTION event occurs, we can determine whether to record this event based on these two fields. In reality, even without this fix, the final data appears to be right since the match process can handle this case (it would just result in an exception log being printed). Do you think the fix is necessary? Pull Request resolved: https://github.com/pytorch/pytorch/pull/159574 Approved by: https://github.com/sraikund16	2025-08-07 01:17:55 +00:00
Xuehai Pan	5cedc5a0ff	[BE][PYFMT] migrate PYFMT for `torch/[p-z]*/` to `ruff format` (#144552 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/144552 Approved by: https://github.com/ezyang	2025-08-07 00:09:56 +00:00
William Wen	fd606a3a91	[dynamo] update pytorch-labs -> meta-pytorch in graph break URLs (#159975 ) Related PR: https://github.com/meta-pytorch/compile-graph-break-site/pull/30 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159975 Approved by: https://github.com/Lucaskabela	2025-08-06 23:57:31 +00:00
Animesh Jain	3daef4d128	[dynamo] Trace nn.Module __delattr__ (#159969 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159969 Approved by: https://github.com/atalman, https://github.com/malfet, https://github.com/StrongerXi	2025-08-06 23:43:19 +00:00
PyTorch MergeBot	cb4b29b754	Revert "[pytorch] Moving torch.compile worker process logs to a dedicated rank based log directory (#159874 )" This reverts commit 9fd5b5f73589cf08dca60910368cc0f05c7906c8. Reverted https://github.com/pytorch/pytorch/pull/159874 on behalf of https://github.com/malfet due to Broke lint ([comment](https://github.com/pytorch/pytorch/pull/159874#issuecomment-3161896978))	2025-08-06 23:21:29 +00:00
drisspg	a6bc296207	[FlexAttention] Update the guard semantics for divisibility (#159884 ) We don't add guards unless we know (and another guard has ensured this) that this is a safe optimization Pull Request resolved: https://github.com/pytorch/pytorch/pull/159884 Approved by: https://github.com/Chillee	2025-08-06 23:12:44 +00:00
Thomas Bohnstingl	64dc30c213	[HOP, map] Rework of map autograd to the new interface (#153343 ) This PR reworks the current autograd implementation of map to the new interface. @pytorchbot label "topic: not user facing" Pull Request resolved: https://github.com/pytorch/pytorch/pull/153343 Approved by: https://github.com/ydwu4	2025-08-06 23:02:42 +00:00
Nathan Brown	93da9952a7	gloo: fix building system gloo with CUDA/HIP (#146637 ) Fix incorrect linking of Gloo's libraries when building with system Gloo. Previously, either Gloo's native library or Gloo's CUDA library were linked. However, Gloo had changed such that all users of Gloo must link the native library, and can optionally link the CUDA or HIP library for Gloo + CUDA/HIP support. This had been updated when building/linking with vendored Gloo, but not when using system Gloo. Fixes: #146239 Reported-by: Adam J Stewart <ajstewart426@gmail.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/146637 Approved by: https://github.com/malfet	2025-08-06 22:56:31 +00:00
christinaburge	3a2c3c8ed3	unskipped mobilenet_v3 quantization and mobilenet_v2 quantization plus tests from https://github.com/pytorch/pytorch/issues/125438 (#157786 ) These tests now pass on AArch64 in our downstream CI. `test_quantization.py::TestNumericSuiteEager::test_mobilenet_v2 <- test/quantization/eager/test_numeric_suite_eager.py PASSED [2.4434s] [ 35%]` Pull Request resolved: https://github.com/pytorch/pytorch/pull/157786 Approved by: https://github.com/jerryzh168, https://github.com/malfet	2025-08-06 22:41:07 +00:00
Jovian Anthony Jaison	9fd5b5f735	[pytorch] Moving torch.compile worker process logs to a dedicated rank based log directory (#159874 ) Summary: Writing torch.compile worked logs to dedicated_log_rank{RANK} if we're running on mast. Test Plan: See: D79456310 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159874 Approved by: https://github.com/c00w	2025-08-06 22:33:04 +00:00
Xiaochang Wu	2507ae63f2	Partitioner: Fix to align partition node order with original graph (#157892 ) Fixes #157891 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157892 Approved by: https://github.com/ezyang	2025-08-06 22:12:47 +00:00
Lucas Kabela	40c4d61f9a	[Dynamo][Better Engineering] Typing `torch/_dynamo/guards.py` (#159315 ) As part of better engineering effort, we would like to improve out type support to improve dev experience in dynamo This PR adds strict typing support to `torch/_dynamo/guards.py` Running ``` mypy torch/_dynamo/guards.py --linecount-report /tmp/coverage_log ``` \| -------- \| Lines Annotated \| Lines Total \| % lines covered \| Funcs Annotated \| Funcs Total \| % funcs covered \| \| -------- \| ------- \| -------- \| ------- \| ------- \| ------- \| ------- \| \| Main \| 2030 \| 3945 \| 51.46% \| 70 \| 138 \| 50.72% \| \| This PR \| 4055 \| 4055 \| 100.00% \| 138 \| 138 \| 100.00% \| \| Delta \| +2025 \| +90 \| +48.54% \| +68 \| 0 \| +49.28% \| Pull Request resolved: https://github.com/pytorch/pytorch/pull/159315 Approved by: https://github.com/williamwen42, https://github.com/Skylion007	2025-08-06 21:52:14 +00:00
Tom Ritchford	a5725965ea	Remove unnecessary "# noqa: set_linter" comments (#159467 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159467 Approved by: https://github.com/eellison	2025-08-06 21:31:52 +00:00
Ruben Rodriguez Buchillon	289f62ce8a	[inductor][ez] fixup scaled_mm (#159948 ) Summary: This reverts the part of #159383 for scaled_mm where now, like before, we pass through the normal input_nodes (not the triton_input_nodes) to select_algorithm - #159383 refactored how kwargs are retrieved - it introduced this notion of KernelInputs that wrap input_nodes - scaled_mm uses unsqueezed input nodes for triton to retrieve params - the issue: it uses a squeezed (regular) bias for select_algorithm instead This fixes that by passing the original input nodes rather than the triton input nodes. Test Plan: ``` buck test '@fbcode//mode/opt' fbcode//caffe2/test/inductor:fp8 -- --exact 'caffe2/test/inductor:fp8 - test_rowwise_scaling_shape_1024,1024,512_has_bias_True_use_fast_accum_True_persistent_matmul_False (caffe2.test.inductor.test_fp8.TestFP8Lowering)' buck test '@fbcode//mode/opt' fbcode//caffe2/test/inductor:fp8 -- --exact 'caffe2/test/inductor:fp8 - test_rowwise_scaling_shape_1024,1024,512_has_bias_True_use_fast_accum_True_persistent_matmul_True (caffe2.test.inductor.test_fp8.TestFP8Lowering)' ``` This set of tests was failing, and is passing now Side note: these tests were failing I believe because the unsqueezed bias made the ATEN choice no longer eligible, and there is some minor numerical discrepancy between ATEN and Triton for this. I'm not sure the test should be written like that, as we're implicitly relying on ATEN being the choice here. Reviewers: Subscribers: Tasks: Tags: Differential Revision: [D79717654](https://our.internmc.facebook.com/intern/diff/D79717654) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159948 Approved by: https://github.com/izaitsevfb, https://github.com/eellison	2025-08-06 21:25:48 +00:00
Nikita Shulga	512b4730e3	[EZ] Remove useless `cross_compile_arm64` (#159986 ) As we don't have any Intel Mac runners in CI for last 2+ years Pull Request resolved: https://github.com/pytorch/pytorch/pull/159986 Approved by: https://github.com/atalman	2025-08-06 21:01:05 +00:00
Xia, Weiwen	d2368aa6f3	[CPUBLAS] add macros for brgemm APIs for versioning (#158629 ) Summary Add macros for brgemm, so that callers (e.g., Torchao's cpp kernels) know which APIs are available. It is useful when callers need to co-work with old versions of PyTorch. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158629 Approved by: https://github.com/CaoE, https://github.com/Valentine233, https://github.com/ezyang	2025-08-06 20:54:05 +00:00
Mwiza Kunda	0afaeb7c4e	Improve `extract_test_fn` (#158637 ) The current implementation assumes test functions are resolved as test_module.TestClass.test_fn, however this would not work for modules nested in directories e.g. inductor.test_torchinductor.TestClass.test_fn Pull Request resolved: https://github.com/pytorch/pytorch/pull/158637 Approved by: https://github.com/jbschlosser	2025-08-06 20:45:21 +00:00
Alan Du	50580b5053	Add minimal nn.functional.log_softmax support for NestedTensor (#159662 ) This only works for the jagged layout and for the non-batch and non-jagged dimensions. I did this mostly by copy-pasting from the existing softmax implementation, but it seems fairly straightforward and I think it should work. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159662 Approved by: https://github.com/jbschlosser	2025-08-06 20:34:02 +00:00
Frank Seide	b8ef60b6bc	Enable XNNPACK aarch64 builds (#159762 ) Summary: This fixes the build of TorchScript's XNNPACK dependency for our aarch64 device. Thanks to andrewjcg for proposing this fix. Rollback Plan: Reviewed By: andrewjcg Differential Revision: D79497613 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159762 Approved by: https://github.com/frankseide, https://github.com/malfet Co-authored-by: Frank Seide <seide@meta.com>	2025-08-06 20:20:32 +00:00
Nikita Shulga	0de2a45a48	[BE] Merge 3 CUDA build jobs into one (#159890 ) Before this change there were build+test jobs: - s89 build+tests - sm75 build+distributed_test - sm_75 build+pr_time_benchmark test This change compiles all 3 builds into one (for 2 architectures) and skips testing sm86 as it never found any new regressions that were not found at the same time on sm89 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159890 Approved by: https://github.com/clee2000, https://github.com/seemethere	2025-08-06 20:09:55 +00:00
xinan.lin	12a54e4ac1	[Inductor UT][Fix XPU CI] Fix case failures introduced by community. (#159759 ) Fixes #159631 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159759 Approved by: https://github.com/EikanWang, https://github.com/jansel	2025-08-06 20:02:20 +00:00
Nikita Shulga	d10e9e4781	[MPS] Remove all pre-MacOS14 logic (#159912 ) Delete older enums, checks for MacOS-13.3+ for int64 support, etc Fixes https://github.com/pytorch/pytorch/issues/159275 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159912 Approved by: https://github.com/manuelcandales	2025-08-06 19:48:12 +00:00
Xu Han	c71950907d	[inductor] add _get_inductor_debug_symbol_cflags for debug symbol control. (#159938 ) We need to add inductor debug symbol support for crash case debug. When we turn on generate debug symbol. On Windows, it should create a [module_name].pdb file. It helps debug by WinDBG. On Linux, it should create some debug sections in binary file. I added UT for it also. It works well on Windows inductor debug. <img width="1648" height="833" alt="image" src="https://github.com/user-attachments/assets/5282a7de-cef3-4a38-9cd4-a0e63482c8b6" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/159938 Approved by: https://github.com/jansel, https://github.com/angelayi	2025-08-06 19:31:45 +00:00
Divyansh Khanna	6fa3592dc6	Dataloader benchmark script (#159432 ) This script adds a simple dataloading benchmark tracking throughput and memory. The output looks like this ``` System Information: PyTorch version: 2.9.0a0+gitf87d117 PyTorch location: /home/divyanshkhanna/pytorch/torch/__init__.py Torchvision version: 0.24.0a0+f52c4f1 Torchvision location: /home/divyanshkhanna/pytorch/vision/torchvision/__init__.py CUDA available: True CUDA device: NVIDIA PG509-210 CPU count: 192 Physical CPU cores: 96 Total system memory: 1510.11 GB Loading dataset from imagenet/val (1 copies) Dataset size: 50000 --- Benchmarking DataLoader with worker_method=multiprocessing --- Memory before DataLoader creation: 500.59 MB Detailed memory information: USS (Unique Set Size): 499.00 MB PSS (Proportional Set Size): 500.74 MB RSS (Resident Set Size): 497.39 MB Memory after DataLoader creation: 1127.61 MB Memory increase: 627.02 MB Starting training loop with 1 epochs (max 100 batches per epoch) Epoch 1, Batch 10, Time: 0.2910s, Memory: 12044.50 MB Epoch 1, Batch 20, Time: 0.2909s, Memory: 12185.71 MB Epoch 1, Batch 30, Time: 0.2909s, Memory: 10654.93 MB Epoch 1, Batch 40, Time: 0.2909s, Memory: 12378.26 MB Epoch 1, Batch 50, Time: 0.2907s, Memory: 12402.28 MB Epoch 1, Batch 60, Time: 0.2909s, Memory: 10559.35 MB Epoch 1, Batch 70, Time: 0.2907s, Memory: 12644.69 MB Epoch 1, Batch 80, Time: 0.2909s, Memory: 12654.65 MB Epoch 1, Batch 90, Time: 0.2909s, Memory: 12727.20 MB Epoch 1, Batch 100, Time: 0.2908s, Memory: 12722.09 MB Results: Worker method: multiprocessing DataLoader init time: 0.1553 seconds Average batch time: 0.3408 seconds Samples per second: 375.53 Peak memory usage: 12738.76 MB Memory increase: 12238.17 MB ``` > TODO: This script right now is CPU-only friendly and GPU friendly. But it might be worth upgrading it to test against a canonical DistributedDataParallel setup on say a 1x8 node. Or maybe we can keep that as a separate script inside `benchmarks` Pull Request resolved: https://github.com/pytorch/pytorch/pull/159432 Approved by: https://github.com/ramanishsingh	2025-08-06 19:05:19 +00:00
PyTorch MergeBot	ba37f589d4	Revert "[dynamo] Be consistent with storing func source for UserMethodVariable (#159696 )" This reverts commit ee62177c196d716fc3a2d641370bed8a673a45d3. Reverted https://github.com/pytorch/pytorch/pull/159696 on behalf of https://github.com/anijain2305 due to broke internal tests ([comment](https://github.com/pytorch/pytorch/pull/159696#issuecomment-3161196192))	2025-08-06 18:41:05 +00:00
Bin Bao	44dd3684d2	[AOTI] Fix memory leak from all_reduce (#159818 ) Summary: This PR solves two issues: 1. When lowering the all_reduce op, Inductor expects to convert it to the in-place version, all_reduce_, but it was calling ir._AllReduceKernel.create_inplace instead of ir._AllReduce_Kernel.create_inplace. This triggers a tricky bug in AOIT because it generates cpp call to the functional version aoti_torch_cpu__c10d_functional_all_reduce, but later corresponding wait operation will still wait on the input to aoti_torch_cpu__c10d_functional_all_reduce instead of the output from aoti_torch_cpu__c10d_functional_all_reduce. This causes unwaited tensor leading to memory leak. 2. Since AOTI generates the inplace version aoti_torch_cpu__c10d_functional_all_reduce_ now. The return tensor from aoti_torch_cpu__c10d_functional_all_reduce_ doesn't get used. It will be released when the program exists, so it's not a memory leak but it will unnecessarily hold that tensor which causes high memory water mark. This PR generates tensor delete operation right after calling aoti_torch_cpu__c10d_functional_all_reduce_. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159818 Approved by: https://github.com/henryhu6, https://github.com/yushangdi	2025-08-06 18:11:14 +00:00
Georgia Phillips	c669b0ab87	Fix execution frame cleanup logic (#158717 ) Summary: This fixes a bug in the execution fram cleanup logic - previously, whenever we hit the time interval to clear out the frames, we were removing any cached execution frames beyond the configured minimum number (frameEntry.used was unused). Instead, we only want to clear frames that were NOT USED in during the last time interval. This diff refactors the executor to have the correct logic. Test Plan: ``` buck2 test 'mode/dev-nosan' fbcode//sigmoid/inference/test_gpu:model_runner_test -- ModelRunnerTest.Basic_InterpreterCuda_Multithread_Cleanup --run-disabled --print-passing-details ``` Rollback Plan: Differential Revision: D78621408 Pull Request resolved: https://github.com/pytorch/pytorch/pull/158717 Approved by: https://github.com/dolpm	2025-08-06 18:04:24 +00:00
Luca Wehrstedt	d7a855d67d	[async-TP] Make scaled-mm + reduce-scatter preserve alignment of scales (#159957 ) After https://github.com/pytorch/pytorch/pull/157905 started using cuBLAS for row-wise scaling on CUDA 12.9+, this broke some downstream tests for fp8 which were testing "odd" shapes. After checking in with the cuBLAS team this turned out to be due to the scale tensors' starting addresses not being aligned to 16 bytes. PyTorch storages are always aligned at 256 bytes, hence this came from a "slicing" of the scale tensor being done inside async-TP when chunking a matmul in order to overlap it with reduce-scatter. Pull Request resolved: https://github.com/pytorch/pytorch/pull/159957 Approved by: https://github.com/vkuzo, https://github.com/danielvegamyhre	2025-08-06 17:42:26 +00:00
Meet Vadakkanchery	4c01991b38	[DCP][Prototype] Checkpoint replication via PGTransport (#157963 ) (#159801 ) Summary: ### PR Context Introduce simple replication logic via PGTransport. The goal is to showcase a working prototype of replication via PGTransport, in this impl we assume world_sizes are equal allowing us to create perfect bi-directional pairs for the purpose of choosing replica "partners". Test Plan: CI Rollback Plan: Differential Revision: D79590797 Pull Request resolved: https://github.com/pytorch/pytorch/pull/159801 Approved by: https://github.com/saumishr	2025-08-06 16:52:03 +00:00
Bin Bao	a4b07fe8f6	[AOTI] Add more default options to compile_standalone (#158560 ) Summary: When compiling for standalone, make embed_kernel_binary and emit_multi_arch_kernel default to True, and add a default name for model_name_for_generated_files to make the generated cpp project easier to understand. Also improved the weights object file naming to be more readable. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158560 Approved by: https://github.com/yushangdi	2025-08-06 15:59:27 +00:00
Mikayla Gawarecki	d87161c3c8	[Easy] Fix wrong propagation of fallback_ops_dict in gen_aoti_c_shim (#159904 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/159904 Approved by: https://github.com/janeyx99	2025-08-06 15:09:18 +00:00

3926 changed files with 224829 additions and 132859 deletions

									
										15

.bc-linter.yml
									
										Normal file
									
												View File
												
				@ -0,0 +1,15 @@

				version: 1

				paths:

				include:

				  - "**/*.py"

				exclude:

				  - ".*"

				  - ".*/**"

				  - "**/.*/**"

				  - "**/.*"

				  - "**/_*/**"

				  - "**/_*.py"

				  - "**/test/**"

				  - "**/benchmarks/**"

				  - "**/test_*.py"

				  - "**/*_test.py"

									
										34

.ci/aarch64_linux/aarch64_ci_build.sh
									
												View File
												
				@ -3,8 +3,22 @@ set -eux -o pipefail

				GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}

				if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then

				# Set CUDA architecture lists to match x86 build_cuda.sh

				if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then

				    export TORCH_CUDA_ARCH_LIST="8.0;9.0"

				elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then

				    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"

				elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then

				    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"

				elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then

				    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"

				fi

				# Compress the fatbin with -compress-mode=size for CUDA 13

				if [[ "$DESIRED_CUDA" == *"13"* ]]; then

				    export TORCH_NVCC_FLAGS="-compress-mode=size"

				    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801

				    export BUILD_BUNDLE_PTXAS=1

				fi

				SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"

				@ -18,14 +32,22 @@ cd /

				# on the mounted pytorch repo

				git config --global --add safe.directory /pytorch

				pip install -r /pytorch/requirements.txt

				pip install auditwheel==6.2.0

				pip install auditwheel==6.2.0 wheel

				if [ "$DESIRED_CUDA" = "cpu" ]; then

				    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."

				    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files

				    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn

				    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn

				else

				    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"

				    export USE_SYSTEM_NCCL=1

				    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files

				    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda

				    # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)

				    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then

				        echo "Bundling CUDA libraries with wheel for aarch64."

				    else

				        echo "Using nvidia libs from pypi for aarch64."

				        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"

				        export USE_NVIDIA_PYPI_LIBS=1

				    fi

				    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda

				fi

									
										305

.ci/aarch64_linux/aarch64_wheel_ci_build.py
									
												View File
												
				@ -13,49 +13,6 @@ def list_dir(path: str) -> list[str]:

				    return check_output(["ls", "-1", path]).decode().split("\n")

				def build_ArmComputeLibrary() -> None:

				    """

				    Using ArmComputeLibrary for aarch64 PyTorch

				    """

				    print("Building Arm Compute Library")

				    acl_build_flags = [

				        "debug=0",

				        "neon=1",

				        "opencl=0",

				        "os=linux",

				        "openmp=1",

				        "cppthreads=0",

				        "arch=armv8a",

				        "multi_isa=1",

				        "fixed_format_kernels=1",

				        "build=native",

				    ]

				    acl_install_dir = "/acl"

				    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")

				    if os.path.isdir(acl_install_dir):

				        shutil.rmtree(acl_install_dir)

				    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):

				        check_call(

				            [

				                "git",

				                "clone",

				                "https://github.com/ARM-software/ComputeLibrary.git",

				                "-b",

				                "v25.02",

				                "--depth",

				                "1",

				                "--shallow-submodules",

				            ]

				        )

				    check_call(

				        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,

				        cwd=acl_checkout_dir,

				    )

				    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:

				        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")

				def replace_tag(filename) -> None:

				    with open(filename) as f:

				        lines = f.readlines()

				@ -69,61 +26,186 @@ def replace_tag(filename) -> None:

				        f.writelines(lines)

				def patch_library_rpath(

				    folder: str,

				    lib_name: str,

				    use_nvidia_pypi_libs: bool = False,

				    desired_cuda: str = "",

				) -> None:

				    """Apply patchelf to set RPATH for a library in torch/lib"""

				    lib_path = f"{folder}/tmp/torch/lib/{lib_name}"

				    if use_nvidia_pypi_libs:

				        # For PyPI NVIDIA libraries, construct CUDA RPATH

				        cuda_rpaths = [

				            "$ORIGIN/../../nvidia/cudnn/lib",

				            "$ORIGIN/../../nvidia/nvshmem/lib",

				            "$ORIGIN/../../nvidia/nccl/lib",

				            "$ORIGIN/../../nvidia/cusparselt/lib",

				        ]

				        if "130" in desired_cuda:

				            cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")

				        else:

				            cuda_rpaths.extend(

				                [

				                    "$ORIGIN/../../nvidia/cublas/lib",

				                    "$ORIGIN/../../nvidia/cuda_cupti/lib",

				                    "$ORIGIN/../../nvidia/cuda_nvrtc/lib",

				                    "$ORIGIN/../../nvidia/cuda_runtime/lib",

				                    "$ORIGIN/../../nvidia/cufft/lib",

				                    "$ORIGIN/../../nvidia/curand/lib",

				                    "$ORIGIN/../../nvidia/cusolver/lib",

				                    "$ORIGIN/../../nvidia/cusparse/lib",

				                    "$ORIGIN/../../nvidia/nvtx/lib",

				                    "$ORIGIN/../../nvidia/cufile/lib",

				                ]

				            )

				        # Add $ORIGIN for local torch libs

				        rpath = ":".join(cuda_rpaths) + ":$ORIGIN"

				    else:

				        # For bundled libraries, just use $ORIGIN

				        rpath = "$ORIGIN"

				    if os.path.exists(lib_path):

				        os.system(

				            f"cd {folder}/tmp/torch/lib/; "

				            f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"

				        )

				def copy_and_patch_library(

				    src_path: str,

				    folder: str,

				    use_nvidia_pypi_libs: bool = False,

				    desired_cuda: str = "",

				) -> None:

				    """Copy a library to torch/lib and patch its RPATH"""

				    if os.path.exists(src_path):

				        lib_name = os.path.basename(src_path)

				        shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")

				        patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)

				def package_cuda_wheel(wheel_path, desired_cuda) -> None:

				    """

				    Package the cuda wheel libraries

				    """

				    folder = os.path.dirname(wheel_path)

				    wheelname = os.path.basename(wheel_path)

				    os.mkdir(f"{folder}/tmp")

				    os.system(f"unzip {wheel_path} -d {folder}/tmp")

				    libs_to_copy = [

				        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",

				        "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",

				        "/usr/local/cuda/lib64/libcudnn.so.9",

				        "/usr/local/cuda/lib64/libcublas.so.12",

				        "/usr/local/cuda/lib64/libcublasLt.so.12",

				        "/usr/local/cuda/lib64/libcudart.so.12",

				        "/usr/local/cuda/lib64/libcufft.so.11",

				        "/usr/local/cuda/lib64/libcusparse.so.12",

				        "/usr/local/cuda/lib64/libcusparseLt.so.0",

				        "/usr/local/cuda/lib64/libcusolver.so.11",

				        "/usr/local/cuda/lib64/libcurand.so.10",

				        "/usr/local/cuda/lib64/libnccl.so.2",

				        "/usr/local/cuda/lib64/libnvJitLink.so.12",

				        "/usr/local/cuda/lib64/libnvrtc.so.12",

				        "/usr/local/cuda/lib64/libcudnn_adv.so.9",

				        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",

				        "/usr/local/cuda/lib64/libcudnn_graph.so.9",

				        "/usr/local/cuda/lib64/libcudnn_ops.so.9",

				        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",

				        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",

				        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",

				        "/lib64/libgomp.so.1",

				        "/usr/lib64/libgfortran.so.5",

				        "/acl/build/libarm_compute.so",

				        "/acl/build/libarm_compute_graph.so",

				        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",

				        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",

				        "/usr/local/lib/libnvpl_lapack_core.so.0",

				        "/usr/local/lib/libnvpl_blas_core.so.0",

				    ]

				    # Delete original wheel since it will be repackaged

				    os.system(f"rm {wheel_path}")

				    if "129" in desired_cuda:

				        libs_to_copy += [

				            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",

				            "/usr/local/cuda/lib64/libcufile.so.0",

				            "/usr/local/cuda/lib64/libcufile_rdma.so.1",

				    # Check if we should use PyPI NVIDIA libraries or bundle system libraries

				    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"

				    if use_nvidia_pypi_libs:

				        print("Using nvidia libs from pypi - skipping CUDA library bundling")

				        # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages

				        # We only need to bundle non-NVIDIA libraries

				        minimal_libs_to_copy = [

				            "/lib64/libgomp.so.1",

				            "/usr/lib64/libgfortran.so.5",

				            "/acl/build/libarm_compute.so",

				            "/acl/build/libarm_compute_graph.so",

				            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",

				            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",

				            "/usr/local/lib/libnvpl_lapack_core.so.0",

				            "/usr/local/lib/libnvpl_blas_core.so.0",

				        ]

				    # Copy libraries to unzipped_folder/a/lib

				    for lib_path in libs_to_copy:

				        lib_name = os.path.basename(lib_path)

				        shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")

				        os.system(

				            f"cd {folder}/tmp/torch/lib/; "

				            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"

				        )

				        # Copy minimal libraries to unzipped_folder/torch/lib

				        for lib_path in minimal_libs_to_copy:

				            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)

				        # Patch torch libraries used for searching libraries

				        torch_libs_to_patch = [

				            "libtorch.so",

				            "libtorch_cpu.so",

				            "libtorch_cuda.so",

				            "libtorch_cuda_linalg.so",

				            "libtorch_global_deps.so",

				            "libtorch_python.so",

				            "libtorch_nvshmem.so",

				            "libc10.so",

				            "libc10_cuda.so",

				            "libcaffe2_nvrtc.so",

				            "libshm.so",

				        ]

				        for lib_name in torch_libs_to_patch:

				            patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)

				    else:

				        print("Bundling CUDA libraries with wheel")

				        # Original logic for bundling system CUDA libraries

				        # Common libraries for all CUDA versions

				        common_libs = [

				            # Non-NVIDIA system libraries

				            "/lib64/libgomp.so.1",

				            "/usr/lib64/libgfortran.so.5",

				            "/acl/build/libarm_compute.so",

				            "/acl/build/libarm_compute_graph.so",

				            # Common CUDA libraries (same for all versions)

				            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",

				            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",

				            "/usr/local/lib/libnvpl_lapack_core.so.0",

				            "/usr/local/lib/libnvpl_blas_core.so.0",

				            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",

				            "/usr/local/cuda/lib64/libcudnn.so.9",

				            "/usr/local/cuda/lib64/libcusparseLt.so.0",

				            "/usr/local/cuda/lib64/libcurand.so.10",

				            "/usr/local/cuda/lib64/libnccl.so.2",

				            "/usr/local/cuda/lib64/libnvshmem_host.so.3",

				            "/usr/local/cuda/lib64/libcudnn_adv.so.9",

				            "/usr/local/cuda/lib64/libcudnn_cnn.so.9",

				            "/usr/local/cuda/lib64/libcudnn_graph.so.9",

				            "/usr/local/cuda/lib64/libcudnn_ops.so.9",

				            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",

				            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",

				            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",

				            "/usr/local/cuda/lib64/libcufile.so.0",

				            "/usr/local/cuda/lib64/libcufile_rdma.so.1",

				            "/usr/local/cuda/lib64/libcusparse.so.12",

				        ]

				        # CUDA version-specific libraries

				        if "13" in desired_cuda:

				            minor_version = desired_cuda[-1]

				            version_specific_libs = [

				                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",

				                "/usr/local/cuda/lib64/libcublas.so.13",

				                "/usr/local/cuda/lib64/libcublasLt.so.13",

				                "/usr/local/cuda/lib64/libcudart.so.13",

				                "/usr/local/cuda/lib64/libcufft.so.12",

				                "/usr/local/cuda/lib64/libcusolver.so.12",

				                "/usr/local/cuda/lib64/libnvJitLink.so.13",

				                "/usr/local/cuda/lib64/libnvrtc.so.13",

				                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",

				            ]

				        elif "12" in desired_cuda:

				            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")

				            minor_version = desired_cuda[-1]

				            version_specific_libs = [

				                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",

				                "/usr/local/cuda/lib64/libcublas.so.12",

				                "/usr/local/cuda/lib64/libcublasLt.so.12",

				                "/usr/local/cuda/lib64/libcudart.so.12",

				                "/usr/local/cuda/lib64/libcufft.so.11",

				                "/usr/local/cuda/lib64/libcusolver.so.11",

				                "/usr/local/cuda/lib64/libnvJitLink.so.12",

				                "/usr/local/cuda/lib64/libnvrtc.so.12",

				                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",

				            ]

				        else:

				            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")

				        # Combine all libraries

				        libs_to_copy = common_libs + version_specific_libs

				        # Copy libraries to unzipped_folder/torch/lib

				        for lib_path in libs_to_copy:

				            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)

				    # Make sure the wheel is tagged with manylinux_2_28

				    for f in os.scandir(f"{folder}/tmp/"):

				@ -131,14 +213,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:

				            replace_tag(f"{f.path}/WHEEL")

				            break

				    os.mkdir(f"{folder}/cuda_wheel")

				    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")

				    shutil.move(

				        f"{folder}/cuda_wheel/{wheelname}",

				        f"{folder}/{wheelname}",

				        copy_function=shutil.copy2,

				    )

				    os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/")

				    os.system(f"wheel pack {folder}/tmp/ -d {folder}")

				    os.system(f"rm -rf {folder}/tmp/")

				def complete_wheel(folder: str) -> str:

				@ -161,14 +237,7 @@ def complete_wheel(folder: str) -> str:

				            f"/{folder}/dist/{repaired_wheel_name}",

				        )

				    else:

				        repaired_wheel_name = wheel_name.replace(

				            "linux_aarch64", "manylinux_2_28_aarch64"

				        )

				        print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")

				        os.rename(

				            f"/{folder}/dist/{wheel_name}",

				            f"/{folder}/dist/{repaired_wheel_name}",

				        )

				        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]

				    print(f"Copying {repaired_wheel_name} to artifacts")

				    shutil.copy2(

				@ -205,10 +274,20 @@ if __name__ == "__main__":

				    ).decode()

				    print("Building PyTorch wheel")

				    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "

				    build_vars = ""

				    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)

				    if enable_cuda:

				        build_vars = "MAX_JOBS=5 " + build_vars

				        build_vars += "MAX_JOBS=5 "

				        # Handle PyPI NVIDIA libraries vs bundled libraries

				        use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"

				        if use_nvidia_pypi_libs:

				            print("Configuring build for PyPI NVIDIA libraries")

				            # Configure for dynamic linking (matching x86 logic)

				            build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "

				        else:

				            print("Configuring build for bundled NVIDIA libraries")

				            # Keep existing static linking approach - already configured above

				    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")

				    desired_cuda = os.getenv("DESIRED_CUDA")

				@ -234,23 +313,17 @@ if __name__ == "__main__":

				        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "

				    if enable_mkldnn:

				        build_ArmComputeLibrary()

				        print("build pytorch with mkldnn+acl backend")

				        build_vars += (

				            "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "

				            "ACL_ROOT_DIR=/acl "

				            "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "

				            "ACL_INCLUDE_DIR=/acl/build "

				            "ACL_LIBRARY=/acl/build "

				        )

				        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "

				        build_vars += "ACL_ROOT_DIR=/acl "

				        if enable_cuda:

				            build_vars += "BLAS=NVPL "

				        else:

				            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "

				            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "

				    else:

				        print("build pytorch without mkldnn backend")

				    os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")

				    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")

				    if enable_cuda:

				        print("Updating Cuda Dependency")

				        filename = os.listdir("/pytorch/dist/")

									
										80

.ci/aarch64_linux/build_aarch64_wheel.py
									
												View File
												
				@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):

				        try:

				            with socket.create_connection((addr, port), timeout=timeout):

				                return

				        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203

				        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203

				            if i == attempt_cnt - 1:

				                raise

				            time.sleep(timeout)

				@ -299,40 +299,6 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:

				        )

				def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:

				    print("Building OpenBLAS")

				    host.run_cmd(

				        f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}"

				    )

				    make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"

				    host.run_cmd(

				        f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS"

				    )

				def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:

				    print("Building Arm Compute Library")

				    acl_build_flags = " ".join(

				        [

				            "debug=0",

				            "neon=1",

				            "opencl=0",

				            "os=linux",

				            "openmp=1",

				            "cppthreads=0",

				            "arch=armv8a",

				            "multi_isa=1",

				            "fixed_format_kernels=1",

				            "build=native",

				        ]

				    )

				    host.run_cmd(

				        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"

				    )

				    host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")

				def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:

				    host.run_cmd("pip3 install auditwheel")

				    host.run_cmd(

				@ -438,13 +404,11 @@ def build_torchvision(

				        )

				        build_vars += f"BUILD_VERSION={version}.dev{build_date}"

				    elif build_version is not None:

				        build_vars += (

				            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"

				        )

				        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"

				    if host.using_docker():

				        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

				    host.run_cmd(f"cd vision && {build_vars} python3 setup.py bdist_wheel")

				    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")

				    vision_wheel_name = host.list_dir("vision/dist")[0]

				    embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))

				@ -495,13 +459,11 @@ def build_torchdata(

				        )

				        build_vars += f"BUILD_VERSION={version}.dev{build_date}"

				    elif build_version is not None:

				        build_vars += (

				            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"

				        )

				        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"

				    if host.using_docker():

				        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

				    host.run_cmd(f"cd data && {build_vars} python3 setup.py bdist_wheel")

				    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")

				    wheel_name = host.list_dir("data/dist")[0]

				    embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))

				@ -553,13 +515,11 @@ def build_torchtext(

				        )

				        build_vars += f"BUILD_VERSION={version}.dev{build_date}"

				    elif build_version is not None:

				        build_vars += (

				            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"

				        )

				        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"

				    if host.using_docker():

				        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

				    host.run_cmd(f"cd text && {build_vars} python3 setup.py bdist_wheel")

				    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")

				    wheel_name = host.list_dir("text/dist")[0]

				    embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))

				@ -613,16 +573,14 @@ def build_torchaudio(

				        )

				        build_vars += f"BUILD_VERSION={version}.dev{build_date}"

				    elif build_version is not None:

				        build_vars += (

				            f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-')[0]}"

				        )

				        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"

				    if host.using_docker():

				        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

				    host.run_cmd(

				        f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \

				        && ./packaging/ffmpeg/build.sh \

				        && {build_vars} python3 setup.py bdist_wheel"

				        && {build_vars} python3 -m build --wheel --no-isolation"

				    )

				    wheel_name = host.list_dir("audio/dist")[0]

				@ -708,7 +666,6 @@ def start_build(

				    configure_system(

				        host, compiler=compiler, use_conda=use_conda, python_version=python_version

				    )

				    build_OpenBLAS(host, git_clone_flags)

				    if host.using_docker():

				        print("Move libgfortant.a into a standard location")

				@ -731,10 +688,12 @@ def start_build(

				        f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"

				    )

				    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")

				    print("Building PyTorch wheel")

				    build_opts = ""

				    if pytorch_build_number is not None:

				        build_opts += f" --build-number {pytorch_build_number}"

				        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"

				    # Breakpad build fails on aarch64

				    build_vars = "USE_BREAKPAD=0 "

				    if branch == "nightly":

				@ -751,15 +710,18 @@ def start_build(

				    if host.using_docker():

				        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"

				    if enable_mkldnn:

				        build_ArmComputeLibrary(host, git_clone_flags)

				        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")

				        print("build pytorch with mkldnn+acl backend")

				        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"

				        build_vars += " BLAS=OpenBLAS"

				        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"

				        build_vars += " ACL_ROOT_DIR=/acl"

				        host.run_cmd(

				            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"

				            f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"

				        )

				        print("Repair the wheel")

				        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]

				        ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib"

				        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"

				        host.run_cmd(

				            f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"

				        )

				@ -771,7 +733,7 @@ def start_build(

				    else:

				        print("build pytorch without mkldnn backend")

				        host.run_cmd(

				            f"cd pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"

				            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"

				        )

				    print("Deleting build folder")

				@ -915,7 +877,7 @@ def terminate_instances(instance_type: str) -> None:

				def parse_arguments():

				    from argparse import ArgumentParser

				    parser = ArgumentParser("Builid and test AARCH64 wheels using EC2")

				    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")

				    parser.add_argument("--key-name", type=str)

				    parser.add_argument("--debug", action="store_true")

				    parser.add_argument("--build-only", action="store_true")

				@ -1012,7 +974,7 @@ if __name__ == "__main__":

				        install_condaforge_python(host, args.python_version)

				        sys.exit(0)

				    python_version = args.python_version if args.python_version is not None else "3.9"

				    python_version = args.python_version if args.python_version is not None else "3.10"

				    if args.use_torch_from_pypi:

				        configure_system(host, compiler=args.compiler, python_version=python_version)

									
										4

.ci/docker/README.md
									
												View File
												
				@ -120,8 +120,8 @@ If your new Docker image needs a library installed from a specific pinned commit

				   If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:

				   ```bash

				   docker build \

				      ....

				      --build-arg "NEW_ARG_1=${NEW_ARG_1}"

				     ....

				     --build-arg "NEW_ARG_1=${NEW_ARG_1}"

				   ```

				3. **Update Dockerfile logic**:

									
										9

.ci/docker/almalinux/Dockerfile
									
												View File
												
				@ -64,8 +64,13 @@ FROM cuda as cuda12.9

				RUN bash ./install_cuda.sh 12.9

				ENV DESIRED_CUDA=12.9

				FROM cuda as cuda13.0

				RUN bash ./install_cuda.sh 13.0

				ENV DESIRED_CUDA=13.0

				FROM ${ROCM_IMAGE} as rocm

				ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"

				ARG PYTORCH_ROCM_ARCH

				ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}

				ADD ./common/install_mkl.sh install_mkl.sh

				RUN bash ./install_mkl.sh && rm install_mkl.sh

				ENV MKLROOT /opt/intel

				@ -76,10 +81,10 @@ ADD ./common/install_mnist.sh install_mnist.sh

				RUN bash ./install_mnist.sh

				FROM base as all_cuda

				COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8

				COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6

				COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8

				COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9

				COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0

				# Final step

				FROM ${BASE_TARGET} as final

									
										6

.ci/docker/almalinux/build.sh
									
												View File
												
				@ -36,6 +36,12 @@ case ${DOCKER_TAG_PREFIX} in

				    ;;

				  rocm*)

				    BASE_TARGET=rocm

				    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"

				    # add gfx950, gfx115x conditionally starting in ROCm 7.0

				    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then

				        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"

				    fi

				    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"

				    ;;

				  *)

				    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"

									
										137

.ci/docker/build.sh
									
												View File
												
				@ -76,13 +76,16 @@ elif [[ "$image" == *cuda*linter* ]]; then

				elif [[ "$image" == *linter* ]]; then

				  # Use a separate Dockerfile for linter to keep a small image size

				  DOCKERFILE="linter/Dockerfile"

				elif [[ "$image" == *riscv* ]]; then

				  # Use RISC-V specific Dockerfile

				  DOCKERFILE="ubuntu-cross-riscv/Dockerfile"

				fi

				_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb

				_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b

				_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152

				_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96

				if [[ "$image" == *rocm* ]]; then

				  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6

				  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d

				  _UCX_COMMIT=29831d319e6be55cb8c768ca61de335c934ca39e

				  _UCC_COMMIT=9f4b242cbbd8b1462cbc732eb29316cdfa124b77

				fi

				tag=$(echo $image | awk -F':' '{print $2}')

				@ -110,6 +113,17 @@ case "$tag" in

				    UCX_COMMIT=${_UCX_COMMIT}

				    UCC_COMMIT=${_UCC_COMMIT}

				    TRITON=yes

				    INSTALL_MINGW=yes

				    ;;

				  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)

				    CUDA_VERSION=13.0.0

				    ANACONDA_PYTHON_VERSION=3.10

				    GCC_VERSION=11

				    VISION=yes

				    KATEX=yes

				    UCX_COMMIT=${_UCX_COMMIT}

				    UCC_COMMIT=${_UCC_COMMIT}

				    TRITON=yes

				    ;;

				  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)

				    CUDA_VERSION=12.8.1

				@ -122,28 +136,6 @@ case "$tag" in

				    TRITON=yes

				    INDUCTOR_BENCHMARKS=yes

				    ;;

				  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)

				    CUDA_VERSION=12.8.1

				    ANACONDA_PYTHON_VERSION=3.12

				    GCC_VERSION=9

				    VISION=yes

				    KATEX=yes

				    UCX_COMMIT=${_UCX_COMMIT}

				    UCC_COMMIT=${_UCC_COMMIT}

				    TRITON=yes

				    INDUCTOR_BENCHMARKS=yes

				    ;;

				  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)

				    CUDA_VERSION=12.8.1

				    ANACONDA_PYTHON_VERSION=3.13

				    GCC_VERSION=9

				    VISION=yes

				    KATEX=yes

				    UCX_COMMIT=${_UCX_COMMIT}

				    UCC_COMMIT=${_UCC_COMMIT}

				    TRITON=yes

				    INDUCTOR_BENCHMARKS=yes

				    ;;

				  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm)

				    CUDA_VERSION=12.8.1

				    ANACONDA_PYTHON_VERSION=3.12

				@ -165,18 +157,18 @@ case "$tag" in

				    TRITON=yes

				    ;;

				  pytorch-linux-jammy-py3-clang12-onnx)

				    ANACONDA_PYTHON_VERSION=3.9

				    ANACONDA_PYTHON_VERSION=3.10

				    CLANG_VERSION=12

				    VISION=yes

				    ONNX=yes

				    ;;

				  pytorch-linux-jammy-py3.9-clang12)

				    ANACONDA_PYTHON_VERSION=3.9

				  pytorch-linux-jammy-py3.10-clang12)

				    ANACONDA_PYTHON_VERSION=3.10

				    CLANG_VERSION=12

				    VISION=yes

				    TRITON=yes

				    ;;

				  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-noble-rocm-n-py3)

				  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)

				    if [[ $tag =~ "jammy" ]]; then

				      ANACONDA_PYTHON_VERSION=3.10

				    else

				@ -184,45 +176,35 @@ case "$tag" in

				    fi

				    GCC_VERSION=11

				    VISION=yes

				    ROCM_VERSION=6.4

				    NINJA_VERSION=1.9.0

				    TRITON=yes

				    KATEX=yes

				    UCX_COMMIT=${_UCX_COMMIT}

				    UCC_COMMIT=${_UCC_COMMIT}

				    INDUCTOR_BENCHMARKS=yes

				    ;;

				  pytorch-linux-noble-rocm-alpha-py3)

				    ANACONDA_PYTHON_VERSION=3.12

				    GCC_VERSION=11

				    VISION=yes

				    ROCM_VERSION=7.0

				    NINJA_VERSION=1.9.0

				    TRITON=yes

				    KATEX=yes

				    UCX_COMMIT=${_UCX_COMMIT}

				    UCC_COMMIT=${_UCC_COMMIT}

				    INDUCTOR_BENCHMARKS=yes

				    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"

				    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"

				    if [[ $tag =~ "benchmarks" ]]; then

				      INDUCTOR_BENCHMARKS=yes

				    fi

				    ;;

				  pytorch-linux-jammy-xpu-2025.0-py3)

				    ANACONDA_PYTHON_VERSION=3.9

				    GCC_VERSION=11

				    VISION=yes

				    XPU_VERSION=2025.0

				    NINJA_VERSION=1.9.0

				    TRITON=yes

				    ;;

				  pytorch-linux-jammy-xpu-2025.1-py3)

				    ANACONDA_PYTHON_VERSION=3.9

				  pytorch-linux-jammy-xpu-n-1-py3)

				    ANACONDA_PYTHON_VERSION=3.10

				    GCC_VERSION=11

				    VISION=yes

				    XPU_VERSION=2025.1

				    NINJA_VERSION=1.9.0

				    TRITON=yes

				    ;;

				  pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)

				    ANACONDA_PYTHON_VERSION=3.9

				  pytorch-linux-jammy-xpu-n-py3)

				    ANACONDA_PYTHON_VERSION=3.10

				    GCC_VERSION=11

				    VISION=yes

				    XPU_VERSION=2025.2

				    NINJA_VERSION=1.9.0

				    TRITON=yes

				    ;;

				  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)

				    ANACONDA_PYTHON_VERSION=3.10

				    GCC_VERSION=11

				    VISION=yes

				    KATEX=yes

				@ -230,8 +212,8 @@ case "$tag" in

				    DOCS=yes

				    INDUCTOR_BENCHMARKS=yes

				    ;;

				  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)

				    ANACONDA_PYTHON_VERSION=3.9

				  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)

				    ANACONDA_PYTHON_VERSION=3.10

				    CUDA_VERSION=12.8.1

				    CLANG_VERSION=12

				    VISION=yes

				@ -242,8 +224,8 @@ case "$tag" in

				    CLANG_VERSION=18

				    VISION=yes

				    ;;

				  pytorch-linux-jammy-py3.9-gcc11)

				    ANACONDA_PYTHON_VERSION=3.9

				  pytorch-linux-jammy-py3.10-gcc11)

				    ANACONDA_PYTHON_VERSION=3.10

				    GCC_VERSION=11

				    VISION=yes

				    KATEX=yes

				@ -270,13 +252,10 @@ case "$tag" in

				    TRITON_CPU=yes

				    ;;

				  pytorch-linux-jammy-linter)

				    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.

				    # We will need to update mypy version eventually, but that's for another day. The task

				    # would be to upgrade mypy to 1.0.0 with Python 3.11

				    PYTHON_VERSION=3.9

				    PYTHON_VERSION=3.10

				    ;;

				  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)

				    PYTHON_VERSION=3.9

				  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)

				    PYTHON_VERSION=3.10

				    CUDA_VERSION=12.8.1

				    ;;

				  pytorch-linux-jammy-aarch64-py3.10-gcc11)

				@ -284,7 +263,6 @@ case "$tag" in

				    GCC_VERSION=11

				    ACL=yes

				    VISION=yes

				    CONDA_CMAKE=yes

				    OPENBLAS=yes

				    # snadampal: skipping llvm src build install because the current version

				    # from pytorch/llvm:9.0.1 is x86 specific

				@ -295,13 +273,15 @@ case "$tag" in

				    GCC_VERSION=11

				    ACL=yes

				    VISION=yes

				    CONDA_CMAKE=yes

				    OPENBLAS=yes

				    # snadampal: skipping llvm src build install because the current version

				    # from pytorch/llvm:9.0.1 is x86 specific

				    SKIP_LLVM_SRC_BUILD_INSTALL=yes

				    INDUCTOR_BENCHMARKS=yes

				    ;;

				  pytorch-linux-noble-riscv64-py3.12-gcc14)

				    GCC_VERSION=14

				    ;;

				  *)

				    # Catch-all for builds that are not hardcoded.

				    VISION=yes

				@ -365,7 +345,7 @@ docker build \

				       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \

				       --build-arg "KATEX=${KATEX:-}" \

				       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \

				       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \

				       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" \

				       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \

				       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \

				       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \

				@ -382,6 +362,7 @@ docker build \

				       --build-arg "OPENBLAS=${OPENBLAS:-}" \

				       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \

				       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \

				       --build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \

				       -f $(dirname ${DOCKERFILE})/Dockerfile \

				       -t "$tmp_tag" \

				       "$@" \

				@ -422,7 +403,14 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

				fi

				if [ -n "$GCC_VERSION" ]; then

				  if !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then

				  if [[ "$image" == *riscv* ]]; then

				    # Check RISC-V cross-compilation toolchain version

				    if !(drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version 2>&1 | grep -q " $GCC_VERSION\\W"); then

				      echo "RISC-V GCC_VERSION=$GCC_VERSION, but:"

				      drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version

				      exit 1

				    fi

				  elif !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then

				    echo "GCC_VERSION=$GCC_VERSION, but:"

				    drun gcc --version

				    exit 1

				@ -455,12 +443,3 @@ elif [ "$HAS_TRITON" = "yes" ]; then

				  echo "expecting triton to not be installed, but it is"

				  exit 1

				fi

				# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if

				# they support 4.0.0 yet, so exclude them from this check.

				CMAKE_VERSION=$(drun cmake --version)

				if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then

				  echo "CMake version is not 4.0.0:"

				  drun cmake --version

				  exit 1

				fi

									
										6

.ci/docker/centos-rocm/Dockerfile
									
												View File
												
				@ -56,9 +56,13 @@ ENV INSTALLED_VISION ${VISION}

				# Install rocm

				ARG ROCM_VERSION

				RUN mkdir ci_commit_pins

				COPY ./common/common_utils.sh common_utils.sh

				COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt

				COPY ./common/install_rocm.sh install_rocm.sh

				RUN bash ./install_rocm.sh

				RUN rm install_rocm.sh

				RUN rm install_rocm.sh common_utils.sh

				RUN rm -r ci_commit_pins

				COPY ./common/install_rocm_magma.sh install_rocm_magma.sh

				RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}

				RUN rm install_rocm_magma.sh

2

.ci/docker/ci_commit_pins/executorch.txt

View File

 @ -1 +1 @@
 aa978594cc155fa8af48cd949f5b5f1823a
 deb42f2a8e48f5032b4a98ee781a15fa87a157cf

2

.ci/docker/ci_commit_pins/huggingface-requirements.txt Normal file

View File

 @ -0,0 +1,2 @@
 transformers==4.56.0
 soxr==0.5.0

1

.ci/docker/ci_commit_pins/huggingface.txt

View File

				`@ -1 +0,0 @@`
				`243e186efbf7fb93328dd6b34927a4e8c8f24395`

2

.ci/docker/ci_commit_pins/nccl-cu12.txt

View File

 @ -1 +1 @@
 v2.27.5-1
 v2.27.5-1

1

.ci/docker/ci_commit_pins/nccl-cu13.txt Normal file

View File

				`@ -0,0 +1 @@`
				`v2.27.7-1`

1

.ci/docker/ci_commit_pins/rocm-composable-kernel.txt Normal file

View File

				`@ -0,0 +1 @@`
				`7fe50dc3da2069d6645d9deb8c017a876472a977`

2

.ci/docker/ci_commit_pins/torchbench.txt

View File

 @ -1 +1 @@
 e03a63be43e33596f7f0a43b0f530353785e4a59
 a23feff57432129df84d8099e622773cf77925

2

.ci/docker/ci_commit_pins/triton-xpu.txt

View File

 @ -1 +1 @@
 ae324eeac8e102a2b40370e341460f3791353398
 b0418a9a454b2b93ab8d71f40e59d2297157fae

2

.ci/docker/ci_commit_pins/triton.txt

View File

 @ -1 +1 @@
 f7888497a1eb9e98d4c07537f0d0bcfe180d1363
 ffcb92cdbe98d9f97e4e6f95247e46dfc9fd

									
										27

.ci/docker/common/install_acl.sh
									
										Normal file → Executable file
									
												View File
												
				@ -1,16 +1,27 @@

				set -euo pipefail

				#!/bin/bash

				# Script used only in CD pipeline

				readonly version=v25.02

				readonly src_host=https://github.com/ARM-software

				readonly src_repo=ComputeLibrary

				set -eux

				ACL_VERSION=${ACL_VERSION:-"v25.02"}

				ACL_INSTALL_DIR="/acl"

				# Clone ACL

				[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git

				cd ${src_repo}

				git checkout $version

				git clone https://github.com/ARM-software/ComputeLibrary.git -b "${ACL_VERSION}" --depth 1 --shallow-submodules

				ACL_CHECKOUT_DIR="ComputeLibrary"

				# Build with scons

				pushd $ACL_CHECKOUT_DIR

				scons -j8  Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \

				  os=linux arch=armv8a build=native multi_isa=1 \

				  fixed_format_kernels=1 openmp=1 cppthreads=0

				popd

				# Install ACL

				sudo mkdir -p ${ACL_INSTALL_DIR}

				for d in arm_compute include utils support src build

				do

				  sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d}

				done

				rm -rf $ACL_CHECKOUT_DIR

									
										9

.ci/docker/common/install_cpython.sh
									
												View File
												
				@ -66,8 +66,9 @@ function do_cpython_build {

				        ln -s pip3 ${prefix}/bin/pip

				    fi

				    # install setuptools since python 3.12 is required to use distutils

				    ${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0

				    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")

				    # packaging is needed to create symlink since wheel no longer provides needed information

				    ${prefix}/bin/pip install packaging==25.0 wheel==0.45.1 setuptools==80.9.0

				    local abi_tag=$(${prefix}/bin/python -c "from packaging.tags import interpreter_name, interpreter_version; import sysconfig ; from sysconfig import get_config_var; print('{0}{1}-{0}{1}{2}'.format(interpreter_name(), interpreter_version(), 't' if sysconfig.get_config_var('Py_GIL_DISABLED') else ''))")

				    ln -sf ${prefix} /opt/python/${abi_tag}

				}

				@ -82,10 +83,6 @@ function build_cpython {

				        py_suffix=${py_ver::-1}

				        py_folder=$py_suffix

				    fi

				    # Only b3 is available now

				    if [ "$py_suffix" == "3.14.0" ]; then

				        py_suffix="3.14.0b3"

				    fi

				    wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz

				    do_cpython_build $py_ver Python-$py_suffix

									
										106

.ci/docker/common/install_cuda.sh
									
												View File
												
				@ -10,7 +10,7 @@ else

				  arch_path='sbsa'

				fi

				NVSHMEM_VERSION=3.3.9

				NVSHMEM_VERSION=3.3.24

				function install_cuda {

				  version=$1

				@ -62,14 +62,16 @@ function install_nvshmem {

				  mkdir -p "${tmpdir}" && cd "${tmpdir}"

				  # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html

				  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"

				  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"

				  # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver

				  filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"

				  suffix=".tar.xz"

				  url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}"

				  # download, unpack, install

				  wget -q "${url}"

				  tar xf "${filename}.tar.gz"

				  cp -a "libnvshmem/include/"* /usr/local/cuda/include/

				  cp -a "libnvshmem/lib/"*     /usr/local/cuda/lib64/

				  tar xf "${filename}${suffix}"

				  cp -a "${filename}/include/"* /usr/local/cuda/include/

				  cp -a "${filename}/lib/"*     /usr/local/cuda/lib64/

				  # cleanup

				  cd ..

				@ -126,74 +128,6 @@ function install_129 {

				  ldconfig

				}

				function prune_124 {

				  echo "Pruning CUDA 12.4"

				  #####################################################################################

				  # CUDA 12.4 prune static libs

				  #####################################################################################

				  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"

				  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"

				  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"

				  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"

				  if [[ -n "$OVERRIDE_GENCODE" ]]; then

				      export GENCODE=$OVERRIDE_GENCODE

				  fi

				  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then

				      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN

				  fi

				  # all CUDA libs except CuDNN and CuBLAS

				  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \

				      | xargs -I {} bash -c \

				                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"

				  # prune CuDNN and CuBLAS

				  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a

				  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a

				  #####################################################################################

				  # CUDA 12.4 prune visual tools

				  #####################################################################################

				  export CUDA_BASE="/usr/local/cuda-12.4/"

				  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/

				}

				function prune_126 {

				  echo "Pruning CUDA 12.6"

				  #####################################################################################

				  # CUDA 12.6 prune static libs

				  #####################################################################################

				  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"

				  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"

				  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"

				  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"

				  if [[ -n "$OVERRIDE_GENCODE" ]]; then

				      export GENCODE=$OVERRIDE_GENCODE

				  fi

				  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then

				      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN

				  fi

				  # all CUDA libs except CuDNN and CuBLAS

				  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \

				      | xargs -I {} bash -c \

				                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"

				  # prune CuDNN and CuBLAS

				  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a

				  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a

				  #####################################################################################

				  # CUDA 12.6 prune visual tools

				  #####################################################################################

				  export CUDA_BASE="/usr/local/cuda-12.6/"

				  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/

				}

				function install_128 {

				  CUDNN_VERSION=9.8.0.87

				  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"

				@ -212,18 +146,38 @@ function install_128 {

				  ldconfig

				}

				function install_130 {

				  CUDNN_VERSION=9.13.0.50

				  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"

				  # install CUDA 13.0 in the same container

				  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux

				  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement

				  install_cudnn 13 $CUDNN_VERSION

				  install_nvshmem 13 $NVSHMEM_VERSION

				  CUDA_VERSION=13.0 bash install_nccl.sh

				  CUDA_VERSION=13.0 bash install_cusparselt.sh

				  ldconfig

				}

				# idiomatic parameter and option handling in sh

				while test $# -gt 0

				do

				    case "$1" in

				    12.4) install_124; prune_124

				    12.4) install_124;

				        ;;

				    12.6|12.6.*) install_126; prune_126

				    12.6|12.6.*) install_126;

				        ;;

				    12.8|12.8.*) install_128;

				        ;;

				    12.9|12.9.*) install_129;

				        ;;

				    13.0|13.0.*) install_130;

				        ;;

				    *) echo "bad argument $1"; exit 1

				        ;;

				    esac

									
										10

.ci/docker/common/install_cusparselt.sh
									
												View File
												
				@ -5,7 +5,15 @@ set -ex

				# cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html

				mkdir tmp_cusparselt && cd tmp_cusparselt

				if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then

				if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then

				    arch_path='sbsa'

				    export TARGETARCH=${TARGETARCH:-$(uname -m)}

				    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then

				        arch_path='x86_64'

				    fi

				    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive"

				    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz

				elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then

				    arch_path='sbsa'

				    export TARGETARCH=${TARGETARCH:-$(uname -m)}

				    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then

									
										23

.ci/docker/common/install_executorch.sh
									
												View File
												
				@ -42,22 +42,27 @@ install_pip_dependencies() {

				  # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current

				  # numba and scipy version used in PyTorch CI

				  conda_run pip uninstall -y numba scipy

				  # Yaspin is needed for running CI test (get_benchmark_analysis_data.py)

				  pip_install yaspin==3.1.0

				  popd

				}

				setup_executorch() {

				  pushd executorch

				  export PYTHON_EXECUTABLE=python

				  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

				  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"

				  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true

				  popd

				}

				clone_executorch

				install_buck2

				install_conda_dependencies

				install_pip_dependencies

				setup_executorch

				if [ $# -eq 0 ]; then

				  clone_executorch

				  install_buck2

				  install_conda_dependencies

				  install_pip_dependencies

				  pushd executorch

				  setup_executorch

				  popd

				else

				  "$@"

				fi

									
										11

.ci/docker/common/install_inductor_benchmark_deps.sh
									
												View File
												
				@ -5,9 +5,7 @@ set -ex

				source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

				function install_huggingface() {

				  local version

				  commit=$(get_pinned_commit huggingface)

				  pip_install "git+https://github.com/huggingface/transformers@${commit}"

				  pip_install -r huggingface-requirements.txt

				}

				function install_timm() {

				@ -26,15 +24,12 @@ function install_torchbench() {

				  python install.py --continue_on_fail

				  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488

				  # is regressing speedup metric. This needs to be investigated further

				  pip install transformers==4.38.1

				  echo "Print all dependencies after TorchBench is installed"

				  python -mpip freeze

				  popd

				  chown -R jenkins torchbench

				  chown -R jenkins /opt/conda

				}

				# Pango is needed for weasyprint which is needed for doctr

				@ -48,4 +43,4 @@ install_huggingface

				install_timm

				# Clean up

				conda_run pip uninstall -y torch torchvision torchaudio triton

				conda_run pip uninstall -y torch torchvision torchaudio triton torchao

									
										10

.ci/docker/common/install_mingw.sh
									
										Normal file
									
												View File
												
				@ -0,0 +1,10 @@

				#!/bin/bash

				set -ex

				# Install MinGW-w64 for Windows cross-compilation

				apt-get update

				apt-get install -y g++-mingw-w64-x86-64-posix

				echo "MinGW-w64 installed successfully"

				x86_64-w64-mingw32-g++ --version

									
										2

.ci/docker/common/install_nccl.sh
									
												View File
												
				@ -7,6 +7,8 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then

				  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)

				elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then

				  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)

				elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then

				  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt)

				else

				  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"

				  exit 1

									
										4

.ci/docker/common/install_onnx.sh
									
												View File
												
				@ -19,8 +19,8 @@ pip_install \

				  transformers==4.36.2

				pip_install coloredlogs packaging

				pip_install onnxruntime==1.18.1

				pip_install onnxscript==0.3.1

				pip_install onnxruntime==1.23.0

				pip_install onnxscript==0.5.4

				# Cache the transformers model to be used later by ONNX tests. We need to run the transformers

				# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/

									
										12

.ci/docker/common/install_openblas.sh
									
										Normal file → Executable file
									
												View File
												
				@ -3,8 +3,10 @@

				set -ex

				cd /

				git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules

				OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"}

				# Clone OpenBLAS

				git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules

				OPENBLAS_CHECKOUT_DIR="OpenBLAS"

				OPENBLAS_BUILD_FLAGS="

				@ -17,5 +19,7 @@ CFLAGS=-O3

				BUILD_BFLOAT16=1

				"

				make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}

				make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}

				make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR

				sudo make install -C $OPENBLAS_CHECKOUT_DIR

				rm -rf $OPENBLAS_CHECKOUT_DIR

									
										15

.ci/docker/common/install_rocm.sh
									
												View File
												
				@ -2,6 +2,11 @@

				set -ex

				# for pip_install function

				source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

				ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)"

				ver() {

				    printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');

				}

				@ -37,12 +42,6 @@ EOF

				    rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"

				    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"

				    # Special case for ROCM_VERSION == 7.0

				    if [[ $(ver "$ROCM_VERSION") -eq $(ver 7.0) ]]; then

				        rocm_baseurl="https://repo.radeon.com/rocm/apt/7.0_alpha2"

				        amdgpu_baseurl="https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu"

				    fi

				    # Add amdgpu repository

				    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`

				    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list

				@ -113,6 +112,8 @@ EOF

				        rm -rf HIP clr

				    fi

				    pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"

				    # Cleanup

				    apt-get autoclean && apt-get clean

				    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

				@ -176,6 +177,8 @@ install_centos() {

				      sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"

				  done

				  pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"

				  # Cleanup

				  yum clean all

				  rm -rf /var/cache/yum

									
										4

.ci/docker/common/install_rocm_magma.sh
									
												View File
												
				@ -12,8 +12,8 @@ function do_install() {

				    rocm_version_nodot=${rocm_version//./}

				    # Version 2.7.2 + ROCm related updates

				    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6

				    # https://github.com/icl-utk-edu/magma/pull/65

				    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec

				    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"

				    rocm_dir="/opt/rocm"

									
										8

.ci/docker/common/install_triton.sh
									
												View File
												
				@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then

				  cd python

				fi

				pip_install pybind11==2.13.6

				pip_install pybind11==3.0.1

				# TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527

				as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py

				@ -66,15 +66,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"

				  # Triton needs at least gcc-9 to build

				  apt-get install -y g++-9

				  CXX=g++-9 conda_run python setup.py bdist_wheel

				  CXX=g++-9 conda_run python -m build --wheel --no-isolation

				elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then

				  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain

				  add-apt-repository -y ppa:ubuntu-toolchain-r/test

				  apt-get install -y g++-9

				  CXX=g++-9 conda_run python setup.py bdist_wheel

				  CXX=g++-9 conda_run python -m build --wheel --no-isolation

				else

				  conda_run python setup.py bdist_wheel

				  conda_run python -m build --wheel --no-isolation

				fi

				# Copy the wheel to /opt for multi stage docker builds

									
										8

.ci/docker/common/install_ucc.sh
									
												View File
												
				@ -44,8 +44,12 @@ function install_ucc() {

				  ./autogen.sh

				  # We only run distributed tests on Tesla M60 and A10G

				  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"

				  if [[ -n "$CUDA_VERSION"  && $CUDA_VERSION == 13* ]]; then

				    NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"

				  else

				    # We only run distributed tests on Tesla M60 and A10G

				    NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"

				  fi

				  if [[ -n "$ROCM_VERSION" ]]; then

				    if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then

									
										61

.ci/docker/common/install_xpu.sh
									
												View File
												
				@ -34,18 +34,27 @@ function install_ubuntu() {

				    # The xpu-smi packages

				    apt-get install -y flex bison xpu-smi

				    # Compute and Media Runtimes

				    apt-get install -y \

				        intel-opencl-icd intel-level-zero-gpu level-zero \

				        intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \

				        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \

				        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \

				        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo

				    if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then

				        apt-get install -y intel-ocloc

				    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then

				        # Compute and Media Runtimes

				        apt-get install -y \

				            intel-opencl-icd intel-level-zero-gpu level-zero \

				            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \

				            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \

				            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \

				            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo

				        # Development Packages

				        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev

				    else # rolling driver

				        apt-get install -y \

				            intel-opencl-icd libze-intel-gpu1 libze1 \

				            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \

				            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \

				            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \

				            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc

				        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev

				    fi

				    # Development Packages

				    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev

				    # Install Intel Support Packages

				    apt-get install -y ${XPU_PACKAGES}

				@ -56,10 +65,14 @@ function install_ubuntu() {

				function install_rhel() {

				    . /etc/os-release

				    if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then

				        echo "RHEL version ${VERSION_ID} not supported"

				        exit

				    if [[ "${ID}" == "rhel" ]]; then

				        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then

				            echo "RHEL version ${VERSION_ID} not supported"

				            exit

				        fi

				    elif [[ "${ID}" == "almalinux" ]]; then

				        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64

				        VERSION_ID="8.8"

				    fi

				    dnf install -y 'dnf-command(config-manager)'

				@ -130,18 +143,18 @@ function install_sles() {

				}

				# Default use GPU driver LTS releases

				XPU_DRIVER_VERSION="/lts/2350"

				if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then

				    # Use GPU driver rolling releases

				    XPU_DRIVER_VERSION=""

				# Default use GPU driver rolling releases

				XPU_DRIVER_VERSION=""

				if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then

				    # Use GPU driver LTS releases

				    XPU_DRIVER_VERSION="/lts/2350"

				fi

				# Default use Intel® oneAPI Deep Learning Essentials 2025.0

				if [[ "$XPU_VERSION" == "2025.1" ]]; then

				    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"

				# Default use Intel® oneAPI Deep Learning Essentials 2025.1

				if [[ "$XPU_VERSION" == "2025.2" ]]; then

				    XPU_PACKAGES="intel-deep-learning-essentials-2025.2"

				else

				    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"

				    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"

				fi

				# The installation depends on the base OS

									
										9

.ci/docker/common/patch_libstdc.sh
									
										Executable file
									
												View File
												
				@ -0,0 +1,9 @@

				#!/bin/bash

				set -xe

				# Script used in Linux x86 and aarch64 CD pipeline

				# Workaround for exposing statically linked libstdc++ CXX11 ABI symbols.

				# see: https://github.com/pytorch/pytorch/issues/133437

				LIBNONSHARED=$(gcc -print-file-name=libstdc++_nonshared.a)

				nm -g $LIBNONSHARED | grep " T " | grep recursive_directory_iterator | cut -c 20-  > weaken-symbols.txt

				objcopy --weaken-symbols weaken-symbols.txt $LIBNONSHARED $LIBNONSHARED

									
										13

.ci/docker/libtorch/Dockerfile
									
												View File
												
				@ -69,6 +69,19 @@ RUN bash ./install_cuda.sh 12.9

				RUN bash ./install_magma.sh 12.9

				RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda

				FROM cuda as cuda13.0

				RUN bash ./install_cuda.sh 13.0

				RUN bash ./install_magma.sh 13.0

				RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda

				# Install libibverbs for libtorch and copy to CUDA directory

				RUN apt-get update -y && \

				    apt-get install -y libibverbs-dev librdmacm-dev && \

				    cp /usr/lib/x86_64-linux-gnu/libmlx5.so* /usr/local/cuda/lib64/ && \

				    cp /usr/lib/x86_64-linux-gnu/librdmacm.so* /usr/local/cuda/lib64/ && \

				    cp /usr/lib/x86_64-linux-gnu/libibverbs.so* /usr/local/cuda/lib64/ && \

				    cp /usr/lib/x86_64-linux-gnu/libnl* /usr/local/cuda/lib64/

				FROM cpu as rocm

				ARG ROCM_VERSION

				ARG PYTORCH_ROCM_ARCH

									
										12

.ci/docker/libtorch/build.sh
									
												View File
												
				@ -39,13 +39,21 @@ case ${DOCKER_TAG_PREFIX} in

				        DOCKER_GPU_BUILD_ARG=""

				        ;;

				    rocm*)

				        # we want the patch version of 6.4 instead

				        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then

				        # we want the patch version of 7.0 instead

				        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then

				            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"

				        fi

				        # we want the patch version of 6.4 instead

				        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then

				            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"

				        fi

				        BASE_TARGET=rocm

				        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete

				        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"

				        # add gfx950, gfx115x conditionally starting in ROCm 7.0

				        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then

				            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"

				        fi

				        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"

				        ;;

				    *)

5

.ci/docker/manywheel/Dockerfile_2_28

View File

 @ -130,7 +130,8 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
 RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
     /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
     done;
 ADD ./common/patch_libstdc.sh patch_libstdc.sh
 RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
 # cmake-3.18.4 from pip; force in case cmake3 already exists
 RUN yum install -y python3-pip && \
 @ -175,6 +176,6 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
     python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
 ENV XPU_VERSION 2025.1
 ENV XPU_VERSION 2025.2
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd

12

.ci/docker/manywheel/Dockerfile_2_28_aarch64

View File

 @ -62,6 +62,13 @@ ARG OPENBLAS_VERSION
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh
 # Install Arm Compute Library
 FROM base as arm_compute
 # use python3.9 to install scons
 RUN python3.9 -m pip install scons==4.7.0
 RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
 COPY ./common/install_acl.sh install_acl.sh
 RUN bash ./install_acl.sh && rm install_acl.sh
 FROM base as final
 # remove unnecessary python versions
 @ -70,4 +77,7 @@ RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
 ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
 COPY --from=arm_compute /acl /acl
 ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH
 ADD ./common/patch_libstdc.sh patch_libstdc.sh
 RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh

13

.ci/docker/manywheel/Dockerfile_cuda_aarch64

View File

 @ -86,6 +86,15 @@ FROM base as nvpl
 ADD ./common/install_nvpl.sh install_nvpl.sh
 RUN bash ./install_nvpl.sh && rm install_nvpl.sh
 # Install Arm Compute Library
 FROM base as arm_compute
 # use python3.9 to install scons
 RUN python3.9 -m pip install scons==4.7.0
 RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
 COPY ./common/install_acl.sh install_acl.sh
 RUN bash ./install_acl.sh && rm install_acl.sh
 FROM base as final
 FROM final as cuda_final
 ARG BASE_CUDA_VERSION
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 @ -93,5 +102,9 @@ COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BAS
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=nvpl /opt/nvpl/lib/  /usr/local/lib/
 COPY --from=nvpl /opt/nvpl/include/  /usr/local/include/
 COPY --from=arm_compute /acl /acl
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=/acl/build/:$LD_LIBRARY_PATH
 ADD ./common/patch_libstdc.sh patch_libstdc.sh
 RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh

71

.ci/docker/manywheel/Dockerfile_cxx11-abi

View File

 @ -1,71 +0,0 @@
 FROM centos:8 as base
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 # change to a valid repo
 RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
 # enable to install ninja-build
 RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
 RUN yum -y update
 RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
 RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
 FROM base as openssl
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 # Install python
 FROM base as python
 RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
 ADD common/install_cpython.sh install_cpython.sh
 RUN bash ./install_cpython.sh && rm install_cpython.sh
 FROM base as conda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
 RUN /opt/conda/bin/conda install -y cmake
 FROM base as intel
 # Install MKL
 COPY --from=python             /opt/python                           /opt/python
 COPY --from=python             /opt/_internal                        /opt/_internal
 COPY --from=conda              /opt/conda                            /opt/conda
 ENV PATH=/opt/conda/bin:$PATH
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM base as patchelf
 ADD ./common/install_patchelf.sh install_patchelf.sh
 RUN bash ./install_patchelf.sh && rm install_patchelf.sh
 RUN cp $(which patchelf) /patchelf
 FROM base as jni
 ADD ./common/install_jni.sh install_jni.sh
 ADD ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh
 FROM base as libpng
 ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh
 FROM base as final
 COPY --from=openssl            /opt/openssl                          /opt/openssl
 COPY --from=python             /opt/python                           /opt/python
 COPY --from=python             /opt/_internal                        /opt/_internal
 COPY --from=intel              /opt/intel                            /opt/intel
 COPY --from=conda              /opt/conda                            /opt/conda
 COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
 COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
 COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
 COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
 COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
 COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
 COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
 RUN yum install -y ninja-build

3

.ci/docker/manywheel/Dockerfile_s390x

View File

 @ -115,6 +115,9 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0
 ADD ./common/patch_libstdc.sh patch_libstdc.sh
 RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.

									
										29

.ci/docker/manywheel/build.sh
									
												View File
												
				@ -28,6 +28,7 @@ fi

				MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}

				DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}

				OPENBLAS_VERSION=${OPENBLAS_VERSION:-}

				ACL_VERSION=${ACL_VERSION:-}

				case ${image} in

				    manylinux2_28-builder:cpu)

				@ -41,13 +42,6 @@ case ${image} in

				        GPU_IMAGE=arm64v8/almalinux:8

				        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"

				        MANY_LINUX_VERSION="2_28_aarch64"

				        OPENBLAS_VERSION="v0.3.30"

				        ;;

				    manylinuxcxx11-abi-builder:cpu-cxx11-abi)

				        TARGET=final

				        GPU_IMAGE=""

				        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"

				        MANY_LINUX_VERSION="cxx11-abi"

				        ;;

				    manylinuxs390x-builder:cpu-s390x)

				        TARGET=final

				@ -67,6 +61,12 @@ case ${image} in

				        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"

				        MANY_LINUX_VERSION="2_28"

				        ;;

				    manylinux2_28-builder:cuda13*)

				        TARGET=cuda_final

				        GPU_IMAGE=amd64/almalinux:8

				        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"

				        MANY_LINUX_VERSION="2_28"

				        ;;

				    manylinuxaarch64-builder:cuda*)

				        TARGET=cuda_final

				        GPU_IMAGE=amd64/almalinux:8

				@ -75,15 +75,23 @@ case ${image} in

				        DOCKERFILE_SUFFIX="_cuda_aarch64"

				        ;;

				    manylinux2_28-builder:rocm*)

				        # we want the patch version of 6.4 instead

				        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then

				        # we want the patch version of 7.0 instead

				        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then

				            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"

				        fi

				        # we want the patch version of 6.4 instead

				        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then

				            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"

				        fi

				        TARGET=rocm_final

				        MANY_LINUX_VERSION="2_28"

				        DEVTOOLSET_VERSION="11"

				        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete

				        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"

				        # add gfx950, gfx115x conditionally starting in ROCm 7.0

				        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then

				            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"

				        fi

				        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"

				        ;;

				    manylinux2_28-builder:xpu)

				@ -115,7 +123,8 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

				DOCKER_BUILDKIT=1 docker build  \

				    ${DOCKER_GPU_BUILD_ARG} \

				    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \

				    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \

				    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION:-}" \

				    --build-arg "ACL_VERSION=${ACL_VERSION:-}" \

				    --target "${TARGET}" \

				    -t "${tmp_tag}" \

				    $@ \

									
										5

.ci/docker/manywheel/build_scripts/ssl-check.py
									
												View File
												
				@ -10,11 +10,6 @@ BAD_SSL = "https://self-signed.badssl.com"

				print("Testing SSL certificate checking for Python:", sys.version)

				if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):

				    print("This version never checks SSL certs; skipping tests")

				    sys.exit(0)

				EXC = OSError

				print(f"Connecting to {GOOD_SSL} should work")

73

.ci/docker/requirements-ci.txt

View File

 @ -10,6 +10,11 @@ boto3==1.35.42
 #Pinned versions: 1.19.12, 1.16.34
 #test that import:
 build==1.3.0
 #Description: A simple, correct Python build frontend.
 #Pinned versions: 1.3.0
 #test that import:
 click
 #Description: Command Line Interface Creation Kit
 #Pinned versions:
 @ -47,10 +52,10 @@ flatbuffers==24.12.23
 #Pinned versions: 24.12.23
 #test that import:
 hypothesis==5.35.1
 hypothesis==6.56.4
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
 #Pinned versions: 5.35.1
 #Pinned versions: 6.56.4
 #test that import: test_xnnpack_integration.py, test_pruning_op.py, test_nn.py
 junitparser==2.1.1
 @ -63,11 +68,12 @@ lark==0.12.0
 #Pinned versions: 0.12.0
 #test that import:
 librosa>=0.6.2 ; python_version < "3.11"
 librosa==0.10.2 ; python_version == "3.12"
 librosa>=0.6.2 ; python_version < "3.11" and platform_machine != "s390x"
 librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
 #test that import: test_spectral_ops.py
 #librosa depends on numba; disable it for s390x while numba is disabled too
 #mkl #this breaks linux-bionic-rocm4.5-py3.7
 #Description: Intel oneAPI Math Kernel Library
 @ -92,8 +98,9 @@ librosa==0.10.2 ; python_version == "3.12"
 #Pinned versions:
 #test that import:
 mypy==1.16.0
 mypy==1.16.0 ; platform_system == "Linux"
 # Pin MyPy version because new errors are likely to appear with each release
 # Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
 @ -104,20 +111,18 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch
 ninja==1.11.1.3
 ninja==1.11.1.4
 #Description: build system. Used in some tests. Used in build to generate build
 #time tracing information
 #Pinned versions: 1.11.1.3
 #Pinned versions: 1.11.1.4
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 numba==0.49.0 ; python_version < "3.9"
 numba==0.55.2 ; python_version == "3.9"
 numba==0.55.2 ; python_version == "3.10"
 numba==0.60.0 ; python_version == "3.12"
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #Pinned versions: 0.55.2, 0.60.0
 #test that import: test_numba_integration.py
 #For numba issue see https://github.com/pytorch/pytorch/issues/51511
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
 #numpy
 #Description: Provides N-dimensional arrays and linear algebra
 @ -131,7 +136,7 @@ numba==0.60.0 ; python_version == "3.12"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
 numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
 numpy==1.22.4; python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13"
 @ -163,12 +168,12 @@ optree==0.13.0
 pillow==11.0.0
 #Description:  Python Imaging Library fork
 #Pinned versions: 10.3.0
 #Pinned versions: 11.0.0
 #test that import:
 protobuf==5.29.4
 protobuf==5.29.5
 #Description:  Google's data interchange format
 #Pinned versions: 5.29.4
 #Pinned versions: 5.29.5
 #test that import: test_tensorboard.py, test/onnx/*
 psutil
 @ -211,7 +216,7 @@ pytest-subtests==0.13.1
 #Pinned versions:
 #test that import:
 xdoctest==1.1.0
 xdoctest==1.3.0
 #Description: runs doctests in pytest
 #Pinned versions: 1.1.0
 #test that import:
 @ -236,10 +241,9 @@ pygments==2.15.0
 #Pinned versions: 14.1.0
 #test that import:
 scikit-image==0.19.3 ; python_version < "3.10"
 scikit-image==0.22.0 ; python_version >= "3.10"
 scikit-image==0.22.0
 #Description: image processing routines
 #Pinned versions:
 #Pinned versions: 0.22.0
 #test that import: test_nn.py
 #scikit-learn
 @ -261,13 +265,8 @@ scipy==1.14.1 ; python_version >= "3.12"
 #Pinned versions:
 #test that import:
 tb-nightly==2.13.0a20230426
 #Description: TensorBoard
 #Pinned versions:
 #test that import:
 # needed by torchgen utils
 typing-extensions>=4.10.0
 typing-extensions==4.12.2
 #Description: type hints for python
 #Pinned versions:
 #test that import:
 @ -307,7 +306,7 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:
 z3-solver==4.15.1.0
 z3-solver==4.15.1.0 ; platform_machine != "s390x"
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
 @ -328,8 +327,6 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 lxml==5.3.0
 #Description: This is a requirement of unittest-xml-reporting
 # Python-3.9 binaries
 PyGithub==2.3.0
 sympy==1.13.3
 @ -342,7 +339,7 @@ onnx==1.18.0
 #Pinned versions:
 #test that import:
 onnxscript==0.3.1
 onnxscript==0.5.3
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
 @ -362,9 +359,10 @@ pwlf==2.2.1
 #test that import: test_sac_estimator.py
 # To build PyTorch itself
 pyyaml
 pyyaml==6.0.2
 pyzstd
 setuptools>=70.1.0
 setuptools==78.1.1
 packaging==23.1
 six
 scons==4.5.2 ; platform_machine == "aarch64"
 @ -379,13 +377,16 @@ dataclasses_json==0.6.7
 #Pinned versions: 0.6.7
 #test that import:
 cmake==4.0.0
 cmake==3.31.6
 #Description: required for building
 tlparse==0.3.30
 tlparse==0.4.0
 #Description: required for log parsing
 cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
 filelock==3.18.0
 #Description: required for inductor testing
 cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x" and platform_system != "Darwin"
 #Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
 #test that import: test_cuda.py

9

.ci/docker/requirements-docs.txt

View File

 @ -1,8 +1,15 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
 standard-imghdr==3.13.0; python_version >= "3.13"
 #Description: This is needed by Sphinx, so it needs to be added here.
 # The reasons are as follows:
 # 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
 # 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
 # Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.

2

.ci/docker/triton_version.txt

View File

 @ -1 +1 @@
 .4.0
 .5.0

2

.ci/docker/triton_xpu_version.txt

View File

 @ -1 +1 @@
 .4.0
 .5.0

									
										155

.ci/docker/ubuntu-cross-riscv/Dockerfile
									
										Normal file
									
												View File
												
				@ -0,0 +1,155 @@

				# Cross-compilation Docker container for RISC-V architecture

				ARG UBUNTU_VERSION

				FROM --platform=linux/amd64 ubuntu:${UBUNTU_VERSION} as base

				ARG UBUNTU_VERSION

				ENV GCC_VERSION=14

				ENV PYTHON_VERSION=3.12.3

				ENV DEBIAN_FRONTEND=noninteractive

				ENV CC=riscv64-linux-gnu-gcc-${GCC_VERSION}

				ENV CXX=riscv64-linux-gnu-g++-${GCC_VERSION}

				ENV QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/

				ENV SYSROOT=/opt/sysroot

				# Install basic dependencies

				RUN apt-get update && apt-get install -y \

				    ninja-build \

				    autoconf \

				    automake \

				    libtool \

				    patchelf \

				    ccache \

				    git \

				    wget \

				    python3-pip \

				    python3-venv \

				    python-is-python3 \

				    cmake \

				    sudo \

				    lsb-release \

				    gcc-${GCC_VERSION}-riscv64-linux-gnu \

				    g++-${GCC_VERSION}-riscv64-linux-gnu \

				    pkg-config \

				    && rm -rf /var/lib/apt/lists/*

				# Install user

				COPY ./common/install_user.sh install_user.sh

				RUN bash ./install_user.sh && rm install_user.sh

				FROM base as python

				ARG ZLIB_VERSION=1.3.1

				ARG FFI_VERSION=3.4.6

				ARG BZ2_VERSION=1.0.8

				ARG XZ_VERSION=5.4.6

				ARG OPENSSL_VERSION=3.2.1

				# Set up sysroot directory for dependencies

				ENV PKG_CONFIG_PATH=${SYSROOT}/lib/pkgconfig

				ENV PKG_CONFIG_SYSROOT_DIR=${SYSROOT}

				WORKDIR /opt

				# Build zlib (for compression)

				RUN echo "--- Building zlib ---" \

				    && wget -c https://www.zlib.net/zlib-${ZLIB_VERSION}.tar.gz \

				    && tar -xf zlib-${ZLIB_VERSION}.tar.gz --no-same-permissions --no-same-owner \

				    && cd zlib-${ZLIB_VERSION}/ \

				    && mkdir build && cd build \

				    && ../configure --prefix=${SYSROOT} \

				    && make -j$(nproc) && make install \

				    && cd ../..

				# Build libffi (for ctypes module)

				RUN echo "--- Building libffi ---" \

				    && wget -c https://github.com/libffi/libffi/releases/download/v${FFI_VERSION}/libffi-${FFI_VERSION}.tar.gz \

				    && tar -xf libffi-${FFI_VERSION}.tar.gz --no-same-permissions --no-same-owner \

				    && cd libffi-${FFI_VERSION}/ \

				    && mkdir build && cd build \

				    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \

				    && make -j$(nproc) && make install \

				    && cd ../..

				# Build bzip2 (for bz2 module)

				RUN echo "--- Building bzip2 ---" \

				    && wget -c https://sourceware.org/pub/bzip2/bzip2-${BZ2_VERSION}.tar.gz \

				    && tar -xf bzip2-${BZ2_VERSION}.tar.gz --no-same-permissions --no-same-owner \

				    && cd bzip2-${BZ2_VERSION}/ \

				    && make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} bzip2 bzip2recover libbz2.a \

				    && make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} -f Makefile-libbz2_so \

				    && make install PREFIX=${SYSROOT} \

				    && cp libbz2.so.${BZ2_VERSION} ${SYSROOT}/lib/ \

				    && cd ${SYSROOT}/lib/ \

				    && ln -sf libbz2.so.${BZ2_VERSION} libbz2.so.1.0 \

				    && ln -sf libbz2.so.1.0 libbz2.so \

				    && cd /opt/

				# Build xz (for lzma module)

				RUN echo "--- Building xz ---" \

				    && wget -c https://github.com/tukaani-project/xz/releases/download/v${XZ_VERSION}/xz-${XZ_VERSION}.tar.gz \

				    && tar -xf xz-${XZ_VERSION}.tar.gz --no-same-permissions --no-same-owner \

				    && cd xz-${XZ_VERSION} \

				    && mkdir build && cd build \

				    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \

				    && make -j$(nproc) && make install \

				    && cd ../..

				# Build OpenSSL (for ssl module)

				RUN echo "--- Building OpenSSL ---" \

				    && wget -c https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz \

				    && tar -xf openssl-${OPENSSL_VERSION}.tar.gz --no-same-permissions --no-same-owner \

				    && cd openssl-${OPENSSL_VERSION}/ \

				    && mkdir build && cd build \

				    && ../Configure linux64-riscv64 --prefix=${SYSROOT} \

				    && make -j$(nproc) && make install_sw \

				    && cd ../..

				# Build SQLite3 (for sqlite3 module)

				RUN echo "--- Building SQLite3 ---" \

				    && wget -c https://www.sqlite.org/2024/sqlite-autoconf-3450200.tar.gz \

				    && tar -xf sqlite-autoconf-3450200.tar.gz --no-same-permissions --no-same-owner \

				    && cd sqlite-autoconf-3450200 \

				    && mkdir build && cd build \

				    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \

				    && make -j$(nproc) && make install \

				    && cd ../..

				# Build and install RISC-V Python with all modules

				RUN wget -c https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \

				    && tar -xf Python-${PYTHON_VERSION}.tgz --no-same-permissions --no-same-owner \

				    && cd Python-${PYTHON_VERSION} \

				    && mkdir build && cd build \

				    && ../configure \

				        --host=riscv64-linux-gnu \

				        --build=x86_64-linux-gnu \

				        --prefix=${SYSROOT} \

				        --enable-shared \

				        --disable-ipv6 \

				        --with-build-python=/usr/bin/python3 \

				        --with-ensurepip=no \

				        ac_cv_file__dev_ptmx=yes \

				        ac_cv_file__dev_ptc=no \

				    && make -j$(nproc) \

				    && make install

				FROM base as final

				COPY --from=python             /opt/sysroot                       /opt/sysroot

				# Install crossenv and cmake

				RUN pip install crossenv cmake==4.0.0 --break-system-packages \

				    && /usr/bin/python3 -m crossenv ${SYSROOT}/bin/python3 /opt/riscv-cross-env

				# Add pip-installed cmake binaries to PATH

				ENV PATH="/usr/local/bin:${PATH}"

				# Set up cross Python environment

				SHELL ["/bin/bash", "-c"]

				RUN source /opt/riscv-cross-env/bin/activate \

				    && pip install setuptools pyyaml typing_extensions wheel

				# Set default environment variables for PyTorch build

				ENV Python_ROOT_DIR=${SYSROOT}

				ENV OPENSSL_ROOT_DIR=${SYSROOT}

				USER jenkins

				CMD ["bash"]

									
										10

.ci/docker/ubuntu-rocm/Dockerfile
									
												View File
												
				@ -52,9 +52,13 @@ ENV INSTALLED_VISION ${VISION}

				# Install rocm

				ARG ROCM_VERSION

				RUN mkdir ci_commit_pins

				COPY ./common/common_utils.sh common_utils.sh

				COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt

				COPY ./common/install_rocm.sh install_rocm.sh

				RUN bash ./install_rocm.sh

				RUN rm install_rocm.sh

				RUN rm install_rocm.sh common_utils.sh

				RUN rm -r ci_commit_pins

				COPY ./common/install_rocm_magma.sh install_rocm_magma.sh

				RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}

				RUN rm install_rocm_magma.sh

				@ -96,11 +100,11 @@ ARG ANACONDA_PYTHON_VERSION

				ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION

				COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh

				COPY ./common/common_utils.sh common_utils.sh

				COPY ci_commit_pins/huggingface.txt huggingface.txt

				COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt

				COPY ci_commit_pins/timm.txt timm.txt

				COPY ci_commit_pins/torchbench.txt torchbench.txt

				RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi

				RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

				RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

				# (optional) Install non-default Ninja version

				ARG NINJA_VERSION

									
										4

.ci/docker/ubuntu-xpu/Dockerfile
									
												View File
												
				@ -56,10 +56,10 @@ RUN rm install_openssl.sh

				ARG INDUCTOR_BENCHMARKS

				COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh

				COPY ./common/common_utils.sh common_utils.sh

				COPY ci_commit_pins/huggingface.txt huggingface.txt

				COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt

				COPY ci_commit_pins/timm.txt timm.txt

				RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi

				RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

				RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt

				# Install XPU Dependencies

				ARG XPU_VERSION

									
										11

.ci/docker/ubuntu/Dockerfile
									
												View File
												
				@ -66,6 +66,7 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"

				# (optional) Install UCC

				ARG UCX_COMMIT

				ARG UCC_COMMIT

				ARG CUDA_VERSION

				ENV UCX_COMMIT $UCX_COMMIT

				ENV UCC_COMMIT $UCC_COMMIT

				ENV UCX_HOME /usr

				@ -96,11 +97,16 @@ RUN rm install_openssl.sh

				ARG INDUCTOR_BENCHMARKS

				COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh

				COPY ./common/common_utils.sh common_utils.sh

				COPY ci_commit_pins/huggingface.txt huggingface.txt

				COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt

				COPY ci_commit_pins/timm.txt timm.txt

				COPY ci_commit_pins/torchbench.txt torchbench.txt

				RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi

				RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

				RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

				ARG INSTALL_MINGW

				COPY ./common/install_mingw.sh install_mingw.sh

				RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi

				RUN rm install_mingw.sh

				ARG TRITON

				ARG TRITON_CPU

				@ -181,7 +187,6 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm

				RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi

				# AWS specific CUDA build guidance

				ENV TORCH_CUDA_ARCH_LIST Maxwell

				ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"

				ENV CUDA_PATH /usr/local/cuda

									
										2

.ci/libtorch/build.sh
									
												View File
												
				@ -7,4 +7,4 @@ set -ex

				SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

				USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh

				USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh

									
										31

.ci/lumen_cli/README.md
									
										Normal file
									
												View File
												
				@ -0,0 +1,31 @@

				# 🔧 Lumen_cli

				A Python CLI tool for building and testing PyTorch-based components, using a YAML configuration file for structured, repeatable workflows.

				## Features

				- **Build**

				    - external projects (e.g. vLLM)

				## 📦 Installation

				at the root of the pytorch repo

				```bash

				pip install -e .ci/lumen_cli

				```

				## Run the cli tool

				The cli tool must be used at root of pytorch repo, as example to run build external vllm:

				```bash

				python -m cli.run build external vllm

				```

				this will run the build steps with default behaviour for vllm project.

				to see help messages, run

				```bash

				python3 -m cli.run --help

				```

				## Add customized external build logics

				To add a new external build, for instance, add a new external build logics:

				1. create the build function in cli/lib folder

				2. register your target and the main build function at  EXTERNAL_BUILD_TARGET_DISPATCH in `cli/build_cli/register_build.py`

				3. [optional] create your ci config file in .github/ci_configs/${EXTERNAL_PACKAGE_NAME}.yaml

0

test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_bool_called_at_least_once → .ci/lumen_cli/cli/build_cli/init.py

View File

									
										37

.ci/lumen_cli/cli/build_cli/register_build.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,37 @@

				import argparse

				import logging

				from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec

				from cli.lib.core.vllm.vllm_build import VllmBuildRunner

				logger = logging.getLogger(__name__)

				# Maps targets to their argparse configuration and runner

				# it adds new target to path python -m cli.run build external {target} with buildrunner

				_TARGETS: dict[str, TargetSpec] = {

				    "vllm": {

				        "runner": VllmBuildRunner,

				        "help": "Build vLLM using docker buildx.",

				    }

				    # add yours ...

				}

				def register_build_commands(subparsers: argparse._SubParsersAction) -> None:

				    build_parser = subparsers.add_parser(

				        "build",

				        help="Build related commands",

				        formatter_class=RichHelp,

				    )

				    build_subparsers = build_parser.add_subparsers(dest="build_command", required=True)

				    overview = "\n".join(

				        f"  {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()

				    )

				    external_parser = build_subparsers.add_parser(

				        "external",

				        help="Build external targets",

				        description="Build third-party targets.\n\nAvailable targets:\n" + overview,

				        formatter_class=RichHelp,

				    )

				    register_targets(external_parser, _TARGETS)

0

test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_complex → .ci/lumen_cli/cli/lib/init.py

View File

									
										71

.ci/lumen_cli/cli/lib/common/cli_helper.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,71 @@

				"""

				Cli Argparser Utility helpers for CLI tasks.

				"""

				import argparse

				from abc import ABC, abstractmethod

				try:

				    from typing import Any, Callable, Required, TypedDict  # Python 3.11+

				except ImportError:

				    from typing import Any, Callable, TypedDict

				    from typing_extensions import Required  # Fallback for Python <3.11

				class BaseRunner(ABC):

				    def __init__(self, args: Any) -> None:

				        self.args = args

				    @abstractmethod

				    def run(self) -> None:

				        """runs main logics, required"""

				# Pretty help: keep newlines + show defaults

				class RichHelp(

				    argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter

				):

				    pass

				class TargetSpec(TypedDict, total=False):

				    """CLI subcommand specification with bA."""

				    runner: Required[type[BaseRunner]]

				    help: str

				    description: str

				    add_arguments: Callable[[argparse.ArgumentParser], None]

				def register_targets(

				    parser: argparse.ArgumentParser,

				    target_specs: dict[str, TargetSpec],

				    common_args: Callable[[argparse.ArgumentParser], None] = lambda _: None,

				) -> None:

				    """Register target subcommands."""

				    targets = parser.add_subparsers(

				        dest="target",

				        required=True,

				        metavar="{" + ",".join(target_specs.keys()) + "}",

				    )

				    for name, spec in target_specs.items():

				        desc = spec.get("description") or spec["runner"].__doc__ or ""

				        p = targets.add_parser(

				            name,

				            help=spec.get("help", ""),

				            description=desc.strip(),

				            formatter_class=RichHelp,

				        )

				        p.set_defaults(

				            func=lambda args, cls=spec["runner"]: cls(args).run(),

				            _runner_class=spec["runner"],

				        )

				        if "add_arguments" in spec and callable(spec["add_arguments"]):

				            spec["add_arguments"](p)

				        if common_args:

				            common_args(p)

									
										42

.ci/lumen_cli/cli/lib/common/docker_helper.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,42 @@

				"""

				Docker Utility helpers for CLI tasks.

				"""

				import logging

				from typing import Optional

				import docker

				from docker.errors import APIError, NotFound

				logger = logging.getLogger(__name__)

				# lazy singleton so we don't reconnect every call

				_docker_client: Optional[docker.DockerClient] = None

				def _get_client() -> docker.DockerClient:

				    global _docker_client

				    if _docker_client is None:

				        _docker_client = docker.from_env()

				    return _docker_client

				def local_image_exists(

				    image_name: str, client: Optional[docker.DockerClient] = None

				) -> bool:

				    """Return True if a local Docker image exists."""

				    if not image_name:

				        return False

				    client = client or _get_client()

				    try:

				        client.images.get(image_name)

				        return True

				    except (NotFound, APIError) as e:

				        logger.error(

				            "Error when checking Docker image '%s': %s",

				            image_name,

				            e.explanation if hasattr(e, "explanation") else str(e),

				        )

				        return False

									
										110

.ci/lumen_cli/cli/lib/common/envs_helper.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,110 @@

				"""

				Environment Variables and Dataclasses Utility helpers for CLI tasks.

				"""

				import os

				from dataclasses import field, fields, is_dataclass, MISSING

				from pathlib import Path

				from textwrap import indent

				from typing import Optional, Union

				from cli.lib.common.utils import str2bool

				def get_env(name: str, default: str = "") -> str:

				    """Get environment variable with default fallback."""

				    return os.environ.get(name) or default

				def env_path_optional(

				    name: str,

				    default: Optional[Union[str, Path]] = None,

				    resolve: bool = True,

				) -> Optional[Path]:

				    """Get environment variable as optional Path."""

				    val = get_env(name) or default

				    if not val:

				        return None

				    path = Path(val)

				    return path.resolve() if resolve else path

				def env_path(

				    name: str,

				    default: Optional[Union[str, Path]] = None,

				    resolve: bool = True,

				) -> Path:

				    """Get environment variable as Path, raise if missing."""

				    path = env_path_optional(name, default, resolve)

				    if not path:

				        raise ValueError(f"Missing path value for {name}")

				    return path

				def env_bool(

				    name: str,

				    default: bool = False,

				) -> bool:

				    val = get_env(name)

				    if not val:

				        return default

				    return str2bool(val)

				def env_bool_field(

				    name: str,

				    default: bool = False,

				):

				    return field(default_factory=lambda: env_bool(name, default))

				def env_path_field(

				    name: str,

				    default: Union[str, Path] = "",

				    *,

				    resolve: bool = True,

				) -> Path:

				    return field(default_factory=lambda: env_path(name, default, resolve=resolve))

				def env_str_field(

				    name: str,

				    default: str = "",

				) -> str:

				    return field(default_factory=lambda: get_env(name, default))

				def generate_dataclass_help(cls) -> str:

				    """Auto-generate help text for dataclass fields."""

				    if not is_dataclass(cls):

				        raise TypeError(f"{cls} is not a dataclass")

				    def get_value(f):

				        if f.default is not MISSING:

				            return f.default

				        if f.default_factory is not MISSING:

				            try:

				                return f.default_factory()

				            except Exception as e:

				                return f"<error: {e}>"

				        return "<required>"

				    lines = [f"{f.name:<22} = {repr(get_value(f))}" for f in fields(cls)]

				    return indent("\n".join(lines), "    ")

				def with_params_help(params_cls: type, title: str = "Parameter defaults"):

				    """

				    Class decorator that appends a help table generated from another dataclass

				    (e.g., VllmParameters) to the decorated class's docstring.

				    """

				    if not is_dataclass(params_cls):

				        raise TypeError(f"{params_cls} must be a dataclass")

				    def _decorator(cls: type) -> type:

				        block = generate_dataclass_help(params_cls)

				        cls.__doc__ = (cls.__doc__ or "") + f"\n\n{title}:\n{block}"

				        return cls

				    return _decorator

									
										143

.ci/lumen_cli/cli/lib/common/gh_summary.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,143 @@

				from __future__ import annotations

				import logging

				import os

				import textwrap

				from pathlib import Path

				from typing import TYPE_CHECKING

				from cli.lib.common.utils import get_wheels

				from jinja2 import Template

				if TYPE_CHECKING:

				    from collections.abc import Iterable, Mapping

				logger = logging.getLogger(__name__)

				_TPL_CONTENT = Template(

				    textwrap.dedent("""\

				    ## {{ title }}

				    ```{{ lang }}

				    {{ content }}

				    ```

				""")

				)

				_TPL_LIST_ITEMS = Template(

				    textwrap.dedent("""\

				    ## {{ title }}

				    {% for it in items %}

				    - {{ it.pkg }}: {{ it.relpath }}

				    {% else %}

				    _(no item found)_

				    {% endfor %}

				    """)

				)

				_TPL_TABLE = Template(

				    textwrap.dedent("""\

				    {%- if rows %}

				    | {{ cols | join(' | ') }} |

				    |{%- for _ in cols %} --- |{%- endfor %}

				    {%- for r in rows %}

				    | {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}

				    {%- endfor %}

				    {%- else %}

				    _(no data)_

				    {%- endif %}

				""")

				)

				def gh_summary_path() -> Path | None:

				    """Return the Path to the GitHub step summary file, or None if not set."""

				    p = os.environ.get("GITHUB_STEP_SUMMARY")

				    return Path(p) if p else None

				def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:

				    """

				    Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set.

				    append_content: default true, if True, append to the end of the file, else overwrite the whole file

				    Returns:

				        True if written successfully (in GitHub Actions environment),

				        False if skipped (e.g., running locally where the variable is not set).

				    """

				    sp = gh_summary_path()

				    if not sp:

				        logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")

				        return False

				    md_clean = textwrap.dedent(md).strip() + "\n"

				    mode = "a" if append_content else "w"

				    with sp.open(mode, encoding="utf-8") as f:

				        f.write(md_clean)

				    return True

				def md_heading(text: str, level: int = 2) -> str:

				    """Generate a Markdown heading string with the given level (1-6)."""

				    return f"{'#' * max(1, min(level, 6))} {text}\n"

				def md_details(summary: str, content: str) -> str:

				    """Generate a collapsible <details> block with a summary and inner content."""

				    return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"

				def summarize_content_from_file(

				    output_dir: Path,

				    freeze_file: str,

				    title: str = "Content from file",

				    code_lang: str = "",  # e.g. "text" or "ini"

				) -> bool:

				    f = Path(output_dir) / freeze_file

				    if not f.exists():

				        return False

				    content = f.read_text(encoding="utf-8").strip()

				    md = render_content(content, title=title, lang=code_lang)

				    return write_gh_step_summary(md)

				def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):

				    items = get_wheels(path, max_depth=max_depth)

				    if not items:

				        return False

				    md = render_list(items, title=title)

				    return write_gh_step_summary(md)

				def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:

				    """

				    Render a list of dicts as a Markdown table using Jinja template.

				    """

				    rows = list(rows)

				    cols = list({k for r in rows for k in r.keys()})

				    md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"

				    return md

				def render_list(

				    items: Iterable[str],

				    *,

				    title: str = "List",

				) -> str:

				    tpl = _TPL_LIST_ITEMS

				    md = tpl.render(title=title, items=items)

				    return md

				def render_content(

				    content: str,

				    *,

				    title: str = "Content",

				    lang: str = "text",

				) -> str:

				    tpl = _TPL_CONTENT

				    md = tpl.render(title=title, content=content, lang=lang)

				    return md

									
										69

.ci/lumen_cli/cli/lib/common/git_helper.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,69 @@

				"""

				Git Utility helpers for CLI tasks.

				"""

				import logging

				from pathlib import Path

				from cli.lib.common.path_helper import remove_dir

				from git import GitCommandError, RemoteProgress, Repo

				logger = logging.getLogger(__name__)

				class PrintProgress(RemoteProgress):

				    """Simple progress logger for git operations."""

				    def __init__(self, interval: int = 5):

				        super().__init__()

				        self._last_percent = -1

				        self._interval = interval

				    def update(self, op_code, cur, max=None, message=""):

				        msg = self._cur_line or message

				        if max and cur:

				            percent = int(cur / max * 100)

				            if percent != self._last_percent and percent % self._interval == 0:

				                self._last_percent = percent

				                logger.info("Progress: %d%% - %s", percent, msg)

				        elif msg:

				            logger.info(msg)

				def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules=False):

				    """Clone repository with pinned commit and optional submodules."""

				    dst = dst or target

				    try:

				        logger.info("Cloning %s to %s", target, dst)

				        # Clone and fetch

				        remove_dir(dst)

				        r = Repo.clone_from(repo, dst, progress=PrintProgress())

				        r.git.fetch("--all", "--tags")

				        # Checkout pinned commit

				        commit = get_post_build_pinned_commit(target)

				        logger.info("Checking out pinned %s commit %s", target, commit)

				        r.git.checkout(commit)

				        # Update submodules if requested

				        if update_submodules and r.submodules:

				            logger.info("Updating %d submodule(s)", len(r.submodules))

				            for sm in r.submodules:

				                sm.update(init=True, recursive=True, progress=PrintProgress())

				        logger.info("Successfully cloned %s", target)

				        return r, commit

				    except GitCommandError:

				        logger.exception("Git operation failed")

				        raise

				def get_post_build_pinned_commit(name: str, prefix=".github/ci_commit_pins") -> str:

				    path = Path(prefix) / f"{name}.txt"

				    if not path.exists():

				        raise FileNotFoundError(f"Pin file not found: {path}")

				    return path.read_text(encoding="utf-8").strip()

									
										14

.ci/lumen_cli/cli/lib/common/logger.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,14 @@

				"""

				Logger Utility helpers for CLI tasks.

				"""

				import logging

				import sys

				def setup_logging(level: int = logging.INFO):

				    logging.basicConfig(

				        level=level,

				        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",

				        stream=sys.stdout,

				    )

									
										62

.ci/lumen_cli/cli/lib/common/path_helper.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,62 @@

				"""Path utility helpers for CLI tasks."""

				import logging

				import shutil

				from pathlib import Path

				from typing import Union

				logger = logging.getLogger(__name__)

				def get_path(path: Union[str, Path], resolve: bool = False) -> Path:

				    """Convert to Path object, optionally resolving to absolute path."""

				    if not path:

				        raise ValueError("Path cannot be None or empty")

				    result = Path(path)

				    return result.resolve() if resolve else result

				def ensure_dir_exists(path: Union[str, Path]) -> Path:

				    """Create directory if it doesn't exist."""

				    path_obj = get_path(path)

				    path_obj.mkdir(parents=True, exist_ok=True)

				    return path_obj

				def remove_dir(path: Union[str, Path, None]) -> None:

				    """Remove directory if it exists."""

				    if not path:

				        return

				    path_obj = get_path(path)

				    if path_obj.exists():

				        shutil.rmtree(path_obj)

				def force_create_dir(path: Union[str, Path]) -> Path:

				    """Remove directory if exists, then create fresh empty directory."""

				    remove_dir(path)

				    return ensure_dir_exists(path)

				def copy(src: Union[str, Path], dst: Union[str, Path]) -> None:

				    """Copy file or directory from src to dst."""

				    src_path = get_path(src, resolve=True)

				    dst_path = get_path(dst, resolve=True)

				    if not src_path.exists():

				        raise FileNotFoundError(f"Source does not exist: {src_path}")

				    dst_path.parent.mkdir(parents=True, exist_ok=True)

				    if src_path.is_file():

				        shutil.copy2(src_path, dst_path)

				    elif src_path.is_dir():

				        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)

				    else:

				        raise ValueError(f"Unsupported path type: {src_path}")

				def is_path_exist(path: Union[str, Path, None]) -> bool:

				    """Check if path exists."""

				    return bool(path and get_path(path).exists())

									
										71

.ci/lumen_cli/cli/lib/common/pip_helper.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,71 @@

				import glob

				import logging

				import shlex

				import shutil

				import sys

				from collections.abc import Iterable

				from importlib.metadata import PackageNotFoundError, version  # noqa: UP035

				from typing import Optional, Union

				from cli.lib.common.utils import run_command

				logger = logging.getLogger(__name__)

				def pip_install_packages(

				    packages: Iterable[str] = (),

				    env=None,

				    *,

				    requirements: Optional[str] = None,

				    constraints: Optional[str] = None,

				    prefer_uv: bool = False,

				) -> None:

				    use_uv = prefer_uv and shutil.which("uv") is not None

				    base = (

				        [sys.executable, "-m", "uv", "pip", "install"]

				        if use_uv

				        else [sys.executable, "-m", "pip", "install"]

				    )

				    cmd = base[:]

				    if requirements:

				        cmd += ["-r", requirements]

				    if constraints:

				        cmd += ["-c", constraints]

				    cmd += list(packages)

				    logger.info("pip installing packages: %s", " ".join(map(shlex.quote, cmd)))

				    run_command(" ".join(map(shlex.quote, cmd)), env=env)

				def pip_install_first_match(pattern: str, extras: Optional[str] = None, pref_uv=False):

				    wheel = first_matching_pkg(pattern)

				    target = f"{wheel}[{extras}]" if extras else wheel

				    logger.info("Installing %s...", target)

				    pip_install_packages([target], prefer_uv=pref_uv)

				def run_python(args: Union[str, list[str]], env=None):

				    """

				    Run the python in the current environment.

				    """

				    if isinstance(args, str):

				        args = shlex.split(args)

				    cmd = [sys.executable] + args

				    run_command(" ".join(map(shlex.quote, cmd)), env=env)

				def pkg_exists(name: str) -> bool:

				    try:

				        pkg_version = version(name)

				        logger.info("%s already exist with version: %s", name, pkg_version)

				        return True

				    except PackageNotFoundError:

				        logger.info("%s is not installed", name)

				        return False

				def first_matching_pkg(pattern: str) -> str:

				    matches = sorted(glob.glob(pattern))

				    if not matches:

				        raise FileNotFoundError(f"No wheel matching: {pattern}")

				    return matches[0]

									
										139

.ci/lumen_cli/cli/lib/common/utils.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,139 @@

				"""

				General Utility helpers for CLI tasks.

				"""

				import logging

				import os

				import shlex

				import subprocess

				import sys

				from contextlib import contextmanager

				from pathlib import Path

				from typing import Optional

				logger = logging.getLogger(__name__)

				def run_command(

				    cmd: str,

				    use_shell: bool = False,

				    log_cmd: bool = True,

				    cwd: Optional[str] = None,

				    env: Optional[dict] = None,

				    check: bool = True,

				) -> int:

				    """Run a command with optional shell execution."""

				    if use_shell:

				        args = cmd

				        log_prefix = "[shell]"

				        executable = "/bin/bash"

				    else:

				        args = shlex.split(cmd)

				        log_prefix = "[cmd]"

				        executable = None

				    if log_cmd:

				        display_cmd = cmd if use_shell else " ".join(args)

				        logger.info("%s %s", log_prefix, display_cmd)

				    run_env = {**os.environ, **(env or {})}

				    proc = subprocess.run(

				        args,

				        shell=use_shell,

				        executable=executable,

				        stdout=sys.stdout,

				        stderr=sys.stderr,

				        cwd=cwd,

				        env=run_env,

				        check=False,

				    )

				    if check and proc.returncode != 0:

				        logger.error(

				            "%s Command failed (exit %s): %s", log_prefix, proc.returncode, cmd

				        )

				        raise subprocess.CalledProcessError(

				            proc.returncode, args if not use_shell else cmd

				        )

				    return proc.returncode

				def str2bool(value: Optional[str]) -> bool:

				    """Convert environment variables to boolean values."""

				    if not value:

				        return False

				    if not isinstance(value, str):

				        raise ValueError(

				            f"Expected a string value for boolean conversion, got {type(value)}"

				        )

				    value = value.strip().lower()

				    true_value_set = {"1", "true", "t", "yes", "y", "on", "enable", "enabled", "found"}

				    false_value_set = {"0", "false", "f", "no", "n", "off", "disable"}

				    if value in true_value_set:

				        return True

				    if value in false_value_set:

				        return False

				    raise ValueError(f"Invalid string value for boolean conversion: {value}")

				@contextmanager

				def temp_environ(updates: dict[str, str]):

				    """

				    Temporarily set environment variables and restore them after the block.

				    Args:

				        updates: Dict of environment variables to set.

				    """

				    missing = object()

				    old: dict[str, str | object] = {k: os.environ.get(k, missing) for k in updates}

				    try:

				        os.environ.update(updates)

				        yield

				    finally:

				        for k, v in old.items():

				            if v is missing:

				                os.environ.pop(k, None)

				            else:

				                os.environ[k] = v  # type: ignore[arg-type]

				@contextmanager

				def working_directory(path: str):

				    """

				    Temporarily change the working directory inside a context.

				    """

				    if not path:

				        # No-op context

				        yield

				        return

				    prev_cwd = os.getcwd()

				    try:

				        os.chdir(path)

				        yield

				    finally:

				        os.chdir(prev_cwd)

				def get_wheels(

				    output_dir: Path,

				    max_depth: Optional[int] = None,

				) -> list[str]:

				    """Return a list of wheels found in the given output directory."""

				    root = Path(output_dir)

				    if not root.exists():

				        return []

				    items = []

				    for dirpath, _, filenames in os.walk(root):

				        depth = Path(dirpath).relative_to(root).parts

				        if max_depth is not None and len(depth) > max_depth:

				            continue

				        for fname in sorted(filenames):

				            if fname.endswith(".whl"):

				                pkg = fname.split("-")[0]

				                relpath = str((Path(dirpath) / fname).relative_to(root))

				                items.append({"pkg": pkg, "relpath": relpath})

				    return items

									
										292

.ci/lumen_cli/cli/lib/core/vllm/lib.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,292 @@

				import logging

				import os

				import textwrap

				from typing import Any

				from cli.lib.common.gh_summary import write_gh_step_summary

				from cli.lib.common.git_helper import clone_external_repo

				from cli.lib.common.pip_helper import pip_install_packages

				from cli.lib.common.utils import run_command, temp_environ, working_directory

				from jinja2 import Template

				logger = logging.getLogger(__name__)

				_TPL_VLLM_INFO = Template(

				    textwrap.dedent("""\

				    ##  Vllm against Pytorch CI Test Summary

				    **Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})

				    {%- if torch_sha %}

				    **Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})

				    {%- endif %}

				""")

				)

				def sample_vllm_test_library():

				    """

				    Simple sample to unblock the vllm ci development, which is mimic to

				    https://github.com/vllm-project/vllm/blob/main/.buildkite/test-pipeline.yaml

				    see run_test_plan for more details

				    """

				    # TODO(elainewy): Read from yaml file to handle the env and tests for vllm

				    return {

				        "vllm_basic_correctness_test": {

				            "title": "Basic Correctness Test",

				            "id": "vllm_basic_correctness_test",

				            "env_vars": {

				                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",

				            },

				            "steps": [

				                "pytest -v -s basic_correctness/test_cumem.py",

				                "pytest -v -s basic_correctness/test_basic_correctness.py",

				                "pytest -v -s basic_correctness/test_cpu_offload.py",

				            ],

				        },

				        "vllm_basic_models_test": {

				            "title": "Basic models test",

				            "id": "vllm_basic_models_test",

				            "steps": [

				                "pytest -v -s models/test_transformers.py",

				                "pytest -v -s models/test_registry.py",

				                "pytest -v -s models/test_utils.py",

				                "pytest -v -s models/test_vision.py",

				                "pytest -v -s models/test_initialization.py",

				            ],

				        },

				        "vllm_entrypoints_test": {

				            "title": "Entrypoints Test ",

				            "id": "vllm_entrypoints_test",

				            "env_vars": {

				                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",

				            },

				            "steps": [

				                " ".join(

				                    [

				                        "pytest",

				                        "-v",

				                        "-s",

				                        "entrypoints/llm",

				                        "--ignore=entrypoints/llm/test_generate.py",

				                        "--ignore=entrypoints/llm/test_collective_rpc.py",

				                    ]

				                ),

				                "pytest -v -s entrypoints/llm/test_generate.py",

				                "pytest -v -s entrypoints/offline_mode",

				            ],

				        },

				        "vllm_regression_test": {

				            "title": "Regression Test",

				            "id": "vllm_regression_test",

				            "package_install": ["modelscope"],

				            "steps": [

				                "pytest -v -s test_regression.py",

				            ],

				        },

				        "vllm_lora_tp_test_distributed": {

				            "title": "LoRA TP Test (Distributed)",

				            "id": "vllm_lora_tp_test_distributed",

				            "env_vars": {

				                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",

				            },

				            "num_gpus": 4,

				            "steps": [

				                "pytest -v -s -x lora/test_chatglm3_tp.py",

				                "pytest -v -s -x lora/test_llama_tp.py",

				                "pytest -v -s -x lora/test_llm_with_multi_loras.py",

				            ],

				        },

				        "vllm_distributed_test_28_failure_test": {

				            "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure",

				            "id": "vllm_distributed_test_28_failure_test",

				            "env_vars": {

				                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",

				            },

				            "num_gpus": 4,

				            "steps": [

				                "pytest -v -s distributed/test_sequence_parallel.py",

				            ],

				        },

				        "vllm_lora_28_failure_test": {

				            "title": "LoRA pytorch 2.8 failure test",

				            "id": "vllm_lora_28_failure_test",

				            "steps": ["pytest -v lora/test_quant_model.py"],

				        },

				        "vllm_multi_model_processor_test": {

				            "title": "Multi-Modal Processor Test",

				            "id": "vllm_multi_model_processor_test",

				            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],

				            "steps": [

				                "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py",

				            ],

				        },

				        "vllm_multi_model_test_28_failure_test": {

				            "title": "Multi-Model Test (Failed 2.8 release)",

				            "id": "vllm_multi_model_test_28_failure_test",

				            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],

				            "steps": [

				                "pytest -v -s models/multimodal/generation/test_voxtral.py",

				                "pytest -v -s models/multimodal/pooling",

				            ],

				        },

				        "vllm_pytorch_compilation_unit_tests": {

				            "title": "PyTorch Compilation Unit Tests",

				            "id": "vllm_pytorch_compilation_unit_tests",

				            "steps": [

				                "pytest -v -s compile/test_pass_manager.py",

				                "pytest -v -s compile/test_fusion.py",

				                "pytest -v -s compile/test_fusion_attn.py",

				                "pytest -v -s compile/test_silu_mul_quant_fusion.py",

				                "pytest -v -s compile/test_sequence_parallelism.py",

				                "pytest -v -s compile/test_async_tp.py",

				                "pytest -v -s compile/test_fusion_all_reduce.py",

				                "pytest -v -s compile/test_decorator.py",

				            ],

				        },

				        "vllm_language_model_test_extended_generation_28_failure_test": {

				            "title": "Language Models Test (Extended Generation) 2.8 release failure",

				            "id": "vllm_languagde_model_test_extended_generation_28_failure_test",

				            "package_install": [

				                "--no-build-isolation",

				                "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8",

				            ],

				            "steps": [

				                "pytest -v -s models/language/generation/test_mistral.py",

				            ],

				        },

				        "vllm_distributed_test_2_gpu_28_failure_test": {

				            "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure",

				            "id": "vllm_distributed_test_2_gpu_28_failure_test",

				            "env_vars": {

				                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",

				            },

				            "num_gpus": 4,

				            "steps": [

				                "pytest -v -s distributed/test_sequence_parallel.py",

				            ],

				        },

				        # TODO(elainewy):need to add g6 with 4 gpus to run this test

				        "vllm_lora_test": {

				            "title": "LoRA Test %N",

				            "id": "lora_test",

				            "parallelism": 4,

				            "steps": [

				                "echo '[checking] list sharded lora tests:'",

				                " ".join(

				                    [

				                        "pytest -q --collect-only lora",

				                        "--shard-id=$$BUILDKITE_PARALLEL_JOB",

				                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",

				                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",

				                    ]

				                ),

				                "echo '[checking] Done. list lora tests'",

				                " ".join(

				                    [

				                        "pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB",

				                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",

				                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",

				                    ]

				                ),

				            ],

				        },

				    }

				def check_parallelism(tests: Any, title: str, shard_id: int = 0, num_shards: int = 0):

				    """

				    a method to check if the test plan is parallelism or not.

				    """

				    parallelism = int(tests.get("parallelism", "0"))

				    is_parallel = parallelism and parallelism > 1

				    if not is_parallel:

				        return False

				    if shard_id > num_shards:

				        raise RuntimeError(

				            f"Test {title} expects {num_shards} shards, but invalid {shard_id} is provided"

				        )

				    if num_shards != parallelism:

				        raise RuntimeError(

				            f"Test {title} expects {parallelism} shards, but invalid {num_shards} is provided"

				        )

				    return True

				def run_test_plan(

				    test_plan: str,

				    test_target: str,

				    tests_map: dict[str, Any],

				    shard_id: int = 0,

				    num_shards: int = 0,

				):

				    """

				    a method to run list of tests based on the test plan.

				    """

				    logger.info("run %s tests.....", test_target)

				    if test_plan not in tests_map:

				        raise RuntimeError(

				            f"test {test_plan} not found, please add it to test plan pool"

				        )

				    tests = tests_map[test_plan]

				    pkgs = tests.get("package_install", [])

				    title = tests.get("title", "unknown test")

				    is_parallel = check_parallelism(tests, title, shard_id, num_shards)

				    if is_parallel:

				        title = title.replace("%N", f"{shard_id}/{num_shards}")

				    logger.info("Running tests: %s", title)

				    if pkgs:

				        logger.info("Installing packages: %s", pkgs)

				        pip_install_packages(packages=pkgs, prefer_uv=True)

				    with (

				        working_directory(tests.get("working_directory", "tests")),

				        temp_environ(tests.get("env_vars", {})),

				    ):

				        failures = []

				        for step in tests["steps"]:

				            logger.info("Running step: %s", step)

				            if is_parallel:

				                step = replace_buildkite_placeholders(step, shard_id, num_shards)

				                logger.info("Running parallel step: %s", step)

				            code = run_command(cmd=step, check=False, use_shell=True)

				            if code != 0:

				                failures.append(step)

				            logger.info("Finish running step: %s", step)

				        if failures:

				            logger.error("Failed tests: %s", failures)

				            raise RuntimeError(f"{len(failures)} pytest runs failed: {failures}")

				        logger.info("Done. All tests passed")

				def clone_vllm(dst: str = "vllm"):

				    _, commit = clone_external_repo(

				        target="vllm",

				        repo="https://github.com/vllm-project/vllm.git",

				        dst=dst,

				        update_submodules=True,

				    )

				    return commit

				def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:

				    mapping = {

				        "$$BUILDKITE_PARALLEL_JOB_COUNT": str(num_shards),

				        "$$BUILDKITE_PARALLEL_JOB": str(shard_id),

				    }

				    for k in sorted(mapping, key=len, reverse=True):

				        step = step.replace(k, mapping[k])

				    return step

				def summarize_build_info(vllm_commit: str) -> bool:

				    torch_sha = os.getenv("GITHUB_SHA")

				    md = (

				        _TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()

				        + "\n"

				    )

				    return write_gh_step_summary(md)

									
										296

.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,296 @@

				import logging

				import os

				import textwrap

				from dataclasses import dataclass

				from pathlib import Path

				from typing import Optional

				from cli.lib.common.cli_helper import BaseRunner

				from cli.lib.common.docker_helper import local_image_exists

				from cli.lib.common.envs_helper import (

				    env_bool_field,

				    env_path_field,

				    env_str_field,

				    with_params_help,

				)

				from cli.lib.common.gh_summary import (

				    gh_summary_path,

				    summarize_content_from_file,

				    summarize_wheels,

				)

				from cli.lib.common.path_helper import (

				    copy,

				    ensure_dir_exists,

				    force_create_dir,

				    get_path,

				    is_path_exist,

				)

				from cli.lib.common.utils import run_command

				from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info

				logger = logging.getLogger(__name__)

				# Default path for docker build artifacts

				_DEFAULT_RESULT_PATH = "./shared"

				# Temp folder in vllm work place to cp torch whls in vllm work directory for docker build

				_VLLM_TEMP_FOLDER = "tmp"

				@dataclass

				class VllmBuildParameters:

				    """

				    Parameters defining the vllm external input configurations.

				    Combine with VllmDockerBuildArgs to define the vllm build environment

				    """

				    # USE_TORCH_WHEEL: when true, use local Torch wheels; requires TORCH_WHEELS_PATH.

				    # Otherwise docker build pull torch nightly during build

				    # TORCH_WHEELS_PATH: directory containing local torch wheels when use_torch_whl is True

				    use_torch_whl: bool = env_bool_field("USE_TORCH_WHEEL", True)

				    torch_whls_path: Path = env_path_field("TORCH_WHEELS_PATH", "./dist")

				    # USE_LOCAL_BASE_IMAGE: when true, use an existing local Docker base image; requires BASE_IMAGE

				    # Otherwise, pull dockerfile's default image remotely

				    # BASE_IMAGE: name:tag (only needed when use_local_base_image is True)

				    use_local_base_image: bool = env_bool_field("USE_LOCAL_BASE_IMAGE", True)

				    base_image: str = env_str_field("BASE_IMAGE")

				    # USE_LOCAL_DOCKERFILE: when true("1"), use a local Dockerfile; requires DOCKERFILE_PATH.

				    # otherwise, use vllm's default dockerfile.torch_nightly for build

				    # DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True"

				    use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True)

				    dockerfile_path: Path = env_path_field(

				        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile"

				    )

				    # the cleaning script to remove torch dependencies from pip

				    cleaning_script: Path = env_path_field(

				        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"

				    )

				    # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts

				    output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")

				    # --- Build args ----------------------------------------------------------

				    target_stage: str = env_str_field("TARGET_STAGE", "export-wheels")

				    tag_name: str = env_str_field("TAG", "vllm-wheels")

				    cuda_version: str = env_str_field("CUDA_VERSION", "12.8.1")

				    python_version: str = env_str_field("PYTHON_VERSION", "3.12")

				    max_jobs: str = env_str_field("MAX_JOBS", "64")

				    sccache_bucket: str = env_str_field("SCCACHE_BUCKET")

				    sccache_region: str = env_str_field("SCCACHE_REGION")

				    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")

				    def __post_init__(self):

				        checks = [

				            (

				                self.use_torch_whl,  # flag

				                True,  # trigger_value

				                "torch_whls_path",  # resource

				                is_path_exist,  # check_func

				                "TORCH_WHEELS_PATH is not provided, but USE_TORCH_WHEEL is set to 1",

				            ),

				            (

				                self.use_local_base_image,

				                True,

				                "base_image",

				                local_image_exists,

				                f"BASE_IMAGE {self.base_image} does not found, but USE_LOCAL_BASE_IMAGE is set to 1",

				            ),

				            (

				                self.use_local_dockerfile,

				                True,

				                "dockerfile_path",

				                is_path_exist,

				                " DOCKERFILE_PATH path does not found, but USE_LOCAL_DOCKERFILE is set to 1",

				            ),

				        ]

				        for flag, trigger_value, attr_name, check_func, error_msg in checks:

				            value = getattr(self, attr_name)

				            if flag == trigger_value:

				                if not value or not check_func(value):

				                    raise ValueError(error_msg)

				            else:

				                logger.info("flag  %s is not set", flag)

				        if not self.output_dir:

				            raise ValueError("missing required output_dir")

				@with_params_help(VllmBuildParameters)

				class VllmBuildRunner(BaseRunner):

				    """

				    Build vLLM using docker buildx.

				    Environment variable options:

				        "USE_TORCH_WHEEL":      "1: use local wheels; 0: pull nightly from pypi",

				        "TORCH_WHEELS_PATH":    "Path to local wheels (when USE_TORCH_WHEEL=1)",

				        "USE_LOCAL_BASE_IMAGE": "1: use local base image; 0: default image",

				         "BASE_IMAGE":           "name:tag to indicate base image the dockerfile depends on (when USE_LOCAL_BASE_IMAGE=1)",

				        "USE_LOCAL_DOCKERFILE": "1: use local Dockerfile; 0: vllm repo default dockerfile.torch_nightly",

				        "DOCKERFILE_PATH":      "Path to Dockerfile (when USE_LOCAL_DOCKERFILE=1)",

				        "OUTPUT_DIR":           "e.g. './shared'",

				        "TORCH_CUDA_ARCH_LIST": "e.g. '8.0' or '8.0;9.0'",

				        "CUDA_VERSION":         "e.g. '12.8.1'",

				        "PYTHON_VERSION":       "e.g. '3.12'",

				        "MAX_JOBS":             "e.g. '64'",

				        "SCCACHE_BUCKET":       "e.g. 'my-bucket'",

				        "SCCACHE_REGION":       "e.g. 'us-west-2'",

				    """

				    def __init__(self, args=None):

				        self.work_directory = "vllm"

				    def run(self):

				        """

				        main function to run vllm build

				        1. prepare vllm build environment

				        2. prepare the docker build command args

				        3. run docker build

				        """

				        inputs = VllmBuildParameters()

				        logger.info("Running vllm build with inputs: %s", inputs)

				        vllm_commit = clone_vllm()

				        self.cp_torch_cleaning_script(inputs)

				        self.cp_dockerfile_if_exist(inputs)

				        # cp torch wheels from root direct to vllm workspace if exist

				        self.cp_torch_whls_if_exist(inputs)

				        # make sure the output dir to store the build artifacts exist

				        ensure_dir_exists(Path(inputs.output_dir))

				        cmd = self._generate_docker_build_cmd(inputs)

				        logger.info("Running docker build: \n %s", cmd)

				        try:

				            run_command(cmd, cwd="vllm", env=os.environ.copy())

				        finally:

				            self.genearte_vllm_build_summary(vllm_commit, inputs)

				    def genearte_vllm_build_summary(

				        self, vllm_commit: str, inputs: VllmBuildParameters

				    ):

				        if not gh_summary_path():

				            return logger.info("Skipping, not detect GH Summary env var....")

				        logger.info("Generate GH Summary ...")

				        # summarize vllm build info

				        summarize_build_info(vllm_commit)

				        # summarize vllm build artifacts

				        vllm_artifact_dir = inputs.output_dir / "wheels"

				        summarize_content_from_file(

				            vllm_artifact_dir,

				            "build_summary.txt",

				            title="Vllm build env pip package summary",

				        )

				        summarize_wheels(

				            inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"

				        )

				        summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")

				    def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:

				        if not inputs.use_torch_whl:

				            return ""

				        tmp_dir = f"./{self.work_directory}/{_VLLM_TEMP_FOLDER}"

				        tmp_path = Path(tmp_dir)

				        force_create_dir(tmp_path)

				        copy(inputs.torch_whls_path, tmp_dir)

				        return tmp_dir

				    def cp_torch_cleaning_script(self, inputs: VllmBuildParameters):

				        script = get_path(inputs.cleaning_script, resolve=True)

				        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")

				        copy(script, vllm_script)

				    def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):

				        if not inputs.use_local_dockerfile:

				            logger.info("using vllm default dockerfile.torch_nightly for build")

				            return

				        dockerfile_path = get_path(inputs.dockerfile_path, resolve=True)

				        vllm_torch_dockerfile = Path(

				            f"./{self.work_directory}/docker/Dockerfile.nightly_torch"

				        )

				        copy(dockerfile_path, vllm_torch_dockerfile)

				    def get_result_path(self, path):

				        """

				        Get the absolute path of the result path

				        """

				        if not path:

				            path = _DEFAULT_RESULT_PATH

				        abs_path = get_path(path, resolve=True)

				        return abs_path

				    def _get_torch_wheel_path_arg(self, torch_whl_dir: Optional[Path]) -> str:

				        if not torch_whl_dir:

				            return ""

				        return f"--build-arg TORCH_WHEELS_PATH={_VLLM_TEMP_FOLDER}"

				    def _get_base_image_args(self, inputs: VllmBuildParameters) -> tuple[str, str, str]:

				        """

				        Returns:

				            - base_image_arg: docker buildx arg string for base image

				            - final_base_image_arg:  docker buildx arg string for vllm-base stage

				            - pull_flag: --pull=true or --pull=false depending on whether the image exists locally

				        """

				        if not inputs.use_local_base_image:

				            return "", "", ""

				        base_image = inputs.base_image

				        # set both base image and final base image to the same local image

				        base_image_arg = f"--build-arg BUILD_BASE_IMAGE={base_image}"

				        final_base_image_arg = f"--build-arg FINAL_BASE_IMAGE={base_image}"

				        if local_image_exists(base_image):

				            pull_flag = "--pull=false"

				            return base_image_arg, final_base_image_arg, pull_flag

				        logger.info(

				            "[INFO] Local image not found:%s will try to pull from remote", {base_image}

				        )

				        return base_image_arg, final_base_image_arg, ""

				    def _generate_docker_build_cmd(

				        self,

				        inputs: VllmBuildParameters,

				    ) -> str:

				        base_image_arg, final_base_image_arg, pull_flag = self._get_base_image_args(

				            inputs

				        )

				        torch_arg = self._get_torch_wheel_path_arg(inputs.torch_whls_path)

				        return textwrap.dedent(

				            f"""

				            docker buildx build \

				                --output type=local,dest={inputs.output_dir} \

				                -f docker/Dockerfile.nightly_torch \

				                {pull_flag} \

				                {torch_arg} \

				                {base_image_arg} \

				                {final_base_image_arg} \

				                --build-arg max_jobs={inputs.max_jobs} \

				                --build-arg CUDA_VERSION={inputs.cuda_version} \

				                --build-arg PYTHON_VERSION={inputs.python_version} \

				                --build-arg USE_SCCACHE={int(bool(inputs.sccache_bucket and inputs.sccache_region))} \

				                --build-arg SCCACHE_BUCKET_NAME={inputs.sccache_bucket} \

				                --build-arg SCCACHE_REGION_NAME={inputs.sccache_region} \

				                --build-arg torch_cuda_arch_list='{inputs.torch_cuda_arch_list}' \

				                --target {inputs.target_stage} \

				                -t {inputs.tag_name} \

				                --progress=plain .

				        """

				        ).strip()

									
										280

.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,280 @@

				import logging

				import os

				import re

				import subprocess

				import sys

				from collections.abc import Iterable

				from dataclasses import dataclass

				from enum import Enum

				from pathlib import Path

				from typing import Any

				from cli.lib.common.cli_helper import BaseRunner

				from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env

				from cli.lib.common.path_helper import copy, get_path, remove_dir

				from cli.lib.common.pip_helper import (

				    pip_install_first_match,

				    pip_install_packages,

				    pkg_exists,

				    run_python,

				)

				from cli.lib.common.utils import run_command, working_directory

				from cli.lib.core.vllm.lib import clone_vllm, run_test_plan, sample_vllm_test_library

				logger = logging.getLogger(__name__)

				@dataclass

				class VllmTestParameters:

				    """

				    Parameters defining the vllm external test input

				    !!!DO NOT ADD SECRETS IN THIS CLASS!!!

				    you can put environment variable name in VllmTestParameters if it's not the same as the secret one

				    fetch secrests directly from env variables during runtime

				    """

				    torch_whls_path: Path = env_path_field("WHEELS_PATH", "./dist")

				    vllm_whls_path: Path = env_path_field(

				        "VLLM_WHEELS_PATH", "./dist/external/vllm/wheels"

				    )

				    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")

				    cleaning_script: Path = env_path_field(

				        "cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"

				    )

				    def __post_init__(self):

				        if not self.torch_whls_path.exists():

				            raise ValueError("missing torch_whls_path")

				        if not self.vllm_whls_path.exists():

				            raise ValueError("missing vllm_whls_path")

				class TestInpuType(Enum):

				    TEST_PLAN = "test_plan"

				    UNKNOWN = "unknown"

				class VllmTestRunner(BaseRunner):

				    def __init__(self, args: Any):

				        self.work_directory = "vllm"

				        self.test_plan = ""

				        self.test_type = TestInpuType.UNKNOWN

				        self.shard_id = args.shard_id

				        self.num_shards = args.num_shards

				        if args.test_plan:

				            self.test_plan = args.test_plan

				            self.test_type = TestInpuType.TEST_PLAN

				        # Matches the structeur in the artifacts.zip from torcb build

				        self.TORCH_WHL_PATH_REGEX = "torch*.whl"

				        self.TORCH_WHL_EXTRA = "opt-einsum"

				        self.TORCH_ADDITIONAL_WHLS_REGEX = [

				            "vision/torchvision*.whl",

				            "audio/torchaudio*.whl",

				        ]

				        # Match the structure of the artifacts.zip from vllm external build

				        self.VLLM_TEST_WHLS_REGEX = [

				            "xformers/*.whl",

				            "vllm/vllm*.whl",

				            "flashinfer-python/flashinfer*.whl",

				        ]

				    def prepare(self):

				        """

				        prepare test environment for vllm. This includes clone vllm repo, install all wheels, test dependencies and set env

				        """

				        params = VllmTestParameters()

				        logger.info("Display VllmTestParameters %s", params)

				        self._set_envs(params)

				        clone_vllm(dst=self.work_directory)

				        self.cp_torch_cleaning_script(params)

				        with working_directory(self.work_directory):

				            remove_dir(Path("vllm"))

				            self._install_wheels(params)

				            self._install_dependencies()

				        # verify the torches are not overridden by test dependencies

				        check_versions()

				    def run(self):

				        """

				        main function to run vllm test

				        """

				        self.prepare()

				        try:

				            with working_directory(self.work_directory):

				                if self.test_type == TestInpuType.TEST_PLAN:

				                    if self.num_shards > 1:

				                        run_test_plan(

				                            self.test_plan,

				                            "vllm",

				                            sample_vllm_test_library(),

				                            self.shard_id,

				                            self.num_shards,

				                        )

				                    else:

				                        run_test_plan(

				                            self.test_plan, "vllm", sample_vllm_test_library()

				                        )

				                else:

				                    raise ValueError(f"Unknown test type {self.test_type}")

				        finally:

				            # double check the torches are not overridden by other packages

				            check_versions()

				    def cp_torch_cleaning_script(self, params: VllmTestParameters):

				        script = get_path(params.cleaning_script, resolve=True)

				        vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")

				        copy(script, vllm_script)

				    def _install_wheels(self, params: VllmTestParameters):

				        logger.info("Running vllm test with inputs: %s", params)

				        if not pkg_exists("torch"):

				            # install torch from local whls if it's not installed yet.

				            torch_p = f"{str(params.torch_whls_path)}/{self.TORCH_WHL_PATH_REGEX}"

				            pip_install_first_match(torch_p, self.TORCH_WHL_EXTRA)

				        torch_whls_path = [

				            f"{str(params.torch_whls_path)}/{whl_path}"

				            for whl_path in self.TORCH_ADDITIONAL_WHLS_REGEX

				        ]

				        for torch_whl in torch_whls_path:

				            pip_install_first_match(torch_whl)

				        logger.info("Done. Installed torch and other torch-related wheels ")

				        logger.info("Installing vllm wheels")

				        vllm_whls_path = [

				            f"{str(params.vllm_whls_path)}/{whl_path}"

				            for whl_path in self.VLLM_TEST_WHLS_REGEX

				        ]

				        for vllm_whl in vllm_whls_path:

				            pip_install_first_match(vllm_whl)

				        logger.info("Done. Installed vllm wheels")

				    def _install_test_dependencies(self):

				        """

				        This method replaces torch dependencies with local torch wheel info in

				        requirements/test.in file from vllm repo. then generates the test.txt

				        in runtime

				        """

				        logger.info("generate test.txt from requirements/test.in with local torch whls")

				        preprocess_test_in()

				        copy("requirements/test.txt", "snapshot_constraint.txt")

				        run_command(

				            f"{sys.executable} -m uv pip compile requirements/test.in "

				            "-o test.txt "

				            "--index-strategy unsafe-best-match "

				            "--constraint snapshot_constraint.txt "

				            "--torch-backend cu128"

				        )

				        pip_install_packages(requirements="test.txt", prefer_uv=True)

				        logger.info("Done. installed requirements for test dependencies")

				    def _install_dependencies(self):

				        pip_install_packages(packages=["-e", "tests/vllm_test_utils"], prefer_uv=True)

				        pip_install_packages(packages=["hf_transfer"], prefer_uv=True)

				        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

				        # using script from vllm repo to remove all torch packages from requirements txt

				        run_python("use_existing_torch.py")

				        # install common packages

				        for requirements in ["requirements/common.txt", "requirements/build.txt"]:

				            pip_install_packages(

				                requirements=requirements,

				                prefer_uv=True,

				            )

				        # install test packages

				        self._install_test_dependencies()

				    def _set_envs(self, inputs: VllmTestParameters):

				        os.environ["TORCH_CUDA_ARCH_LIST"] = inputs.torch_cuda_arch_list

				        if not validate_cuda(get_env("TORCH_CUDA_ARCH_LIST")):

				            logger.warning(

				                "Missing supported TORCH_CUDA_ARCH_LIST. "

				                "Currently support TORCH_CUDA_ARCH_LIST env var "

				                "with supported arch [8.0, 8.9, 9.0]"

				            )

				        os.environ["HF_TOKEN"] = os.getenv("VLLM_TEST_HUGGING_FACE_TOKEN", "")

				        if not get_env("HF_TOKEN"):

				            raise ValueError(

				                "missing required HF_TOKEN, please set VLLM_TEST_HUGGING_FACE_TOKEN env var"

				            )

				        if not get_env("TORCH_CUDA_ARCH_LIST"):

				            raise ValueError(

				                "missing required TORCH_CUDA_ARCH_LIST, please set TORCH_CUDA_ARCH_LIST env var"

				            )

				def preprocess_test_in(

				    target_file: str = "requirements/test.in", additional_packages: Iterable[str] = ()

				):

				    """

				    This modifies the target_file file in place in vllm work directory.

				    It removes torch and unwanted packages in target_file and replace with local torch whls

				    package  with format "$WHEEL_PACKAGE_NAME @ file://<LOCAL_PATH>"

				    """

				    additional_package_to_move = list(additional_packages or ())

				    pkgs_to_remove = [

				        "torch",

				        "torchvision",

				        "torchaudio",

				        "xformers",

				        "mamba_ssm",

				    ] + additional_package_to_move

				    # Read current requirements

				    target_path = Path(target_file)

				    lines = target_path.read_text().splitlines()

				    pkgs_to_add = []

				    # Remove lines starting with the package names (==, @, >=) — case-insensitive

				    pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)

				    kept_lines = [line for line in lines if not pattern.match(line)]

				    # Get local installed torch/vision/audio from pip freeze

				    # This is hacky, but it works

				    pip_freeze = subprocess.check_output(["pip", "freeze"], text=True)

				    header_lines = [

				        line

				        for line in pip_freeze.splitlines()

				        if re.match(

				            r"^(torch|torchvision|torchaudio)\s*@\s*file://", line, re.IGNORECASE

				        )

				    ]

				    # Write back: header_lines + blank + kept_lines

				    out_lines = header_lines + [""] + kept_lines

				    if pkgs_to_add:

				        out_lines += [""] + pkgs_to_add

				    out = "\n".join(out_lines) + "\n"

				    target_path.write_text(out)

				    logger.info("[INFO] Updated %s", target_file)

				def validate_cuda(value: str) -> bool:

				    VALID_VALUES = {"8.0", "8.9", "9.0"}

				    return all(v in VALID_VALUES for v in value.split())

				def check_versions():

				    """

				    check installed packages version

				    """

				    logger.info("Double check installed packages")

				    patterns = ["torch", "xformers", "torchvision", "torchaudio", "vllm"]

				    for pkg in patterns:

				        pkg_exists(pkg)

				    logger.info("Done. checked installed packages")

									
										40

.ci/lumen_cli/cli/run.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,40 @@

				# main.py

				import argparse

				import logging

				from cli.build_cli.register_build import register_build_commands

				from cli.lib.common.logger import setup_logging

				from cli.test_cli.register_test import register_test_commands

				logger = logging.getLogger(__name__)

				def main():

				    # Define top-level parser

				    parser = argparse.ArgumentParser(description="Lumos CLI")

				    subparsers = parser.add_subparsers(dest="command", required=True)

				    parser.add_argument(

				        "--log-level", default="INFO", help="Log level (DEBUG, INFO, WARNING, ERROR)"

				    )

				    # registers second-level subcommands

				    register_build_commands(subparsers)

				    register_test_commands(subparsers)

				    # parse args after all options are registered

				    args = parser.parse_args()

				    # setup global logging

				    setup_logging(getattr(logging, args.log_level.upper(), logging.INFO))

				    logger.debug("Parsed args: %s", args)

				    if hasattr(args, "func"):

				        args.func(args)

				    else:

				        parser.print_help()

				if __name__ == "__main__":

				    main()

0

test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_interpreter_convert_to_bool_raises → .ci/lumen_cli/cli/test_cli/init.py

View File

									
										62

.ci/lumen_cli/cli/test_cli/register_test.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,62 @@

				import argparse

				import logging

				from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec

				from cli.lib.core.vllm.vllm_test import VllmTestRunner

				logger = logging.getLogger(__name__)

				# Maps targets to their argparse configuration and runner

				# it adds new target to path python -m cli.run build external {target} with buildrunner

				_TARGETS: dict[str, TargetSpec] = {

				    "vllm": {

				        "runner": VllmTestRunner,

				        "help": "test vLLM with pytorch main",

				    }

				    # add yours ...

				}

				def common_args(parser: argparse.ArgumentParser) -> None:

				    """

				    Add common CLI arguments to the given parser.

				    """

				    parser.add_argument(

				        "--shard-id",

				        type=int,

				        default=1,

				        help="a shard id to run, e.g. '0,1,2,3'",

				    )

				    parser.add_argument(

				        "--num-shards",

				        type=int,

				        default=1,

				        help="a number of shards to run, e.g. '4'",

				    )

				    group = parser.add_mutually_exclusive_group(required=True)

				    group.add_argument(

				        "-tp",

				        "--test-plan",

				        type=str,

				        help="a pre-defined test plan to run, e.g. 'basic_correctness_test'",

				    )

				def register_test_commands(subparsers: argparse._SubParsersAction) -> None:

				    build_parser = subparsers.add_parser(

				        "test",

				        help="test related commands",

				        formatter_class=RichHelp,

				    )

				    build_subparsers = build_parser.add_subparsers(dest="test_command", required=True)

				    overview = "\n".join(

				        f"  {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()

				    )

				    external_parser = build_subparsers.add_parser(

				        "external",

				        help="Test external targets",

				        description="Test third-party targets.\n\nAvailable targets:\n" + overview,

				        formatter_class=RichHelp,

				    )

				    register_targets(external_parser, _TARGETS, common_args=common_args)

									
										23

.ci/lumen_cli/pyproject.toml
									
										Normal file
									
												View File
												
				@ -0,0 +1,23 @@

				[project]

				name = "lumen-ci"

				version = "0.1.0"

				dependencies = [

				    "pyyaml==6.0.2",

				    "GitPython==3.1.45",

				    "docker==7.1.0",

				    "pytest==7.3.2",

				    "uv==0.8.6"

				]

				[tool.setuptools]

				packages = ["cli"]

				[tool.setuptools.package-dir]

				cli = "cli"

				[tool.ruff.lint]

				# Enable preview mode for linting

				preview = true

				# Now you can select your preview rules, like RUF048

				extend-select = ["RUF048"]

									
										47

.ci/lumen_cli/tests/test_app.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,47 @@

				# tests/test_cli.py

				import io

				import sys

				import unittest

				from contextlib import redirect_stderr, redirect_stdout

				from unittest.mock import patch

				from cli.run import main

				class TestArgparseCLI(unittest.TestCase):

				    @patch("cli.build_cli.register_build.VllmBuildRunner.run", return_value=None)

				    @patch("cli.build_cli.register_build.VllmBuildRunner.__init__", return_value=None)

				    def test_cli_run_build_external(self, mock_init, mock_run):

				        from cli.run import main  # import after patches if needed

				        test_args = ["cli.run", "build", "external", "vllm"]

				        with patch.object(sys, "argv", test_args):

				            # argparse may call sys.exit on error; capture to avoid test aborts

				            try:

				                main()

				            except SystemExit:

				                pass

				        mock_init.assert_called_once()  # got constructed

				        mock_run.assert_called_once_with()  # run() called

				    def test_build_help(self):

				        test_args = ["cli.run", "build", "--help"]

				        with patch.object(sys, "argv", test_args):

				            stdout = io.StringIO()

				            stderr = io.StringIO()

				            # --help always raises SystemExit(0)

				            with self.assertRaises(SystemExit) as cm:

				                with redirect_stdout(stdout), redirect_stderr(stderr):

				                    main()

				            self.assertEqual(cm.exception.code, 0)

				            output = stdout.getvalue()

				            self.assertIn("usage", output)

				            self.assertIn("external", output)

				if __name__ == "__main__":

				    unittest.main()

									
										115

.ci/lumen_cli/tests/test_cli_helper.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,115 @@

				import argparse

				import io

				import unittest

				from contextlib import redirect_stderr

				from unittest.mock import patch

				from cli.lib.common.cli_helper import BaseRunner, register_targets, RichHelp, TargetSpec

				# ---- Dummy runners for unittests----

				class FooRunner(BaseRunner):

				    """Foo description from docstring."""

				    def run(self) -> None:  # replaced by mock

				        pass

				class BarRunner(BaseRunner):

				    def run(self) -> None:  # replaced by mock

				        pass

				def add_foo_args(p: argparse.ArgumentParser) -> None:

				    p.add_argument("--x", type=int, required=True, help="x value")

				def common_args(p: argparse.ArgumentParser) -> None:

				    p.add_argument("--verbose", action="store_true", help="verbose flag")

				def build_parser(specs: dict[str, TargetSpec]) -> argparse.ArgumentParser:

				    parser = argparse.ArgumentParser(prog="app", formatter_class=RichHelp)

				    register_targets(

				        parser=parser,

				        target_specs=specs,

				        common_args=common_args,

				    )

				    return parser

				def get_subparser(

				    parser: argparse.ArgumentParser, name: str

				) -> argparse.ArgumentParser:

				    subparsers_action = next(

				        a

				        for a in parser._subparsers._group_actions  # type: ignore[attr-defined]

				        if isinstance(a, argparse._SubParsersAction)

				    )

				    return subparsers_action.choices[name]

				class TestRegisterTargets(unittest.TestCase):

				    def test_metavar_lists_targets(self):

				        specs: dict[str, TargetSpec] = {

				            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},

				            "bar": {"runner": BarRunner},

				        }

				        parser = build_parser(specs)

				        subparsers_action = next(

				            a

				            for a in parser._subparsers._group_actions  # type: ignore[attr-defined]

				            if isinstance(a, argparse._SubParsersAction)

				        )

				        self.assertEqual(subparsers_action.metavar, "{foo,bar}")

				    def test_add_arguments_and_common_args_present(self):

				        specs: dict[str, TargetSpec] = {

				            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},

				        }

				        parser = build_parser(specs)

				        foo = get_subparser(parser, "foo")

				        help_text = foo.format_help()

				        self.assertIn("--x", help_text)

				        self.assertIn("--verbose", help_text)

				    def test_runner_constructed_with_ns_and_run_called(self):

				        specs: dict[str, TargetSpec] = {

				            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},

				        }

				        parser = build_parser(specs)

				        with (

				            patch.object(FooRunner, "__init__", return_value=None) as mock_init,

				            patch.object(FooRunner, "run", return_value=None) as mock_run,

				        ):

				            ns = parser.parse_args(["foo", "--x", "3", "--verbose"])

				            ns.func(ns)  # set by register_targets

				            # __init__ received the Namespace

				            self.assertEqual(mock_init.call_count, 1)

				            (called_ns,), _ = mock_init.call_args

				            self.assertIsInstance(called_ns, argparse.Namespace)

				            # run() called with no args

				            mock_run.assert_called_once_with()

				    def test_runner_docstring_used_as_description_when_missing(self):

				        specs: dict[str, TargetSpec] = {

				            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},

				        }

				        parser = build_parser(specs)

				        foo = get_subparser(parser, "foo")

				        help_text = foo.format_help()

				        self.assertIn("Foo description from docstring.", help_text)

				    def test_missing_target_raises_systemexit_with_usage(self):

				        specs: dict[str, TargetSpec] = {"foo": {"runner": FooRunner}}

				        parser = build_parser(specs)

				        buf = io.StringIO()

				        with self.assertRaises(SystemExit), redirect_stderr(buf):

				            parser.parse_args([])

				        err = buf.getvalue()

				        self.assertIn("usage:", err)

				if __name__ == "__main__":

				    unittest.main()

									
										75

.ci/lumen_cli/tests/test_docker_helper.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,75 @@

				import unittest

				from unittest import mock

				from unittest.mock import MagicMock

				import docker.errors as derr

				from cli.lib.common.docker_helper import _get_client, local_image_exists

				class TestDockerImageHelpers(unittest.TestCase):

				    def setUp(self):

				        # Reset the singleton in the target module

				        patcher = mock.patch("cli.lib.common.docker_helper._docker_client", None)

				        self.addCleanup(patcher.stop)

				        patcher.start()

				    def test_local_image_exists_true(self):

				        # Mock a docker client whose images.get returns an object (no exception)

				        mock_client = MagicMock()

				        mock_client.images.get.return_value = object()

				        ok = local_image_exists("repo:tag", client=mock_client)

				        self.assertTrue(ok)

				    def test_local_image_exists_not_found_false(self):

				        mock_client = MagicMock()

				        # Raise docker.errors.NotFound

				        mock_client.images.get.side_effect = derr.NotFound("nope")

				        ok = local_image_exists("missing:latest", client=mock_client)

				        self.assertFalse(ok)

				    def test_local_image_exists_api_error_false(self):

				        mock_client = MagicMock()

				        mock_client.images.get.side_effect = derr.APIError("boom", None)

				        ok = local_image_exists("broken:tag", client=mock_client)

				        self.assertFalse(ok)

				    def test_local_image_exists_uses_lazy_singleton(self):

				        # Patch docker.from_env used by _get_client()

				        with mock.patch(

				            "cli.lib.common.docker_helper.docker.from_env"

				        ) as mock_from_env:

				            mock_docker_client = MagicMock()

				            mock_from_env.return_value = mock_docker_client

				            # First call should create and cache the client

				            c1 = _get_client()

				            self.assertIs(c1, mock_docker_client)

				            mock_from_env.assert_called_once()

				            # Second call should reuse cached client (no extra from_env calls)

				            c2 = _get_client()

				            self.assertIs(c2, mock_docker_client)

				            mock_from_env.assert_called_once()  # still once

				    def test_local_image_exists_without_client_param_calls_get_client_once(self):

				        # Ensure _get_client is called and cached; local_image_exists should reuse it

				        with mock.patch("cli.lib.common.docker_helper._get_client") as mock_get_client:

				            mock_client = MagicMock()

				            mock_get_client.return_value = mock_client

				            # 1st call

				            local_image_exists("repo:tag")

				            # 2nd call

				            local_image_exists("repo:tag2")

				            # local_image_exists should call _get_client each time,

				            # but your _get_client itself caches docker.from_env.

				            self.assertEqual(mock_get_client.call_count, 2)

				            self.assertEqual(mock_client.images.get.call_count, 2)

				            mock_client.images.get.assert_any_call("repo:tag")

				            mock_client.images.get.assert_any_call("repo:tag2")

				if __name__ == "__main__":

				    unittest.main()

									
										149

.ci/lumen_cli/tests/test_envs_helper.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,149 @@

				import os

				import unittest

				from dataclasses import dataclass

				from pathlib import Path

				from unittest.mock import patch

				import cli.lib.common.envs_helper as m

				class TestEnvHelpers(unittest.TestCase):

				    def setUp(self):

				        # Keep a copy of the original environment to restore later

				        self._env_backup = dict(os.environ)

				    def tearDown(self):

				        # Restore environment to original state

				        os.environ.clear()

				        os.environ.update(self._env_backup)

				    # -------- get_env --------

				    def test_get_env_unset_returns_default(self):

				        with patch.dict(os.environ, {}, clear=True):

				            self.assertEqual(m.get_env("FOO", "default"), "default")

				    def test_get_env_empty_returns_default(self):

				        with patch.dict(os.environ, {"FOO": ""}, clear=True):

				            self.assertEqual(m.get_env("FOO", "default"), "default")

				    def test_get_env_set_returns_value(self):

				        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):

				            self.assertEqual(m.get_env("FOO", "default"), "bar")

				    def test_get_env_not_exist_returns_default(self):

				        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):

				            self.assertEqual(m.get_env("TEST_NOT_EXIST", "default"), "default")

				    def test_get_env_not_exist_without_default(self):

				        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):

				            self.assertEqual(m.get_env("TEST_NOT_EXIST"), "")

				    # -------- env_bool --------

				    def test_env_bool_uses_default_when_unset(self):

				        with patch.dict(os.environ, {}, clear=True):

				            self.assertTrue(m.env_bool("FLAG", default=True))

				            self.assertFalse(m.env_bool("FLAG", default=False))

				    def test_env_bool_uses_str2bool_when_set(self):

				        # Patch str2bool used by env_bool so we don't depend on its exact behavior

				        def fake_str2bool(s: str) -> bool:

				            return s.lower() in {"1", "true", "yes", "on", "y"}

				        with (

				            patch.dict(os.environ, {"FLAG": "yEs"}, clear=True),

				            patch.object(m, "str2bool", fake_str2bool),

				        ):

				            self.assertTrue(m.env_bool("FLAG", default=False))

				    # -------- env_path_optional / env_path --------

				    def test_env_path_optional_unset_returns_none_by_default(self):

				        with patch.dict(os.environ, {}, clear=True):

				            self.assertIsNone(m.env_path_optional("P"))

				    def test_env_path_optional_unset_returns_none_when_env_var_is_empty(self):

				        with patch.dict(os.environ, {"P": ""}, clear=True):

				            self.assertIsNone(m.env_path_optional("P"))

				    def test_env_path_optional_unset_returns_default_str(self):

				        # default as string; resolve=True by default -> absolute path

				        default_str = "x/y"

				        with patch.dict(os.environ, {}, clear=True):

				            p = m.env_path_optional("P", default=default_str)

				            self.assertIsInstance(p, Path)

				            self.assertIsNotNone(p)

				            if p:

				                self.assertTrue(p.is_absolute())

				                self.assertEqual(p.parts[-2:], ("x", "y"))

				    def test_env_path_optional_unset_returns_default_path_no_resolve(self):

				        d = Path("z")

				        with patch.dict(os.environ, {}, clear=True):

				            p = m.env_path_optional("P", default=d, resolve=False)

				            self.assertEqual(p, d)

				    def test_env_path_optional_respects_resolve_true(self):

				        with patch.dict(os.environ, {"P": "a/b"}, clear=True):

				            p = m.env_path_optional("P", resolve=True)

				            self.assertIsInstance(p, Path)

				            if p:

				                self.assertTrue(p.is_absolute())

				    def test_env_path_optional_respects_resolve_false(self):

				        with patch.dict(os.environ, {"P": "rel/dir"}, clear=True):

				            p = m.env_path_optional("P", resolve=False)

				            self.assertEqual(p, Path("rel/dir"))

				            if p:

				                self.assertFalse(p.is_absolute())

				    def test_env_path_raises_when_missing_and_default_none(self):

				        with patch.dict(os.environ, {}, clear=True):

				            with self.assertRaises(ValueError):

				                m.env_path("P", None, resolve=True)

				    def test_env_path_returns_path_when_present(self):

				        tmp = Path("./b").resolve()

				        with patch.dict(os.environ, {"P": str(tmp)}, clear=True):

				            p = m.env_path("P", None, resolve=True)

				            self.assertEqual(p, tmp)

				    # -------- dataclass field helpers --------

				    def test_dataclass_fields_read_env_at_instantiation(self):

				        @dataclass

				        class Cfg:

				            flag: bool = m.env_bool_field("FLAG", default=False)

				            out: Path = m.env_path_field("OUT", default="ab", resolve=True)

				            name: str = m.env_str_field("NAME", default="anon")

				        # First instantiation

				        with patch.dict(

				            os.environ, {"FLAG": "true", "OUT": "outdir", "NAME": "alice"}, clear=True

				        ):

				            cfg1 = Cfg()

				            self.assertTrue(cfg1.flag)

				            self.assertIsInstance(cfg1.out, Path)

				            self.assertTrue(cfg1.out.is_absolute())

				            self.assertEqual(cfg1.name, "alice")

				            cfg1.name = "bob"  # change instance value

				            self.assertEqual(cfg1.name, "bob")  # change is reflected

				        # Change env; new instance should reflect new values

				        with patch.dict(os.environ, {"FLAG": "false", "NAME": ""}, clear=True):

				            cfg2 = Cfg()

				            self.assertFalse(cfg2.flag)  # str2bool("false") -> False

				            self.assertTrue("ab" in str(cfg2.out))

				            self.assertIsInstance(cfg2.out, Path)

				            self.assertTrue(cfg2.out.is_absolute())

				            self.assertEqual(cfg2.name, "anon")  # empty -> fallback to default

				    def test_dataclass_path_field_with_default_value(self):

				        @dataclass

				        class C2:

				            out: Path = m.env_path_field("OUT", default="some/dir", resolve=False)

				        with patch.dict(os.environ, {}, clear=True):

				            c = C2()

				            self.assertEqual(c.out, Path("some/dir"))

				if __name__ == "__main__":

				    unittest.main()

									
										122

.ci/lumen_cli/tests/test_path_helper.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,122 @@

				# test_path_utils.py

				# Run: pytest -q

				import os

				import unittest

				from pathlib import Path

				from tempfile import TemporaryDirectory

				from cli.lib.common.path_helper import (

				    copy,

				    ensure_dir_exists,

				    force_create_dir,

				    get_path,

				    is_path_exist,

				    remove_dir,

				)

				class TestPathHelper(unittest.TestCase):

				    def setUp(self):

				        self.tmpdir = TemporaryDirectory()

				        self.tmp_path = Path(self.tmpdir.name)

				    def tearDown(self):

				        self.tmpdir.cleanup()

				    # -------- get_path --------

				    def test_get_path_returns_path_for_str(self):

				        # Use relative path to avoid absolute-ness

				        rel_str = "sub/f.txt"

				        os.chdir(self.tmp_path)

				        p = get_path(rel_str, resolve=False)

				        self.assertIsInstance(p, Path)

				        self.assertFalse(p.is_absolute())

				        self.assertEqual(str(p), rel_str)

				    def test_get_path_resolves(self):

				        rel_str = "sub/f.txt"

				        p = get_path(str(self.tmp_path / rel_str), resolve=True)

				        self.assertTrue(p.is_absolute())

				        self.assertTrue(str(p).endswith(rel_str))

				    def test_get_path_with_path_input(self):

				        p_in = self.tmp_path / "sub/f.txt"

				        p_out = get_path(p_in, resolve=False)

				        self.assertTrue(str(p_out) == str(p_in))

				    def test_get_path_with_none_raises(self):

				        with self.assertRaises(ValueError):

				            get_path(None)  # type: ignore[arg-type]

				    def test_get_path_invalid_type_raises(self):

				        with self.assertRaises(TypeError):

				            get_path(123)  # type: ignore[arg-type]

				    # -------- ensure_dir_exists / force_create_dir / remove_dir --------

				    def test_ensure_dir_exists_creates_and_is_idempotent(self):

				        d = self.tmp_path / "made"

				        ensure_dir_exists(d)

				        self.assertTrue(d.exists() and d.is_dir())

				        ensure_dir_exists(d)

				    def test_force_create_dir_clears_existing(self):

				        d = self.tmp_path / "fresh"

				        (d / "inner").mkdir(parents=True)

				        (d / "inner" / "f.txt").write_text("x")

				        force_create_dir(d)

				        self.assertTrue(d.exists())

				        self.assertEqual(list(d.iterdir()), [])

				    def test_remove_dir_none_is_noop(self):

				        remove_dir(None)  # type: ignore[arg-type]

				    def test_remove_dir_nonexistent_is_noop(self):

				        ghost = self.tmp_path / "ghost"

				        remove_dir(ghost)

				    def test_remove_dir_accepts_str(self):

				        d = self.tmp_path / "to_rm"

				        d.mkdir()

				        remove_dir(str(d))

				        self.assertFalse(d.exists())

				    # -------- copy --------

				    def test_copy_file_to_file(self):

				        src = self.tmp_path / "src.txt"

				        dst = self.tmp_path / "out" / "dst.txt"

				        src.write_text("hello")

				        copy(src, dst)

				        self.assertEqual(dst.read_text(), "hello")

				    def test_copy_dir_to_new_dir(self):

				        src = self.tmp_path / "srcdir"

				        (src / "a").mkdir(parents=True)

				        (src / "a" / "f.txt").write_text("content")

				        dst = self.tmp_path / "destdir"

				        copy(src, dst)

				        self.assertEqual((dst / "a" / "f.txt").read_text(), "content")

				    def test_copy_dir_into_existing_dir_overwrite_true_merges(self):

				        src = self.tmp_path / "srcdir"

				        dst = self.tmp_path / "destdir"

				        (src / "x").mkdir(parents=True)

				        (src / "x" / "new.txt").write_text("new")

				        dst.mkdir()

				        (dst / "existing.txt").write_text("old")

				        copy(src, dst)

				        self.assertEqual((dst / "existing.txt").read_text(), "old")

				        self.assertEqual((dst / "x" / "new.txt").read_text(), "new")

				    def test_is_str_path_exist(self):

				        p = self.tmp_path / "x.txt"

				        p.write_text("1")

				        self.assertTrue(is_path_exist(str(p)))

				        self.assertTrue(is_path_exist(p))

				        self.assertFalse(is_path_exist(str(self.tmp_path / "missing")))

				        self.assertFalse(is_path_exist(self.tmp_path / "missing"))

				        self.assertFalse(is_path_exist(""))

				if __name__ == "__main__":

				    unittest.main()

									
										185

.ci/lumen_cli/tests/test_run_plan.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,185 @@

				# tests/test_run_test_plan.py

				import importlib

				from contextlib import nullcontext

				from types import SimpleNamespace

				from unittest.mock import MagicMock

				import pytest

				MOD = "cli.lib.core.vllm.lib"

				# We import inside tests so the MOD override above applies everywhere

				run_test_plan_import_path = f"{MOD}.run_test_plan"

				def _get_cmd(c):

				    # Support both kwargs and positional args

				    return c.kwargs.get("cmd", c.args[0] if c.args else None)

				def _get_check(c):

				    if "check" in c.kwargs:

				        return c.kwargs["check"]

				    # If positional, assume second arg is 'check' when present; default False

				    return c.args[1] if len(c.args) > 1 else False

				@pytest.fixture

				def patch_module(monkeypatch):

				    """

				    Patch helpers ('pip_install_packages', 'temp_environ', 'working_directory',

				    'run_command', 'logger') inside the target module and expose them.

				    """

				    module = importlib.import_module(MOD)

				    # Create fakes/mocks

				    pip_install_packages = MagicMock(name="pip_install_packages")

				    run_command = MagicMock(name="run_command", return_value=0)

				    # temp_environ / working_directory: record calls but act as context managers

				    temp_calls: list[dict] = []

				    workdir_calls: list[str] = []

				    def fake_working_directory(path: str):

				        workdir_calls.append(path)

				        return nullcontext()

				    def fake_temp_env(map: dict[str, str]):

				        temp_calls.append(map)

				        return nullcontext()

				    logger = SimpleNamespace(

				        info=MagicMock(name="logger.info"),

				        error=MagicMock(name="logger.error"),

				    )

				    # Apply patches (raise if attribute doesn't exist)

				    monkeypatch.setattr(

				        module, "pip_install_packages", pip_install_packages, raising=True

				    )

				    monkeypatch.setattr(module, "run_command", run_command, raising=True)

				    monkeypatch.setattr(

				        module, "working_directory", fake_working_directory, raising=True

				    )

				    monkeypatch.setattr(module, "temp_environ", fake_temp_env, raising=True)

				    monkeypatch.setattr(module, "logger", logger, raising=True)

				    return SimpleNamespace(

				        module=module,

				        run_test_plan=module.run_test_plan,  # expose to avoid getattr("constant") (Ruff B009)

				        pip_install_packages=pip_install_packages,

				        run_command=run_command,

				        temp_calls=temp_calls,

				        workdir_calls=workdir_calls,

				        logger=logger,

				    )

				def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_module):

				    run_test_plan = patch_module.run_test_plan

				    tests_map = {

				        "basic": {

				            "title": "Basic suite",

				            "package_install": [],

				            "working_directory": "tests",

				            "env_vars": {"GLOBAL_FLAG": "1"},

				            "steps": [

				                "export A=x && pytest -q",

				                "export B=y && pytest -q tests/unit",

				            ],

				        }

				    }

				    # One exit code per step (export + two pytest)

				    patch_module.run_command.side_effect = [0, 0, 0]

				    run_test_plan("basic", "cpu", tests_map)

				    calls = patch_module.run_command.call_args_list

				    cmds = [_get_cmd(c) for c in calls]

				    checks = [_get_check(c) for c in calls]

				    assert cmds == [

				        "export A=x && pytest -q",

				        "export B=y && pytest -q tests/unit",

				    ]

				    assert all(chk is False for chk in checks)

				    assert patch_module.workdir_calls == ["tests"]

				    assert patch_module.temp_calls == [{"GLOBAL_FLAG": "1"}]

				def test_installs_packages_when_present(monkeypatch, patch_module):

				    run_test_plan = patch_module.module.run_test_plan

				    tests_map = {

				        "with_pkgs": {

				            "title": "Needs deps",

				            "package_install": ["timm==1.0.0", "flash-attn"],

				            "steps": ["pytest -q"],

				        }

				    }

				    patch_module.run_command.return_value = 0

				    run_test_plan("with_pkgs", "gpu", tests_map)

				    patch_module.pip_install_packages.assert_called_once_with(

				        packages=["timm==1.0.0", "flash-attn"],

				        prefer_uv=True,

				    )

				def test_raises_on_missing_plan(patch_module):

				    run_test_plan = patch_module.module.run_test_plan

				    with pytest.raises(RuntimeError) as ei:

				        run_test_plan("nope", "cpu", tests_map={})

				    assert "test nope not found" in str(ei.value)

				def test_aggregates_failures_and_raises(monkeypatch, patch_module):

				    run_test_plan = patch_module.module.run_test_plan

				    tests_map = {

				        "mix": {

				            "title": "Some pass some fail",

				            "steps": [

				                "pytest test_a.py",  # 0 → pass

				                "pytest test_b.py",  # 1 → fail

				                "pytest test_c.py",  # 2 → fail

				            ],

				        }

				    }

				    # Simulate pass, fail, fail

				    patch_module.run_command.side_effect = [0, 1, 2]

				    with pytest.raises(RuntimeError) as ei:

				        run_test_plan("mix", "cpu", tests_map)

				    msg = str(ei.value)

				    assert "2 pytest runs failed" in msg

				    # Ensure logger captured failed tests list

				    patch_module.logger.error.assert_called_once()

				    # And we attempted all three commands

				    assert patch_module.run_command.call_count == 3

				def test_custom_working_directory_used(patch_module):

				    run_test_plan = patch_module.module.run_test_plan

				    tests_map = {

				        "customwd": {

				            "title": "Custom wd",

				            "working_directory": "examples/ci",

				            "steps": ["pytest -q"],

				        }

				    }

				    patch_module.run_command.return_value = 0

				    run_test_plan("customwd", "cpu", tests_map)

				    assert patch_module.workdir_calls == ["examples/ci"]

									
										143

.ci/lumen_cli/tests/test_utils.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,143 @@

				import os

				import tempfile

				import unittest

				from pathlib import Path

				from cli.lib.common.utils import temp_environ, working_directory  # <-- replace import

				class EnvIsolatedTestCase(unittest.TestCase):

				    """Base class that snapshots os.environ and CWD for isolation."""

				    def setUp(self):

				        import os

				        import tempfile

				        self._env_backup = dict(os.environ)

				        # Snapshot/repair CWD if it's gone

				        try:

				            self._cwd_backup = os.getcwd()

				        except FileNotFoundError:

				            # If CWD no longer exists, switch to a safe place and record that

				            self._cwd_backup = tempfile.gettempdir()

				            os.chdir(self._cwd_backup)

				        # Create a temporary directory for the test to run in

				        self._temp_dir = tempfile.mkdtemp()

				        os.chdir(self._temp_dir)

				    def tearDown(self):

				        import os

				        import shutil

				        import tempfile

				        # Restore cwd first (before cleaning up temp dir)

				        try:

				            os.chdir(self._cwd_backup)

				        except OSError:

				            os.chdir(tempfile.gettempdir())

				        # Clean up temporary directory

				        try:

				            shutil.rmtree(self._temp_dir, ignore_errors=True)

				        except Exception:

				            pass  # Ignore cleanup errors

				        # Restore env

				        to_del = set(os.environ.keys()) - set(self._env_backup.keys())

				        for k in to_del:

				            os.environ.pop(k, None)

				        for k, v in self._env_backup.items():

				            os.environ[k] = v

				class TestTempEnviron(EnvIsolatedTestCase):

				    def test_sets_and_restores_new_var(self):

				        var = "TEST_TMP_ENV_NEW"

				        self.assertNotIn(var, os.environ)

				        with temp_environ({var: "123"}):

				            self.assertEqual(os.environ[var], "123")

				        self.assertNotIn(var, os.environ)  # removed after exit

				    def test_overwrites_and_restores_existing_var(self):

				        var = "TEST_TMP_ENV_OVERWRITE"

				        os.environ[var] = "orig"

				        with temp_environ({var: "override"}):

				            self.assertEqual(os.environ[var], "override")

				        self.assertEqual(os.environ[var], "orig")  # restored

				    def test_multiple_vars_and_missing_cleanup(self):

				        v1, v2 = "TEST_ENV_V1", "TEST_ENV_V2"

				        os.environ.pop(v1, None)

				        os.environ[v2] = "keep"

				        with temp_environ({v1: "a", v2: "b"}):

				            self.assertEqual(os.environ[v1], "a")

				            self.assertEqual(os.environ[v2], "b")

				        self.assertNotIn(v1, os.environ)  # newly-added -> removed

				        self.assertEqual(os.environ[v2], "keep")  # pre-existing -> restored

				    def test_restores_even_on_exception(self):

				        var = "TEST_TMP_ENV_EXCEPTION"

				        self.assertNotIn(var, os.environ)

				        with self.assertRaises(RuntimeError):

				            with temp_environ({var: "x"}):

				                self.assertEqual(os.environ[var], "x")

				                raise RuntimeError("boom")

				        self.assertNotIn(var, os.environ)  # removed after exception

				class TestWorkingDirectory(EnvIsolatedTestCase):

				    def test_changes_and_restores(self):

				        start = Path.cwd()

				        with tempfile.TemporaryDirectory() as td:

				            target = Path(td) / "wd"

				            target.mkdir()

				            with working_directory(str(target)):

				                self.assertEqual(Path.cwd().resolve(), target.resolve())

				        self.assertEqual(Path.cwd(), start)

				    def test_noop_when_empty_path(self):

				        start = Path.cwd()

				        with working_directory(""):

				            self.assertEqual(Path.cwd(), start)

				        self.assertEqual(Path.cwd(), start)

				    def test_restores_on_exception(self):

				        start = Path.cwd()

				        with tempfile.TemporaryDirectory() as td:

				            target = Path(td) / "wd_exc"

				            target.mkdir()

				            with self.assertRaises(ValueError):

				                with working_directory(str(target)):

				                    # Normalize both sides to handle /var -> /private/var

				                    self.assertEqual(Path.cwd().resolve(), target.resolve())

				                    raise ValueError("boom")

				        self.assertEqual(Path.cwd().resolve(), start.resolve())

				    def test_raises_for_missing_dir(self):

				        start = Path.cwd()

				        with tempfile.TemporaryDirectory() as td:

				            missing = Path(td) / "does_not_exist"

				            with self.assertRaises(FileNotFoundError):

				                # os.chdir should raise before yielding

				                with working_directory(str(missing)):

				                    pass

				        self.assertEqual(Path.cwd(), start)

				if __name__ == "__main__":

				    unittest.main(verbosity=2)

									
										176

.ci/lumen_cli/tests/test_vllm.py
									
										Normal file
									
												View File
												
				@ -0,0 +1,176 @@

				import os

				import tempfile

				import unittest

				from pathlib import Path

				from unittest.mock import MagicMock, patch

				import cli.lib.core.vllm.vllm_build as vllm_build

				_VLLM_BUILD_MODULE = "cli.lib.core.vllm.vllm_build"

				class TestVllmBuildParameters(unittest.TestCase):

				    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)

				    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=True)

				    @patch(

				        "cli.lib.common.envs_helper.env_path_optional",

				        side_effect=lambda name, default=None, resolve=True: {

				            "DOCKERFILE_PATH": Path("/abs/vllm/Dockerfile"),

				            "TORCH_WHEELS_PATH": Path("/abs/dist"),

				            "OUTPUT_DIR": Path("/abs/shared"),

				        }.get(name, Path(default) if default is not None else None),

				    )

				    @patch.dict(

				        os.environ,

				        {

				            "USE_TORCH_WHEEL": "1",

				            "USE_LOCAL_BASE_IMAGE": "1",

				            "USE_LOCAL_DOCKERFILE": "1",

				            "BASE_IMAGE": "my/image:tag",

				            "DOCKERFILE_PATH": "vllm/Dockerfile",

				            "TORCH_WHEELS_PATH": "dist",

				            "OUTPUT_DIR": "shared",

				        },

				        clear=True,

				    )

				    def test_params_success_normalizes_and_validates(

				        self, mock_env_path, mock_is_path, mock_local_img

				    ):

				        params = vllm_build.VllmBuildParameters()

				        self.assertEqual(params.torch_whls_path, Path("/abs/dist"))

				        self.assertEqual(params.dockerfile_path, Path("/abs/vllm/Dockerfile"))

				        self.assertEqual(params.output_dir, Path("/abs/shared"))

				        self.assertEqual(params.base_image, "my/image:tag")

				    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)

				    @patch.dict(

				        os.environ, {"USE_TORCH_WHEEL": "1", "TORCH_WHEELS_PATH": "dist"}, clear=True

				    )

				    def test_params_missing_torch_whls_raises(self, _is_path):

				        with tempfile.TemporaryDirectory() as td:

				            os.chdir(td)

				            with self.assertRaises(ValueError) as cm:

				                vllm_build.VllmBuildParameters(

				                    use_local_base_image=False,

				                    use_local_dockerfile=False,

				                )

				        err = cm.exception

				        self.assertIn("TORCH_WHEELS_PATH", str(err))

				    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=False)

				    @patch.dict(

				        os.environ, {"USE_LOCAL_BASE_IMAGE": "1", "BASE_IMAGE": "img:tag"}, clear=True

				    )

				    def test_params_missing_local_base_image_raises(self, _local_img):

				        with tempfile.TemporaryDirectory() as td:

				            os.chdir(td)

				            with self.assertRaises(ValueError) as cm:

				                vllm_build.VllmBuildParameters(

				                    use_torch_whl=False,

				                    use_local_dockerfile=False,

				                )

				        err = cm.exception

				        self.assertIn("BASE_IMAGE", str(err))

				    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)

				    @patch.dict(

				        os.environ,

				        {"USE_LOCAL_DOCKERFILE": "1", "DOCKERFILE_PATH": "Dockerfile"},

				        clear=True,

				    )

				    def test_params_missing_dockerfile_raises(self, _is_path):

				        with tempfile.TemporaryDirectory() as td:

				            os.chdir(td)

				            with self.assertRaises(ValueError) as cm:

				                vllm_build.VllmBuildParameters(

				                    use_torch_whl=False,

				                    use_local_base_image=False,

				                )

				        err = cm.exception

				        self.assertIn("DOCKERFILE_PATH", str(err))

				    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)

				    @patch.dict(

				        os.environ,

				        {"OUTPUT_DIR": ""},

				        clear=True,

				    )

				    def test_params_missing_output_dir(self, _is_path):

				        with self.assertRaises(FileNotFoundError):

				            vllm_build.VllmBuildParameters()

				class TestBuildCmdAndRun(unittest.TestCase):

				    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)

				    def test_generate_docker_build_cmd_includes_bits(self, _exists):

				        runner = vllm_build.VllmBuildRunner()

				        inputs = MagicMock()

				        inputs.output_dir = Path("/abs/out")

				        inputs.use_local_base_image = True

				        inputs.base_image = "img:tag"

				        inputs.torch_whls_path = Path("./vllm/tmp")

				        inputs.max_jobs = 64

				        inputs.cuda_version = "12.8.1"

				        inputs.python_version = "3.12"

				        inputs.sccache_bucket = "my-bucket"

				        inputs.sccache_region = "us-west-2"

				        inputs.torch_cuda_arch_list = "8.0;9.0"

				        inputs.target_stage = "export-wheels"

				        inputs.tag_name = "vllm-wheels"

				        cmd = runner._generate_docker_build_cmd(inputs)

				        squashed = " ".join(cmd.split())

				        self.assertIn("--output type=local,dest=/abs/out", squashed)

				        self.assertIn("-f docker/Dockerfile.nightly_torch", squashed)

				        self.assertIn("--pull=false", squashed)

				        self.assertIn("--build-arg TORCH_WHEELS_PATH=tmp", squashed)

				        self.assertIn("--build-arg BUILD_BASE_IMAGE=img:tag", squashed)

				        self.assertIn("--build-arg FINAL_BASE_IMAGE=img:tag", squashed)

				        self.assertIn("--build-arg max_jobs=64", squashed)

				        self.assertIn("--build-arg CUDA_VERSION=12.8.1", squashed)

				        self.assertIn("--build-arg PYTHON_VERSION=3.12", squashed)

				        self.assertIn("--build-arg USE_SCCACHE=1", squashed)

				        self.assertIn("--build-arg SCCACHE_BUCKET_NAME=my-bucket", squashed)

				        self.assertIn("--build-arg SCCACHE_REGION_NAME=us-west-2", squashed)

				        self.assertIn("--build-arg torch_cuda_arch_list='8.0;9.0'", squashed)

				        self.assertIn("--target export-wheels", squashed)

				        self.assertIn("-t vllm-wheels", squashed)

				    @patch(f"{_VLLM_BUILD_MODULE}.run_command")

				    @patch(f"{_VLLM_BUILD_MODULE}.ensure_dir_exists")

				    @patch(f"{_VLLM_BUILD_MODULE}.clone_vllm")

				    @patch.object(

				        vllm_build.VllmBuildRunner,

				        "_generate_docker_build_cmd",

				        return_value="docker buildx ...",

				    )

				    @patch.dict(

				        os.environ,

				        {

				            "USE_TORCH_WHEEL": "0",

				            "USE_LOCAL_BASE_IMAGE": "0",

				            "USE_LOCAL_DOCKERFILE": "0",

				            "OUTPUT_DIR": "shared",

				        },

				        clear=True,

				    )

				    def test_run_calls_clone_prepare_and_build(

				        self, mock_gen, mock_clone, mock_ensure, mock_run

				    ):

				        params = MagicMock()

				        params.output_dir = Path("shared")

				        params.use_local_dockerfile = False

				        params.use_torch_whl = False

				        with patch(f"{_VLLM_BUILD_MODULE}.VllmBuildParameters", return_value=params):

				            runner = vllm_build.VllmBuildRunner()

				            runner.run()

				        mock_clone.assert_called_once()

				        mock_ensure.assert_called_once_with(Path("shared"))

				        mock_gen.assert_called_once_with(params)

				        mock_run.assert_called_once()

				        _, kwargs = mock_run.call_args

				        assert kwargs.get("cwd") == "vllm"

									
										16

.ci/magma-rocm/Makefile
									
												View File
												
				@ -1,11 +1,11 @@

				SHELL=/usr/bin/env bash

				DOCKER_CMD ?= docker

				DESIRED_ROCM ?= 6.4

				DESIRED_ROCM ?= 7.0

				DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))

				PACKAGE_NAME = magma-rocm

				# inherit this from underlying docker image, do not pass this env var to docker

				#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201

				#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201

				DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \

					-v $(shell git rev-parse --show-toplevel)/.ci:/builder \

				@ -16,20 +16,20 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \

					magma-rocm/build_magma.sh

				.PHONY: all

				all: magma-rocm70

				all: magma-rocm64

				all: magma-rocm63

				.PHONY:

				clean:

					$(RM) -r magma-*

					$(RM) -r output

				.PHONY: magma-rocm70

				magma-rocm70: DESIRED_ROCM := 7.0

				magma-rocm70:

					$(DOCKER_RUN)

				.PHONY: magma-rocm64

				magma-rocm64: DESIRED_ROCM := 6.4

				magma-rocm64:

					$(DOCKER_RUN)

				.PHONY: magma-rocm63

				magma-rocm63: DESIRED_ROCM := 6.3

				magma-rocm63:

					$(DOCKER_RUN)

									
										6

.ci/magma-rocm/build_magma.sh
									
												View File
												
				@ -6,8 +6,8 @@ set -eou pipefail

				# The script expects DESIRED_CUDA and PACKAGE_NAME to be set

				ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

				# Version 2.7.2 + ROCm related updates

				MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6

				# https://github.com/icl-utk-edu/magma/pull/65

				MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec

				# Folders for the build

				PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata

				@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE

				# Fetch magma sources and verify checksum

				pushd ${PACKAGE_DIR}

				git clone https://bitbucket.org/icl/magma.git

				git clone https://github.com/jeffdaily/magma

				pushd magma

				git checkout ${MAGMA_VERSION}

				popd

									
										7

.ci/magma/Makefile
									
												View File
												
				@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \

					magma/build_magma.sh

				.PHONY: all

				all: magma-cuda130

				all: magma-cuda129

				all: magma-cuda128

				all: magma-cuda126

				@ -25,6 +26,12 @@ clean:

					$(RM) -r magma-*

					$(RM) -r output

				.PHONY: magma-cuda130

				magma-cuda130: DESIRED_CUDA := 13.0

				magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120

				magma-cuda130:

					$(DOCKER_RUN)

				.PHONY: magma-cuda129

				magma-cuda129: DESIRED_CUDA := 12.9

				magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120

									
										2

.ci/magma/build_magma.sh
									
												View File
												
				@ -28,6 +28,7 @@ pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}

				patch < ${PACKAGE_FILES}/CMake.patch

				patch < ${PACKAGE_FILES}/cmakelists.patch

				patch -p0 < ${PACKAGE_FILES}/thread_queue.patch

				patch -p1 < ${PACKAGE_FILES}/cuda13.patch

				patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch

				patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch

				# The build.sh script expects to be executed from the sources root folder

				@ -37,6 +38,7 @@ popd

				# Package recipe, license and tarball

				# Folder and package name are backward compatible for the build workflow

				cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh

				cp ${PACKAGE_FILES}/cuda13.patch ${PACKAGE_RECIPE}/cuda13.patch

				cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch

				cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch

				cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch

									
										26

.ci/magma/package_files/cuda13.patch
									
										Normal file
									
												View File
												
				@ -0,0 +1,26 @@

				diff --git a/interface_cuda/interface.cpp b/interface_cuda/interface.cpp

				index 73fed1b20..e77519bfe 100644

				--- a/interface_cuda/interface.cpp

				+++ b/interface_cuda/interface.cpp

				@@ -438,14 +438,20 @@ magma_print_environment()

				         cudaDeviceProp prop;

				         err = cudaGetDeviceProperties( &prop, dev );

				         check_error( err );

				+        #ifdef MAGMA_HAVE_CUDA

				+#if CUDA_VERSION < 13000

				         printf( "%% device %d: %s, %.1f MHz clock, %.1f MiB memory, capability %d.%d\n",

				                 dev,

				                 prop.name,

				                 prop.clockRate / 1000.,

				+#else

				+        printf( "%% device %d: %s, ??? MHz clock, %.1f MiB memory, capability %d.%d\n",

				+                dev,

				+                prop.name,

				+#endif

				                 prop.totalGlobalMem / (1024.*1024.),

				                 prop.major,

				                 prop.minor );

				-        #ifdef MAGMA_HAVE_CUDA

				         int arch = prop.major*100 + prop.minor*10;

				         if ( arch < MAGMA_CUDA_ARCH_MIN ) {

				             printf("\n"

									
										4

.ci/manywheel/build.sh
									
												View File
												
				@ -5,10 +5,6 @@ set -ex

				SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

				case "${GPU_ARCH_TYPE:-BLANK}" in

				    BLANK)

				        # Legacy behavior for CircleCI

				        bash "${SCRIPTPATH}/build_cuda.sh"

				        ;;

				    cuda)

				        bash "${SCRIPTPATH}/build_cuda.sh"

				        ;;

									
										35

.ci/manywheel/build_common.sh
									
												View File
												
				@ -138,28 +138,11 @@ fi

				echo "Calling setup.py bdist at $(date)"

				if [[ "$USE_SPLIT_BUILD" == "true" ]]; then

				    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"

				    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \

				    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \

				time CMAKE_ARGS=${CMAKE_ARGS[@]} \

				    EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \

				    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \

				    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \

				    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR

				    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"

				    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"

				    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \

				    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \

				    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \

				    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \

				    CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR

				    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"

				else

				    time CMAKE_ARGS=${CMAKE_ARGS[@]} \

				        EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \

				        BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \

				        USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \

				        python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR

				fi

				    python -m build --wheel --no-isolation --outdir /tmp/$WHEELHOUSE_DIR

				echo "Finished setup.py bdist at $(date)"

				# Build libtorch packages

				@ -272,10 +255,6 @@ ls /tmp/$WHEELHOUSE_DIR

				mkdir -p "/$WHEELHOUSE_DIR"

				mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/

				if [[ "$USE_SPLIT_BUILD" == "true" ]]; then

				    mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true

				fi

				if [[ -n "$BUILD_PYTHONLESS" ]]; then

				    mkdir -p /$LIBTORCH_HOUSE_DIR

				    mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR

				@ -452,16 +431,8 @@ if [[ -z "$BUILD_PYTHONLESS" ]]; then

				  pushd $PYTORCH_ROOT/test

				  # Install the wheel for this Python version

				  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then

				    pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true

				  fi

				  pip uninstall -y "$TORCH_PACKAGE_NAME"

				  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then

				    pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v

				  fi

				  pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v

				  # Print info on the libraries installed in this wheel

									
										107

.ci/manywheel/build_cuda.sh
									
												View File
												
				@ -66,6 +66,9 @@ case ${CUDA_VERSION} in

				            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"

				        fi

				        ;;

				    13.0)

				        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"

				        ;;

				    12.6)

				        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"

				        ;;

				@ -110,13 +113,18 @@ DEPS_SONAME=(

				)

				# CUDA_VERSION 12.6, 12.8, 12.9

				if [[ $CUDA_VERSION == 12* ]]; then

				# CUDA_VERSION 12.*, 13.*

				if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then

				    export USE_STATIC_CUDNN=0

				    # Try parallelizing nvcc as well

				    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"

				    TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"

				    # Compress the fatbin with -compress-mode=size for CUDA 13

				    if [[ $CUDA_VERSION == 13* ]]; then

				        export TORCH_NVCC_FLAGS="$TORCH_NVCC_FLAGS -compress-mode=size"

				    fi

				    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then

				        echo "Bundling with cudnn and cublas."

				        DEPS_LIST+=(

				            "/usr/local/cuda/lib64/libcudnn_adv.so.9"

				            "/usr/local/cuda/lib64/libcudnn_cnn.so.9"

				@ -126,15 +134,11 @@ if [[ $CUDA_VERSION == 12* ]]; then

				            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"

				            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"

				            "/usr/local/cuda/lib64/libcudnn.so.9"

				            "/usr/local/cuda/lib64/libcublas.so.12"

				            "/usr/local/cuda/lib64/libcublasLt.so.12"

				            "/usr/local/cuda/lib64/libcusparseLt.so.0"

				            "/usr/local/cuda/lib64/libcudart.so.12"

				            "/usr/local/cuda/lib64/libnvrtc.so.12"

				            "/usr/local/cuda/lib64/libnvrtc-builtins.so"

				            "/usr/local/cuda/lib64/libcufile.so.0"

				            "/usr/local/cuda/lib64/libcufile_rdma.so.1"

				            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"

				            "/usr/local/cuda/lib64/libnvshmem_host.so.3"

				            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"

				        )

				        DEPS_SONAME+=(

				@ -146,41 +150,86 @@ if [[ $CUDA_VERSION == 12* ]]; then

				            "libcudnn_engines_precompiled.so.9"

				            "libcudnn_heuristic.so.9"

				            "libcudnn.so.9"

				            "libcublas.so.12"

				            "libcublasLt.so.12"

				            "libcusparseLt.so.0"

				            "libcudart.so.12"

				            "libnvrtc.so.12"

				            "libnvrtc-builtins.so"

				            "libnvshmem_host.so.3"

				            "libcufile.so.0"

				            "libcufile_rdma.so.1"

				            "libcupti.so.12"

				            "libnvperf_host.so"

				        )

				        # Add libnvToolsExt only if CUDA version is not 12.9

				        if [[ $CUDA_VERSION != 12.9* ]]; then

				            DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")

				            DEPS_SONAME+=("libnvToolsExt.so.1")

				        if [[ $CUDA_VERSION == 13* ]]; then

				            DEPS_LIST+=(

				                "/usr/local/cuda/lib64/libcublas.so.13"

				                "/usr/local/cuda/lib64/libcublasLt.so.13"

				                "/usr/local/cuda/lib64/libcudart.so.13"

				                "/usr/local/cuda/lib64/libnvrtc.so.13"

				                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13"

				                "/usr/local/cuda/lib64/libibverbs.so.1"

				                "/usr/local/cuda/lib64/librdmacm.so.1"

				                "/usr/local/cuda/lib64/libmlx5.so.1"

				                "/usr/local/cuda/lib64/libnl-3.so.200"

				                "/usr/local/cuda/lib64/libnl-route-3.so.200")

				            DEPS_SONAME+=(

				                "libcublas.so.13"

				                "libcublasLt.so.13"

				                "libcudart.so.13"

				                "libnvrtc.so.13"

				                "libcupti.so.13"

				                "libibverbs.so.1"

				                "librdmacm.so.1"

				                "libmlx5.so.1"

				                "libnl-3.so.200"

				                "libnl-route-3.so.200")

				            export USE_CUPTI_SO=1

				            export ATEN_STATIC_CUDA=0

				            export USE_CUDA_STATIC_LINK=0

				            export USE_CUFILE=0

				        else

				            DEPS_LIST+=(

				                "/usr/local/cuda/lib64/libcublas.so.12"

				                "/usr/local/cuda/lib64/libcublasLt.so.12"

				                "/usr/local/cuda/lib64/libcudart.so.12"

				                "/usr/local/cuda/lib64/libnvrtc.so.12"

				                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")

				            DEPS_SONAME+=(

				                "libcublas.so.12"

				                "libcublasLt.so.12"

				                "libcudart.so.12"

				                "libnvrtc.so.12"

				                "libcupti.so.12")

				            if [[ $CUDA_VERSION != 12.9* ]]; then

				                DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")

				                DEPS_SONAME+=("libnvToolsExt.so.1")

				            fi

				        fi

				    else

				        echo "Using nvidia libs from pypi."

				        CUDA_RPATHS=(

				            '$ORIGIN/../../nvidia/cublas/lib'

				            '$ORIGIN/../../nvidia/cuda_cupti/lib'

				            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'

				            '$ORIGIN/../../nvidia/cuda_runtime/lib'

				            '$ORIGIN/../../nvidia/cudnn/lib'

				            '$ORIGIN/../../nvidia/cufft/lib'

				            '$ORIGIN/../../nvidia/curand/lib'

				            '$ORIGIN/../../nvidia/cusolver/lib'

				            '$ORIGIN/../../nvidia/cusparse/lib'

				            '$ORIGIN/../../nvidia/cusparselt/lib'

				            '$ORIGIN/../../cusparselt/lib'

				            '$ORIGIN/../../nvidia/nccl/lib'

				            '$ORIGIN/../../nvidia/nvshmem/lib'

				            '$ORIGIN/../../nvidia/nvtx/lib'

				            '$ORIGIN/../../nvidia/cufile/lib'

				            '$ORIGIN/../../nvidia/nccl/lib'

				            '$ORIGIN/../../nvidia/cusparselt/lib'

				        )

				        if [[ $CUDA_VERSION == 13* ]]; then

				            CUDA_RPATHS+=('$ORIGIN/../../nvidia/cu13/lib')

				        else

				            CUDA_RPATHS+=(

				                '$ORIGIN/../../nvidia/cublas/lib'

				                '$ORIGIN/../../nvidia/cuda_cupti/lib'

				                '$ORIGIN/../../nvidia/cuda_nvrtc/lib'

				                '$ORIGIN/../../nvidia/cuda_runtime/lib'

				                '$ORIGIN/../../nvidia/cufft/lib'

				                '$ORIGIN/../../nvidia/curand/lib'

				                '$ORIGIN/../../nvidia/cusolver/lib'

				                '$ORIGIN/../../nvidia/cusparse/lib'

				                '$ORIGIN/../../cusparselt/lib'

				                '$ORIGIN/../../nvidia/nvtx/lib'

				                '$ORIGIN/../../nvidia/cufile/lib'

				            )

				        fi

				        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")

				        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'

				        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'

									
										2

.ci/manywheel/build_libtorch.sh
									
												View File
												
				@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then

				    export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr

				fi

				echo "Calling 'python -m pip install .' at $(date)"

				echo "Calling -m pip install . -v --no-build-isolation at $(date)"

				if [[ $LIBTORCH_VARIANT = *"static"* ]]; then

				    STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"

									
										4

.ci/manywheel/build_rocm.sh
									
												View File
												
				@ -107,6 +107,10 @@ if [[ $ROCM_INT -ge 60200 ]]; then

				    ROCM_SO_FILES+=("librocm-core.so")

				fi

				if [[ $ROCM_INT -ge 70000 ]]; then

				    ROCM_SO_FILES+=("librocroller.so")

				fi

				OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`

				if [[ "$OS_NAME" == *"CentOS Linux"* || "$OS_NAME" == *"AlmaLinux"* ]]; then

				    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"

									
										1

.ci/manywheel/build_xpu.sh
									
												View File
												
				@ -25,6 +25,7 @@ source /opt/intel/oneapi/mpi/latest/env/vars.sh

				export USE_STATIC_MKL=1

				export USE_ONEMKL=1

				export USE_XCCL=1

				export USE_MPI=0

				WHEELHOUSE_DIR="wheelhousexpu"

				LIBTORCH_HOUSE_DIR="libtorch_housexpu"

									
										71

.ci/pytorch/build.sh
									
												View File
												
				@ -50,9 +50,6 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then

				  export ATEN_THREADING=NATIVE

				fi

				# Enable LLVM dependency for TensorExpr testing

				export USE_LLVM=/opt/llvm

				export LLVM_DIR=/opt/llvm/lib/cmake/llvm

				if ! which conda; then

				  # In ROCm CIs, we are doing cross compilation on build machines with

				@ -92,7 +89,28 @@ fi

				if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then

				  export USE_MKLDNN=1

				  export USE_MKLDNN_ACL=1

				  export ACL_ROOT_DIR=/ComputeLibrary

				  export ACL_ROOT_DIR=/acl

				fi

				if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then

				  if [[ -f /opt/riscv-cross-env/bin/activate ]]; then

				    # shellcheck disable=SC1091

				    source /opt/riscv-cross-env/bin/activate

				  else

				    echo "Activation file not found"

				    exit 1

				  fi

				  export CMAKE_CROSSCOMPILING=TRUE

				  export CMAKE_SYSTEM_NAME=Linux

				  export CMAKE_SYSTEM_PROCESSOR=riscv64

				  export USE_CUDA=0

				  export USE_MKLDNN=0

				  export SLEEF_TARGET_EXEC_USE_QEMU=ON

				  sudo chown -R jenkins /var/lib/jenkins/workspace /opt

				fi

				if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then

				@ -155,6 +173,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then

				  source /opt/intel/oneapi/mpi/latest/env/vars.sh

				  # Enable XCCL build

				  export USE_XCCL=1

				  export USE_MPI=0

				  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA

				  export USE_KINETO=0

				  export TORCH_XPU_ARCH_LIST=pvc

				@ -176,8 +195,16 @@ fi

				# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of

				# memory to build and will OOM

				if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then

				  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"

				if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then

				  J=2  # default to 2 jobs

				  case "$RUNNER" in

				    linux.12xlarge.memory|linux.24xlarge.memory)

				      J=24

				      ;;

				  esac

				  echo "Building FlashAttention with job limit $J"

				  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j ${J}"

				fi

				if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then

				@ -192,7 +219,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then

				  export USE_ASAN=1

				  export REL_WITH_DEB_INFO=1

				  export UBSAN_FLAGS="-fno-sanitize-recover=all"

				  unset USE_LLVM

				fi

				if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then

				@ -207,13 +233,15 @@ if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then

				  export BUILD_STATIC_RUNTIME_BENCHMARK=ON

				fi

				if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then

				if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then

				  export CMAKE_BUILD_TYPE=Debug

				elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then

				  export CMAKE_BUILD_TYPE=RelWithAssert

				fi

				# Do not change workspace permissions for ROCm and s390x CI jobs

				# as it can leave workspace with bad permissions for cancelled jobs

				if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then

				if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && -d /var/lib/jenkins/workspace ]]; then

				  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)

				  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")

				  cleanup_workspace() {

				@ -258,31 +286,26 @@ else

				    # XLA test build fails when WERROR=1

				    # set only when building other architectures

				    # or building non-XLA tests.

				    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&

				          "$BUILD_ENVIRONMENT" != *xla* ]]; then

				    if [[ "$BUILD_ENVIRONMENT" != *rocm*  && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then

				      # Install numpy-2.0.2 for builds which are backward compatible with 1.X

				      python -mpip install numpy==2.0.2

				      WERROR=1 python setup.py clean

				      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then

				        python3 tools/packaging/split_wheel.py bdist_wheel

				      else

				        WERROR=1 python setup.py bdist_wheel

				      fi

				      WERROR=1 python -m build --wheel --no-isolation

				    else

				      python setup.py clean

				      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then

				        source .ci/pytorch/install_cache_xla.sh

				      fi

				      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then

				        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"

				        exit 1

				      else

				        python setup.py bdist_wheel

				      fi

				      python -m build --wheel --no-isolation

				    fi

				    pip_install_whl "$(echo dist/*.whl)"

				    if [[ "$BUILD_ENVIRONMENT" == *full-debug* ]]; then

				      # Regression test for https://github.com/pytorch/pytorch/issues/164297

				      # Torch should be importable and that's about it

				      pushd /; python -c "import torch;print(torch.__config__.show(), torch.randn(5) + 1.7)"; popd

				    fi

				    if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then

				      install_torchvision

				@ -405,7 +428,7 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];

				  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build

				  python tools/stats/export_test_times.py

				fi

				# don't do this for bazel or s390x as they don't use sccache

				if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then

				# don't do this for bazel or s390x or riscv64 as they don't use sccache

				if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then

				  print_sccache_stats

				fi

									
										21

.ci/pytorch/check_binary.sh
									
												View File
												
				@ -300,24 +300,3 @@ except RuntimeError as e:

				    exit 1

				  fi

				fi

				###############################################################################

				# Check for C++ ABI compatibility to GCC-11 - GCC 13

				###############################################################################

				if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then

				  pushd /tmp

				  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html

				  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19

				  # gcc 11 - CUDA 11.8, xpu, rocm

				  # gcc 13 - CUDA 12.6, 12.8 and cpu

				  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426

				  if [[ "$(uname -m)" == "s390x" ]]; then

				    cxx_abi="19"

				  elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then

				    cxx_abi="18"

				  else

				    cxx_abi="16"

				  fi

				  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"

				  popd

				fi

									
										31

.ci/pytorch/common_utils.sh
									
												View File
												
				@ -149,6 +149,19 @@ function get_pinned_commit() {

				  cat .github/ci_commit_pins/"${1}".txt

				}

				function detect_cuda_arch() {

				  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then

				    if command -v nvidia-smi; then

				      TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)

				    elif [[ "${TEST_CONFIG}" == *nogpu* ]]; then

				      # There won't be nvidia-smi in nogpu tests, so just set TORCH_CUDA_ARCH_LIST to the default

				      # minimum supported value here

				      TORCH_CUDA_ARCH_LIST=8.0

				    fi

				    export TORCH_CUDA_ARCH_LIST

				  fi

				}

				function install_torchaudio() {

				  local commit

				  commit=$(get_pinned_commit audio)

				@ -245,11 +258,19 @@ function install_torchrec_and_fbgemm() {

				      git clone --recursive https://github.com/pytorch/fbgemm

				      pushd fbgemm/fbgemm_gpu

				      git checkout "${fbgemm_commit}" --recurse-submodules

				      python setup.py bdist_wheel \

				        --build-variant=rocm \

				        -DHIP_ROOT_DIR="${ROCM_PATH}" \

				        -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \

				        -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"

				      # until the fbgemm_commit includes the tbb patch

				      patch <<'EOF'

				--- a/FbgemmGpu.cmake

				+++ b/FbgemmGpu.cmake

				@@ -184,5 +184,6 @@ gpu_cpp_library(

				     fbgemm_gpu_tbe_cache

				     fbgemm_gpu_tbe_optimizers

				     fbgemm_gpu_tbe_utils

				+    tbb

				   DESTINATION

				     fbgemm_gpu)

				EOF

				      python setup.py bdist_wheel --build-variant=rocm

				      popd

				      # Save the wheel before cleaning up

									
										2

.ci/pytorch/cpp_doc_push_script.sh
									
												View File
												
				@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \

				# Build the docs

				pushd docs/cpp

				time make VERBOSE=1 html -j

				time make VERBOSE=1 html

				popd

				popd

									
										40

.ci/pytorch/functorch_doc_push_script.sh
									
												View File
											
				@ -1,40 +0,0 @@

				#!/bin/bash

				# This is where the local pytorch install in the docker image is located

				pt_checkout="/var/lib/jenkins/workspace"

				source "$pt_checkout/.ci/pytorch/common_utils.sh"

				echo "functorch_doc_push_script.sh: Invoked with $*"

				set -ex -o pipefail

				version=${DOCS_VERSION:-nightly}

				echo "version: $version"

				# Build functorch docs

				pushd $pt_checkout/functorch/docs

				make html

				popd

				git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages

				pushd functorch_ghpages

				if [ "$version" == "main" ]; then

				  version=nightly

				fi

				git rm -rf "$version" || true

				mv "$pt_checkout/functorch/docs/build/html" "$version"

				git add "$version" || true

				git status

				git config user.email "soumith+bot@pytorch.org"

				git config user.name "pytorchbot"

				# If there aren't changes, don't make a commit; push is no-op

				git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true

				git status

				if [[ "${WITH_PUSH:-}" == true ]]; then

				  git push -u origin gh-pages

				fi

				popd

									
										4

.ci/pytorch/macos-build.sh
									
												View File
												
				@ -36,11 +36,11 @@ fi

				print_cmake_info

				if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then

				  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls

				  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel

				  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python -m build --wheel --no-isolation

				else

				  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests

				  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448

				  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64

				  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python -m build --wheel --no-isolation -C--build-option=--plat-name=macosx_11_0_arm64

				fi

				if which sccache > /dev/null; then

				  print_sccache_stats

									
										58

.ci/pytorch/macos-test.sh
									
												View File
												
				@ -55,7 +55,7 @@ test_python_shard() {

				  setup_test_python

				  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS"

				  time python test/run_test.py --verbose --exclude-jit-executor --exclude-distributed-tests --exclude-quantization-tests --shard "$1" "$NUM_TEST_SHARDS"

				  assert_git_not_dirty

				}

				@ -174,10 +174,15 @@ checkout_install_torchbench() {

				    # to install and test other models

				    python install.py --continue_on_fail

				  fi

				  popd

				  pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt

				  # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because

				  # its current version 0.12.0 doesn't work with transformers 4.54.0

				  pip uninstall -y torchao

				  echo "Print all dependencies after TorchBench is installed"

				  python -mpip freeze

				  popd

				}

				torchbench_setup_macos() {

				@ -190,7 +195,7 @@ torchbench_setup_macos() {

				  git checkout "$(cat ../.github/ci_commit_pins/vision.txt)"

				  git submodule update --init --recursive

				  python setup.py clean

				  python setup.py develop

				  python -m pip install -e . -v --no-build-isolation

				  popd

				  pushd torchaudio

				@ -199,7 +204,7 @@ torchbench_setup_macos() {

				  git submodule update --init --recursive

				  python setup.py clean

				  #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp

				  USE_OPENMP=0 python setup.py develop

				  USE_OPENMP=0 python -m pip install -e . -v --no-build-isolation

				  popd

				  checkout_install_torchbench

				@ -251,7 +256,7 @@ test_torchbench_smoketest() {

				  local device=mps

				  local dtypes=(undefined float16 bfloat16 notset)

				  local dtype=${dtypes[$1]}

				  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)

				  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)

				  for backend in eager inductor; do

				@ -297,6 +302,47 @@ test_torchbench_smoketest() {

				    fi

				  done

				  echo "Pytorch benchmark on mps device completed"

				}

				test_aoti_torchbench_smoketest() {

				  print_cmake_info

				  echo "Launching AOTInductor torchbench setup"

				  pip_benchmark_deps

				  # shellcheck disable=SC2119,SC2120

				  torchbench_setup_macos

				  TEST_REPORTS_DIR=$(pwd)/test/test-reports

				  mkdir -p "$TEST_REPORTS_DIR"

				  local device=mps

				  local dtypes=(undefined float16 bfloat16 notset)

				  local dtype=${dtypes[$1]}

				  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)

				  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"

				  local dtype_arg="--${dtype}"

				  if [ "$dtype" == notset ]; then

				      dtype_arg="--float32"

				  fi

				  touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv"

				  for model in "${models[@]}"; do

				    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \

				      --performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \

				      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true

				    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \

				      --accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \

				      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true

				  done

				  echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}"

				  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \

				    --performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \

				    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true

				  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \

				    --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \

				    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true

				  echo "Pytorch benchmark on mps device completed"

				}

				@ -345,6 +391,8 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then

				  test_timm_perf

				elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then

				  test_torchbench_smoketest "${SHARD_NUMBER}"

				elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then

				  test_aoti_torchbench_smoketest "${SHARD_NUMBER}"

				elif [[ $TEST_CONFIG == *"mps"* ]]; then

				  test_python_mps

				elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then

									
										2

.ci/pytorch/multigpu-test.sh
									
												View File
												
				@ -26,6 +26,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then

				    time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo

				    time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl

				    time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering

				    time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering

				    time python test/run_test.py --verbose -i distributed/test_store

				    time python test/run_test.py --verbose -i distributed/test_symmetric_memory

				    time python test/run_test.py --verbose -i distributed/test_pg_wrapper

				@ -45,6 +46,7 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then

				    # DTensor tests

				    time python test/run_test.py --verbose -i distributed/tensor/test_random_ops

				    time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile

				    time python test/run_test.py --verbose -i distributed/tensor/test_utils.py

				    # DeviceMesh test

				    time python test/run_test.py --verbose -i distributed/test_device_mesh

									
										25

.ci/pytorch/numba-cuda-13.patch
									
										Normal file
									
												View File
												
				@ -0,0 +1,25 @@

				From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001

				From: Michael Wang <13521008+isVoid@users.noreply.github.com>

				Date: Tue, 1 Apr 2025 17:28:05 -0700

				Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage

				 (#185)

				Co-authored-by: isVoid <isVoid@users.noreply.github.com>

				---

				 numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++

				 1 file changed, 3 insertions(+)

				diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py

				index 1641bf77..233e9ed7 100644

				--- a/numba_cuda/numba/cuda/cudadrv/driver.py

				+++ b/numba_cuda/numba/cuda/cudadrv/driver.py

				@@ -365,6 +365,9 @@ def _find_api(self, fname):

				         else:

				             variants = ('_v2', '')

				+        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):

				+            return getattr(self.lib, fname)

				+

				         for variant in variants:

				             try:

				                 return getattr(self.lib, f'{fname}{variant}')

Compare commits

3390 Commits codex-test ... ciflow/tru

15 .bc-linter.yml Normal file Unescape Escape View File

34 .ci/aarch64_linux/aarch64_ci_build.sh Unescape Escape View File

305 .ci/aarch64_linux/aarch64_wheel_ci_build.py Unescape Escape View File

80 .ci/aarch64_linux/build_aarch64_wheel.py Unescape Escape View File

4 .ci/docker/README.md Unescape Escape View File

9 .ci/docker/almalinux/Dockerfile Unescape Escape View File

6 .ci/docker/almalinux/build.sh Unescape Escape View File

137 .ci/docker/build.sh Unescape Escape View File

6 .ci/docker/centos-rocm/Dockerfile Unescape Escape View File

2 .ci/docker/ci_commit_pins/executorch.txt Unescape Escape View File

2 .ci/docker/ci_commit_pins/huggingface-requirements.txt Normal file Unescape Escape View File

1 .ci/docker/ci_commit_pins/huggingface.txt Unescape Escape View File

2 .ci/docker/ci_commit_pins/nccl-cu12.txt Unescape Escape View File

1 .ci/docker/ci_commit_pins/nccl-cu13.txt Normal file Unescape Escape View File

1 .ci/docker/ci_commit_pins/rocm-composable-kernel.txt Normal file Unescape Escape View File

2 .ci/docker/ci_commit_pins/torchbench.txt Unescape Escape View File

2 .ci/docker/ci_commit_pins/triton-xpu.txt Unescape Escape View File

2 .ci/docker/ci_commit_pins/triton.txt Unescape Escape View File

27 .ci/docker/common/install_acl.sh Normal file → Executable file Unescape Escape View File

9 .ci/docker/common/install_cpython.sh Unescape Escape View File

106 .ci/docker/common/install_cuda.sh Unescape Escape View File

10 .ci/docker/common/install_cusparselt.sh Unescape Escape View File

23 .ci/docker/common/install_executorch.sh Unescape Escape View File

11 .ci/docker/common/install_inductor_benchmark_deps.sh Unescape Escape View File

10 .ci/docker/common/install_mingw.sh Normal file Unescape Escape View File

2 .ci/docker/common/install_nccl.sh Unescape Escape View File

4 .ci/docker/common/install_onnx.sh Unescape Escape View File

12 .ci/docker/common/install_openblas.sh Normal file → Executable file Unescape Escape View File

15 .ci/docker/common/install_rocm.sh Unescape Escape View File

4 .ci/docker/common/install_rocm_magma.sh Unescape Escape View File

8 .ci/docker/common/install_triton.sh Unescape Escape View File

8 .ci/docker/common/install_ucc.sh Unescape Escape View File

61 .ci/docker/common/install_xpu.sh Unescape Escape View File

9 .ci/docker/common/patch_libstdc.sh Executable file Unescape Escape View File

13 .ci/docker/libtorch/Dockerfile Unescape Escape View File

12 .ci/docker/libtorch/build.sh Unescape Escape View File

5 .ci/docker/manywheel/Dockerfile_2_28 Unescape Escape View File

12 .ci/docker/manywheel/Dockerfile_2_28_aarch64 Unescape Escape View File

13 .ci/docker/manywheel/Dockerfile_cuda_aarch64 Unescape Escape View File

71 .ci/docker/manywheel/Dockerfile_cxx11-abi Unescape Escape View File

3 .ci/docker/manywheel/Dockerfile_s390x Unescape Escape View File

29 .ci/docker/manywheel/build.sh Unescape Escape View File

5 .ci/docker/manywheel/build_scripts/ssl-check.py Unescape Escape View File

73 .ci/docker/requirements-ci.txt Unescape Escape View File

9 .ci/docker/requirements-docs.txt Unescape Escape View File

2 .ci/docker/triton_version.txt Unescape Escape View File

2 .ci/docker/triton_xpu_version.txt Unescape Escape View File

155 .ci/docker/ubuntu-cross-riscv/Dockerfile Normal file Unescape Escape View File

10 .ci/docker/ubuntu-rocm/Dockerfile Unescape Escape View File

4 .ci/docker/ubuntu-xpu/Dockerfile Unescape Escape View File

11 .ci/docker/ubuntu/Dockerfile Unescape Escape View File

2 .ci/libtorch/build.sh Unescape Escape View File

31 .ci/lumen_cli/README.md Normal file Unescape Escape View File

0 test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_bool_called_at_least_once → .ci/lumen_cli/cli/build_cli/__init__.py Unescape Escape View File

37 .ci/lumen_cli/cli/build_cli/register_build.py Normal file Unescape Escape View File

0 test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_complex → .ci/lumen_cli/cli/lib/__init__.py Unescape Escape View File

71 .ci/lumen_cli/cli/lib/common/cli_helper.py Normal file Unescape Escape View File

42 .ci/lumen_cli/cli/lib/common/docker_helper.py Normal file Unescape Escape View File

110 .ci/lumen_cli/cli/lib/common/envs_helper.py Normal file Unescape Escape View File

143 .ci/lumen_cli/cli/lib/common/gh_summary.py Normal file Unescape Escape View File

69 .ci/lumen_cli/cli/lib/common/git_helper.py Normal file Unescape Escape View File

14 .ci/lumen_cli/cli/lib/common/logger.py Normal file Unescape Escape View File

62 .ci/lumen_cli/cli/lib/common/path_helper.py Normal file Unescape Escape View File

71 .ci/lumen_cli/cli/lib/common/pip_helper.py Normal file Unescape Escape View File

139 .ci/lumen_cli/cli/lib/common/utils.py Normal file Unescape Escape View File

292 .ci/lumen_cli/cli/lib/core/vllm/lib.py Normal file Unescape Escape View File

296 .ci/lumen_cli/cli/lib/core/vllm/vllm_build.py Normal file Unescape Escape View File

280 .ci/lumen_cli/cli/lib/core/vllm/vllm_test.py Normal file Unescape Escape View File

40 .ci/lumen_cli/cli/run.py Normal file Unescape Escape View File

0 test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_interpreter_convert_to_bool_raises → .ci/lumen_cli/cli/test_cli/__init__.py Unescape Escape View File

62 .ci/lumen_cli/cli/test_cli/register_test.py Normal file Unescape Escape View File

23 .ci/lumen_cli/pyproject.toml Normal file Unescape Escape View File

47 .ci/lumen_cli/tests/test_app.py Normal file Unescape Escape View File

115 .ci/lumen_cli/tests/test_cli_helper.py Normal file Unescape Escape View File

75 .ci/lumen_cli/tests/test_docker_helper.py Normal file Unescape Escape View File

149 .ci/lumen_cli/tests/test_envs_helper.py Normal file Unescape Escape View File

122 .ci/lumen_cli/tests/test_path_helper.py Normal file Unescape Escape View File

185 .ci/lumen_cli/tests/test_run_plan.py Normal file Unescape Escape View File

3390 Commits

codex-test ... ciflow/tru

15

.bc-linter.yml Normal file

View File

34

.ci/aarch64_linux/aarch64_ci_build.sh

View File

305

.ci/aarch64_linux/aarch64_wheel_ci_build.py

View File

80

.ci/aarch64_linux/build_aarch64_wheel.py

View File

4

.ci/docker/README.md

View File

9

.ci/docker/almalinux/Dockerfile

View File

6

.ci/docker/almalinux/build.sh

View File

137

.ci/docker/build.sh

View File

6

.ci/docker/centos-rocm/Dockerfile

View File

2

.ci/docker/ci_commit_pins/executorch.txt

View File

2

.ci/docker/ci_commit_pins/huggingface-requirements.txt Normal file

View File

1

.ci/docker/ci_commit_pins/huggingface.txt

View File

2

.ci/docker/ci_commit_pins/nccl-cu12.txt

View File

1

.ci/docker/ci_commit_pins/nccl-cu13.txt Normal file

View File

1

.ci/docker/ci_commit_pins/rocm-composable-kernel.txt Normal file

View File

2

.ci/docker/ci_commit_pins/torchbench.txt

View File

2

.ci/docker/ci_commit_pins/triton-xpu.txt

View File

2

.ci/docker/ci_commit_pins/triton.txt

View File

27

.ci/docker/common/install_acl.sh Normal file → Executable file

View File

9

.ci/docker/common/install_cpython.sh

View File

106

.ci/docker/common/install_cuda.sh

View File

10

.ci/docker/common/install_cusparselt.sh

View File

23

.ci/docker/common/install_executorch.sh

View File

11

.ci/docker/common/install_inductor_benchmark_deps.sh

View File

10

.ci/docker/common/install_mingw.sh Normal file

View File

2

.ci/docker/common/install_nccl.sh

View File

4

.ci/docker/common/install_onnx.sh

View File

12

.ci/docker/common/install_openblas.sh Normal file → Executable file

View File

15

.ci/docker/common/install_rocm.sh

View File

4

.ci/docker/common/install_rocm_magma.sh

View File

8

.ci/docker/common/install_triton.sh

View File

8

.ci/docker/common/install_ucc.sh

View File

61

.ci/docker/common/install_xpu.sh

View File

9

.ci/docker/common/patch_libstdc.sh Executable file

View File

13

.ci/docker/libtorch/Dockerfile

View File

12

.ci/docker/libtorch/build.sh

View File

5

.ci/docker/manywheel/Dockerfile_2_28

View File

12

.ci/docker/manywheel/Dockerfile_2_28_aarch64

View File

13

.ci/docker/manywheel/Dockerfile_cuda_aarch64

View File

71

.ci/docker/manywheel/Dockerfile_cxx11-abi

View File

3

.ci/docker/manywheel/Dockerfile_s390x

View File

29

.ci/docker/manywheel/build.sh

View File

5

.ci/docker/manywheel/build_scripts/ssl-check.py

View File

73

.ci/docker/requirements-ci.txt

View File

9

.ci/docker/requirements-docs.txt

View File

2

.ci/docker/triton_version.txt

View File

2

.ci/docker/triton_xpu_version.txt

View File

155

.ci/docker/ubuntu-cross-riscv/Dockerfile Normal file

View File

10

.ci/docker/ubuntu-rocm/Dockerfile

View File

4

.ci/docker/ubuntu-xpu/Dockerfile

View File

11

.ci/docker/ubuntu/Dockerfile

View File

2

.ci/libtorch/build.sh

View File

31

.ci/lumen_cli/README.md Normal file

View File

0

test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_bool_called_at_least_once → .ci/lumen_cli/cli/build_cli/init.py

View File

37

.ci/lumen_cli/cli/build_cli/register_build.py Normal file

View File

0

test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_complex → .ci/lumen_cli/cli/lib/init.py

View File

71

.ci/lumen_cli/cli/lib/common/cli_helper.py Normal file

View File

42

.ci/lumen_cli/cli/lib/common/docker_helper.py Normal file

View File

110

.ci/lumen_cli/cli/lib/common/envs_helper.py Normal file

View File

143

.ci/lumen_cli/cli/lib/common/gh_summary.py Normal file

View File

69

.ci/lumen_cli/cli/lib/common/git_helper.py Normal file

View File

14

.ci/lumen_cli/cli/lib/common/logger.py Normal file

View File

62

.ci/lumen_cli/cli/lib/common/path_helper.py Normal file

View File

71

.ci/lumen_cli/cli/lib/common/pip_helper.py Normal file

View File

139

.ci/lumen_cli/cli/lib/common/utils.py Normal file

View File

292

.ci/lumen_cli/cli/lib/core/vllm/lib.py Normal file

View File

296

.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py Normal file

View File

280

.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py Normal file

View File

40

.ci/lumen_cli/cli/run.py Normal file

View File

0

test/dynamo_expected_failures/CPython313-test_bool-BoolTest.test_interpreter_convert_to_bool_raises → .ci/lumen_cli/cli/test_cli/init.py

View File

62

.ci/lumen_cli/cli/test_cli/register_test.py Normal file

View File

23

.ci/lumen_cli/pyproject.toml Normal file

View File

47

.ci/lumen_cli/tests/test_app.py Normal file

View File

115

.ci/lumen_cli/tests/test_cli_helper.py Normal file

View File

75

.ci/lumen_cli/tests/test_docker_helper.py Normal file

View File

149

.ci/lumen_cli/tests/test_envs_helper.py Normal file

View File

122

.ci/lumen_cli/tests/test_path_helper.py Normal file

View File

185

.ci/lumen_cli/tests/test_run_plan.py Normal file

View File

143

.ci/lumen_cli/tests/test_utils.py Normal file

View File