Sun, Jiayi
e9d8973427
[Inductor] support masked vectorization for the tail_loop for float64 datatype (#163316)
**Summary:**
Support masked vectorization for the tail_loop for float64 datatype.
**Example:**
```
import torch
def fn(x):
return x * x
x = torch.randn((22, 22), dtype=torch.double)
with torch.no_grad():
compiled_fn = torch.compile(fn)
compiled_fn(x)
```
**Generated code:**
- Before
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const double* in_ptr0,
double* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
{
for (int64_t x0_tail = static_cast<int64_t>(480L);x0_tail < static_cast<int64_t>(484L); x0_tail++)
{
auto tmp0 = in_ptr0[static_cast<int64_t>(x0_tail)];
auto tmp1 = double(tmp0 * tmp0);
out_ptr0[static_cast<int64_t>(x0_tail)] = tmp1;
}
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (22, 22), (22, 1))
buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
# [Provenance debug handles] cpp_fused_mul_0:1
cpp_fused_mul_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
- After
```
cpp_fused_mul_0 = async_compile.cpp_pybinding(['const double*', 'double*'], r'''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C" void kernel(const double* in_ptr0,
double* out_ptr0)
{
{
for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(484L); x0+=static_cast<int64_t>(16L))
{
{
if(C10_LIKELY(x0 >= static_cast<int64_t>(0) && x0 < static_cast<int64_t>(480L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(16));
}
if(C10_UNLIKELY(x0 >= static_cast<int64_t>(480L) && x0 < static_cast<int64_t>(484L)))
{
auto tmp0 = at::vec::VectorizedN<double,2>::loadu(in_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
auto tmp1 = tmp0 * tmp0;
tmp1.store(out_ptr0 + static_cast<int64_t>(x0), static_cast<int64_t>(4L));
}
}
}
}
}
''')
async_compile.wait(globals())
del async_compile
class Runner:
def __init__(self, partitions):
self.partitions = partitions
def recursively_apply_fns(self, fns):
new_callables = []
for fn, c in zip(fns, self.partitions):
new_callables.append(fn(c))
self.partitions = new_callables
def call(self, args):
arg0_1, = args
args.clear()
assert_size_stride(arg0_1, (22, 22), (22, 1))
buf0 = empty_strided_cpu((22, 22), (22, 1), torch.float64)
# [Provenance debug handles] cpp_fused_mul_0:1
cpp_fused_mul_0(arg0_1, buf0)
del arg0_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163316
Approved by: https://github.com/mingfeima, https://github.com/jansel
2025-10-20 01:41:38 +00:00
..
2025-10-18 07:36:18 +00:00
2024-12-13 22:13:12 +00:00
2025-07-17 12:08:33 +00:00
2025-06-21 18:33:38 +00:00
2025-10-17 18:32:39 +00:00
2025-06-24 04:53:54 +00:00
2025-10-17 18:32:39 +00:00
2025-03-29 01:39:13 +00:00
2025-07-25 02:37:30 +00:00
2025-10-19 19:20:45 +00:00
2025-10-18 07:36:18 +00:00
2025-10-19 21:12:19 +00:00
2025-10-17 17:44:45 +00:00
2025-10-01 21:32:59 +00:00
2025-10-15 17:47:23 +00:00
2025-10-17 22:35:50 +00:00
2025-10-09 15:42:51 +00:00
2025-10-18 07:36:18 +00:00
2025-10-17 07:27:11 +00:00
2025-10-17 22:35:50 +00:00
2025-10-20 01:41:38 +00:00
2025-09-10 04:39:20 +00:00
2025-01-27 18:12:39 +00:00
2025-10-18 07:36:18 +00:00
2025-03-29 01:39:13 +00:00
2025-10-11 01:03:55 +00:00
2025-10-17 07:27:11 +00:00
2025-10-18 07:36:18 +00:00
2025-10-18 07:36:18 +00:00
2025-10-18 07:36:18 +00:00
2025-08-06 02:26:10 +00:00
2025-10-18 07:36:18 +00:00
2025-10-18 07:36:18 +00:00
2025-07-29 03:26:09 +00:00
2025-02-22 03:44:53 +00:00
2025-10-17 07:27:11 +00:00
2025-09-02 16:53:55 +00:00
2025-09-19 07:37:14 +00:00
2025-10-09 15:42:51 +00:00
2025-08-03 20:53:58 +00:00
2025-10-02 22:22:04 +00:00
2024-11-04 18:30:29 +00:00
2025-10-14 19:19:03 +00:00
2025-05-12 18:30:52 +00:00
2025-10-13 11:47:32 +00:00
2025-08-08 17:41:22 +00:00
2025-09-18 16:08:13 +00:00
2025-04-26 18:10:58 +00:00
2025-10-16 23:08:27 +00:00
2025-07-30 19:30:55 +00:00
2025-10-17 07:27:11 +00:00
2025-10-13 01:48:55 +00:00
2025-06-04 14:38:13 +00:00
2025-10-08 07:27:17 +00:00
2024-11-22 20:54:55 +00:00
2025-10-08 07:27:17 +00:00
2025-09-09 15:49:21 +00:00
2025-08-04 20:37:39 +00:00
2025-09-29 15:15:10 +00:00
2025-09-29 17:50:12 +00:00
2025-04-10 21:02:14 +00:00
2025-04-25 20:15:04 +00:00
2025-01-04 14:17:20 +00:00
2025-09-09 15:49:21 +00:00
2025-10-09 03:24:50 +00:00
2025-09-23 22:15:10 +00:00
2025-01-04 10:47:51 +00:00
2024-12-18 23:02:30 +00:00
2025-10-19 15:34:44 +00:00
2025-10-14 14:18:42 +00:00
2025-10-18 07:36:18 +00:00
2025-10-18 07:36:18 +00:00
2025-10-17 07:27:11 +00:00
2025-06-14 03:37:38 +00:00
2025-10-14 17:17:11 +00:00
2025-10-18 07:36:18 +00:00
2025-08-04 20:37:39 +00:00
2025-01-23 00:31:39 +00:00
2025-09-26 15:45:02 +00:00
2024-12-18 23:02:30 +00:00
2025-10-09 18:03:12 +00:00
2025-10-06 22:42:01 +00:00
2024-12-18 23:02:30 +00:00
2025-02-08 00:55:20 +00:00
2025-01-04 10:47:51 +00:00
2025-07-09 11:02:23 +00:00
2025-07-09 11:02:23 +00:00
2025-10-15 03:18:57 +00:00
2025-07-09 11:02:23 +00:00
2025-07-10 06:34:46 +00:00
2025-10-17 07:27:11 +00:00
2025-02-28 00:47:03 +00:00
2025-06-04 14:38:13 +00:00
2025-10-18 07:36:18 +00:00
2025-10-17 07:27:11 +00:00
2025-10-02 22:22:04 +00:00
2025-10-18 07:36:18 +00:00
2025-10-02 22:22:04 +00:00
2025-10-02 22:22:04 +00:00
2024-12-18 23:02:30 +00:00
2025-10-09 03:24:50 +00:00
2025-10-18 07:36:18 +00:00
2024-12-18 23:02:30 +00:00
2025-09-26 18:26:56 +00:00
2025-07-09 11:02:23 +00:00
2025-07-09 11:02:23 +00:00
2025-10-15 22:26:47 +00:00
2025-07-09 11:02:23 +00:00
2025-07-21 21:44:49 +00:00
2025-10-18 07:36:18 +00:00
2025-09-17 20:29:12 +00:00
2024-12-18 23:02:30 +00:00
2025-01-04 10:47:51 +00:00
2025-07-17 08:57:34 +00:00
2024-12-18 23:02:30 +00:00
2025-09-10 14:19:34 +00:00
2024-12-18 23:02:30 +00:00
2025-09-12 15:02:40 +00:00
2024-12-18 23:02:30 +00:00
2025-10-18 07:36:18 +00:00
2025-10-16 19:34:10 +00:00
2025-10-13 01:48:55 +00:00
2025-09-26 17:41:00 +00:00
2025-01-04 10:47:51 +00:00
2024-12-06 21:45:18 +00:00
2025-10-13 17:59:18 +00:00
2025-10-17 07:27:11 +00:00
2025-01-04 10:47:51 +00:00
2025-10-18 07:36:18 +00:00
2025-09-26 18:26:56 +00:00
2025-10-17 07:27:11 +00:00
2025-10-14 20:21:04 +00:00
2025-10-08 09:09:16 +00:00
2024-12-18 23:02:30 +00:00
2025-03-18 16:09:39 +00:00
2025-07-09 11:02:23 +00:00
2025-09-29 09:08:04 +00:00
2025-09-16 12:07:50 +00:00
2024-12-27 07:58:44 +00:00
2025-09-13 07:52:50 +00:00
2025-07-09 11:02:23 +00:00
2025-09-19 19:41:33 +00:00
2025-10-01 21:32:59 +00:00
2025-10-08 18:42:37 +00:00
2025-10-17 07:27:11 +00:00
2025-10-07 20:51:22 +00:00
2025-10-15 17:24:50 +00:00
2025-09-05 20:15:29 +00:00
2025-10-19 00:59:28 +00:00
2025-10-18 07:36:18 +00:00
2025-07-17 01:27:44 +00:00
2025-10-19 21:42:01 +00:00
2025-09-15 06:50:00 +00:00
2025-08-10 18:35:42 +00:00
2025-10-17 07:27:11 +00:00
2025-10-18 07:36:18 +00:00
2025-02-25 03:47:40 +00:00
2025-08-14 17:06:27 +00:00
2025-10-18 07:36:18 +00:00
2025-09-23 07:52:00 +00:00
2025-10-18 09:04:42 +00:00
2025-06-04 01:58:52 +00:00
2025-07-09 11:02:23 +00:00
2025-10-18 07:36:18 +00:00
2024-12-18 23:02:30 +00:00
2025-09-29 01:42:01 +00:00
2025-10-17 07:27:11 +00:00
2025-10-18 07:36:18 +00:00
2025-10-18 07:36:18 +00:00
2025-09-29 01:42:01 +00:00
2025-01-26 03:37:20 +00:00
2025-10-18 07:36:18 +00:00
2025-10-17 07:27:11 +00:00
2025-10-09 21:58:54 +00:00
2025-10-08 07:27:17 +00:00
2025-10-08 07:27:17 +00:00
2025-10-13 01:48:55 +00:00
2025-07-09 11:02:23 +00:00
2025-09-15 05:44:15 +00:00
2025-02-05 19:40:10 +00:00
2024-12-12 01:18:34 +00:00
2025-10-04 15:25:45 +00:00
2025-10-15 19:45:55 +00:00
2025-10-18 07:36:18 +00:00
2024-12-18 23:02:30 +00:00
2025-07-09 11:02:23 +00:00
2025-10-18 07:36:18 +00:00
2025-10-12 12:11:57 +00:00