Compare commits

..

5 Commits

Author SHA1 Message Date
f78738eead Update triton.txt 2025-11-04 13:21:04 -05:00
7002824759 Update build_triton_wheel.py 2025-11-04 13:20:16 -05:00
2b3af4a4ba Update build_triton_wheel.py 2025-11-04 13:12:32 -05:00
63fb7949e7 Update triton.txt 2025-11-04 12:40:41 -05:00
c50594ff6f Update triton to 3.5.1 release 2025-11-04 12:39:15 -05:00
5 changed files with 20 additions and 6 deletions

View File

@ -1 +1 @@
7416ffcb92cdbe98d9f97e4e6f95247e46dfc9fd
bfeb066872bc1e8b2d2bc0a3b295b99dd77206e7

View File

@ -1 +1 @@
3.5.0
3.5.1

View File

@ -23,6 +23,8 @@ C10_DIAGNOSTIC_POP()
#endif
namespace at {
namespace {
/*
These const variables defined the fp32 precisions for different backend
We have "generic", "cuda", "mkldnn" backend now and we can choose fp32
@ -39,6 +41,16 @@ namespace at {
->rnn
*/
C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
TORCH_WARN_ONCE(
"Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' "
"or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, "
"torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see "
"https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"
);
}
} // namespace
Float32Backend str2backend(const std::string& name) {
if (name == "generic")
return Float32Backend::GENERIC;
@ -194,6 +206,7 @@ bool Context::allowTF32CuDNN(std::optional<Float32Op> op) const {
} else {
return float32Precision(Float32Backend::CUDA, op.value()) == Float32Precision::TF32;
}
warn_deprecated_fp32_precision_api();
return allow_tf32_cudnn;
}
@ -201,6 +214,7 @@ void Context::setAllowTF32CuDNN(bool b) {
setFloat32Precision(Float32Backend::CUDA, Float32Op::RNN, b ? Float32Precision::TF32 : Float32Precision::NONE);
setFloat32Precision(Float32Backend::CUDA, Float32Op::CONV, b ? Float32Precision::TF32 : Float32Precision::NONE);
allow_tf32_cudnn = b;
warn_deprecated_fp32_precision_api();
}
void Context::setSDPPriorityOrder(const std::vector<int64_t>& order) {
@ -311,6 +325,7 @@ bool Context::allowTF32CuBLAS() const {
"Current status indicate that you have used mix of the legacy and new APIs to set the TF32 status for cublas matmul. ",
"We suggest only using the new API to set the TF32 flag. See also: ",
"https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
warn_deprecated_fp32_precision_api();
return allow_tf32_new;
}
@ -334,6 +349,7 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const {
"Current status indicate that you have used mix of the legacy and new APIs to set the matmul precision. ",
"We suggest only using the new API for matmul precision. See also: ",
"https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
warn_deprecated_fp32_precision_api();
return float32_matmul_precision;
}
@ -361,6 +377,7 @@ Float32Precision Context::float32Precision(Float32Backend backend, Float32Op op)
void Context::setFloat32MatmulPrecision(const std::string &s) {
auto match = [this](const std::string & s_) {
warn_deprecated_fp32_precision_api();
// TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
if (s_ == "highest") {
float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;

View File

@ -179,9 +179,6 @@ def aot_stage1_graph_capture(
)
)
print(f"in aot_stage1_graph_capture. maybe_subclass_meta.fw_metadata.static_input_indices:{maybe_subclass_meta.fw_metadata.static_input_indices if maybe_subclass_meta is not None and maybe_subclass_meta.fw_metadata is not None else None}")
print(f"in aot_stage1_graph_capture. aot_state.fw_metadata.static_input_indices:{aot_state.fw_metadata.static_input_indices}")
return AOTGraphCapture(
wrappers=wrappers,
graph_module=graph,

View File

@ -2318,7 +2318,7 @@ def compile_fx_forward(
# force the outputs of invoke_subgraph subgraph to follow the
# original strides
_recursive_record_user_visible_output_idxs(gm)
print(f"in compile_fx_foward. static_input_idxs:{get_static_input_idxs(fixed)}")
return inner_compile(
gm,
example_inputs,