[ROCm][tunableop] Modified Online Tuning Mode to add Instant Logging (#163965)

- Added instant logging in online tuning mode, so that each tuned GEMM is instantly written - Allows us to have saved tuning configs, in cases of crashes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/163965 Approved by: https://github.com/naromero77amd, https://github.com/jeffdaily
2025-10-20 21:14:14 +08:00 · 2025-10-15 20:02:27 +00:00
parent 83f9baf413
commit 7f9b745494
8 changed files with 209 additions and 164 deletions
--- a/aten/src/ATen/cuda/tunable/README.md
+++ b/aten/src/ATen/cuda/tunable/README.md
@ -175,8 +175,6 @@ All python APIs exist in the `torch.cuda.tunable` module.
 | get_filename() -> str | |
 | get_results() -> Tuple[str, str, str, float] | |
 | get_validators() -> Tuple[str, str] | |
 | write_file_on_exit(val: bool) -> None | Default is True. |
 | write_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
 | read_file(filename: Optional[str] = None) -> None | If filename not given, it will call get_filename(). |
 | tune_gemm_in_file(filename: str) -> None | read an untuned file and tune GEMMs in it. |
 | mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None: -> None | read one or more untuned files and tune all unique GEMMs on one or more GPUs. |
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -107,14 +107,30 @@ void TuningResultsManager::AddImpl(const std::string& op_signature,
 }
 void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) {
-  std::scoped_lock l{lock_};
+  bool is_new = false;
  ResultEntry inserted = ResultEntry::Null();
-  auto it = results_.find(op_signature);
+  // ---- mutate maps under results lock ----
-  if (it == results_.end()) {
+  {
-    it = results_.insert({op_signature, {}}).first;
+    std::scoped_lock l{lock_};
    auto& km = results_[op_signature];  // creates if missing
    is_new = (km.find(params_signature) == km.end());
    AddImpl(op_signature, params_signature, std::move(best), km);
    if (is_new) {
      inserted = km.at(params_signature);  // snapshot for I/O after unlocking
    }
  }
   if (!is_new) return;  // only write once per unique (op, params)
   TuningContext* ctx = getTuningContext();
  if (ctx->IsTuningEnabled() && !ctx->IsRecordUntunedEnabled()) {
    InitRealtimeAppend(ctx->GetFilename(), ctx->GetTuningResultsValidator().GetAllValidators());
    if (is_new && realtime_out_ && realtime_out_->good()) {
      AppendResultLine(op_signature, params_signature, inserted);
    }
  }
  AddImpl(op_signature, params_signature, std::move(best), it->second);
 }
 void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
@ -150,6 +166,77 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
  }
 }
 void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const std::unordered_map<std::string, std::string>& validators) {
  std::scoped_lock fl{realtime_file_mutex_};
  if (realtime_out_ && realtime_out_->good() && realtime_filename_ == filename) {
    return;
  }
  if (realtime_out_ && realtime_filename_ != filename) {
    realtime_out_->flush();
    realtime_out_->close();
    realtime_out_.reset();
    validators_written_ = false;
  }
  bool file_exists = false;
  bool file_empty = true;
  {
    std::ifstream check_file(filename);
    if (check_file.good()) {
      file_exists = true;
      file_empty = (check_file.peek() == std::ifstream::traits_type::eof());
    }
  }
  realtime_out_ = std::make_unique<std::ofstream>(filename, std::ios::out | std::ios::app);
  if (!realtime_out_->good()) {
    TORCH_WARN("TunableOp realtime append: failed to open '", filename,"'");
    realtime_out_.reset();
    return;
  }
  if(!file_exists || file_empty) {
    for(const auto& [key, val] : validators) {
      (*realtime_out_) << "Validator," << key << "," << val << std::endl;
      realtime_out_->flush();
    }
    validators_written_ = true;
    TUNABLE_LOG2("Wrote validators to realtime output file");
  }
  realtime_filename_ = filename;
 }
 void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std::string& param_sig, const ResultEntry& result) {
  std::scoped_lock fl{realtime_file_mutex_};
  if(!realtime_out_ || !realtime_out_->good()) {
    return;
  }
  (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
  realtime_out_->flush(); //ensure immediate write to disk
  TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
 }
 void TuningResultsManager::CloseRealtimeAppend() {
  std::scoped_lock fl{realtime_file_mutex_};
  if(realtime_out_) {
    realtime_out_->flush();
    realtime_out_->close();
    realtime_out_.reset();
    TUNABLE_LOG2("Closed realtime output file");
  }
 }
 void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) {
  std::scoped_lock l{lock_};
@ -396,7 +483,6 @@ TuningContext::TuningContext() :
    tuning_enable_{true},
    record_untuned_enable_{false},
    manager_initialized_{false},
    write_file_on_exit_{true},
    numerics_check_enable_{false},
    max_tuning_duration_ms_{30},
    max_tuning_iterations_{100},
@ -417,20 +503,8 @@ TuningContext::~TuningContext() {
    // but doesn't do any computation itself.
    return;
  }
-  auto filename = GetFilename();
+  TUNABLE_LOG1("Closing File");
-  if (IsTunableOpEnabled() && IsTuningEnabled() && !filename.empty() && write_file_on_exit_) {
+  GetTuningResultsManager().CloseRealtimeAppend(); // Since, we do instant logging by default now.
    if (results_count_from_input_file_ < GetTuningResultsManager().GetSize()) {
      if (results_count_from_input_file_ > 0) {
        TUNABLE_LOG1("additional tuning results available, rewriting file ", filename);
      }
      else {
        TUNABLE_LOG1("writing file ", filename);
      }
      if (!WriteFile(filename)) {
        TUNABLE_LOG1("failed to write file ", filename);
      }
    }
  }
  if (untuned_file_.good()) {
    untuned_file_.close();
@ -511,9 +585,6 @@ std::ofstream& TuningContext::GetUntunedFile(){
  return untuned_file_;
 }
 void TuningContext::WriteFileOnExit(bool value) {
  write_file_on_exit_ = value;
 }
 void TuningContext::EnableNumericsCheck(bool value) {
  numerics_check_enable_ = value;
@ -634,11 +705,6 @@ TuningResultsManager& TuningContext::GetTuningResultsManager() {
    auto filename = GetFilename();
    if (!filename.empty() && !IsRecordUntunedEnabled()) {
      ReadFile(filename);
      // attempt immediately to open file for writing to catch errors early
      std::ofstream file(filename, std::ios::out | std::ios::app);
      if (!file.good()) {
        TORCH_WARN("failed to open file '", filename, "' for writing; your tuning results will not be saved");
      }
    }
  });
  return manager_;
@ -744,27 +810,6 @@ bool TuningContext::ReadFile(const std::string& filename_) {
  return true;
 }
 bool TuningContext::WriteFile(const std::string& filename_) {
  std::string filename = filename_.empty() ? GetFilename() : filename_;
  std::ofstream file(filename, std::ios::out | std::ios::trunc);
  if (!file.good()) {
    TUNABLE_LOG1("error opening tuning results file for writing ", filename);
    return false;
  }
  auto validators = GetTuningResultsValidator().GetAllValidators();
  for (const auto& [key, val] : validators) {
    file << "Validator," << key << "," << val << std::endl;
  }
  auto results = GetTuningResultsManager().Dump();
  for (const auto& [op_sig, kernelmap] : results) {
    for (const auto& [param_sig, result] : kernelmap) {
      file << op_sig << "," << param_sig << "," << result << std::endl;
    }
  }
  file.close();
  return true;
 }
 namespace {
 struct MaybeDelete {
--- a/aten/src/ATen/cuda/tunable/Tunable.h
+++ b/aten/src/ATen/cuda/tunable/Tunable.h
@ -103,10 +103,24 @@ class TORCH_CUDA_CPP_API TuningResultsManager {
    void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
      const std::string& params_signature, const std::string& blas_signature);
    void InitRealtimeAppend(
        const std::string& filename,
        const std::unordered_map<std::string, std::string>& validators);
    void AppendResultLine(const std::string& op_sig,
                         const std::string& param_sig,
                         const ResultEntry& result);
    void CloseRealtimeAppend();  // For clean shutdown
  private:
    std::mutex lock_;
    std::mutex realtime_file_mutex_;
    std::unique_ptr<std::ofstream> realtime_out_;
    std::string realtime_filename_;
    ResultsMap results_;
    UntunedMap untuned_results_;
    bool validators_written_ = false;
 };
@ -185,10 +199,7 @@ class TORCH_CUDA_CPP_API TuningContext {
    void SetFilename(const std::string& filename, bool insert_device_ordinal=false);
    std::string GetFilename() const;
    void WriteFileOnExit(bool value);
    bool ReadFile(const std::string& filename={});
    bool WriteFile(const std::string& filename={});
    template<class... Types>
    void Log(int level, Types... args) {
@ -207,7 +218,6 @@ class TORCH_CUDA_CPP_API TuningContext {
    bool tuning_enable_;
    bool record_untuned_enable_;
    bool manager_initialized_;
    bool write_file_on_exit_;
    bool numerics_check_enable_;
    int max_tuning_duration_ms_;
    int max_tuning_iterations_;
--- a/docs/source/cuda.tunable.md
+++ b/docs/source/cuda.tunable.md
@ -68,14 +68,6 @@
 .. autofunction:: get_validators
 ```
 ```{eval-rst}
 .. autofunction:: write_file_on_exit
 ```
 ```{eval-rst}
 .. autofunction:: write_file
 ```
 ```{eval-rst}
 .. autofunction:: read_file
 ```
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@ -4750,6 +4750,7 @@ class TestLinalg(TestCase):
    @dtypes(*floating_types_and(torch.half))
    @precisionOverride({torch.float16: 1e-1})  # TunableOp may occasionally find less precise solution
    def test_matmul_small_brute_force_tunableop(self, device, dtype):
        import os
        # disable tunableop buffer rotation for all tests everywhere, it can be slow
        # We set the TunableOp numerical check environment variable here because it is
        # possible to hit some invalid numerical solutions due to the small matrix sizes.
@ -4777,27 +4778,11 @@ class TestLinalg(TestCase):
            filename1 = torch.cuda.tunable.get_filename()
            unique_id = self.id().split(".")[-1]
            filename2 = f"{filename1}_tmp1.csv"
            filename3 = f"{filename1}_tmp2.csv"
            ordinal = torch.cuda.current_device()
            assert filename1 == f"tunableop_results_{unique_id}_{ordinal}.csv"
            assert len(torch.cuda.tunable.get_results()) > 0
-            assert torch.cuda.tunable.write_file()  # use default filename
+            self.assertTrue(os.path.exists(filename1))
            assert torch.cuda.tunable.write_file(filename2)  # use custom, one-time filename
            torch.cuda.tunable.set_filename(filename3)
            assert torch.cuda.tunable.write_file()  # use previously set filename
            assert torch.cuda.tunable.read_file()  # use previously set filename, will ignore duplicates and return True
            with open(filename1) as file1:
                file1_contents = file1.read()
            with open(filename2) as file2:
                file2_contents = file2.read()
            with open(filename3) as file3:
                file3_contents = file3.read()
            assert file1_contents == file2_contents
            assert file1_contents == file3_contents
            # We need to reset the filename to the default value so we can properly
            # clean up intermediate files
            self._set_tunableop_defaults()
@ -4806,6 +4791,7 @@ class TestLinalg(TestCase):
    @skipCUDAIfNotRocm
    @dtypes(torch.half)
    def test_matmul_offline_tunableop(self, device, dtype):
        import os
        # Main offline tunableop test
        # NOTE: The offline tuning does not support certain tensor
        # shapes as noted below. Submatrics / matrix slices are
@ -4916,7 +4902,9 @@ class TestLinalg(TestCase):
            new_results = len(torch.cuda.tunable.get_results())
            self.assertGreater(new_results - ref_results, 0)
-            self.assertTrue(torch.cuda.tunable.write_file())
+
            results_filename = torch.cuda.tunable.get_filename()
            self.assertTrue(os.path.exists(results_filename))
            # Compare Param Signature of untuned and tuned results
            ok = self._compare_untuned_tuned_entries()
@ -4927,6 +4915,7 @@ class TestLinalg(TestCase):
    @runOnRocmArch(MI300_ARCH)
    @dtypes(torch.torch.float8_e4m3fnuz, torch.float8_e5m2fnuz)
    def test_scaled_gemm_offline_tunableop(self, device, dtype):
        import os
        # This test is the offline version of test_scaled_gemm_tunableop
        with self._tunableop_ctx():
@ -5006,7 +4995,8 @@ class TestLinalg(TestCase):
                count = 6
            self.assertEqual(total_num_results, count)
-            self.assertTrue(torch.cuda.tunable.write_file())
+            results_filename = torch.cuda.tunable.get_filename()
            self.assertTrue(os.path.exists(results_filename))
            # Compare Param Signature of untuned and tuned results
            ok = self._compare_untuned_tuned_entries()
@ -5381,6 +5371,7 @@ class TestLinalg(TestCase):
    @skipCUDAIfNotRocm
    @dtypes(torch.bfloat16)
    def test_gemm_bias_offline_tunableop(self, device, dtype):
        import os
        # This test is the offline version of test_gemm_bias_tunableop
        ordinal = torch.cuda.current_device()
@ -5431,7 +5422,8 @@ class TestLinalg(TestCase):
            # There must be a new tuning results
            self.assertEqual(total_num_results, 2)
-            self.assertTrue(torch.cuda.tunable.write_file())
+            results_filename = torch.cuda.tunable.get_filename()
            self.assertTrue(os.path.exists(results_filename))
            # Compare Param Signature of untuned and tuned results
            ok = self._compare_untuned_tuned_entries()
@ -5632,7 +5624,8 @@ class TestLinalg(TestCase):
                                                     'nn_41_41_41_ld_41_41_41')
                self.assertTrue(found_result is not None)
-                self.assertTrue(torch.cuda.tunable.write_file())
+                results_filename = torch.cuda.tunable.get_filename()
                self.assertTrue(os.path.exists(results_filename))
                # Compare Param Signature of untuned and tuned results
                ok = self._compare_untuned_tuned_entries()
@ -5732,6 +5725,7 @@ class TestLinalg(TestCase):
    @skipCUDAIfNotRocm
    @dtypes(torch.float)
    def test_mm_submatrix_offline_tunableop(self, device, dtype):
        import os
        # Test offline tuning with submatrices
        # Covers GEMM, ScaledGEMM, and GEMM+bias.
        ordinal = torch.cuda.current_device()
@ -5862,12 +5856,97 @@ class TestLinalg(TestCase):
            # There must be a new tuning results
            self.assertEqual(total_num_results, 10)
-            self.assertTrue(torch.cuda.tunable.write_file())
+            results_filename = torch.cuda.tunable.get_filename()
            self.assertTrue(os.path.exists(results_filename))
            # Compare Param Signature of untuned and tuned results
            ok = self._compare_untuned_tuned_entries()
            self.assertTrue(ok)
    @onlyCUDA
    @skipCUDAIfNotRocm
    @dtypes(torch.float32)
    def test_ops_append_to_existing_file_tunableop(self, device, dtype):
        """If a TunableOp results file already exists (with matching Validator),
        new results should be appended (not overwritten)."""
        with self._tunableop_ctx():
            torch.cuda.tunable.set_rotating_buffer_size(0)
            # Seed the existing results file with Validator lines + 1 result line
            results_filename = torch.cuda.tunable.get_filename()
            validators = torch.cuda.tunable.get_validators()  # Iterable[Tuple[str, str]]
            seed_lines = []
            # Each (k, v) becomes a "Validator" line
            for k, v in validators:
                seed_lines.append(f"Validator,{k},{v}")
            # One arbitrary, plausible matmul result line
            seed_lines.append(
                "GemmAndBiasTunableOp_float_TN,tn_768_32_1024_ld_1024_1024_768,"
                "Gemm_Hipblaslt_220580,0.0103395"
            )
            with open(results_filename, "w") as f:
                f.write("\n".join(seed_lines) + "\n")
            # Count initial (non-Validator) lines
            with open(results_filename) as f:
                initial_content = f.read()
            initial_lines = [
                l for l in initial_content.split("\n")
                if l and not l.startswith("Validator")
            ]
            initial_count = len(initial_lines)
            self.assertGreater(initial_count, 0)  # we seeded 1 result line
            # Perform ONE simple matmul
            A = torch.randn(37, 53, device=device, dtype=dtype)
            B = torch.randn(53, 29, device=device, dtype=dtype)
            _ = torch.matmul(A, B)
            # Verify that new results were appended to the same file
            with open(results_filename) as f:
                final_content = f.read()
            final_lines = [
                l for l in final_content.split("\n")
                if l and not l.startswith("Validator")
            ]
            final_count = len(final_lines)
            self.assertGreater(final_count, initial_count)
    @onlyCUDA
    @skipCUDAIfNotRocm
    @dtypes(torch.float32)
    def test_matmul_empty_existing_file_tunableop(self, device, dtype):
        """ Test that if an existing results file is empty/corrupted, then the default behaviour should hold """
        with self._tunableop_ctx():
            torch.cuda.tunable.set_rotating_buffer_size(0)
            results_filename = torch.cuda.tunable.get_filename()
            # Pre-create an empty results file
            with open(results_filename, 'w') as f:
                pass  # Empty file
            # Use unique random inputs for this test
            A = torch.randn(37, 53, device=device, dtype=dtype)
            B = torch.randn(53, 29, device=device, dtype=dtype)
            # Direct matmul
            C = torch.matmul(A, B)
            with open(results_filename) as f:
                content = f.read()
                self.assertIn("Validator", content)
                result_lines = [l for l in content.split('\n')
                                if l and not l.startswith('Validator')]
                self.assertGreater(len(result_lines), 0)
    @onlyCUDA
    @skipCUDAIfNotRocm
    @runOnRocmArch(MI300_ARCH)
--- a/torch/_C/init.pyi.in
+++ b/torch/_C/init.pyi.in
@ -2197,9 +2197,7 @@ def _cuda_tunableop_set_filename(
    insert_device_ordinal: _bool | None,
 ) -> None: ...
 def _cuda_tunableop_get_filename() -> str: ...
 def _cuda_tunableop_write_file(filename: str | None) -> _bool: ...
 def _cuda_tunableop_read_file(filename: str | None) -> _bool: ...
 def _cuda_tunableop_write_file_on_exit(val: _bool) -> None: ...
 def _cuda_tunableop_get_results() -> tuple[str, str, str, _float]: ...
 def _cuda_tunableop_get_validators() -> tuple[str, str]: ...
 def _cuda_tunableop_set_rotating_buffer_size(buffer_size: _int) -> None: ...
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -1653,20 +1653,6 @@ PyObject* THCPModule_cuda_record_untuned_is_enabled(
  END_HANDLE_TH_ERRORS
 }
 PyObject* THCPModule_cuda_tunableop_write_file_on_exit(
    PyObject* _unused,
    PyObject* arg) {
  HANDLE_TH_ERRORS
  TORCH_CHECK(
      THPUtils_checkBool(arg),
      "cuda_tunableop_write_file_on_exit expects a bool, but got ",
      THPUtils_typename(arg));
  at::cuda::tunable::getTuningContext()->WriteFileOnExit(
      THPUtils_unpackBool(arg));
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
 }
 PyObject* THCPModule_cuda_tunableop_set_max_tuning_duration(
    PyObject* _unused,
    PyObject* arg) {
@ -1748,32 +1734,6 @@ PyObject* THCPModule_cuda_tunableop_get_filename(
  END_HANDLE_TH_ERRORS
 }
 PyObject* THCPModule_cuda_tunableop_write_file(
    PyObject* _unused,
    PyObject* args) {
  HANDLE_TH_ERRORS
  PyObject* str = nullptr;
  bool success = false;
  if (!PyArg_ParseTuple(args, "|O", &str)) {
  }
  if (str) {
    TORCH_CHECK(
        THPUtils_checkString(str),
        "cuda_tunableop_write_file expects a string, but got ",
        THPUtils_typename(str));
    auto filename = THPUtils_unpackString(str);
    success = at::cuda::tunable::getTuningContext()->WriteFile(filename);
  } else {
    success = at::cuda::tunable::getTuningContext()->WriteFile();
  }
  if (success) {
    Py_RETURN_TRUE;
  } else {
    Py_RETURN_FALSE;
  }
  END_HANDLE_TH_ERRORS
 }
 PyObject* THCPModule_cuda_tunableop_read_file(
    PyObject* _unused,
    PyObject* args) {
@ -2127,10 +2087,6 @@ static struct PyMethodDef _THCPModule_methods[] = {
     THCPModule_cuda_record_untuned_is_enabled,
     METH_NOARGS,
     nullptr},
    {"_cuda_tunableop_write_file_on_exit",
     THCPModule_cuda_tunableop_write_file_on_exit,
     METH_O,
     nullptr},
    {"_cuda_tunableop_set_max_tuning_duration",
     THCPModule_cuda_tunableop_set_max_tuning_duration,
     METH_O,
@ -2155,10 +2111,6 @@ static struct PyMethodDef _THCPModule_methods[] = {
     THCPModule_cuda_tunableop_get_filename,
     METH_NOARGS,
     nullptr},
    {"_cuda_tunableop_write_file",
     THCPModule_cuda_tunableop_write_file,
     METH_VARARGS,
     nullptr},
    {"_cuda_tunableop_read_file",
     THCPModule_cuda_tunableop_read_file,
     METH_VARARGS,
--- a/torch/cuda/tunable.py
+++ b/torch/cuda/tunable.py
@ -206,8 +206,6 @@ __all__ = [
    "get_filename",
    "get_results",
    "get_validators",
    "write_file_on_exit",
    "write_file",
    "read_file",
    "tune_gemm_in_file",
    "mgpu_tune_gemm_in_file",
@ -306,25 +304,6 @@ def get_validators() -> tuple[str, str]:
    return torch._C._cuda_tunableop_get_validators()  # type: ignore[attr-defined]
 def write_file_on_exit(val: bool) -> None:
    r"""During Tuning Context destruction, write file to disk.
    This is useful as a final flush of your results to disk if your application
    terminates as result of normal operation or an error. Manual flushing of
    your results can be achieved by manually calling ``write_file()``."""
    torch._C._cuda_tunableop_write_file_on_exit(val)  # type: ignore[attr-defined]
 def write_file(filename: Optional[str] = None) -> bool:
    r"""Write results to a CSV file.
    If :attr:`filename` is not given, ``get_filename()`` is called.
    """
    if filename is None:
        filename = get_filename()
    return torch._C._cuda_tunableop_write_file(filename)  # type: ignore[attr-defined]
 def read_file(filename: Optional[str] = None) -> bool:
    r"""Read results from a TunableOp CSV file.
@ -787,7 +766,6 @@ def mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None:
    mp_context = mp.get_context("spawn")
    futures = []  # empty list to hold futures
    flush_results = []  # empty list to hold futures
    # GEMM are assigned to GPUs in a round robin manner
    h = 0
@ -809,13 +787,6 @@ def mgpu_tune_gemm_in_file(filename_pattern: str, num_gpus: int) -> None:
        for future in concurrent.futures.as_completed(futures):
            future.result()
        for g in range(num_gpus):
            flush_result = executor.submit(write_file)
            flush_results.append(flush_result)
        for flush_result in concurrent.futures.as_completed(flush_results):
            flush_result.result()
    torch.cuda.synchronize()
    _gather_tunableop_results()