[redone][pytorch] Moving torch.compile worker process logs to a dedicated rank based log directory (#160352)

Summary:
Writing torch.compile worked logs to dedicated_log_rank{RANK} if we're running on mast.
ref: D79456310 (got reverted because of linter)

Testing:
Refer differential Revision: D79917440

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160352
Approved by: https://github.com/masnesral
This commit is contained in:
Jovian Anthony Jaison
2025-08-12 16:49:05 +00:00
committed by PyTorch MergeBot
parent a7abf57aab
commit 94b91a8763
3 changed files with 47 additions and 4 deletions

View File

@ -1,6 +1,7 @@
# Owner(s): ["module: inductor"] # Owner(s): ["module: inductor"]
import operator import operator
import os import os
import tempfile
from torch._inductor.compile_worker.subproc_pool import ( from torch._inductor.compile_worker.subproc_pool import (
raise_testexc, raise_testexc,
@ -66,6 +67,19 @@ class TestCompileWorker(TestCase):
finally: finally:
pool.shutdown() pool.shutdown()
@skipIfWindows(msg="pass_fds not supported on Windows.")
def test_logging(self):
os.environ["MAST_HPC_JOB_NAME"] = "test_job"
os.environ["ROLE_RANK"] = "0"
with tempfile.NamedTemporaryFile(delete=True) as temp_log:
os.environ["TORCHINDUCTOR_WORKER_LOGPATH"] = temp_log.name
pool = SubprocPool(2)
try:
pool.submit(operator.add, 100, 1)
self.assertEqual(os.path.exists(temp_log.name), True)
finally:
pool.shutdown()
if __name__ == "__main__": if __name__ == "__main__":
from torch._inductor.test_case import run_tests from torch._inductor.test_case import run_tests

View File

@ -145,10 +145,19 @@ class SubprocPool:
f"--write-fd={str(subproc_write_fd)}", f"--write-fd={str(subproc_write_fd)}",
f"--torch-key={torch_key_str}", f"--torch-key={torch_key_str}",
] ]
local = False log_path = None
self.log_file = None
if config.worker_suppress_logging: if config.worker_suppress_logging:
log_path = os.devnull
log.info("Suppressing compile worker output due to config") log.info("Suppressing compile worker output due to config")
local = True else:
log_path = config.torchinductor_worker_logpath
if not log_path:
log_path = config.get_worker_log_path()
if log_path:
self.log_file = open(log_path, "w")
self.process = subprocess.Popen( self.process = subprocess.Popen(
cmd, cmd,
@ -164,8 +173,8 @@ class SubprocPool:
"LD_LIBRARY_PATH": get_ld_library_path(), "LD_LIBRARY_PATH": get_ld_library_path(),
}, },
pass_fds=(subproc_read_fd, subproc_write_fd), pass_fds=(subproc_read_fd, subproc_write_fd),
stdout=subprocess.DEVNULL if local else None, stdout=self.log_file,
stderr=subprocess.DEVNULL if local else None, stderr=self.log_file,
) )
self.write_lock = threading.Lock() self.write_lock = threading.Lock()
self.read_thread = threading.Thread( self.read_thread = threading.Thread(
@ -262,6 +271,8 @@ class SubprocPool:
_send_msg(self.write_pipe, MsgHeader.SHUTDOWN) _send_msg(self.write_pipe, MsgHeader.SHUTDOWN)
self.write_pipe.close() self.write_pipe.close()
self.process.wait(300) self.process.wait(300)
if self.log_file:
self.log_file.close()
except OSError as e: except OSError as e:
log.warning("Ignored OSError in pool shutdown: %s", e) log.warning("Ignored OSError in pool shutdown: %s", e)
finally: finally:

View File

@ -1020,6 +1020,24 @@ enable_caching_generated_triton_templates: bool = True
autotune_lookup_table: dict[str, dict[str, Any]] = {} autotune_lookup_table: dict[str, dict[str, Any]] = {}
def get_worker_log_path() -> Optional[str]:
log_loc = None
if is_fbcode():
mast_job_name = os.environ.get("MAST_HPC_JOB_NAME", None)
global_rank = os.environ.get("ROLE_RANK", "0")
if mast_job_name is not None:
log_loc = f"/logs/dedicated_log_torch_compile_worker_rank{global_rank}"
return log_loc
torchinductor_worker_logpath: str = Config(
env_name_force="TORCHINDUCTOR_WORKER_LOGPATH",
default="",
)
# config specific to codegen/cpp.py # config specific to codegen/cpp.py
class cpp: class cpp:
""" """