Set kernel func name from AOT Compiler (#67229)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/67229 Right now, assembly code generated for the a given method from the model is named wrapper or func by default. The function name is then replaced with a proper kernel_func_name after target specific assembly is generated. This PR propagates a desired kernel_func_name right from aotCompiler API so that the generated function has the needed name that doesn't need to be replaced later. Note: Most of this change was landed in https://github.com/pytorch/pytorch/pull/66337 which had to be reverted as it was breaking `test_profiler` in `test_jit_fuser_te` as it replaced the name generated for graph with the default kernel_func_name value. This PR fixes that as well. ``` (pytorch) ~/local/pytorch kname └─ $ python3 test/test_jit_fuser_te.py CUDA not available, skipping tests monkeytype is not installed. Skipping tests for Profile-Directed Typing ........................................<string>:3: UserWarning: torch.cholesky is deprecated in favor of torch.linalg.cholesky and will be removed in a future PyTorch release. L = torch.cholesky(A) should be replaced with L = torch.linalg.cholesky(A) and . . . ......................<string>:3: UserWarning: torch.symeig is deprecated in favor of torch.linalg.eigh and will be removed in a future PyTorch release. The default behavior has changed from using the upper triangular portion of the matrix by default to using the lower triangular portion. L, _ = torch.symeig(A, upper=upper) should be replaced with L = torch.linalg.eigvalsh(A, UPLO='U' if upper else 'L') and L, V = torch.symeig(A, eigenvectors=True) should be replaced with L, V = torch.linalg.eigh(A, UPLO='U' if upper else 'L') (Triggered internally at ../aten/src/ATen/native/BatchLinearAlgebra.cpp:2492.) ......[W pybind_utils.cpp:35] Warning: Using sparse tensors in TorchScript is experimental. Many optimization pathways have not been thoroughly tested with sparse tensors. Please include the fact that the network is running sparse tensors in any bug reports submitted. (function operator()) /data/users/priyaramani/pytorch/torch/testing/_internal/common_utils.py:403: UserWarning: Using sparse tensors in TorchScript is experimental. Many optimization pathways have not been thoroughly tested with sparse tensors. Please include the fact that the network is running sparse tensors in any bug reports submitted. (Triggered internally at ../torch/csrc/jit/python/pybind_utils.h:691.) return callable(*args, **kwargs) .....................................................................[W Resize.cpp:23] Warning: An output with one or more elements was resized since it had shape [1], which does not match the required output shape [].This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0). (function resize_output_check) [W Resize.cpp:23] Warning: An output with one or more elements was resized since it had shape [1, 5], which does not match the required output shape [5].This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0). (function resize_output_check) ........................................................................s.......s...s.s....s......s..sss............................ ---------------------------------------------------------------------- Ran 503 tests in 37.536s OK (skipped=10) ``` Test Plan: Imported from OSS Reviewed By: navahgar, pbelevich Differential Revision: D31945713 Pulled By: priyaramani fbshipit-source-id: f2246946f0fd51afba5cb6186d9743051e3b096b
2025-10-20 12:54:11 +08:00 · 2021-10-27 13:09:12 -07:00
parent 5347dab851
commit fa70d72e95
5 changed files with 37 additions and 16 deletions
--- a/binaries/aot_model_compiler.cc
+++ b/binaries/aot_model_compiler.cc
@ -90,6 +90,10 @@ std::string getNncKernelId() {
      ":" + version_token;
 }

+std::string getNncKernelFuncName(const std::string& method_name) {
+  return "nnc_" + FLAGS_model_name + "_" + FLAGS_model_version + "_" + method_name;
+}
+
 void writeOutputLlvmAssembly(const std::string& asm_code) {
  std::string output_llvm_file_name = FLAGS_output_llvm;
  if (output_llvm_file_name.empty()) {
@ -108,18 +112,13 @@ c10::IValue preprocess(
    const c10::Dict<c10::IValue, c10::IValue>& compile_spec,
    const torch::jit::BackendDebugHandleGenerator& generate_debug_handles) {

-  std::string output_llvm_file_name = FLAGS_output_llvm;
-  if (output_llvm_file_name.empty()) {
-    output_llvm_file_name =
-        FLAGS_model.substr(0, FLAGS_model.find('.')) + ".compiled.ll";
-  }
-
  auto method = mod.get_method(FLAGS_method_name);
  auto graph = toGraphFunction(method.function()).graph()->copy();
  auto sizes = getInputSizes(compile_spec);
+  auto kernel_func_name = getNncKernelFuncName(FLAGS_method_name);

-  std::string llvm_asm_code;
-  auto compiled = torch::jit::mobile::nnc::aotCompile(FLAGS_method_name, graph, sizes);
+  auto compiled = torch::jit::mobile::nnc::aotCompile(
+      FLAGS_method_name, graph, sizes, kernel_func_name);
  writeOutputLlvmAssembly(compiled.second);

  auto func = std::move(compiled.first);
@ -141,8 +140,8 @@ int main(int argc, char** argv) {
      " --model=<model file>"
      " --model_name=<model name>"
      " --model_version=<model version>"
-      " --input_dims='1,3,224,224'"
-      " [--method_name=<mehhod name>]"
+      " --input_dims=<input dimensions like '1,3,224,224;2,2'>"
+      " [--method_name=<method name>]"
      " [--output_llvm=<llvm assembly output file path>]"
      " [--output_model=<output model file path>]");

@ -153,6 +152,9 @@ int main(int argc, char** argv) {
  }

  CAFFE_ENFORCE(!FLAGS_model.empty(), c10::UsageMessage());
+  CAFFE_ENFORCE(!FLAGS_model_name.empty(), c10::UsageMessage());
+  CAFFE_ENFORCE(!FLAGS_model_version.empty(), c10::UsageMessage());
+  CAFFE_ENFORCE(!FLAGS_input_dims.empty(), c10::UsageMessage());

  std::string output_model_name = FLAGS_output_model;
  if (output_model_name.empty()) {
--- a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
+++ b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp
@ -87,7 +87,8 @@ std::unique_ptr<Function> compileMethod(
 std::pair<std::unique_ptr<Function>, const std::string> aotCompile(
    const std::string& method_name,
    std::shared_ptr<Graph>& g,
-    const std::vector<std::vector<int64_t>>& sizes) {
+    const std::vector<std::vector<int64_t>>& sizes,
+    const std::string& kernel_func_name) {
  GRAPH_DEBUG("Input sizes ", sizes);
  GRAPH_DEBUG("Method name ", method_name);

@ -111,7 +112,9 @@ std::pair<std::unique_ptr<Function>, const std::string> aotCompile(
  GRAPH_DUMP("graph after shape propagation ", g);

  std::shared_ptr<tensorexpr::TensorExprKernel> kernel =
-      std::make_shared<tensorexpr::TensorExprKernel>(g);
+      std::make_shared<tensorexpr::TensorExprKernel>(
+          TensorExprKernel(g, kernel_func_name));
+
  const std::string compiled_assembly = kernel->getCodeText();

  auto func = compileMethod(kernel, method_name, sizes);
--- a/torch/csrc/jit/mobile/nnc/aot_compiler.h
+++ b/torch/csrc/jit/mobile/nnc/aot_compiler.h
@ -14,7 +14,8 @@ namespace nnc {
 TORCH_API std::pair<std::unique_ptr<Function>, const std::string> aotCompile(
    const std::string& method_name,
    std::shared_ptr<Graph>& subgraph,
-    const std::vector<std::vector<int64_t>>& sizes);
+    const std::vector<std::vector<int64_t>>& sizes,
+    const std::string& kernel_func_name = "func");

 } // namespace nnc
 } // namespace mobile
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@ -7,7 +7,6 @@
 #include <c10/util/irange.h>
 #include <c10/util/string_utils.h>
 #include <torch/csrc/jit/jit_log.h>
-#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/graph_opt.h>
 #include <torch/csrc/jit/tensorexpr/ir_printer.h>
@ -1172,17 +1171,19 @@ void TensorExprKernel::compile() {
      stmt,
      bufferArgs_,
      device_,
-      SubgraphUtils::generateNameForGraph(graph_));
+      kernel_func_name_);
 }

 TensorExprKernel::TensorExprKernel(
    const std::shared_ptr<Graph>& subgraph,
+    const std::string& kernel_func_name,
    std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings,
    bool pre_alloc /*= false*/)
    : graph_(subgraph),
      code_(subgraph, ""),
      custom_lowerings_(std::move(custom_lowerings)),
-      pre_alloc_(pre_alloc) {
+      pre_alloc_(pre_alloc),
+      kernel_func_name_(kernel_func_name) {
  allow_fallback_ = fallbackAllowed();
  if (!allow_fallback_) {
    compile();
--- a/torch/csrc/jit/tensorexpr/kernel.h
+++ b/torch/csrc/jit/tensorexpr/kernel.h
@ -2,6 +2,7 @@

 #include <c10/util/variant.h>
 #include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
 #include <torch/csrc/jit/runtime/interpreter.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/codegen.h>
@ -91,10 +92,22 @@ class TORCH_API TensorExprKernel {
 public:
  explicit TensorExprKernel(
      const std::shared_ptr<Graph>& subgraph,
+      const std::string& kernel_func_name,
      std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings =
          {},
      bool pre_alloc = false);

+  explicit TensorExprKernel(
+      const std::shared_ptr<Graph>& subgraph,
+      std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings =
+          {},
+      bool pre_alloc = false)
+      : TensorExprKernel(
+            subgraph,
+            SubgraphUtils::generateNameForGraph(subgraph),
+            custom_lowerings,
+            pre_alloc) {}
+
  void run(Stack& stack);
  void runFast(
      const std::vector<void*>& inputs,
@ -235,6 +248,7 @@ class TORCH_API TensorExprKernel {

  std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings_;
  bool pre_alloc_{false};
+  const std::string& kernel_func_name_;
 };

 TORCH_API int& getTECudaPointwiseLoopLevels();