[caffe2] Fix alias analysis for quantization compression ops (#74169)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/74169 Alias DB was being way too conservative about the semantics of exported Caffe2 ops - it thought some pure functions were writing to their inputs, which caused `ReplaceWithMaybeCopy` to fail. This in turn lead to a huge decrease in out variant coverage and regressions in many models. I've extended the export macro to let the user specify an `AliasAnalysisKind` and marked all of the quantization compression ops as pure functions. ghstack-source-id: 151394133 Reviewed By: hlu1 Differential Revision: D34733630 fbshipit-source-id: e968812e052f14261c10f9a280abe1d910de1f2f (cherry picked from commit 5e9de49b98caff57be13e8bd101144ae2475b6b5)
2025-10-20 21:14:14 +08:00 · 2022-03-15 14:41:50 -07:00
parent ddb34e7b6a
commit 6bd4376c60
2 changed files with 71 additions and 50 deletions
--- a/caffe2/core/export_caffe2_op_to_c10.h
+++ b/caffe2/core/export_caffe2_op_to_c10.h
@ -4,12 +4,13 @@

 #if defined(EXPOSE_C2_OPS) || \
    !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
+#include <ATen/core/dispatch/OperatorOptions.h>
 #include <ATen/core/function_schema.h>
 #include <ATen/core/grad_mode.h>
 #include <ATen/core/op_registration/op_registration.h>
-#include <torch/csrc/jit/frontend/function_schema_parser.h>
 #include <c10/core/CompileTimeFunctionPointer.h>
 #include <c10/util/irange.h>
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
 #include <torch/library.h>
 #include <vector>

@ -113,7 +114,9 @@ void call_caffe2_op_from_c10(
  _call_caffe2_op_from_c10(stack, Schema(), &_call_caffe2_op<Caffe2Operator>);
 }

-inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
+inline FunctionSchema make_function_schema_for_c10(
+    const char* schema_str,
+    c10::optional<c10::AliasAnalysisKind> optional_alias_analysis_kind) {
 #if !defined(EXPOSE_C2_OPS) && \
    (defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE))
  throw std::logic_error(
@ -127,13 +130,17 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
      nullopt,
      IValue());

-  return FunctionSchema(
+  auto schema = FunctionSchema(
      parsed_schema.name(),
      parsed_schema.overload_name(),
      std::move(arguments),
      parsed_schema.returns(),
      parsed_schema.is_vararg(),
      parsed_schema.is_varret());
+  if (optional_alias_analysis_kind) {
+    schema.setAliasAnalysis(*optional_alias_analysis_kind);
+  }
+  return schema;
 #endif
 }

@ -169,7 +176,7 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
 *   caffe2.
 * - all operators must call C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10 and
 *   C10_EXPORT_CAFFE2_OP_TO_C10_CPU .
- * - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted i f
+ * - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted if
 *   you don't want to expose the operator for CUDA operations.
 * - caffe2 arguments must come after caffe2 inputs, in other words, any tensor
 *   inputs must precede any non-tensor inputs.
@ -178,73 +185,85 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
 * - If your operator has a variable number of input tensors, make the first (!)
 *   input an input of type TensorList. There must be no other tensor inputs.
 */
-#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)   \
-  namespace caffe2 {                                        \
-  namespace _c10_ops {                                      \
+#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)  \
+  namespace caffe2 {                                       \
+  namespace _c10_ops {                                     \
  TORCH_API const FunctionSchema& schema_##OperatorName(); \
-  }                                                         \
+  }                                                        \
  }

-#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema) \
-  /* Register the op schema with the c10 dispatcher */                        \
-  namespace caffe2 {                                                          \
-  namespace _c10_ops {                                                        \
-  C10_EXPORT const FunctionSchema& schema_##OperatorName() {                  \
-    static const FunctionSchema schema =                                      \
-        ::caffe2::detail::make_function_schema_for_c10(OperatorSchema);       \
-    return schema;                                                            \
-  }                                                                           \
-  TORCH_LIBRARY_FRAGMENT(_caffe2, m) {                                        \
-      m.def(::caffe2::detail::make_function_schema_for_c10(OperatorSchema));  \
-  }                                                                           \
-  }                                                                           \
+#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(             \
+    OperatorName, OperatorSchema, OptionalAliasAnalysisKind) \
+  /* Register the op schema with the c10 dispatcher */       \
+  namespace caffe2 {                                         \
+  namespace _c10_ops {                                       \
+  C10_EXPORT const FunctionSchema& schema_##OperatorName() { \
+    static const FunctionSchema schema =                     \
+        ::caffe2::detail::make_function_schema_for_c10(      \
+            OperatorSchema, OptionalAliasAnalysisKind);      \
+    return schema;                                           \
+  }                                                          \
+  TORCH_LIBRARY_FRAGMENT(_caffe2, m) {                       \
+    m.def(::caffe2::detail::make_function_schema_for_c10(    \
+        OperatorSchema, OptionalAliasAnalysisKind));         \
+  }                                                          \
+  }                                                          \
  }

 #define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(                         \
    OperatorName, OperatorClass)                                             \
  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
-    TORCH_LIBRARY_IMPL(_caffe2, CPU, m) {                                    \
-        m.impl("_caffe2::" #OperatorName,                                    \
-            torch::CppFunction::makeFromBoxedFunction<                       \
-                ::caffe2::detail::call_caffe2_op_from_c10<                   \
-                    ::caffe2::_c10_ops::schema_##OperatorName,               \
-                    OperatorClass>>());                                      \
-    }
+  TORCH_LIBRARY_IMPL(_caffe2, CPU, m) {                                      \
+    m.impl(                                                                  \
+        "_caffe2::" #OperatorName,                                           \
+        torch::CppFunction::makeFromBoxedFunction<                           \
+            ::caffe2::detail::call_caffe2_op_from_c10<                       \
+                ::caffe2::_c10_ops::schema_##OperatorName,                   \
+                OperatorClass>>());                                          \
+  }

-#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU(                                     \
-    OperatorName, OperatorSchema, OperatorClass)                             \
-  C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema)      \
+#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU(          \
+    OperatorName, OperatorSchema, OperatorClass)  \
+  C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(        \
+      OperatorName, OperatorSchema, c10::nullopt) \
+  C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)
+
+#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_WITH_ALIAS_ANALYSIS(                \
+    OperatorName, OperatorSchema, OperatorClass, OptionalAliasAnalysisKind) \
+  C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(                                  \
+      OperatorName, OperatorSchema, OptionalAliasAnalysisKind)              \
  C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)

 #define C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(OperatorName, OperatorClass)        \
  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
-    TORCH_LIBRARY_IMPL(_caffe2, CUDA, m) {                                   \
-        m.impl("_caffe2::" #OperatorName,                                    \
-            torch::CppFunction::makeFromBoxedFunction<                       \
-                ::caffe2::detail::call_caffe2_op_from_c10<                   \
-                    ::caffe2::_c10_ops::schema_##OperatorName,               \
-                    OperatorClass>>());                                      \
-    }
-
+  TORCH_LIBRARY_IMPL(_caffe2, CUDA, m) {                                     \
+    m.impl(                                                                  \
+        "_caffe2::" #OperatorName,                                           \
+        torch::CppFunction::makeFromBoxedFunction<                           \
+            ::caffe2::detail::call_caffe2_op_from_c10<                       \
+                ::caffe2::_c10_ops::schema_##OperatorName,                   \
+                OperatorClass>>());                                          \
+  }

 // You should never manually call the C10_EXPORT_CAFFE2_OP_TO_C10_HIP macro .
 // The C10_EXPORT_CAFFE2_OP_TO_C10_CUDA macro from above will be automatically
 // rewritten to C10_EXPORT_CAFFE2_OP_TO_C10_HIP by hipify .
 #define C10_EXPORT_CAFFE2_OP_TO_C10_HIP(OperatorName, OperatorClass)         \
  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
-    TORCH_LIBRARY_IMPL(_caffe2, HIP, m) {                                    \
-        m.impl("_caffe2::" #OperatorName,                                    \
-            torch::CppFunction::makeFromBoxedFunction<                       \
-                ::caffe2::detail::call_caffe2_op_from_c10<                   \
-                    ::caffe2::_c10_ops::schema_##OperatorName,               \
-                    OperatorClass>>());                                      \
-    }
-
+  TORCH_LIBRARY_IMPL(_caffe2, HIP, m) {                                      \
+    m.impl(                                                                  \
+        "_caffe2::" #OperatorName,                                           \
+        torch::CppFunction::makeFromBoxedFunction<                           \
+            ::caffe2::detail::call_caffe2_op_from_c10<                       \
+                ::caffe2::_c10_ops::schema_##OperatorName,                   \
+                OperatorClass>>());                                          \
+  }

 #else
 // Don't use c10 dispatcher on mobile because of binary size
 #define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)
-#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema)
+#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \
+    OperatorName, OperatorSchema, OptionalAliasAnalysisKind)
 #define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)
 #define C10_EXPORT_CAFFE2_OP_TO_C10_CPU( \
    OperatorName, OperatorSchema, OperatorClass)
--- a/caffe2/operators/copy_op.cc
+++ b/caffe2/operators/copy_op.cc
@ -200,8 +200,10 @@ REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);

 C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(
    CopyGPUToCPU,
-    "_caffe2::CopyGPUToCPU(Tensor input) -> Tensor");
+    "_caffe2::CopyGPUToCPU(Tensor input) -> Tensor",
+    /*optional_alias_analysis_kind=*/c10::nullopt);

 C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(
    CopyCPUToGPU,
-    "_caffe2::CopyCPUToGPU(Tensor input) -> Tensor");
+    "_caffe2::CopyCPUToGPU(Tensor input) -> Tensor",
+    /*optional_alias_analysis_kind=*/c10::nullopt);