[caffe2] Fix alias analysis for quantization compression ops (#74169)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/74169

Alias DB was being way too conservative about the semantics of exported Caffe2 ops - it thought some pure functions were writing to their inputs, which caused `ReplaceWithMaybeCopy` to fail. This in turn lead to a huge decrease in out variant coverage and regressions in many models.

I've extended the export macro to let the user specify an `AliasAnalysisKind` and marked all of the quantization compression ops as pure functions.
ghstack-source-id: 151394133

Reviewed By: hlu1

Differential Revision: D34733630

fbshipit-source-id: e968812e052f14261c10f9a280abe1d910de1f2f
(cherry picked from commit 5e9de49b98caff57be13e8bd101144ae2475b6b5)
This commit is contained in:
Mike Iovine
2022-03-15 14:41:50 -07:00
committed by PyTorch MergeBot
parent ddb34e7b6a
commit 6bd4376c60
2 changed files with 71 additions and 50 deletions

View File

@ -4,12 +4,13 @@
#if defined(EXPOSE_C2_OPS) || \
!defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
#include <ATen/core/dispatch/OperatorOptions.h>
#include <ATen/core/function_schema.h>
#include <ATen/core/grad_mode.h>
#include <ATen/core/op_registration/op_registration.h>
#include <torch/csrc/jit/frontend/function_schema_parser.h>
#include <c10/core/CompileTimeFunctionPointer.h>
#include <c10/util/irange.h>
#include <torch/csrc/jit/frontend/function_schema_parser.h>
#include <torch/library.h>
#include <vector>
@ -113,7 +114,9 @@ void call_caffe2_op_from_c10(
_call_caffe2_op_from_c10(stack, Schema(), &_call_caffe2_op<Caffe2Operator>);
}
inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
inline FunctionSchema make_function_schema_for_c10(
const char* schema_str,
c10::optional<c10::AliasAnalysisKind> optional_alias_analysis_kind) {
#if !defined(EXPOSE_C2_OPS) && \
(defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE))
throw std::logic_error(
@ -127,13 +130,17 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
nullopt,
IValue());
return FunctionSchema(
auto schema = FunctionSchema(
parsed_schema.name(),
parsed_schema.overload_name(),
std::move(arguments),
parsed_schema.returns(),
parsed_schema.is_vararg(),
parsed_schema.is_varret());
if (optional_alias_analysis_kind) {
schema.setAliasAnalysis(*optional_alias_analysis_kind);
}
return schema;
#endif
}
@ -169,7 +176,7 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
* caffe2.
* - all operators must call C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10 and
* C10_EXPORT_CAFFE2_OP_TO_C10_CPU .
* - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted i f
* - calling C10_EXPORT_CAFFE2_OP_TO_C10_CUDA is optional and can be omitted if
* you don't want to expose the operator for CUDA operations.
* - caffe2 arguments must come after caffe2 inputs, in other words, any tensor
* inputs must precede any non-tensor inputs.
@ -178,73 +185,85 @@ inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
* - If your operator has a variable number of input tensors, make the first (!)
* input an input of type TensorList. There must be no other tensor inputs.
*/
#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName) \
namespace caffe2 { \
namespace _c10_ops { \
#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName) \
namespace caffe2 { \
namespace _c10_ops { \
TORCH_API const FunctionSchema& schema_##OperatorName(); \
} \
} \
}
#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema) \
/* Register the op schema with the c10 dispatcher */ \
namespace caffe2 { \
namespace _c10_ops { \
C10_EXPORT const FunctionSchema& schema_##OperatorName() { \
static const FunctionSchema schema = \
::caffe2::detail::make_function_schema_for_c10(OperatorSchema); \
return schema; \
} \
TORCH_LIBRARY_FRAGMENT(_caffe2, m) { \
m.def(::caffe2::detail::make_function_schema_for_c10(OperatorSchema)); \
} \
} \
#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \
OperatorName, OperatorSchema, OptionalAliasAnalysisKind) \
/* Register the op schema with the c10 dispatcher */ \
namespace caffe2 { \
namespace _c10_ops { \
C10_EXPORT const FunctionSchema& schema_##OperatorName() { \
static const FunctionSchema schema = \
::caffe2::detail::make_function_schema_for_c10( \
OperatorSchema, OptionalAliasAnalysisKind); \
return schema; \
} \
TORCH_LIBRARY_FRAGMENT(_caffe2, m) { \
m.def(::caffe2::detail::make_function_schema_for_c10( \
OperatorSchema, OptionalAliasAnalysisKind)); \
} \
} \
}
#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY( \
OperatorName, OperatorClass) \
/* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
TORCH_LIBRARY_IMPL(_caffe2, CPU, m) { \
m.impl("_caffe2::" #OperatorName, \
torch::CppFunction::makeFromBoxedFunction< \
::caffe2::detail::call_caffe2_op_from_c10< \
::caffe2::_c10_ops::schema_##OperatorName, \
OperatorClass>>()); \
}
TORCH_LIBRARY_IMPL(_caffe2, CPU, m) { \
m.impl( \
"_caffe2::" #OperatorName, \
torch::CppFunction::makeFromBoxedFunction< \
::caffe2::detail::call_caffe2_op_from_c10< \
::caffe2::_c10_ops::schema_##OperatorName, \
OperatorClass>>()); \
}
#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU( \
OperatorName, OperatorSchema, OperatorClass) \
C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema) \
#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU( \
OperatorName, OperatorSchema, OperatorClass) \
C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \
OperatorName, OperatorSchema, c10::nullopt) \
C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)
#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_WITH_ALIAS_ANALYSIS( \
OperatorName, OperatorSchema, OperatorClass, OptionalAliasAnalysisKind) \
C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \
OperatorName, OperatorSchema, OptionalAliasAnalysisKind) \
C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)
#define C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(OperatorName, OperatorClass) \
/* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
TORCH_LIBRARY_IMPL(_caffe2, CUDA, m) { \
m.impl("_caffe2::" #OperatorName, \
torch::CppFunction::makeFromBoxedFunction< \
::caffe2::detail::call_caffe2_op_from_c10< \
::caffe2::_c10_ops::schema_##OperatorName, \
OperatorClass>>()); \
}
TORCH_LIBRARY_IMPL(_caffe2, CUDA, m) { \
m.impl( \
"_caffe2::" #OperatorName, \
torch::CppFunction::makeFromBoxedFunction< \
::caffe2::detail::call_caffe2_op_from_c10< \
::caffe2::_c10_ops::schema_##OperatorName, \
OperatorClass>>()); \
}
// You should never manually call the C10_EXPORT_CAFFE2_OP_TO_C10_HIP macro .
// The C10_EXPORT_CAFFE2_OP_TO_C10_CUDA macro from above will be automatically
// rewritten to C10_EXPORT_CAFFE2_OP_TO_C10_HIP by hipify .
#define C10_EXPORT_CAFFE2_OP_TO_C10_HIP(OperatorName, OperatorClass) \
/* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
TORCH_LIBRARY_IMPL(_caffe2, HIP, m) { \
m.impl("_caffe2::" #OperatorName, \
torch::CppFunction::makeFromBoxedFunction< \
::caffe2::detail::call_caffe2_op_from_c10< \
::caffe2::_c10_ops::schema_##OperatorName, \
OperatorClass>>()); \
}
TORCH_LIBRARY_IMPL(_caffe2, HIP, m) { \
m.impl( \
"_caffe2::" #OperatorName, \
torch::CppFunction::makeFromBoxedFunction< \
::caffe2::detail::call_caffe2_op_from_c10< \
::caffe2::_c10_ops::schema_##OperatorName, \
OperatorClass>>()); \
}
#else
// Don't use c10 dispatcher on mobile because of binary size
#define C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(OperatorName)
#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(OperatorName, OperatorSchema)
#define C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY( \
OperatorName, OperatorSchema, OptionalAliasAnalysisKind)
#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU_KERNEL_ONLY(OperatorName, OperatorClass)
#define C10_EXPORT_CAFFE2_OP_TO_C10_CPU( \
OperatorName, OperatorSchema, OperatorClass)

View File

@ -200,8 +200,10 @@ REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(
CopyGPUToCPU,
"_caffe2::CopyGPUToCPU(Tensor input) -> Tensor");
"_caffe2::CopyGPUToCPU(Tensor input) -> Tensor",
/*optional_alias_analysis_kind=*/c10::nullopt);
C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(
CopyCPUToGPU,
"_caffe2::CopyCPUToGPU(Tensor input) -> Tensor");
"_caffe2::CopyCPUToGPU(Tensor input) -> Tensor",
/*optional_alias_analysis_kind=*/c10::nullopt);