Make C10_MOBILE consistent with how feature macros are usually used (#17481)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17481 Usually, feature macros are either defined or undefined and checked accordingly. C10_MOBILE was a weird special case that was always defined but either defined to 1 or to 0. This caused a lot of confusion for me when trying to disable something from mobile build and it also disabled it from the server build (because I was using ifdef). Also, I found a place in the existing code base that made that wrong assumption and used the macro wrongly, see https://fburl.com/y4icohts Reviewed By: dzhulgakov Differential Revision: D14214825 fbshipit-source-id: f3a155b6d43d334e8839e2b2e3c40ed2c773eab6
2025-10-20 21:14:14 +08:00 · 2019-02-27 17:54:51 -08:00
parent 7c5ffc4120
commit 6706e9af19
20 changed files with 36 additions and 51 deletions
--- a/aten/src/ATen/core/LegacyTypeDispatch.cpp
+++ b/aten/src/ATen/core/LegacyTypeDispatch.cpp
@ -26,7 +26,7 @@ namespace at {
 /// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
 /// thread_local is not supported. In that case, we don't provide
 /// `at::NonVariableTypeMode`.
-#if !C10_MOBILE && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+#if !defined(C10_MOBILE) && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)

 thread_local bool NonVariableTypeMode_enabled = false;

@ -38,7 +38,7 @@ void NonVariableTypeMode::set_enabled(bool enabled) {
  NonVariableTypeMode_enabled = enabled;
 }

-#else // C10_MOBILE || defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+#else // defined(C10_MOBILE) || defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)

 bool NonVariableTypeMode::is_enabled() {
  throw std::runtime_error("NonVariableTypeMode is not supported on mobile");
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@ -8,7 +8,7 @@
 // To explicitly use interned strings as symbols in your code, you must add
 // them to this list.

-#if !C10_MOBILE
+#ifndef C10_MOBILE
 #define FORALL_ATEN_BASE_SYMBOLS(_) \
 _(aten, __and__) \
 _(aten, __iand__) \
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@ -10,7 +10,7 @@

 namespace c10 {

-#if !C10_MOBILE
+#ifndef C10_MOBILE
 #define FORALL_NS_SYMBOLS(_)       \
  _(namespaces, prim)              \
  _(namespaces, aten)              \
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -156,7 +156,7 @@ auto ConvParams::use_nnpack(const at::Tensor& input) const -> bool {
         !is_dilated() && // or dilation
         !transposed &&   // or transposed tensors
         input.ndimension() == 4 // must be in NCHW format
-#if !C10_MOBILE && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+#if !defined(C10_MOBILE) && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
         && input.size(0) >= 16 // ensure large enough batch size to ensure perf, tuneable
 #endif
     ;
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@ -110,8 +110,8 @@ namespace at { namespace cuda { using namespace c10::hip; }}
 #define C10_DEVICE __device__
 #define C10_HOST __host__
 // constants from (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
-// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5) 
-// but 2048 for previous architectures. You'll get warnings if you exceed these constants. 
+// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5)
+// but 2048 for previous architectures. You'll get warnings if you exceed these constants.
 // Hence, the following macros adjust the input values from the user to resolve potential warnings.
 #if __CUDA_ARCH__ >= 750
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
@ -121,17 +121,17 @@ constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
 // CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
 constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
 // CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block size.
-// 256 is a good number for this fallback and should give good occupancy and 
+// 256 is a good number for this fallback and should give good occupancy and
 // versatility across all architectures.
 constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 // NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
-//       turns out that although __launch_bounds__ can take constexpr, it 
-//       can't take a constexpr that has anything to do with templates. 
-//       Currently we use launch_bounds that depend on template arguments in 
-//       Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK and 
+//       turns out that although __launch_bounds__ can take constexpr, it
+//       can't take a constexpr that has anything to do with templates.
+//       Currently we use launch_bounds that depend on template arguments in
+//       Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK and
 //       C10_MIN_BLOCKS_PER_SM are kept as macros.
-// Suppose you were planning to write __launch_bounds__(a, b), based on your performance tuning on a modern GPU. 
-// Instead, you should write __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)), 
+// Suppose you were planning to write __launch_bounds__(a, b), based on your performance tuning on a modern GPU.
+// Instead, you should write __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
 // which will also properly respect limits on old architectures.
 #define C10_MAX_THREADS_PER_BLOCK(val) (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) : CUDA_THREADS_PER_BLOCK_FALLBACK)
 #define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) ((((threads_per_block)*(blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) ? (blocks_per_sm) : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / (threads_per_block))))
@ -164,9 +164,6 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
 #define C10_MOBILE 1
 #elif (defined(__APPLE__) && TARGET_OS_MAC)
 #define C10_IOS 1
-#define C10_MOBILE 0
-#else
-#define C10_MOBILE 0
 #endif // ANDROID / IOS / MACOS

 // Portably determine if a type T is trivially copyable or not.
--- a/c10/util/numa.cpp
+++ b/c10/util/numa.cpp
@ -2,7 +2,7 @@

 C10_DEFINE_bool(caffe2_cpu_numa_enabled, false, "Use NUMA whenever possible.");

-#if defined(__linux__) && !defined(C10_DISABLE_NUMA) && C10_MOBILE == 0
+#if defined(__linux__) && !defined(C10_DISABLE_NUMA) && !defined(C10_MOBILE)
 #include <numa.h>
 #include <numaif.h>
 #include <unistd.h>
--- a/caffe2/core/c10_operator.h
+++ b/caffe2/core/c10_operator.h
@ -133,7 +133,7 @@ inline c10::FunctionSchema make_function_schema_for_c10(const char* OperatorName
 * - calling C10_REGISTER_CAFFE2_OPERATOR_CUDA is optional and can be omitted if
 * you don't want to expose the operator for CUDA operations.
 */
-#if !C10_MOBILE
+#ifndef C10_MOBILE
 #define C10_DECLARE_CAFFE2_OPERATOR(OperatorName) \
  namespace caffe2 {                              \
  namespace _c10_ops {                            \
--- a/caffe2/core/operator_c10wrapper.h
+++ b/caffe2/core/operator_c10wrapper.h
@ -180,7 +180,7 @@ C10_DECLARE_REGISTRY(
    Workspace*);

 // TODO Also register c10 operators on mobile
-#if !C10_MOBILE
+#ifndef C10_MOBILE
 // TODO Currently we only register the CPU variant. This is going to be fixed
 //      once the tensor detemplatization lands.
 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH(OperatorHandle, Name, NumOutputParameters)  \
--- a/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
+++ b/caffe2/mobile/contrib/ios/ios_caffe_predictor.cc
@ -2,7 +2,7 @@
 #include "caffe2/core/flags.h"
 #include "caffe2/core/tensor.h"

-#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
+#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
 #include "caffe2/mobile/contrib/ios/mpscnn/mpscnn.h"
 #endif

@ -14,7 +14,7 @@ Caffe2IOSPredictor* Caffe2IOSPredictor::NewCaffe2IOSPredictor(const caffe2::NetD
                                                              bool allowMetalOperators) {
  caffe2::NetDef metal_predict_net;
  bool usingMetalOperators = false;
-#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
+#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
  if (allowMetalOperators) {
    caffe2::dumpDef(predict_net);
    if (caffe2::tryConvertToMPSCNN(init_net, predict_net, &metal_predict_net)) {
@ -38,7 +38,7 @@ Caffe2IOSPredictor::Caffe2IOSPredictor(const caffe2::NetDef& init_net,
                                       bool disableMultithreadProcessing,
                                       bool usingMetalOperators)
    : usingMetalOperators(usingMetalOperators), predictor_(init_net, predict_net) {
-#if C10_MOBILE
+#ifdef C10_MOBILE
  if (disableMultithreadProcessing) {
    caffe2::ThreadPool* threadpool = predictor_.ws()->GetThreadPool();
    if (threadpool != nullptr) {
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn.mm
@ -1,7 +1,7 @@
 #include "caffe2/core/common.h"
 #include "caffe2/core/context.h"

-#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
+#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)

 #include "caffe2/core/operator.h"
 #include "caffe2/core/timer.h"
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_context.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_context.mm
@ -1,7 +1,7 @@

 #include "caffe2/core/common.h"

-#if C10_MOBILE
+#ifdef C10_MOBILE

 #include "mpscnn_context.h"
 #include "mpscnn_kernels.h"
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_test.mm
@ -1,6 +1,6 @@
 #include "caffe2/core/common.h"

-#if C10_MOBILE && defined(CAFFE2_USE_MPSCNN_TEST)
+#if defined(C10_MOBILE) && defined(CAFFE2_USE_MPSCNN_TEST)

 #include "mpscnn_context.h"
 #include "mpscnn_graph_mask.h"
--- a/caffe2/mobile/contrib/ulp2/ulp.cc
+++ b/caffe2/mobile/contrib/ulp2/ulp.cc
@ -280,7 +280,7 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
  //   r->WQL1Norm.mutable_data<float>()[i] *= center_distance;
  // }
  state->parallelFor = [ws](size_t range, std::function<void(size_t)> f) {
-#if C10_MOBILE
+#ifdef C10_MOBILE
    ws->GetThreadPool()->run([&](int, size_t v) { f(v); }, range);
 #else
    for (size_t v = 0; v < range; ++v) {
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@ -6,7 +6,7 @@
 #include "caffe2/utils/map_utils.h"
 #include "caffe2/utils/proto_utils.h"

-#if !C10_MOBILE
+#ifndef C10_MOBILE
 #include "onnx/checker.h"
 #include "onnx/optimizer/optimize.h"
 #endif
@ -69,7 +69,7 @@ caffe2::DeviceOption GetDeviceOption(const Device& onnx_device) {
  return d;
 }

-#if !C10_MOBILE
+#ifndef C10_MOBILE
 ModelProto OptimizeOnnx(const ModelProto& input, bool init) {
  std::vector<std::string> passes{"fuse_consecutive_transposes",
                                  "eliminate_nop_transpose",
@ -1422,7 +1422,7 @@ void Caffe2Backend::OnnxToCaffe2(
    const std::vector<Caffe2Ops>& extras) {
  auto device_option = GetDeviceOption(Device(device));

-#if !C10_MOBILE
+#ifndef C10_MOBILE
  ModelProto init_model = OptimizeOnnx(onnx_model, true);
  ModelProto pred_model = OptimizeOnnx(onnx_model, false);
 #else
@ -1526,7 +1526,7 @@ Caffe2BackendRep* Caffe2Backend::Prepare(
  ModelProto onnx_model;
  ParseProtoFromLargeString(onnx_model_str, &onnx_model);

-#if !C10_MOBILE
+#ifndef C10_MOBILE
  ::ONNX_NAMESPACE::checker::check_model(onnx_model);
 #endif

--- a/caffe2/operators/conv_transpose_op_mobile.cc
+++ b/caffe2/operators/conv_transpose_op_mobile.cc
@ -3,11 +3,7 @@

 namespace caffe2 {

-#ifndef C10_MOBILE
-#error "mobile build state not defined"
-#endif
-
-#if C10_MOBILE
+#ifdef C10_MOBILE
 // mobile-only implementation (tiled + vectorized + multithreaded)
 REGISTER_CPU_OPERATOR_WITH_ENGINE(
    ConvTranspose,
--- a/caffe2/operators/conv_transpose_op_mobile.h
+++ b/caffe2/operators/conv_transpose_op_mobile.h
@ -3,11 +3,7 @@

 #include "caffe2/core/common.h"

-#ifndef C10_MOBILE
-#error "mobile build state not defined"
-#endif
-
-#if C10_MOBILE
+#ifdef C10_MOBILE

 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
--- a/caffe2/operators/conv_transpose_op_mobile_impl.h
+++ b/caffe2/operators/conv_transpose_op_mobile_impl.h
@ -5,11 +5,7 @@

 #include "caffe2/core/common.h"

-#ifndef C10_MOBILE
-#error "mobile build state not defined"
-#endif
-
-#if C10_MOBILE
+#ifdef C10_MOBILE

 #include "caffe2/core/logging.h"
 #include "caffe2/operators/conv_op_shared.h"
--- a/caffe2/predictor/predictor_config.cc
+++ b/caffe2/predictor/predictor_config.cc
@ -71,7 +71,7 @@ PredictorConfig makePredictorConfig(
  if (run_init) {
    CAFFE_ENFORCE(ws.RunNetOnce(init_net));
  }
-#if C10_MOBILE
+#ifdef C10_MOBILE
  GlobalInit();
 #endif
  if (optimization &&
--- a/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
+++ b/caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc
@ -489,7 +489,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {

    Timer t;

-#if C10_MOBILE
+#ifdef C10_MOBILE
    ws_->GetThreadPool()->run(
        [&](int, int n_g) {
          const int g = n_g / N;
--- a/modules/observers/perf_observer.cc
+++ b/modules/observers/perf_observer.cc
@ -1,6 +1,6 @@
 #include "observers/perf_observer.h"
 #include "observers/observer_config.h"
-#if !C10_MOBILE
+#ifndef C10_MOBILE
 #include "caffe2/core/flags.h"
 #include "observers/net_observer_reporter_print.h"
 #endif
@ -10,7 +10,7 @@
 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"

-#if !C10_MOBILE
+#ifndef C10_MOBILE
 C10_DEFINE_int64(
    aiBench_netInitSampleRate,
    0,
@ -45,7 +45,7 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
    return caffe2::make_unique<PerfNetObserver>(subject);
  });

-#if !C10_MOBILE
+#if !defined(C10_MOBILE)
  // for aibench usage
  caffe2::ObserverConfig::setReporter(
      caffe2::make_unique<caffe2::NetObserverReporterPrint>());