Make C10_MOBILE consistent with how feature macros are usually used (#17481)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/17481

Usually, feature macros are either defined or undefined and checked accordingly.
C10_MOBILE was a weird special case that was always defined but either defined to 1 or to 0.

This caused a lot of confusion for me when trying to disable something from mobile build and it also disabled it
from the server build (because I was using ifdef). Also, I found a place in the existing code base that made
that wrong assumption and used the macro wrongly, see https://fburl.com/y4icohts

Reviewed By: dzhulgakov

Differential Revision: D14214825

fbshipit-source-id: f3a155b6d43d334e8839e2b2e3c40ed2c773eab6
This commit is contained in:
Sebastian Messmer
2019-02-27 17:54:51 -08:00
committed by Facebook Github Bot
parent 7c5ffc4120
commit 6706e9af19
20 changed files with 36 additions and 51 deletions

View File

@ -26,7 +26,7 @@ namespace at {
/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
/// thread_local is not supported. In that case, we don't provide
/// `at::NonVariableTypeMode`.
#if !C10_MOBILE && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
#if !defined(C10_MOBILE) && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
thread_local bool NonVariableTypeMode_enabled = false;
@ -38,7 +38,7 @@ void NonVariableTypeMode::set_enabled(bool enabled) {
NonVariableTypeMode_enabled = enabled;
}
#else // C10_MOBILE || defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
#else // defined(C10_MOBILE) || defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
bool NonVariableTypeMode::is_enabled() {
throw std::runtime_error("NonVariableTypeMode is not supported on mobile");

View File

@ -8,7 +8,7 @@
// To explicitly use interned strings as symbols in your code, you must add
// them to this list.
#if !C10_MOBILE
#ifndef C10_MOBILE
#define FORALL_ATEN_BASE_SYMBOLS(_) \
_(aten, __and__) \
_(aten, __iand__) \

View File

@ -10,7 +10,7 @@
namespace c10 {
#if !C10_MOBILE
#ifndef C10_MOBILE
#define FORALL_NS_SYMBOLS(_) \
_(namespaces, prim) \
_(namespaces, aten) \

View File

@ -156,7 +156,7 @@ auto ConvParams::use_nnpack(const at::Tensor& input) const -> bool {
!is_dilated() && // or dilation
!transposed && // or transposed tensors
input.ndimension() == 4 // must be in NCHW format
#if !C10_MOBILE && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
#if !defined(C10_MOBILE) && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
&& input.size(0) >= 16 // ensure large enough batch size to ensure perf, tuneable
#endif
;

View File

@ -110,8 +110,8 @@ namespace at { namespace cuda { using namespace c10::hip; }}
#define C10_DEVICE __device__
#define C10_HOST __host__
// constants from (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5)
// but 2048 for previous architectures. You'll get warnings if you exceed these constants.
// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5)
// but 2048 for previous architectures. You'll get warnings if you exceed these constants.
// Hence, the following macros adjust the input values from the user to resolve potential warnings.
#if __CUDA_ARCH__ >= 750
constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
@ -121,17 +121,17 @@ constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block size.
// 256 is a good number for this fallback and should give good occupancy and
// 256 is a good number for this fallback and should give good occupancy and
// versatility across all architectures.
constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
// turns out that although __launch_bounds__ can take constexpr, it
// can't take a constexpr that has anything to do with templates.
// Currently we use launch_bounds that depend on template arguments in
// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK and
// turns out that although __launch_bounds__ can take constexpr, it
// can't take a constexpr that has anything to do with templates.
// Currently we use launch_bounds that depend on template arguments in
// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK and
// C10_MIN_BLOCKS_PER_SM are kept as macros.
// Suppose you were planning to write __launch_bounds__(a, b), based on your performance tuning on a modern GPU.
// Instead, you should write __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
// Suppose you were planning to write __launch_bounds__(a, b), based on your performance tuning on a modern GPU.
// Instead, you should write __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
// which will also properly respect limits on old architectures.
#define C10_MAX_THREADS_PER_BLOCK(val) (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) : CUDA_THREADS_PER_BLOCK_FALLBACK)
#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) ((((threads_per_block)*(blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) ? (blocks_per_sm) : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / (threads_per_block))))
@ -164,9 +164,6 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
#define C10_MOBILE 1
#elif (defined(__APPLE__) && TARGET_OS_MAC)
#define C10_IOS 1
#define C10_MOBILE 0
#else
#define C10_MOBILE 0
#endif // ANDROID / IOS / MACOS
// Portably determine if a type T is trivially copyable or not.

View File

@ -2,7 +2,7 @@
C10_DEFINE_bool(caffe2_cpu_numa_enabled, false, "Use NUMA whenever possible.");
#if defined(__linux__) && !defined(C10_DISABLE_NUMA) && C10_MOBILE == 0
#if defined(__linux__) && !defined(C10_DISABLE_NUMA) && !defined(C10_MOBILE)
#include <numa.h>
#include <numaif.h>
#include <unistd.h>

View File

@ -133,7 +133,7 @@ inline c10::FunctionSchema make_function_schema_for_c10(const char* OperatorName
* - calling C10_REGISTER_CAFFE2_OPERATOR_CUDA is optional and can be omitted if
* you don't want to expose the operator for CUDA operations.
*/
#if !C10_MOBILE
#ifndef C10_MOBILE
#define C10_DECLARE_CAFFE2_OPERATOR(OperatorName) \
namespace caffe2 { \
namespace _c10_ops { \

View File

@ -180,7 +180,7 @@ C10_DECLARE_REGISTRY(
Workspace*);
// TODO Also register c10 operators on mobile
#if !C10_MOBILE
#ifndef C10_MOBILE
// TODO Currently we only register the CPU variant. This is going to be fixed
// once the tensor detemplatization lands.
#define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH(OperatorHandle, Name, NumOutputParameters) \

View File

@ -2,7 +2,7 @@
#include "caffe2/core/flags.h"
#include "caffe2/core/tensor.h"
#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
#include "caffe2/mobile/contrib/ios/mpscnn/mpscnn.h"
#endif
@ -14,7 +14,7 @@ Caffe2IOSPredictor* Caffe2IOSPredictor::NewCaffe2IOSPredictor(const caffe2::NetD
bool allowMetalOperators) {
caffe2::NetDef metal_predict_net;
bool usingMetalOperators = false;
#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
if (allowMetalOperators) {
caffe2::dumpDef(predict_net);
if (caffe2::tryConvertToMPSCNN(init_net, predict_net, &metal_predict_net)) {
@ -38,7 +38,7 @@ Caffe2IOSPredictor::Caffe2IOSPredictor(const caffe2::NetDef& init_net,
bool disableMultithreadProcessing,
bool usingMetalOperators)
: usingMetalOperators(usingMetalOperators), predictor_(init_net, predict_net) {
#if C10_MOBILE
#ifdef C10_MOBILE
if (disableMultithreadProcessing) {
caffe2::ThreadPool* threadpool = predictor_.ws()->GetThreadPool();
if (threadpool != nullptr) {

View File

@ -1,7 +1,7 @@
#include "caffe2/core/common.h"
#include "caffe2/core/context.h"
#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"

View File

@ -1,7 +1,7 @@
#include "caffe2/core/common.h"
#if C10_MOBILE
#ifdef C10_MOBILE
#include "mpscnn_context.h"
#include "mpscnn_kernels.h"

View File

@ -1,6 +1,6 @@
#include "caffe2/core/common.h"
#if C10_MOBILE && defined(CAFFE2_USE_MPSCNN_TEST)
#if defined(C10_MOBILE) && defined(CAFFE2_USE_MPSCNN_TEST)
#include "mpscnn_context.h"
#include "mpscnn_graph_mask.h"

View File

@ -280,7 +280,7 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
// r->WQL1Norm.mutable_data<float>()[i] *= center_distance;
// }
state->parallelFor = [ws](size_t range, std::function<void(size_t)> f) {
#if C10_MOBILE
#ifdef C10_MOBILE
ws->GetThreadPool()->run([&](int, size_t v) { f(v); }, range);
#else
for (size_t v = 0; v < range; ++v) {

View File

@ -6,7 +6,7 @@
#include "caffe2/utils/map_utils.h"
#include "caffe2/utils/proto_utils.h"
#if !C10_MOBILE
#ifndef C10_MOBILE
#include "onnx/checker.h"
#include "onnx/optimizer/optimize.h"
#endif
@ -69,7 +69,7 @@ caffe2::DeviceOption GetDeviceOption(const Device& onnx_device) {
return d;
}
#if !C10_MOBILE
#ifndef C10_MOBILE
ModelProto OptimizeOnnx(const ModelProto& input, bool init) {
std::vector<std::string> passes{"fuse_consecutive_transposes",
"eliminate_nop_transpose",
@ -1422,7 +1422,7 @@ void Caffe2Backend::OnnxToCaffe2(
const std::vector<Caffe2Ops>& extras) {
auto device_option = GetDeviceOption(Device(device));
#if !C10_MOBILE
#ifndef C10_MOBILE
ModelProto init_model = OptimizeOnnx(onnx_model, true);
ModelProto pred_model = OptimizeOnnx(onnx_model, false);
#else
@ -1526,7 +1526,7 @@ Caffe2BackendRep* Caffe2Backend::Prepare(
ModelProto onnx_model;
ParseProtoFromLargeString(onnx_model_str, &onnx_model);
#if !C10_MOBILE
#ifndef C10_MOBILE
::ONNX_NAMESPACE::checker::check_model(onnx_model);
#endif

View File

@ -3,11 +3,7 @@
namespace caffe2 {
#ifndef C10_MOBILE
#error "mobile build state not defined"
#endif
#if C10_MOBILE
#ifdef C10_MOBILE
// mobile-only implementation (tiled + vectorized + multithreaded)
REGISTER_CPU_OPERATOR_WITH_ENGINE(
ConvTranspose,

View File

@ -3,11 +3,7 @@
#include "caffe2/core/common.h"
#ifndef C10_MOBILE
#error "mobile build state not defined"
#endif
#if C10_MOBILE
#ifdef C10_MOBILE
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"

View File

@ -5,11 +5,7 @@
#include "caffe2/core/common.h"
#ifndef C10_MOBILE
#error "mobile build state not defined"
#endif
#if C10_MOBILE
#ifdef C10_MOBILE
#include "caffe2/core/logging.h"
#include "caffe2/operators/conv_op_shared.h"

View File

@ -71,7 +71,7 @@ PredictorConfig makePredictorConfig(
if (run_init) {
CAFFE_ENFORCE(ws.RunNetOnce(init_net));
}
#if C10_MOBILE
#ifdef C10_MOBILE
GlobalInit();
#endif
if (optimization &&

View File

@ -489,7 +489,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {
Timer t;
#if C10_MOBILE
#ifdef C10_MOBILE
ws_->GetThreadPool()->run(
[&](int, int n_g) {
const int g = n_g / N;

View File

@ -1,6 +1,6 @@
#include "observers/perf_observer.h"
#include "observers/observer_config.h"
#if !C10_MOBILE
#ifndef C10_MOBILE
#include "caffe2/core/flags.h"
#include "observers/net_observer_reporter_print.h"
#endif
@ -10,7 +10,7 @@
#include "caffe2/core/init.h"
#include "caffe2/core/operator.h"
#if !C10_MOBILE
#ifndef C10_MOBILE
C10_DEFINE_int64(
aiBench_netInitSampleRate,
0,
@ -45,7 +45,7 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
return caffe2::make_unique<PerfNetObserver>(subject);
});
#if !C10_MOBILE
#if !defined(C10_MOBILE)
// for aibench usage
caffe2::ObserverConfig::setReporter(
caffe2::make_unique<caffe2::NetObserverReporterPrint>());