mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Make C10_MOBILE consistent with how feature macros are usually used (#17481)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/17481 Usually, feature macros are either defined or undefined and checked accordingly. C10_MOBILE was a weird special case that was always defined but either defined to 1 or to 0. This caused a lot of confusion for me when trying to disable something from mobile build and it also disabled it from the server build (because I was using ifdef). Also, I found a place in the existing code base that made that wrong assumption and used the macro wrongly, see https://fburl.com/y4icohts Reviewed By: dzhulgakov Differential Revision: D14214825 fbshipit-source-id: f3a155b6d43d334e8839e2b2e3c40ed2c773eab6
This commit is contained in:
committed by
Facebook Github Bot
parent
7c5ffc4120
commit
6706e9af19
@ -26,7 +26,7 @@ namespace at {
|
||||
/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
|
||||
/// thread_local is not supported. In that case, we don't provide
|
||||
/// `at::NonVariableTypeMode`.
|
||||
#if !C10_MOBILE && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
|
||||
#if !defined(C10_MOBILE) && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
|
||||
|
||||
thread_local bool NonVariableTypeMode_enabled = false;
|
||||
|
||||
@ -38,7 +38,7 @@ void NonVariableTypeMode::set_enabled(bool enabled) {
|
||||
NonVariableTypeMode_enabled = enabled;
|
||||
}
|
||||
|
||||
#else // C10_MOBILE || defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
|
||||
#else // defined(C10_MOBILE) || defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
|
||||
|
||||
bool NonVariableTypeMode::is_enabled() {
|
||||
throw std::runtime_error("NonVariableTypeMode is not supported on mobile");
|
||||
|
@ -8,7 +8,7 @@
|
||||
// To explicitly use interned strings as symbols in your code, you must add
|
||||
// them to this list.
|
||||
|
||||
#if !C10_MOBILE
|
||||
#ifndef C10_MOBILE
|
||||
#define FORALL_ATEN_BASE_SYMBOLS(_) \
|
||||
_(aten, __and__) \
|
||||
_(aten, __iand__) \
|
||||
|
@ -10,7 +10,7 @@
|
||||
|
||||
namespace c10 {
|
||||
|
||||
#if !C10_MOBILE
|
||||
#ifndef C10_MOBILE
|
||||
#define FORALL_NS_SYMBOLS(_) \
|
||||
_(namespaces, prim) \
|
||||
_(namespaces, aten) \
|
||||
|
@ -156,7 +156,7 @@ auto ConvParams::use_nnpack(const at::Tensor& input) const -> bool {
|
||||
!is_dilated() && // or dilation
|
||||
!transposed && // or transposed tensors
|
||||
input.ndimension() == 4 // must be in NCHW format
|
||||
#if !C10_MOBILE && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
|
||||
#if !defined(C10_MOBILE) && !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
|
||||
&& input.size(0) >= 16 // ensure large enough batch size to ensure perf, tuneable
|
||||
#endif
|
||||
;
|
||||
|
@ -110,8 +110,8 @@ namespace at { namespace cuda { using namespace c10::hip; }}
|
||||
#define C10_DEVICE __device__
|
||||
#define C10_HOST __host__
|
||||
// constants from (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
|
||||
// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5)
|
||||
// but 2048 for previous architectures. You'll get warnings if you exceed these constants.
|
||||
// The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5)
|
||||
// but 2048 for previous architectures. You'll get warnings if you exceed these constants.
|
||||
// Hence, the following macros adjust the input values from the user to resolve potential warnings.
|
||||
#if __CUDA_ARCH__ >= 750
|
||||
constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
|
||||
@ -121,17 +121,17 @@ constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
|
||||
// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
|
||||
constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
|
||||
// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block size.
|
||||
// 256 is a good number for this fallback and should give good occupancy and
|
||||
// 256 is a good number for this fallback and should give good occupancy and
|
||||
// versatility across all architectures.
|
||||
constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
|
||||
// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
|
||||
// turns out that although __launch_bounds__ can take constexpr, it
|
||||
// can't take a constexpr that has anything to do with templates.
|
||||
// Currently we use launch_bounds that depend on template arguments in
|
||||
// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK and
|
||||
// turns out that although __launch_bounds__ can take constexpr, it
|
||||
// can't take a constexpr that has anything to do with templates.
|
||||
// Currently we use launch_bounds that depend on template arguments in
|
||||
// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK and
|
||||
// C10_MIN_BLOCKS_PER_SM are kept as macros.
|
||||
// Suppose you were planning to write __launch_bounds__(a, b), based on your performance tuning on a modern GPU.
|
||||
// Instead, you should write __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
|
||||
// Suppose you were planning to write __launch_bounds__(a, b), based on your performance tuning on a modern GPU.
|
||||
// Instead, you should write __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
|
||||
// which will also properly respect limits on old architectures.
|
||||
#define C10_MAX_THREADS_PER_BLOCK(val) (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) : CUDA_THREADS_PER_BLOCK_FALLBACK)
|
||||
#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) ((((threads_per_block)*(blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) ? (blocks_per_sm) : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / (threads_per_block))))
|
||||
@ -164,9 +164,6 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
|
||||
#define C10_MOBILE 1
|
||||
#elif (defined(__APPLE__) && TARGET_OS_MAC)
|
||||
#define C10_IOS 1
|
||||
#define C10_MOBILE 0
|
||||
#else
|
||||
#define C10_MOBILE 0
|
||||
#endif // ANDROID / IOS / MACOS
|
||||
|
||||
// Portably determine if a type T is trivially copyable or not.
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
C10_DEFINE_bool(caffe2_cpu_numa_enabled, false, "Use NUMA whenever possible.");
|
||||
|
||||
#if defined(__linux__) && !defined(C10_DISABLE_NUMA) && C10_MOBILE == 0
|
||||
#if defined(__linux__) && !defined(C10_DISABLE_NUMA) && !defined(C10_MOBILE)
|
||||
#include <numa.h>
|
||||
#include <numaif.h>
|
||||
#include <unistd.h>
|
||||
|
@ -133,7 +133,7 @@ inline c10::FunctionSchema make_function_schema_for_c10(const char* OperatorName
|
||||
* - calling C10_REGISTER_CAFFE2_OPERATOR_CUDA is optional and can be omitted if
|
||||
* you don't want to expose the operator for CUDA operations.
|
||||
*/
|
||||
#if !C10_MOBILE
|
||||
#ifndef C10_MOBILE
|
||||
#define C10_DECLARE_CAFFE2_OPERATOR(OperatorName) \
|
||||
namespace caffe2 { \
|
||||
namespace _c10_ops { \
|
||||
|
@ -180,7 +180,7 @@ C10_DECLARE_REGISTRY(
|
||||
Workspace*);
|
||||
|
||||
// TODO Also register c10 operators on mobile
|
||||
#if !C10_MOBILE
|
||||
#ifndef C10_MOBILE
|
||||
// TODO Currently we only register the CPU variant. This is going to be fixed
|
||||
// once the tensor detemplatization lands.
|
||||
#define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH(OperatorHandle, Name, NumOutputParameters) \
|
||||
|
@ -2,7 +2,7 @@
|
||||
#include "caffe2/core/flags.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
|
||||
#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
|
||||
#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
|
||||
#include "caffe2/mobile/contrib/ios/mpscnn/mpscnn.h"
|
||||
#endif
|
||||
|
||||
@ -14,7 +14,7 @@ Caffe2IOSPredictor* Caffe2IOSPredictor::NewCaffe2IOSPredictor(const caffe2::NetD
|
||||
bool allowMetalOperators) {
|
||||
caffe2::NetDef metal_predict_net;
|
||||
bool usingMetalOperators = false;
|
||||
#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
|
||||
#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
|
||||
if (allowMetalOperators) {
|
||||
caffe2::dumpDef(predict_net);
|
||||
if (caffe2::tryConvertToMPSCNN(init_net, predict_net, &metal_predict_net)) {
|
||||
@ -38,7 +38,7 @@ Caffe2IOSPredictor::Caffe2IOSPredictor(const caffe2::NetDef& init_net,
|
||||
bool disableMultithreadProcessing,
|
||||
bool usingMetalOperators)
|
||||
: usingMetalOperators(usingMetalOperators), predictor_(init_net, predict_net) {
|
||||
#if C10_MOBILE
|
||||
#ifdef C10_MOBILE
|
||||
if (disableMultithreadProcessing) {
|
||||
caffe2::ThreadPool* threadpool = predictor_.ws()->GetThreadPool();
|
||||
if (threadpool != nullptr) {
|
||||
|
@ -1,7 +1,7 @@
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/context.h"
|
||||
|
||||
#if defined(CAFFE2_USE_MPSCNN) && C10_MOBILE
|
||||
#if defined(CAFFE2_USE_MPSCNN) && defined(C10_MOBILE)
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
@ -1,7 +1,7 @@
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#if C10_MOBILE
|
||||
#ifdef C10_MOBILE
|
||||
|
||||
#include "mpscnn_context.h"
|
||||
#include "mpscnn_kernels.h"
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#if C10_MOBILE && defined(CAFFE2_USE_MPSCNN_TEST)
|
||||
#if defined(C10_MOBILE) && defined(CAFFE2_USE_MPSCNN_TEST)
|
||||
|
||||
#include "mpscnn_context.h"
|
||||
#include "mpscnn_graph_mask.h"
|
||||
|
@ -280,7 +280,7 @@ std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
|
||||
// r->WQL1Norm.mutable_data<float>()[i] *= center_distance;
|
||||
// }
|
||||
state->parallelFor = [ws](size_t range, std::function<void(size_t)> f) {
|
||||
#if C10_MOBILE
|
||||
#ifdef C10_MOBILE
|
||||
ws->GetThreadPool()->run([&](int, size_t v) { f(v); }, range);
|
||||
#else
|
||||
for (size_t v = 0; v < range; ++v) {
|
||||
|
@ -6,7 +6,7 @@
|
||||
#include "caffe2/utils/map_utils.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
|
||||
#if !C10_MOBILE
|
||||
#ifndef C10_MOBILE
|
||||
#include "onnx/checker.h"
|
||||
#include "onnx/optimizer/optimize.h"
|
||||
#endif
|
||||
@ -69,7 +69,7 @@ caffe2::DeviceOption GetDeviceOption(const Device& onnx_device) {
|
||||
return d;
|
||||
}
|
||||
|
||||
#if !C10_MOBILE
|
||||
#ifndef C10_MOBILE
|
||||
ModelProto OptimizeOnnx(const ModelProto& input, bool init) {
|
||||
std::vector<std::string> passes{"fuse_consecutive_transposes",
|
||||
"eliminate_nop_transpose",
|
||||
@ -1422,7 +1422,7 @@ void Caffe2Backend::OnnxToCaffe2(
|
||||
const std::vector<Caffe2Ops>& extras) {
|
||||
auto device_option = GetDeviceOption(Device(device));
|
||||
|
||||
#if !C10_MOBILE
|
||||
#ifndef C10_MOBILE
|
||||
ModelProto init_model = OptimizeOnnx(onnx_model, true);
|
||||
ModelProto pred_model = OptimizeOnnx(onnx_model, false);
|
||||
#else
|
||||
@ -1526,7 +1526,7 @@ Caffe2BackendRep* Caffe2Backend::Prepare(
|
||||
ModelProto onnx_model;
|
||||
ParseProtoFromLargeString(onnx_model_str, &onnx_model);
|
||||
|
||||
#if !C10_MOBILE
|
||||
#ifndef C10_MOBILE
|
||||
::ONNX_NAMESPACE::checker::check_model(onnx_model);
|
||||
#endif
|
||||
|
||||
|
@ -3,11 +3,7 @@
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
#error "mobile build state not defined"
|
||||
#endif
|
||||
|
||||
#if C10_MOBILE
|
||||
#ifdef C10_MOBILE
|
||||
// mobile-only implementation (tiled + vectorized + multithreaded)
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(
|
||||
ConvTranspose,
|
||||
|
@ -3,11 +3,7 @@
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
#error "mobile build state not defined"
|
||||
#endif
|
||||
|
||||
#if C10_MOBILE
|
||||
#ifdef C10_MOBILE
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
@ -5,11 +5,7 @@
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
#error "mobile build state not defined"
|
||||
#endif
|
||||
|
||||
#if C10_MOBILE
|
||||
#ifdef C10_MOBILE
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/operators/conv_op_shared.h"
|
||||
|
@ -71,7 +71,7 @@ PredictorConfig makePredictorConfig(
|
||||
if (run_init) {
|
||||
CAFFE_ENFORCE(ws.RunNetOnce(init_net));
|
||||
}
|
||||
#if C10_MOBILE
|
||||
#ifdef C10_MOBILE
|
||||
GlobalInit();
|
||||
#endif
|
||||
if (optimization &&
|
||||
|
@ -489,7 +489,7 @@ class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {
|
||||
|
||||
Timer t;
|
||||
|
||||
#if C10_MOBILE
|
||||
#ifdef C10_MOBILE
|
||||
ws_->GetThreadPool()->run(
|
||||
[&](int, int n_g) {
|
||||
const int g = n_g / N;
|
||||
|
@ -1,6 +1,6 @@
|
||||
#include "observers/perf_observer.h"
|
||||
#include "observers/observer_config.h"
|
||||
#if !C10_MOBILE
|
||||
#ifndef C10_MOBILE
|
||||
#include "caffe2/core/flags.h"
|
||||
#include "observers/net_observer_reporter_print.h"
|
||||
#endif
|
||||
@ -10,7 +10,7 @@
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
#if !C10_MOBILE
|
||||
#ifndef C10_MOBILE
|
||||
C10_DEFINE_int64(
|
||||
aiBench_netInitSampleRate,
|
||||
0,
|
||||
@ -45,7 +45,7 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
|
||||
return caffe2::make_unique<PerfNetObserver>(subject);
|
||||
});
|
||||
|
||||
#if !C10_MOBILE
|
||||
#if !defined(C10_MOBILE)
|
||||
// for aibench usage
|
||||
caffe2::ObserverConfig::setReporter(
|
||||
caffe2::make_unique<caffe2::NetObserverReporterPrint>());
|
||||
|
Reference in New Issue
Block a user