diff --git a/android/build_defs.bzl b/android/build_defs.bzl new file mode 100644 index 000000000000..5e8497a6923b --- /dev/null +++ b/android/build_defs.bzl @@ -0,0 +1,19 @@ +load("@fbsource//tools/build_defs:fb_xplat_cxx_test.bzl", "fb_xplat_cxx_test") +load("@fbsource//xplat/caffe2:pt_defs.bzl", "get_build_from_deps_query", "pt_operator_registry") + +DEFAULT_PT_OP_DEPS = [ + "fbsource//xplat/caffe2:torch_mobile_ops_full_dev", +] + +def pt_xplat_cxx_test(name, deps = [], pt_op_deps = DEFAULT_PT_OP_DEPS, **kwargs): + code_gen_lib = [] + if get_build_from_deps_query(): + lib_name = name + "_lib" + pt_operator_registry(lib_name, preferred_linkage = "static", template_select = False, deps = pt_op_deps) + code_gen_lib = [":" + lib_name] + deps = deps + code_gen_lib + fb_xplat_cxx_test( + name = name, + deps = deps, + **kwargs + ) diff --git a/c10/c10_defs.bzl b/c10/c10_defs.bzl new file mode 100644 index 000000000000..55fb9fc35e5d --- /dev/null +++ b/c10/c10_defs.bzl @@ -0,0 +1,29 @@ +load("@fbsource//tools/build_defs:expect.bzl", "expect") +load( + "@fbsource//tools/build_defs/apple:build_mode_defs.bzl", + "is_production_build", +) + +############################################################################### +# Check if we need to strip glog. +def _get_strip_glog_config(): + c2_strip_glog = native.read_config("caffe2", "strip_glog", "1") + expect( + c2_strip_glog in ("0", "1"), + c2_strip_glog, + ) + return bool(int(c2_strip_glog)) + +# For iOS production builds (and all Android builds), strip GLOG logging to +# save size. We can disable by setting caffe2.strip_glog=0 in .buckconfig.local. +def get_fbobjc_strip_glog_flags(): + if is_production_build() or _get_strip_glog_config(): + return ["-UGOOGLE_STRIP_LOG", "-DGOOGLE_STRIP_LOG=3"] + else: + return ["-UGOOGLE_STRIP_LOG"] + +def get_fbandroid_strip_glog_flags(): + if _get_strip_glog_config(): + return ["-UGOOGLE_STRIP_LOG", "-DGOOGLE_STRIP_LOG=1"] + else: + return [] diff --git a/c10/defs_hip.bzl b/c10/defs_hip.bzl new file mode 100644 index 000000000000..5084758b62e6 --- /dev/null +++ b/c10/defs_hip.bzl @@ -0,0 +1,126 @@ +load("@bazel_skylib//lib:paths.bzl", "paths") +load("//caffe2:defs_hip.bzl", "get_hip_file_path") + +gpu_file_extensions = [".cu", ".c", ".cc", ".cpp"] +gpu_header_extensions = [".cuh", ".h", ".hpp"] + +def is_test_files(filepath): + if filepath.startswith("test"): + return True + else: + return False + +def get_c10_hip_srcs(): + gpu_file_pattern = [ + base + suffix + for base in c10_includes + for suffix in gpu_file_extensions + ] + native_gpu_files = native.glob(gpu_file_pattern) + + gpu_files = [] + hip_files = [] + for name in native_gpu_files: + # exclude the test folder + if is_test_files(name): + continue + + gpu_files.append(name) + hip_file_name = get_hip_file_path(paths.join("cuda/", name)) + hip_files.append(hip_file_name) + + # there will be some native hip files that needs suffix changed + native_hip_pattern = [ + "hip/**/*.hip", + ] + native_hip_files = native.glob(native_hip_pattern) + + gpu_files += native_hip_files + hip_files += native_hip_files + + # we run hipify script under the caffe2 folder; therefore we need to + # prepend c10 to the path so that buck can find the hipified file + real_hip_files = [] + for filename in hip_files: + real_hip_files.append(paths.join("c10", filename)) + + # return the src and output_gen files + return gpu_files, real_hip_files + +def get_c10_hip_headers(): + gpu_file_pattern = [ + base + suffix + for base in c10_includes + for suffix in gpu_header_extensions + ] + native_gpu_files = native.glob(gpu_file_pattern) + + # store the original + gpu_files = [] + hip_files = [] + for name in native_gpu_files: + if is_test_files(name): + continue + + gpu_files.append(name) + hip_file_name = get_hip_file_path(paths.join("cuda/", name)) + hip_files.append(hip_file_name) + + # there will be some native hip files that needs suffix changed + native_hip_pattern = [ + "hip/**/*" + suffix + for suffix in gpu_header_extensions + ] + native_hip_files = native.glob(native_hip_pattern) + + gpu_files += native_hip_files + hip_files += native_hip_files + + # we run hipify script under the caffe2 folder; therefore we need to + # prepend c10 to the path so that buck can find the hipified file + real_hip_files = [] + for filename in hip_files: + real_hip_files.append(paths.join("c10", filename)) + + # return the src and output_gen files + return gpu_files, real_hip_files + +def get_c10_hip_test_files(): + gpu_file_pattern = [ + base + suffix + for base in c10_includes + for suffix in gpu_file_extensions + ] + native_gpu_files = native.glob(gpu_file_pattern) + + # store the original + gpu_files = [] + hip_files = [] + for name in native_gpu_files: + if not is_test_files(name): + continue + + gpu_files.append(name) + hip_file_name = get_hip_file_path(paths.join("cuda/", name)) + hip_files.append(hip_file_name) + + # there will be some native hip files that needs suffix changed + native_hip_pattern = [ + "hip/test/**/*" + suffix + for suffix in gpu_header_extensions + ] + native_hip_files = native.glob(native_hip_pattern) + + gpu_files += native_hip_files + hip_files += native_hip_files + + # we run hipify script under the caffe2 folder; therefore we need to + # prepend c10 to the path so that buck can find the hipified file + real_hip_files = [] + for filename in hip_files: + real_hip_files.append(paths.join("c10", filename)) + + # return the src and output_gen files + return gpu_files, real_hip_files + +c10_includes = ["**/*"] diff --git a/c10/ovrsource_defs.bzl b/c10/ovrsource_defs.bzl new file mode 100644 index 000000000000..8d23920007a0 --- /dev/null +++ b/c10/ovrsource_defs.bzl @@ -0,0 +1,276 @@ +load("//arvr/tools/build_defs:genrule_utils.bzl", "gen_cmake_header") +load("//arvr/tools/build_defs:oxx.bzl", "oxx_static_library") + +cpu_supported_platforms = [ + "ovr_config//os:android", + "ovr_config//os:iphoneos", + "ovr_config//os:linux-x86_64", + "ovr_config//os:macos", + "ovr_config//os:windows-x86_64", + "ovr_config//runtime:arm64-linux-ubuntu-neon", +] + +cuda_supported_platforms = [ + "ovr_config//os:linux-cuda", + "ovr_config//os:windows-cuda", +] + +def define_c10_ovrsource(name, is_mobile): + if is_mobile: + pp_flags = ["-DC10_MOBILE=1"] + else: + pp_flags = [] + + oxx_static_library( + name = name, + srcs = native.glob([ + "core/*.cpp", + "core/impl/*.cpp", + "mobile/*.cpp", + "util/*.cpp", + ]), + compatible_with = cpu_supported_platforms, + compiler_flags = select({ + "DEFAULT": [], + "ovr_config//compiler:cl": [ + "/w", + ], + "ovr_config//toolchain/clang:win": [ + "-Wno-error", + "-Wno-shadow", + "-Wno-undef", + "-Wno-unused-variable", + ], + }), + include_directories = [".."], + preprocessor_flags = [ + "-DNO_EXPORT", + "-DC10_BUILD_MAIN_LIB=1", + "-DSUPPORTS_BACKTRACE=0", + ], + public_include_directories = [".."], + public_preprocessor_flags = pp_flags, + public_raw_headers = native.glob([ + "core/*.h", + "macros/*.h", + "mobile/*.h", + "test/util/*.h", # some external tests use this + "util/*.h", + ]), + raw_headers = native.glob([ + "core/impl/*.h", + ]), + reexport_all_header_dependencies = False, + # tests = C10_CPU_TEST_TARGETS, + visibility = [ + "//xplat/caffe2/c10:c10_ovrsource", + ], + deps = select({ + "DEFAULT": [], + "ovr_config//os:linux": [ + "//third-party/numactl:numactl", + ], + }), + exported_deps = [ + ":ovrsource_c10_cmake_macros.h", + "//arvr/third-party/gflags:gflags", + "//third-party/glog:glog", + "//third-party/fmt:fmt", + ], + ) + +def define_ovrsource_targets(): + # C10_CPU_TEST_FILES = native.glob([ + # "test/core/*.cpp", + # "test/util/*.cpp", + # ]) + + # C10_GPU_TEST_FILES = native.glob([ + # "cuda/test/**/*.cpp", + # ]) + + # C10_CPU_TEST_TARGETS = [ + # ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource" + # for test in C10_CPU_TEST_FILES + # ] + + # C10_GPU_TEST_TARGETS = [ + # ":" + paths.basename(test)[:-len(".cpp")] + "_ovrsource" + # for test in C10_GPU_TEST_FILES + # ] + + common_c10_cmake_defines = [ + ("#cmakedefine C10_BUILD_SHARED_LIBS", ""), + ("#cmakedefine C10_DISABLE_NUMA", ""), + ("#cmakedefine C10_USE_NUMA", ""), + ("#cmakedefine C10_USE_MSVC_STATIC_RUNTIME", ""), + ] + + mobile_c10_cmake_defines = [ + ("#cmakedefine C10_USE_GLOG", ""), + ("#cmakedefine C10_USE_GFLAGS", ""), + ] + + non_mobile_c10_cmake_defines = [ + ("#cmakedefine C10_USE_GLOG", "#define C10_USE_GLOG 1"), + ("#cmakedefine C10_USE_GFLAGS", "#define C10_USE_GFLAGS 1"), + ] + + gen_cmake_header( + src = "macros/cmake_macros.h.in", + defines = common_c10_cmake_defines + mobile_c10_cmake_defines, + header = "c10/macros/cmake_macros.h", + prefix = "ovrsource_c10_mobile_", + ) + + gen_cmake_header( + src = "macros/cmake_macros.h.in", + defines = common_c10_cmake_defines + non_mobile_c10_cmake_defines, + header = "c10/macros/cmake_macros.h", + prefix = "ovrsource_c10_non_mobile_", + ) + + oxx_static_library( + name = "ovrsource_c10_cmake_macros.h", + compatible_with = [ + "ovr_config//os:android", + "ovr_config//os:iphoneos", + "ovr_config//os:linux", + "ovr_config//os:macos", + "ovr_config//os:windows", + ], + deps = select({ + "ovr_config//os:android": [":ovrsource_c10_mobile_cmake_macros.h"], + "ovr_config//os:iphoneos": [":ovrsource_c10_mobile_cmake_macros.h"], + "ovr_config//os:linux": [":ovrsource_c10_non_mobile_cmake_macros.h"], + "ovr_config//os:macos": [":ovrsource_c10_non_mobile_cmake_macros.h"], + "ovr_config//os:windows": [":ovrsource_c10_non_mobile_cmake_macros.h"], + }), + ) + + c10_cuda_macros = gen_cmake_header( + src = "cuda/impl/cuda_cmake_macros.h.in", + defines = [ + ("#cmakedefine C10_CUDA_BUILD_SHARED_LIBS", ""), + ], + header = "c10/cuda/impl/cuda_cmake_macros.h", + prefix = "ovrsource", + ) + + oxx_static_library( + name = "c10_ovrsource", + compatible_with = cpu_supported_platforms, + exported_deps = select({ + "DEFAULT": [":c10_full_ovrsource"], + "ovr_config//os:android": [":c10_mobile_ovrsource"], + "ovr_config//os:iphoneos": [":c10_mobile_ovrsource"], + }), + visibility = ["PUBLIC"], + ) + + """ + Most users should use c10_ovrsource, not these targets directly. + """ + define_c10_ovrsource("c10_mobile_ovrsource", True) + define_c10_ovrsource("c10_full_ovrsource", False) + + oxx_static_library( + name = "c10_cuda_ovrsource", + srcs = native.glob([ + "cuda/*.cpp", + "cuda/impl/*.cpp", + ]), + compatible_with = cuda_supported_platforms, + compiler_flags = select({ + "DEFAULT": [], + "ovr_config//compiler:cl": [ + "/w", + ], + "ovr_config//toolchain/clang:win": [ + "-Wno-error", + "-Wno-shadow", + "-Wno-undef", + "-Wno-unused-variable", + ], + }), + link_whole = True, + preprocessor_flags = [ + "-DNO_EXPORT", + "-DC10_CUDA_BUILD_MAIN_LIB=1", + ], + raw_headers = native.glob([ + "cuda/*.h", + "cuda/impl/*.h", + ]), + reexport_all_header_dependencies = False, + # tests = C10_GPU_TEST_TARGETS, + visibility = ["PUBLIC"], + deps = [ + "//third-party/cuda:libcuda", + "//third-party/cuda:libcudart", + ], + exported_deps = c10_cuda_macros + [ + ":c10_ovrsource", + ], + ) + + # [ + # oxx_test( + # name = paths.basename(test)[:-len(".cpp")] + "_ovrsource", + # srcs = [test], + # compatible_with = cpu_supported_platforms, + # compiler_flags = select({ + # "DEFAULT": [], + # "ovr_config//compiler:cl": [ + # "/w", + # ], + # "ovr_config//compiler:clang": [ + # "-Wno-error", + # "-Wno-self-assign-overloaded", + # "-Wno-self-move", + # "-Wno-shadow", + # "-Wno-undef", + # "-Wno-unused-function", + # "-Wno-unused-variable", + # ], + # }), + # framework = "gtest", + # oncall = "ovrsource_pytorch", + # raw_headers = native.glob([ + # "test/**/*.h", + # ]), + # deps = [ + # ":c10_ovrsource", + # ], + # ) + # for test in C10_CPU_TEST_FILES + # ] + + # [ + # oxx_test( + # name = paths.basename(test)[:-len(".cpp")] + "_ovrsource", + # srcs = [test], + # compatible_with = cuda_supported_platforms, + # compiler_flags = select({ + # "DEFAULT": [], + # "ovr_config//compiler:cl": [ + # "/w", + # ], + # "ovr_config//compiler:clang": [ + # "-Wno-error", + # ], + # }), + # framework = "gtest", + # oncall = "ovrsource_pytorch", + # raw_headers = native.glob([ + # "test/**/*.h", + # ]), + # runtime_shared_libraries = [ + # "//third-party/cuda:cudart", + # ], + # deps = [ + # ":c10_cuda_ovrsource", + # ], + # ) + # for test in C10_GPU_TEST_FILES + # ] diff --git a/c2_defs.bzl b/c2_defs.bzl new file mode 100644 index 000000000000..01ec0c6d1642 --- /dev/null +++ b/c2_defs.bzl @@ -0,0 +1,549 @@ +load("@bazel_skylib//lib:collections.bzl", "collections") +load("@bazel_skylib//lib:paths.bzl", "paths") +load("@fbcode_macros//build_defs:native_rules.bzl", "buck_genrule") +load("@fbsource//tools/build_defs:default_platform_defs.bzl", "compose_platform_setting_list") +load("@fbsource//tools/build_defs:dict_defs.bzl", "dict_defs") +load("@fbsource//tools/build_defs:expect.bzl", "expect") +load("@fbsource//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library") +load("@fbsource//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode") +load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "CXX", "IOS", "MACOSX", "WINDOWS") +load("@fbsource//tools/build_defs/apple:build_mode_defs.bzl", "is_production_build") +load("@fbsource//tools/build_defs/apple:config_utils_defs.bzl", "STATIC_LIBRARY_IOS_CONFIG", "STATIC_LIBRARY_MAC_CONFIG", "fbobjc_configs") +load("@fbsource//tools/build_defs/apple:focus_config.bzl", "is_focus_enabled") +load("@fbsource//xplat/pfh/Msgr/Mobile/ProductInfra:DEFS.bzl", "Msgr_Mobile_ProductInfra") + +def get_c2_expose_op_to_c10(): + c2_op_to_c10 = native.read_config("caffe2", "expose_op_to_c10", "0") + + expect( + c2_op_to_c10 in ("0", "1"), + c2_op_to_c10, + ) + + return bool(int(c2_op_to_c10)) + +def get_c2_mpscnn(): + c2_mpscnn = native.read_config("caffe2", "enable_mpscnn", "1") + + expect( + c2_mpscnn in ("0", "1"), + c2_mpscnn, + ) + + return bool(int(c2_mpscnn)) + +def get_c2_mpscnn_test(): + c2_mpscnn_test = native.read_config("caffe2", "enable_mpscnn_test", "0") + + expect( + c2_mpscnn_test in ("0", "1"), + c2_mpscnn_test, + ) + + return bool(int(c2_mpscnn_test)) + +def get_c2_nomnigraph(): + c2_nomnigraph = native.read_config("caffe2", "enable_nomnigraph", "1") + + expect( + c2_nomnigraph in ("0", "1"), + c2_nomnigraph, + ) + + return bool(int(c2_nomnigraph)) + +def get_c2_qpl(): + c2_qpl = native.read_config("caffe2", "enable_qpl", "1") + + expect( + c2_qpl in ("0", "1"), + c2_qpl, + ) + + return bool(int(c2_qpl)) + +def get_c2_strip_debug_info(): + c2_strip_debug_info = native.read_config("caffe2", "strip_debug_info", "0") + + expect( + c2_strip_debug_info in ("0", "1"), + c2_strip_debug_info, + ) + + return bool(int(c2_strip_debug_info)) + +def get_c2_strip_glog(): + c2_strip_glog = native.read_config("caffe2", "strip_glog", "1") + + expect( + c2_strip_glog in ("0", "1"), + c2_strip_glog, + ) + + return bool(int(c2_strip_glog)) + +def get_c2_tvm(): + c2_tvm = native.read_config("caffe2", "enable_tvm", "1") + + expect( + c2_tvm in ("0", "1"), + c2_tvm, + ) + + return bool(int(c2_tvm)) + +_C2_XPLAT_NO_HPTT_PREPROCESSOR_FLAGS = [ + "-fexceptions", + "-frtti", + "-Wno-shadow", + "-Wno-unknown-pragmas", + "-Wno-unused-variable", + "-Wno-sign-compare", + "-Icaffe2", + "-Imodules", + "-DEIGEN_NO_DEBUG", + "-DCAFFE2_USE_LITE_PROTO", + "-DCAFFE2_USE_GOOGLE_GLOG", + "-DCAFFE2_RNN_NO_TEXT_FORMAT", + "-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK=1", + "-DCAFFE2_IS_XPLAT_BUILD", + "-DSTRIP_ERROR_MESSAGES", + "-DUSE_INTERNAL_PTHREADPOOL_IMPL", +] + +def get_c2_xplat_no_hptt_preprocessor_flags(): + flags = [] + flags += _C2_XPLAT_NO_HPTT_PREPROCESSOR_FLAGS + if is_arvr_mode() and get_c2_strip_glog(): + flags += ["-UGOOGLE_STRIP_LOG", "-DGOOGLE_STRIP_LOG=1"] + if get_c2_expose_op_to_c10(): + flags += ["-DEXPOSE_C2_OPS", "-frtti"] + return flags + +C2_XPLAT_SERVER_PREPROCESSOR_FLAGS = [ + "-DCAFFE2_USE_EIGEN_FOR_BLAS", + "-DC10_DISABLE_SIGNAL_HANDLERS", + "-DCAFFE2_DISABLE_NUMA", +] + +C2_XPLAT_HPTT_PREPROCESSOR_FLAGS = [ + "-DCAFFE2_USE_HPTT", +] + +def get_c2_xplat_preprocessor_flags(): + flags = get_c2_xplat_no_hptt_preprocessor_flags() + C2_XPLAT_HPTT_PREPROCESSOR_FLAGS + if get_c2_nomnigraph(): + flags.append("-DCAFFE2_OPTIMIZER") + return flags + +def get_c2_xplat_no_hptt_compiler_flags(): + return [ + "-Os", + ] + get_c2_xplat_no_hptt_preprocessor_flags() + +def get_c2_xplat_compiler_flags(): + return get_c2_xplat_no_hptt_compiler_flags() + C2_XPLAT_HPTT_PREPROCESSOR_FLAGS + +def get_c2_fbobjc_xplat_compiler_flags(): + flags = [] + + if is_production_build(): + flags.append("-DCAFFE2_NO_OPERATOR_SCHEMA") + + flags.append("-DCAFFE2_NO_GRADIENT_OPS") + + # For iOS production builds (and all Android builds), strip GLOG logging to + # save size. We can disable by setting caffe2.strip_glog=0 in .buckconfig.local. + if is_production_build() or get_c2_strip_glog(): + flags += ["-UGOOGLE_STRIP_LOG", "-DGOOGLE_STRIP_LOG=3"] + else: + flags.append("-UGOOGLE_STRIP_LOG") + + return flags + +def get_c2_fbandroid_xplat_compiler_flags(): + flags = [ + # T95767731 -- remove this once all builds are on at least llvm-13 + "-Wno-unknown-warning-option", + "-Wno-unused-but-set-variable", + ] + + if get_c2_strip_glog(): + flags += ["-UGOOGLE_STRIP_LOG", "-DGOOGLE_STRIP_LOG=1"] + + if get_c2_strip_debug_info(): + flags.append("-g0") + + return flags + +_C2_FBOBJC_COMPILER_FLAGS = [ + "-Wno-missing-prototypes", + "-Wno-global-constructors", + "-Wno-unknown-pragmas", + "-Wno-invalid-partial-specialization", + "-Wno-missing-braces", + "-Wno-range-loop-analysis", +] + +def get_c2_fbobjc_compiler_flags(): + flags = list(_C2_FBOBJC_COMPILER_FLAGS) + + # Avoid linking Accelerate on MacOS because we have + # inconsistent LAPACK headers (see problems in D19257077). + flags.append("-DCAFFE2_USE_ACCELERATE" if not is_arvr_mode() else "-DCAFFE2_USE_EIGEN_FOR_BLAS") + if get_c2_mpscnn(): + flags.append( + # TODO(t19120552) - fix this. MPSCNNConvolutionDescriptor.strideInPixelsX + # is marked as iOS 11+, but it's been available since iOS 10. + "-Wno-unguarded-availability", + ) + return flags + +C2_FBOBJC_MACOSX_COMPILER_FLAGS = [ + "-msse4.2", +] + +C2_FBOBJC_IPHONE_COMPILER_FLAGS = [ + "-mfpu=neon-fp16", +] + +def get_c2_fbobjc_frameworks(): + frameworks = [] + if not is_arvr_mode(): + frameworks.append( + # On iOS, presumably Accelerate is a faster BLAS + "$SDKROOT/System/Library/Frameworks/Accelerate.framework", + ) + return frameworks + +def get_c2_fbobjc_ios_frameworks(): + frameworks = [] + + if get_c2_mpscnn(): + frameworks.append( + "$SDKROOT/System/Library/Frameworks/Metal.framework", + ) + + return frameworks + +def get_c2_fbobjc_linker_flags(): + flags = [] + + if get_c2_mpscnn(): + # Need linker flags as no platform_frameworks exist, and we can't + # use MPSCNN on x86_64. + # We use weak_framework as it's iOS 10 + flags = [ + "-L$SDKROOT/System/Library/Frameworks/MetalPerformanceShaders.framework", + "-weak_framework", + "MetalPerformanceShaders", + ] + return flags + +def get_c2_fbobjc_exported_preprocessor_flags(): + flags = [] + + if get_c2_mpscnn(): + flags.append("-DCAFFE2_USE_MPSCNN") + + if get_c2_mpscnn_test(): + flags.append("-DCAFFE2_USE_MPSCNN_TEST") + + return flags + +def get_c2_fbandroid_exported_preprocessor_flags(): + flags = [] + + BUILD_MODE_DO_NOT_USE_WITHOUT_ASKING_SERIOUSLY = native.read_config( + "fbandroid", + "build_mode", + "dev", + ) + if BUILD_MODE_DO_NOT_USE_WITHOUT_ASKING_SERIOUSLY == "opt": + flags.append("-DCAFFE2_NO_OPERATOR_SCHEMA") + + flags.append("-DCAFFE2_NO_GRADIENT_OPS") + + return flags + +C2_FBANDROID_COMPILER_FLAGS = [ + "-DCAFFE2_USE_EIGEN_FOR_BLAS", + "-Wno-unknown-pragmas", + "-Wno-deprecated-declarations", + "-Wno-invalid-partial-specialization", + "-Wno-missing-braces", +] + +C2_FBANDROID_ARMV7_COMPILER_FLAGS = [ + "-mfpu=neon-fp16", +] + +C2_FBANDROID_X86_COMPILER_FLAGS = [ + "-mssse3", +] + +C2_FBANDROID_LINKER_FLAGS = [] + +C2_FBOBJC_EXTRA_TARGET_CONFIG = { + "MTL_LANGUAGE_REVISION": "Metal12", +} + +def get_c2_default_cxx_args(): + return dict( + header_namespace = "", + apple_sdks = (IOS, MACOSX), + compiler_flags = get_c2_xplat_compiler_flags(), + fbandroid_compiler_flags = C2_FBANDROID_COMPILER_FLAGS + get_c2_fbandroid_xplat_compiler_flags(), + fbandroid_exported_platform_preprocessor_flags = [ + ( + "android-armv7", + get_c2_fbandroid_exported_preprocessor_flags(), + ), + ], + fbandroid_linker_flags = C2_FBANDROID_LINKER_FLAGS, + fbandroid_platform_compiler_flags = [ + ("android-armv7", C2_FBANDROID_ARMV7_COMPILER_FLAGS), + (".*x86.*", C2_FBANDROID_X86_COMPILER_FLAGS), + ], + fbobjc_compiler_flags = get_c2_fbobjc_compiler_flags() + get_c2_fbobjc_xplat_compiler_flags(), + fbobjc_configs = fbobjc_configs( + STATIC_LIBRARY_IOS_CONFIG, + extra_target_config = C2_FBOBJC_EXTRA_TARGET_CONFIG, + ), + fbobjc_exported_platform_linker_flags = [ + ( + "iphoneos", + get_c2_fbobjc_linker_flags(), + ), + ], + fbobjc_exported_platform_preprocessor_flags = [ + ( + "iphoneos", + get_c2_fbobjc_exported_preprocessor_flags(), + ), + ], + fbobjc_frameworks = get_c2_fbobjc_frameworks() + get_c2_fbobjc_ios_frameworks(), + fbobjc_platform_compiler_flags = [ + ("iphoneos", C2_FBOBJC_IPHONE_COMPILER_FLAGS), + ], + macosx_compiler_flags = C2_FBOBJC_MACOSX_COMPILER_FLAGS, + fbobjc_macosx_configs_override = fbobjc_configs( + STATIC_LIBRARY_MAC_CONFIG, + ), + macosx_frameworks_override = get_c2_fbobjc_frameworks(), + preprocessor_flags = [ + # Use the internal pthreadpool impl for all Caffe2 targets on all + # platforms but do not export the preprocessor flag downstream. + "-DUSE_INTERNAL_PTHREADPOOL_IMPL", + ], + visibility = ["PUBLIC"], + windows_preferred_linkage = "static" if is_arvr_mode() else None, + xcode_public_headers_symlinks = True, + ) + +def get_c2_aten_cpu_fbobjc_macosx_deps(): + if is_focus_enabled(): + # focus2 is broken when using platform deps (T80070498) so in the case + # where it's focus2 we just add fbgemm as a standard dep. Otherwise we + # use platform deps to select correctly for arm64. + return [ + "fbsource//xplat/deeplearning/fbgemm:fbgemm", + "fbsource//xplat/caffe2:cpukernel_avx2", + ] + else: + return [] + +def get_c2_aten_cpu_fbobjc_macosx_platform_deps(): + if is_focus_enabled(): + # focus2 is broken when using platform deps (T80070498) so in the case + # where it's focus2 we just add fbgemm as a standard dep. Otherwise we + # use platform deps to select correctly for arm64. + return [] + else: + return compose_platform_setting_list([ + { + "cpu": "x86_64", + "flags": [ + "fbsource//xplat/deeplearning/fbgemm:fbgemmAppleMac", + ] + ([ + "fbsource//xplat/caffe2:cpukernel_avx2AppleMac", + ] if not is_arvr_mode() else []), + "os": "macosx", + }, + { + "cpu": "arm64", + "flags": ["fbsource//xplat/third-party/XNNPACK:XNNPACKAppleMac"], + "os": "macosx", + }, + ]) + +def c2_cxx_library(**kwargs): + args = get_c2_default_cxx_args() + args.update(kwargs) + args.setdefault("platforms", (ANDROID, APPLE, CXX, WINDOWS)) + fb_xplat_cxx_library( + labels = [ + "supermodule:android/default/caffe2", + "supermodule:ios/default/public.caffe2", + ], + feature = Msgr_Mobile_ProductInfra, + **args + ) + +def c2_protobuf_rule(protos): + cpps = [] + headers = {} + raw_headers = {} + for p in protos: + proto = paths.basename(p) + if native.host_info().os.is_windows: + protocexe = "$(exe fbsource//third-party/protobuf:protoc-host)" if is_arvr_mode() else "$(location fbsource//xplat/third-party/protobuf:protoc.Windows)" + protocmd = "powershell.exe -file $(location fbsource//xplat/caffe2/scripts:proto)\\proto.ps1 -Protoc {} -Unprocessed $SRCDIR/{} -Processed $SRCDIR/{} -out $OUT -srcdir $SRCDIR".format(protocexe, p, proto) + else: + protocmd = ("cp $SRCDIR/{} $SRCDIR/{} && chmod +w $SRCDIR/{} && echo \"option optimize_for = LITE_RUNTIME;\" >> $SRCDIR/{} && ".format(p, proto, proto, proto) + + "cp $SRCDIR/caffe2/proto/caffe2.proto $SRCDIR/caffe2.proto && chmod +w $SRCDIR/caffe2.proto && echo \"option optimize_for = LITE_RUNTIME;\" >> $SRCDIR/caffe2.proto && " + + "sed -i -e 's/caffe2\\/proto\\/caffe2.proto/caffe2.proto/g' $SRCDIR/{} && ".format(proto) + + ("$(exe fbsource//third-party/protobuf:protoc-host) " if is_arvr_mode() else "$(exe fbsource//xplat/third-party/protobuf:protoc) --osx $(location fbsource//xplat/third-party/protobuf:protoc.Darwin) --linux $(location fbsource//xplat/third-party/protobuf:protoc.Linux) ") + + "-I $SRCDIR --cpp_out=$OUT $SRCDIR/{}".format(proto)) + buck_genrule( + name = proto, + srcs = sorted(collections.uniq([p, "caffe2/proto/caffe2.proto"])), + cmd_exe = protocmd, + bash = protocmd, + out = ".", + ) + (name, _) = paths.split_extension(proto) + cpp = name + ".pb.cc" + h = name + ".pb.h" + buck_genrule( + name = h, + cmd_exe = "@powershell -Command \" & { " + "(Get-Content $(location :{})\\{}".format(proto, h) + ") -replace \\\"caffe2.pb.h\\\", \\\"caffe2/proto/caffe2.pb.h\\\" | Set-Content $OUT } \"", + bash = "cp -f $(location :{})/{} $OUT && ".format(proto, h) + + "sed -i -e 's/caffe2.pb.h/caffe2\\/proto\\/caffe2.pb.h/g' $OUT", + out = h, + ) + headers["caffe2/proto/" + h] = ":{}".format(h) + raw_headers[h] = ":{}".format(h) + buck_genrule( + name = cpp, + cmd_exe = "@powershell -Command copy $(location :{})/{} $OUT".format(proto, cpp), + bash = "cp -f $(location :{})/{} $OUT".format(proto, cpp), + out = cpp, + ) + cpps.append(":{}".format(cpp)) + return (cpps, headers, raw_headers) + +# C2 uses lite version of protobuf while torch/jit uses some method only exists +# in full protobuf. This is a temporary workaround to enable experiment build. +# DO NOT USE IT IN PRODUCTION BUILD! +def c2_full_protobuf_rule(protos): + prefix = "full_" + cpps = [] + headers = {} + raw_headers = {} + for p in protos: + proto = paths.basename(p) + if native.host_info().os.is_windows: + protocexe = "$(exe fbsource//third-party/protobuf:protoc-host)" if is_arvr_mode() else "$(location fbsource//xplat/third-party/protobuf:protoc.Windows)" + protocmd = "powershell.exe -file $(location fbsource//xplat/caffe2/scripts:proto)\\proto.ps1 -Protoc {} -Unprocessed $SRCDIR/{} -Processed $SRCDIR/{} -out $OUT -srcdir $SRCDIR".format(protocexe, p, proto) + else: + protocmd = ("cp $SRCDIR/{} $SRCDIR/{} && ".format(p, proto) + + "cp $SRCDIR/caffe2/proto/caffe2.proto $SRCDIR/caffe2.proto && " + + "sed -i -e 's/caffe2\\/proto\\/caffe2.proto/caffe2.proto/g' $SRCDIR/{} && ".format(proto) + + ("$(exe fbsource//third-party/protobuf:protoc-host) " if is_arvr_mode() else "$(exe fbsource//xplat/third-party/protobuf:protoc) --osx $(location fbsource//xplat/third-party/protobuf:protoc.Darwin) --linux $(location fbsource//xplat/third-party/protobuf:protoc.Linux) ") + + "-I $SRCDIR --cpp_out=$OUT $SRCDIR/{}".format(proto)) + buck_genrule( + name = prefix + proto, + srcs = sorted(collections.uniq([p, "caffe2/proto/caffe2.proto"])), + cmd = protocmd, + out = ".", + ) + (name, _) = paths.split_extension(proto) + cpp = name + ".pb.cc" + h = name + ".pb.h" + buck_genrule( + name = prefix + h, + cmd_exe = "@powershell -Command \" & { " + "(Get-Content $(location :{})\\{}".format(prefix + proto, h) + ") -replace \\\"caffe2.pb.h\\\", \\\"caffe2/proto/caffe2.pb.h\\\" | Set-Content $OUT } \"", + bash = "cp -f $(location :{})/{} $OUT && ".format(prefix + proto, h) + + "sed -i -e 's/caffe2.pb.h/caffe2\\/proto\\/caffe2.pb.h/g' $OUT", + out = h, + ) + headers["caffe2/proto/" + h] = ":{}".format(prefix + h) + raw_headers[h] = ":{}".format(prefix + h) + buck_genrule( + name = prefix + cpp, + cmd_exe = "@powershell -Command copy $(location :{})/{} $OUT".format(prefix + proto, cpp), + bash = "cp -f $(location :{})/{} $OUT".format(prefix + proto, cpp), + out = cpp, + ) + cpps.append(":{}".format(prefix + cpp)) + return (cpps, headers, raw_headers) + +def libcaffe2_cxx_library(name, use_hptt, **kwargs): + c2_cxx_library( + name = name, + exported_deps = [ + "fbsource//xplat/caffe2/c10:c10", + "fbsource//third-party/protobuf:libprotobuf" if is_arvr_mode() else "fbsource//xplat/third-party/protobuf:fb-protobuf-lite", + ":caffe2_protobuf_headers", + ":pthreadpool", + ":common_core", + ":caffe2_proto_types", + ], + compiler_flags = get_c2_xplat_compiler_flags() if use_hptt else get_c2_xplat_no_hptt_compiler_flags(), + exported_preprocessor_flags = get_c2_xplat_preprocessor_flags() if use_hptt else get_c2_xplat_no_hptt_preprocessor_flags(), + cxx_preprocessor_flags = C2_XPLAT_SERVER_PREPROCESSOR_FLAGS, + fbandroid_exported_preprocessor_flags = get_c2_fbandroid_xplat_compiler_flags(), + fbobjc_exported_preprocessor_flags = get_c2_fbobjc_xplat_compiler_flags(), + # Hack to work around lack of platform_srcs support in Xcode project generation. + macosx_extra_xcode_sources_override = [], + link_whole = True, + **kwargs + ) + +def c2_operator_library(name, **kwargs): + dict_defs.key_extend( + kwargs, + "deps", + [ + "fbsource//xplat/folly:molly", + "fbsource//third-party/glog:glog", + ":caffe2", + ] + ([":aten_cpu"] if get_c2_expose_op_to_c10() else []), + ) + + # NOTE: Currently operators can "depend" on other operators, which is used + # so that loading one will implicitly load the dependencies. So, make sure + # that no `--as-needed` flags pulled in from dependencies cause these + # operator deps to get dropped. + linker_flags = [ + "-Wl,--no-as-needed", + ] + c2_cxx_library( + name = name, + soname = "lib" + name + ".$(ext)", + fbandroid_compiler_flags = get_c2_default_cxx_args()["fbandroid_compiler_flags"] + ["-Os"], + fbobjc_compiler_flags = get_c2_default_cxx_args()["fbobjc_compiler_flags"] + ["-Oz", "-DCOMPILING_FOR_MIN_SIZE=1"], + link_whole = True, + cxx_exported_linker_flags = linker_flags, + fbandroid_exported_linker_flags = linker_flags, + exported_deps = [ + ":caffe2", + ], + **kwargs + ) + +def c2_genrule(genrule, genfiles, prefix = "", src_path = "", header_namespace = ""): + headers = {} + srcs = [] + for generated_filename in genfiles: + buck_genrule( + name = prefix + generated_filename, + bash = "cp -f $(location :{})/{} $OUT".format(genrule, src_path + generated_filename), + cmd_exe = "@powershell -Command copy $(location :{})/{} $OUT".format(genrule, src_path + generated_filename), + out = generated_filename, + ) + rule = ":{}{}".format(prefix, generated_filename) + headers[header_namespace + generated_filename] = rule + srcs.append(rule) + return {"headers": headers, "srcs": srcs} diff --git a/c2_test_defs.bzl b/c2_test_defs.bzl new file mode 100644 index 000000000000..8ef83073d6fa --- /dev/null +++ b/c2_test_defs.bzl @@ -0,0 +1,20 @@ +load("@fbsource//tools/build_defs:fb_xplat_cxx_test.bzl", "fb_xplat_cxx_test") +load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID", "APPLE", "CXX", "IOS", "MACOSX") +load("@fbsource//xplat/caffe2:c2_defs.bzl", "get_c2_default_cxx_args") + +def c2_cxx_test(**kwargs): + args = get_c2_default_cxx_args() + args.update(kwargs) + args["fbandroid_use_instrumentation_test"] = True + for flag in [ + "macosx_compiler_flags", + "fbobjc_macosx_configs_override", + "macosx_frameworks_override", + "xcode_public_headers_symlinks", + "macosx_inherited_buck_flags_override", + ]: + args.pop(flag, None) + args["apple_sdks"] = (IOS, MACOSX) + args["platforms"] = (CXX, APPLE, ANDROID) + args["contacts"] = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"] + fb_xplat_cxx_test(**args) diff --git a/caffe2/BUILD_MODE.bzl b/caffe2/BUILD_MODE.bzl new file mode 100644 index 000000000000..1fbd3e6f7a47 --- /dev/null +++ b/caffe2/BUILD_MODE.bzl @@ -0,0 +1,23 @@ +""" build mode definitions for caffe2/caffe2 """ + +load("@fbcode//:BUILD_MODE.bzl", get_parent_modes = "all_modes_keep_gpu_sections_all_modes_use_lld") +load("@fbcode_macros//build_defs:create_build_mode.bzl", "extend_build_mode") + +def update_mode_struct(name, mode_struct): + if name == "dev": + return extend_build_mode( + mode_struct, + # TODO(ipbrady): Modules introduce floating point inaccuracies (T43879333) + cxx_modules = False, + ) + else: + return mode_struct + +_modes = { + mode_name: update_mode_struct(mode_name, mode_struct) + for mode_name, mode_struct in get_parent_modes().items() +} + +def get_modes(): + """ Return modes for this file """ + return _modes diff --git a/caffe2/defs.bzl b/caffe2/defs.bzl new file mode 100644 index 000000000000..39f4b1b5d93d --- /dev/null +++ b/caffe2/defs.bzl @@ -0,0 +1,89 @@ +# useful command for debugging which files are included: +# buck targets caffe2/caffe2: --json | jq -r "map(select(.srcs)) | map({key: .name, value: .srcs | sort}) | from_entries" +load("@fbsource//tools/build_defs:type_defs.bzl", "is_list") +load("//tools/build/buck:flags.bzl", "get_flags") + +flags = get_flags() + +_BASE_PATHS = ( + "core/*", + "core/boxing/*", + "core/boxing/impl/*", + "core/dispatch/*", + "core/op_registration/*", + "cuda_rtc/*", + "db/*", + "experiments/operators/*", + "ideep/**/*", + "observers/*", + "onnx/**/*", + "operators/**/*", + "observers/*", + "predictor/*", + "queue/*", + "sgd/*", + "share/contrib/zstd/*", + "transforms/*", + "utils/**/*", +) + +_BASE_SGX_PATHS = ( + "core/*", + "core/boxing/*", + "core/boxing/impl/*", + "core/dispatch/*", + "core/op_registration/*", + "cuda_rtc/*", + "db/*", + "experiments/operators/*", + "observers/*", + "onnx/**/*", + "operators/**/*", + "observers/*", + "predictor/*", + "queue/*", + "sgd/*", + "serialize/*", + "share/contrib/zstd/*", + "transforms/*", + "utils/**/*", +) + +def get_sgx_patterns(ext): + if not is_list(ext): + ext = [ext] + return [path + e for path in _BASE_SGX_PATHS for e in ext] + +def get_patterns(ext): + if not is_list(ext): + ext = [ext] + return [path + e for path in _BASE_PATHS for e in ext] + +def get_simd_preprocessor_flags(): + return [ + "-DUSE_FBGEMM", + ] + +def get_simd_compiler_flags(): + if flags.USE_SSE_ONLY: + return ["-mno-avx"] + + simd_compiler_flags = [ + "-mavx", + ] + get_simd_preprocessor_flags() + + # Every uarch with AVX512 support has AVX2 support + if (flags.USE_AVX2 or flags.USE_AVX512): + simd_compiler_flags += [ + "-mavx2", + "-mfma", + ] + + if flags.USE_AVX512: + simd_compiler_flags += [ + "-mavx512f", + "-mavx512dq", + "-mavx512vl", + ] + + return simd_compiler_flags diff --git a/caffe2/defs_hip.bzl b/caffe2/defs_hip.bzl new file mode 100644 index 000000000000..a93fe3569060 --- /dev/null +++ b/caffe2/defs_hip.bzl @@ -0,0 +1,149 @@ +load("@bazel_skylib//lib:paths.bzl", "paths") +load( + "//caffe2:defs_hip.bzl", + "caffe2_includes", + "caffe2_video_image_includes", + "get_hip_file_path", +) + +gpu_file_extensions = [".cu", ".c", ".cc", ".cpp"] +gpu_header_extensions = [".cuh", ".h", ".hpp"] + +def is_caffe2_gpu_file(filepath): + # those files are needed since they define placeholders + if "/native/cudnn/" in filepath: + return True + + # files that are already compatible with hip + if "/hip/" in filepath: + return False + + # exclude all cudnn and nvrtc implementations except for nvrtc_stub + if "/nvrtc_stub/" in filepath: + return True + if any([keyword in filepath for keyword in ("cudnn", "nvrtc", "NVRTC")]): + return False + + if "/cuda/" in filepath: + return True + + filename = paths.basename(filepath) + _, ext = paths.split_extension(filename) + + if "gpu" in filename or ext in [".cu", ".cuh"]: + return True + + return False + +def get_caffe2_hip_srcs( + include_patterns = caffe2_includes, + include_files = [], + project_dir = "caffe2"): + gpu_file_pattern = [ + base + suffix + for base in include_patterns + for suffix in gpu_file_extensions + ] + native_gpu_files = native.glob(gpu_file_pattern) + include_files + + # store the original + gpu_files = [] + hip_files = [] + for name in native_gpu_files: + # exclude test files + if "_test" in paths.basename(name) or not is_caffe2_gpu_file(name): + continue + + gpu_files.append(name) + hip_file_name = get_hip_file_path(name, is_caffe2 = True) + hip_files.append(hip_file_name) + + # there will be some native hip files that needs suffix changed + native_hip_pattern = [ + base[:-1] + "hip/*.hip" + for base in include_patterns + ] + native_hip_files = native.glob(native_hip_pattern) + + gpu_files += native_hip_files + hip_files += native_hip_files + + # we run hipify script under the caffe2 folder; therefore we need to + # prepend caffe2 to the path so that buck can find the hipified file + real_hip_files = [] + for filename in hip_files: + real_hip_files.append(paths.join(project_dir, filename)) + + # return the src and output_gen files + return gpu_files, real_hip_files + +def get_caffe2_hip_headers( + include_patterns = caffe2_includes, + include_files = [], + project_dir = "caffe2"): + header_pattern = [ + base + suffix + for base in include_patterns + for suffix in gpu_header_extensions + ] + native_header_files = native.glob(header_pattern) + include_files + + header_files = [] + hip_headers = [] + for name in native_header_files: + # exclude test files + # if the caller directly specifies files via include_files, follow it + if not name in include_files and ("_test" in paths.basename(name) or not is_caffe2_gpu_file(name)): + continue + + header_files.append(name) + hip_header_name = get_hip_file_path(name, is_caffe2 = True) + hip_headers.append(hip_header_name) + + # we run hipify script under the caffe2 folder; therefore we need to + # prepend caffe2 to the path so that buck can find the hipified file + real_hip_headers = [] + for filename in hip_headers: + real_hip_headers.append(paths.join(project_dir, filename)) + + # return the src and output_gen files + return header_files, real_hip_headers + +def get_caffe2_hip_video_image_srcs(): + return get_caffe2_hip_srcs(include_patterns = caffe2_video_image_includes) + +def get_caffe2_hip_video_image_headers(): + return get_caffe2_hip_headers(include_patterns = caffe2_video_image_includes) + +def get_caffe2_hip_test_files(): + test_includes = [ + "**/*_gpu_test.cc", + ] + + # let's ignores the mpi test and fb-internal tests for now + test_ignores = [ + "mpi/mpi_gpu_test.cc", + # "operators/roi_align_op_gpu_test.cc", + "**/fb/**/*_gpu_test.cc", + ] + + native_test_files = native.glob(test_includes, exclude = test_ignores) + + test_files = [] + hip_test_files = [] + for name in native_test_files: + if not is_caffe2_gpu_file(name): + continue + + test_files.append(name) + hip_file_name = get_hip_file_path(name, is_caffe2 = True) + hip_test_files.append(hip_file_name) + + # we run hipify script under the caffe2 folder; therefore we need to + # prepend caffe2 to the path so that buck can find the hipified file + real_hip_test_files = [] + for filename in hip_test_files: + real_hip_test_files.append(paths.join("caffe2", filename)) + + # return the src and output_gen files + return test_files, real_hip_test_files diff --git a/defs.bzl b/defs.bzl new file mode 100644 index 000000000000..c81f59274c1c --- /dev/null +++ b/defs.bzl @@ -0,0 +1,89 @@ +def get_sleef_deps(): + return [("sleef", None, "sleef")] if not (host_info().arch.is_aarch64) else [] + +def get_blas_gomp_deps(): + if host_info().arch.is_x86_64: + return [( + "IntelComposerXE", + None, + native.read_config("fbcode", "mkl_lp64", "mkl_lp64_omp"), + )] + if host_info().arch.is_aarch64: + return [ + ("OpenBLAS", None, "OpenBLAS"), + ("openmp", None, "omp"), + ] + fail("Unsupported architecture") + +default_compiler_flags = [ + "-Wall", + "-Wextra", + "-Wno-unused-function", + "-Wno-unused-parameter", + "-Wno-error=strict-aliasing", + "-Wno-unused-local-typedefs", + "-Wno-shadow-compatible-local", + "-Wno-maybe-uninitialized", # aten is built with gcc as part of HHVM + "-Wno-unknown-pragmas", + "-Wno-strict-overflow", + # See https://fb.facebook.com/groups/fbcode/permalink/1813348245368673/ + # These trigger on platform007 + "-Wno-stringop-overflow", + "-Wno-class-memaccess", + "-DHAVE_MMAP", + "-DUSE_GCC_ATOMICS=1", + "-D_FILE_OFFSET_BITS=64", + "-DHAVE_SHM_OPEN=1", + "-DHAVE_SHM_UNLINK=1", + "-DHAVE_MALLOC_USABLE_SIZE=1", + "-DTH_HAVE_THREAD", + "-DCPU_CAPABILITY_DEFAULT", + "-DTH_INDEX_BASE=0", + "-DMAGMA_V2", + "-DNO_CUDNN_DESTROY_HANDLE", + "-DUSE_FBGEMM", + "-DUSE_QNNPACK", + "-DUSE_PYTORCH_QNNPACK", + # The dynamically loaded NVRTC trick doesn't work in fbcode, + # and it's not necessary anyway, because we have a stub + # nvrtc library which we load canonically anyway + "-DUSE_DIRECT_NVRTC", + "-DUSE_RUY_QMATMUL", +] + ([] if native.host_info().os.is_windows else [ + # XNNPACK depends on an updated version of pthreadpool interface, whose implementation + # includes - a header not available on Windows. + "-DUSE_XNNPACK", +]) + (["-O1"] if native.read_config("fbcode", "build_mode_test_label", "") == "dev-nosan" else []) + +compiler_specific_flags = { + "clang": [ + "-Wno-absolute-value", + "-Wno-pass-failed", + "-Wno-braced-scalar-init", + ], + "gcc": [ + "-Wno-error=array-bounds", + ], +} + +def get_cpu_parallel_backend_flags(): + parallel_backend = native.read_config("pytorch", "parallel_backend", "openmp") + defs = [] + if parallel_backend == "openmp": + defs.append("-DAT_PARALLEL_OPENMP_FBCODE=1") + elif parallel_backend == "tbb": + defs.append("-DAT_PARALLEL_NATIVE_TBB_FBCODE=1") + elif parallel_backend == "native": + defs.append("-DAT_PARALLEL_NATIVE_FBCODE=1") + else: + fail("Unsupported parallel backend: " + parallel_backend) + if native.read_config("pytorch", "exp_single_thread_pool", "0") == "1": + defs.append("-DAT_EXPERIMENTAL_SINGLE_THREAD_POOL=1") + mkl_ver = native.read_config("fbcode", "mkl_lp64", "mkl_lp64_omp") + if mkl_ver == "mkl_lp64_seq": + defs.append("-DATEN_MKL_SEQUENTIAL_FBCODE=1") + return defs + +def is_cpu_static_dispatch_build(): + mode = native.read_config("fbcode", "caffe2_static_dispatch_mode", "none") + return mode == "cpu" diff --git a/defs_gpu.bzl b/defs_gpu.bzl new file mode 100644 index 000000000000..3d6cae883089 --- /dev/null +++ b/defs_gpu.bzl @@ -0,0 +1,166 @@ +load("@fbcode_macros//build_defs:native_rules.bzl", "buck_genrule") +load( + "//caffe2/caffe2:defs_hip.bzl", + "get_caffe2_hip_headers", + "get_caffe2_hip_srcs", +) +load(":ufunc_defs.bzl", "aten_ufunc_names") + +ATEN_CUDA_H_PATTERN = [ + "aten/src/ATen/cuda/*.h", + "aten/src/ATen/cuda/detail/*.h", + "aten/src/ATen/cuda/nvrtc_stub/*.h", + "aten/src/ATen/cuda/*.cuh", + "aten/src/ATen/cuda/detail/*.cuh", +] + +ATEN_CUDA_CPP_PATTERN = [ + "aten/src/ATen/cuda/*.cpp", + "aten/src/ATen/cuda/detail/*.cpp", + "aten/src/ATen/cuda/nvrtc_stub/*.cpp", +] + +ATEN_CUDA_CU_PATTERN = [ + "aten/src/ATen/cuda/*.cu", + "aten/src/ATen/cuda/detail/*.cu", +] + +ATEN_CUDNN_H_PATTERN = [ + "aten/src/ATen/cudnn/*.h", + "aten/src/ATen/cudnn/*.cuh", +] + +ATEN_CUDNN_CPP_PATTERN = ["aten/src/ATen/cudnn/*.cpp"] + +ATEN_MIOPEN_H_PATTERN = [ + "aten/src/ATen/miopen/*.h", + "aten/src/ATen/miopen/*.cuh", +] + +ATEN_MIOPEN_CPP_PATTERN = ["aten/src/ATen/miopen/*.cpp"] + +ATEN_NATIVE_CUDNN_CPP_PATTERN = ["aten/src/ATen/native/cudnn/*.cpp"] + +ATEN_NATIVE_MIOPEN_CPP_PATTERN = ["aten/src/ATen/native/miopen/*.cpp"] + +ATEN_NATIVE_CUDA_CU_PATTERN = [ + "aten/src/ATen/native/cuda/*.cu", + "aten/src/ATen/native/nested/cuda/*.cu", + "aten/src/ATen/native/quantized/cuda/*.cu", + "aten/src/ATen/native/sparse/cuda/*.cu", + "aten/src/ATen/native/transformers/**/*.cu", +] + +ATEN_NATIVE_CUDA_CPP_PATTERN = [ + "aten/src/ATen/native/cuda/*.cpp", + "aten/src/ATen/native/cuda/linalg/*.cpp", + "aten/src/ATen/native/nested/cuda/*.cpp", + "aten/src/ATen/native/sparse/cuda/*.cpp", + "aten/src/ATen/native/transformers/cuda/*.cpp", +] + +ATEN_NATIVE_CUDA_H_PATTERN = [ + "aten/src/ATen/native/cudnn/**/*.h", + "aten/src/ATen/native/cuda/**/*.h", + "aten/src/ATen/native/cuda/**/*.cuh", + "aten/src/ATen/native/sparse/cuda/*.h", + "aten/src/ATen/native/sparse/cuda/*.cuh", + "aten/src/ATen/native/quantized/cuda/*.h", + "aten/src/ATen/native/transformers/cuda/*.h", + "aten/src/ATen/native/transformers/**/*.cuh", +] + +# T66678203: Clang CUDA rollout +ATEN_CUDA_CLANG_CU_PATTERN = [ + "aten/src/ATen/native/cuda/DistributionBernoulli.cu", +] + +### Cuda Files +def get_aten_cuda_headers(): + ATEN_CUDA_H = native.glob(ATEN_CUDA_H_PATTERN) + ATEN_NATIVE_CUDA_H = native.glob(ATEN_NATIVE_CUDA_H_PATTERN) + ATEN_CUDNN_H = native.glob(ATEN_CUDNN_H_PATTERN) + return ATEN_CUDA_H + ATEN_NATIVE_CUDA_H + ATEN_CUDNN_H + +def get_aten_cuda_srcs(): + ATEN_CUDA_CU = native.glob(ATEN_CUDA_CU_PATTERN) + ATEN_NATIVE_CUDA_CU = native.glob( + ATEN_NATIVE_CUDA_CU_PATTERN, + exclude = ATEN_CUDA_CLANG_CU_PATTERN, + ) + return ATEN_CUDA_CU + ATEN_NATIVE_CUDA_CU + +def get_aten_cuda_clang_srcs(): + return native.glob(ATEN_CUDA_CLANG_CU_PATTERN) + +# CPU+CUDA file +# Note that these sources and headers include the CPU lists too +def get_all_cuda_srcs(): + ATEN_NATIVE_CUDNN_CPP = native.glob(ATEN_NATIVE_CUDNN_CPP_PATTERN) + ATEN_CUDNN_CPP = native.glob(ATEN_CUDNN_CPP_PATTERN) + ATEN_NATIVE_MIOPEN_CPP = native.glob(ATEN_NATIVE_MIOPEN_CPP_PATTERN) + ATEN_CUDA_CPP = native.glob(ATEN_CUDA_CPP_PATTERN) + ATEN_NATIVE_CUDA_CPP = native.glob(ATEN_NATIVE_CUDA_CPP_PATTERN) + + return ATEN_NATIVE_CUDNN_CPP + ATEN_CUDNN_CPP + ATEN_NATIVE_MIOPEN_CPP + ATEN_CUDA_CPP + ATEN_NATIVE_CUDA_CPP + get_aten_cuda_srcs() + +### HIP files +# Files that must be hipified +def get_aten_hip_srcs(): + ## CU -> HIP files + ATEN_CUDA_CU = native.glob(ATEN_CUDA_CU_PATTERN) + + # HIP does not use clang for ATEN_CUDA_CLANG_CU_PATTERN + ATEN_NATIVE_CUDA_CU = native.glob(ATEN_NATIVE_CUDA_CU_PATTERN) + + ## CPU files + ATEN_NATIVE_CUDNN_CPP = native.glob(ATEN_NATIVE_CUDNN_CPP_PATTERN) + ATEN_CUDNN_CPP = native.glob(ATEN_CUDNN_CPP_PATTERN) + ATEN_CUDA_CPP = native.glob(ATEN_CUDA_CPP_PATTERN) + ATEN_NATIVE_CUDA_CPP = native.glob(ATEN_NATIVE_CUDA_CPP_PATTERN) + + # Get hipified file names (before, after) + srcs = ATEN_CUDA_CU + ATEN_NATIVE_CUDA_CU + ATEN_NATIVE_CUDNN_CPP + ATEN_CUDNN_CPP + ATEN_CUDA_CPP + ATEN_NATIVE_CUDA_CPP + ret = get_caffe2_hip_srcs(include_patterns = [], include_files = srcs, project_dir = "") + return (ret[0], [f.replace("aten/src/", "") for f in ret[1]]) + +def get_aten_hip_headers(): + ATEN_CUDA_H = native.glob(ATEN_CUDA_H_PATTERN) + ATEN_NATIVE_CUDA_H = native.glob(ATEN_NATIVE_CUDA_H_PATTERN) + ATEN_CUDNN_H = [] # native.glob(ATEN_CUDNN_H_PATTERN) + + # Get hipified file names (before, after) + srcs = ATEN_CUDA_H + ATEN_NATIVE_CUDA_H + ATEN_CUDNN_H + ret = get_caffe2_hip_headers(include_patterns = [], include_files = ATEN_CUDA_H + ATEN_NATIVE_CUDA_H + ATEN_CUDNN_H, project_dir = "") + return ret[0], [f.replace("aten/src/", "") for f in ret[1]] + +# Native HIP-aware files +def get_aten_hip_native_srcs(): + HIP_IMPL_CPP = native.glob(["aten/src/ATen/hip/impl/*.cpp"]) + ATEN_MIOPEN_CPP = native.glob(ATEN_MIOPEN_CPP_PATTERN) + ATEN_NATIVE_MIOPEN_CPP = native.glob(ATEN_NATIVE_MIOPEN_CPP_PATTERN) + return HIP_IMPL_CPP + ATEN_MIOPEN_CPP + ATEN_NATIVE_MIOPEN_CPP + +def get_aten_hip_native_headers(): + HIP_IMPL_H = native.glob(["aten/src/ATen/hip/impl/*.h"]) + ATEN_MIOPEN_H = native.glob(ATEN_MIOPEN_H_PATTERN) + return HIP_IMPL_H + ATEN_MIOPEN_H + +def get_aten_hip_ufunc_generated_cuda_sources(gencode_pattern = "{}"): + # Contents of these CUDA files do not need to be hipified at this point, + # but they must be renamed from ".cu" to ".hip" because, unlike OSS, a compiler + # is selected based on a file extension. + + renamed_rules = [] + for n in aten_ufunc_names: + cuda_name = "UfuncCUDA_{}.cu".format(n) + hip_name = "UfuncCUDA_{}.hip".format(n) + buck_genrule( + name = "aten_ufunc_hip_renamed_{}".format(n), + srcs = [gencode_pattern.format(cuda_name)], + bash = 'cp "$SRCDIR/{}" "$OUT"'.format(cuda_name), + out = hip_name, + default_outs = [], + ) + renamed_rules.append(":aten_ufunc_hip_renamed_{}".format(n)) + return renamed_rules diff --git a/defs_hip.bzl b/defs_hip.bzl new file mode 100644 index 000000000000..061f7fe2157f --- /dev/null +++ b/defs_hip.bzl @@ -0,0 +1,136 @@ +load("@bazel_skylib//lib:paths.bzl", "paths") +load("@fbcode//tools/build/buck:rocm_flags.bzl", "get_rocm_arch_args") + +caffe2_includes = [ + "operators/**/*", + "operators/*", + "sgd/*", + "transforms/*", + # distributed folder is managed by its own TARGETS file + # "distributed/*", + "queue/*", + # "binaries/*", + "**/*_test*", + "core/*", + "db/*", + "utils/**/*", +] + +caffe2_video_image_includes = [ + "image/*", + "video/*", +] + +pytorch_includes = [ + "aten/src/ATen/cuda/*", + "aten/src/ATen/native/cuda/*", + "aten/src/ATen/native/cuda/linalg/*", + "aten/src/ATen/native/cudnn/*", + "aten/src/ATen/native/nested/cuda/*", + "aten/src/ATen/native/sparse/cuda/*", + "aten/src/ATen/native/transformers/cuda/*", + "aten/src/THC/*", + "aten/src/ATen/test/*", + "torch/*", +] + +gpu_file_extensions = [".cu", ".c", ".cc", ".cpp"] +gpu_header_extensions = [".cuh", ".h", ".hpp"] + +hip_external_deps = [ + ("rocm", None, "amdhip64-lazy"), + ("rocm", None, "MIOpen-lazy"), + ("rocm", None, "rccl-lazy"), + ("rocm", None, "roctracer64-lazy"), +] + +hip_pp_flags = [ + # HIP 4.4.21432 -> TORCH_HIP_VERSION=404 + "-DTORCH_HIP_VERSION=(FB_HIP_VERSION/100000)", + # ROCm 4.5.2 -> ROCM_VERSION=40502 + "-DROCM_VERSION=FB_ROCM_VERSION", + "-DUSE_ROCM=1", + "-D__HIP_PLATFORM_HCC__=1", + "-D__HIP_NO_HALF_OPERATORS__=1", + "-D__HIP_NO_HALF_CONVERSIONS__=1", + "-DCUDA_HAS_FP16=1", + "-DCAFFE2_USE_MIOPEN", + # The c10/cuda/impl/cuda_cmake_macros.h is not generated for the + # hip build yet. + "-DC10_HIP_NO_CMAKE_CONFIGURE_FILE", + # clang with -fopenmp=libgomp (gcc's OpenMP runtime library) produces + # single threaded code and doesn't define -D_OPENMP by default. + # clang with -fopenmp or -fopenmp=libomp (llvm's OpenMP runtime library) + # produces multi-threaded code and defines -D_OPENMP by default. + # + # hcc currently don't have llvm openmp runtime project builtin. + # wrap_hip.py also drops -D_OPENMP if explicitly specified. + "-U_OPENMP", +] + +def get_hip_flags(): + return [ + # Caffe2 cannot be compiled with NDEBUG using ROCm 4.5.2. + # TODO: The issue should be fixed properly. + "-UNDEBUG", + "-Wno-error=absolute-value", + "-Wno-macro-redefined", + "-Wno-inconsistent-missing-override", + "-Wno-exceptions", + "-Wno-shift-count-negative", + "-Wno-shift-count-overflow", + "-Wno-duplicate-decl-specifier", + "-Wno-implicit-int-float-conversion", + "-Wno-unused-result", + "-Wno-pass-failed", + "-Wno-unknown-pragmas", + "-Wno-cuda-compat", + ] + get_rocm_arch_args() + +def get_hip_file_path(filepath, is_caffe2 = False): + """ + this function should be in sync with the hipified script in + third-party/hipify_torch/hipify/hipify_python.py + unfortunately because it's a normal python (instead of Starlark) + we cannot simply import from there + + The general rule of converting file names from cuda to hip is: + - If there is a directory component named "cuda", replace + it with "hip", AND + + - If the file name contains "CUDA", replace it with "HIP", AND + + If NONE of the above occurred, then insert "hip" in the file path + as the direct parent folder of the file + + Furthermore, ALWAYS replace '.cu' with '.hip', because those files + contain CUDA kernels that needs to be hipified and processed with + hcc compile + """ + dirpath = paths.dirname(filepath) + filename = paths.basename(filepath) + filename, ext = paths.split_extension(filename) + + if ext == ".cu": + ext = ".hip" + + orig_dirpath = dirpath + + dirpath = dirpath.replace("cuda", "hip") + dirpath = dirpath.replace("THC", "THH") + + filename = filename.replace("cuda", "hip") + filename = filename.replace("CUDA", "HIP") + + # Special case to handle caffe2/core/THCCachingAllocator + if not (is_caffe2 and dirpath == "core"): + filename = filename.replace("THC", "THH") + + # if the path doesn't change (e.g., path doesn't include "cuda" so we + # cannot differentiate), insert "hip" as the direct parent folder + # special case for utils/cub_namespace, because it is first used and hipified when used + # from core, it doesn't end up in hip directory + if dirpath == orig_dirpath and not filename == "cub_namespace": + dirpath = paths.join(dirpath, "hip") + + return paths.join(dirpath, filename + ext) diff --git a/ios/METADATA.bzl b/ios/METADATA.bzl new file mode 100644 index 000000000000..467644b22773 --- /dev/null +++ b/ios/METADATA.bzl @@ -0,0 +1,10 @@ +# THIS FILE IS AUTOMATICALLY GENERATED FROM INFORMATION STORED IN +# THIRD-PARTY METADATA SERVICE. YOUR MANUAL CHANGES TO THIS FILE WILL +# BE PRESERVED AND WILL SERVE AS THE SOURCE OF TRUTH FOR METADATA OF +# THIS PACKAGE. +# TPMS-GENERATED: b832a8f526016b30c557d8a58fc89d9338a51cff +METADATA = { + "name": "LibTorch", + "owner": "ai_infra_mobile_platform", + "version": "1.11.0", +} diff --git a/ios/TestApp/METADATA.bzl b/ios/TestApp/METADATA.bzl new file mode 100644 index 000000000000..6ab0710d6660 --- /dev/null +++ b/ios/TestApp/METADATA.bzl @@ -0,0 +1,10 @@ +# THIS FILE IS AUTOMATICALLY GENERATED FROM INFORMATION STORED IN +# THIRD-PARTY METADATA SERVICE. YOUR MANUAL CHANGES TO THIS FILE WILL +# BE PRESERVED AND WILL SERVE AS THE SOURCE OF TRUTH FOR METADATA OF +# THIS PACKAGE. +# TPMS-GENERATED: ba55575493b7ad21fde900f05f93c501b2715a09 +METADATA = { + "name": "unf_ext", + "owner": "ai_infra_mobile_platform", + "version": "0.0.7.6", +} diff --git a/ovrsource_aten_gen_defs.bzl b/ovrsource_aten_gen_defs.bzl new file mode 100644 index 000000000000..0a56c32e579a --- /dev/null +++ b/ovrsource_aten_gen_defs.bzl @@ -0,0 +1,83 @@ +# @nolint +load("//arvr/tools/build_defs:genrule_utils.bzl", "gen_cmake_header") +load("//arvr/tools/build_defs:oxx.bzl", "oxx_static_library") +load( + "@fbsource//xplat/caffe2:pt_defs.bzl", + "gen_aten_files", + "get_aten_codegen_extra_params", +) + +def define_aten_gen(): + backends = [ + "CPU", + "SparseCPU", + "SparseCsrCPU", + # "MkldnnCPU", + "CUDA", + "SparseCUDA", + "SparseCsrCUDA", + "QuantizedCPU", + "QuantizedCUDA", + "Meta", + "ZeroTensor" + ] + + gen_aten_files( + name = "gen_aten_ovrsource", + extra_flags = get_aten_codegen_extra_params(backends), + visibility = ["PUBLIC"], + ) + + oxx_static_library( + name = "ovrsource_aten_generated_cuda_headers", + header_namespace = "ATen", + public_generated_headers = { + "CUDAFunctions.h": ":gen_aten_ovrsource[CUDAFunctions.h]", + "CUDAFunctions_inl.h": ":gen_aten_ovrsource[CUDAFunctions_inl.h]", + }, + visibility = ["PUBLIC"], + ) + + oxx_static_library( + name = "ovrsource_aten_generated_meta_headers", + header_namespace = "ATen", + public_generated_headers = { + "MetaFunctions.h": ":gen_aten_ovrsource[MetaFunctions.h]", + "MetaFunctions_inl.h": ":gen_aten_ovrsource[MetaFunctions_inl.h]", + }, + visibility = ["PUBLIC"], + ) + + gen_cmake_header( + src = "aten/src/ATen/Config.h.in", + defines = [ + ("@AT_MKLDNN_ENABLED@", "0"), + ("@AT_MKL_ENABLED@", "0"), + ("@AT_MKL_SEQUENTIAL@", "0"), + ("@AT_FFTW_ENABLED@", "0"), + ("@AT_NNPACK_ENABLED@", "0"), + ("@AT_PARALLEL_OPENMP@", "0"), + ("@AT_PARALLEL_NATIVE@", "1"), + ("@AT_PARALLEL_NATIVE_TBB@", "0"), + ("@AT_POCKETFFT_ENABLED@", "0"), + ("@CAFFE2_STATIC_LINK_CUDA_INT@", "1"), + ("@AT_BUILD_WITH_BLAS@", "1"), + ("@AT_BUILD_WITH_LAPACK@", "1"), + ("@AT_BLAS_F2C@", "1"), + ("@AT_BLAS_USE_CBLAS_DOT@", "0") + ], + header = "ATen/Config.h", + prefix = "ovrsource_aten_", + ) + + gen_cmake_header( + src = "aten/src/ATen/cuda/CUDAConfig.h.in", + defines = [ + ("@AT_CUDNN_ENABLED@", "1"), + ("@AT_ROCM_ENABLED@", "0"), + ("@NVCC_FLAGS_EXTRA@", " "), + ("@AT_MAGMA_ENABLED@", "0") + ], + header = "ATen/cuda/CUDAConfig.h", + prefix = "ovrsource_aten_", + ) diff --git a/ovrsource_caffe2_perfkernels_defs.bzl b/ovrsource_caffe2_perfkernels_defs.bzl new file mode 100644 index 000000000000..bcfeb6490a01 --- /dev/null +++ b/ovrsource_caffe2_perfkernels_defs.bzl @@ -0,0 +1,87 @@ +# @nolint +load("//arvr/tools/build_defs:oxx.bzl", "oxx_static_library") +load("@fbsource//xplat/caffe2/c10:ovrsource_defs.bzl", "cpu_supported_platforms") + +def define_caffe2_perfkernels(): + [ + oxx_static_library( + name = "perfkernels_{}_ovrsource".format(arch), + srcs = native.glob(["caffe2/perfkernels/*_{}.cc".format(arch)]), + compatible_with = ["ovr_config//cpu:x86_64"], + compiler_flags = select({ + "DEFAULT": [], + "ovr_config//compiler:cl": [ + "/arch:AVX2", + "/w", + ], + "ovr_config//compiler:clang": [ + "-Wno-error", + "-mf16c", + ] + (["-mf16c", "-mavx"] if arch == "avx" else ["-mfma", "-mavx2"] if arch == "avx2" else ["-mavx512f"]), + }), + raw_headers = native.glob([ + "caffe2/core/*.h", + "caffe2/perfkernels/*.h", + "caffe2/proto/*.h", + "caffe2/utils/*.h", + ], exclude = [ + "caffe2/core/macros.h", + ]), + reexport_all_header_dependencies = False, + deps = [ + ":caffe2_proto_ovrsource", + ":ovrsource_caffe2_macros.h", + "@fbsource//xplat/caffe2/c10:c10_ovrsource", + ], + ) + for arch in ["avx", "avx2", "avx512"] + ] + + oxx_static_library( + name = "perfkernels_ovrsource", + srcs = native.glob([ + "caffe2/perfkernels/*.cc", + ], exclude = [ + "**/*_avx*", + ]), + compatible_with = cpu_supported_platforms, + compiler_flags = select({ + "DEFAULT": [], + "ovr_config//compiler:cl": [ + "/w", + ], + "ovr_config//compiler:clang": [ + "-Wno-macro-redefined", + "-Wno-shadow", + "-Wno-undef", + "-Wno-unused-function", + "-Wno-unused-local-typedef", + "-Wno-unused-variable", + ], + }), + public_include_directories = [], + public_raw_headers = native.glob([ + "caffe2/perfkernels/*.h", + ]), + raw_headers = native.glob([ + "caffe2/core/*.h", + "caffe2/proto/*.h", + "caffe2/utils/*.h", + ], exclude = [ + "caffe2/core/macros.h", + ]), + reexport_all_header_dependencies = False, + deps = [ + ":caffe2_proto_ovrsource", + ":ovrsource_caffe2_macros.h", + "//third-party/cpuinfo:cpuinfo", + "@fbsource//xplat/caffe2/c10:c10_ovrsource", + "//third-party/protobuf:libprotobuf", + ] + select({ + "DEFAULT": [], + "ovr_config//cpu:x86_64": [ + ":perfkernels_avx_ovrsource", + ":perfkernels_avx2_ovrsource", + ], + }), + ) diff --git a/ovrsource_caffe2_proto_defs.bzl b/ovrsource_caffe2_proto_defs.bzl new file mode 100644 index 000000000000..579e807dcf20 --- /dev/null +++ b/ovrsource_caffe2_proto_defs.bzl @@ -0,0 +1,20 @@ +# @nolint +load("//arvr/tools/build_defs:oxx.bzl", "oxx_static_library", "oxx_test") +load("//arvr/tools/build_defs:oxx_python.bzl", "oxx_python_binary", "oxx_python_library") +load("//arvr/tools/build_defs:genrule_utils.bzl", "gen_cmake_header") +load("//arvr/tools/build_defs:protobuf.bzl", "proto_cxx_library") +load("@bazel_skylib//lib:paths.bzl", "paths") + +def define_caffe2_proto(): + proto_cxx_library( + name = "caffe2_proto_ovrsource", + protos = [ + "caffe2/proto/caffe2.proto", + "caffe2/proto/caffe2_legacy.proto", + "caffe2/proto/hsm.proto", + "caffe2/proto/metanet.proto", + "caffe2/proto/predictor_consts.proto", + "caffe2/proto/prof_dag.proto", + "caffe2/proto/torch.proto", + ], + ) diff --git a/ovrsource_nomnigraph_defs.bzl b/ovrsource_nomnigraph_defs.bzl new file mode 100644 index 000000000000..2a378f231230 --- /dev/null +++ b/ovrsource_nomnigraph_defs.bzl @@ -0,0 +1,101 @@ +# @nolint +load("//arvr/tools/build_defs:oxx.bzl", "oxx_static_library", "oxx_test") +load("//arvr/tools/build_defs:oxx_python.bzl", "oxx_python_binary", "oxx_python_library") +load("//arvr/tools/build_defs:genrule_utils.bzl", "gen_cmake_header") +load("@bazel_skylib//lib:paths.bzl", "paths") + +def define_nomnigraph(): + oxx_python_binary( + name = "nomnigraph_gen_py_ovrsource", + main_module = "caffe2.core.nomnigraph.op_gen", + deps = [":nomnigraph_gen_py_main_ovrsource"], + ) + + oxx_python_library( + name = "nomnigraph_gen_py_main_ovrsource", + srcs = native.glob(["caffe2/core/nomnigraph/*.py"]), + base_module = "", + ) + + nomnigraph_gen_py_cmd = " ".join([ + "--install_dir=$OUT", + "--source_def=caffe2/core/nomnigraph/ops.def", + # "--source_def=caffe2/core/nomnigraph/fb/ops.def", + ]) + + native.genrule( + name = "nomnigraph_gen_ovrsource", + srcs = [ + # "caffe2/core/nomnigraph/fb/ops.def", + "caffe2/core/nomnigraph/op_gen.py", + "caffe2/core/nomnigraph/ops.def", + ], + cmd_exe = "mkdir $OUT && $(exe :nomnigraph_gen_py_ovrsource) " + nomnigraph_gen_py_cmd, + out = "gen", + ) + + TEST_SRCS = native.glob([ + "caffe2/core/nomnigraph/tests/*.cc", + ], exclude = [ + "caffe2/core/nomnigraph/tests/GraphTest.cc", # fails because debug iterator check + ]) + + oxx_static_library( + name = "nomnigraph_ovrsource", + srcs = [ + "caffe2/core/nomnigraph/Representations/NeuralNet.cc", + ], + compiler_flags = select({ + "ovr_config//compiler:clang": [ + "-Wno-undef", + "-Wno-shadow", + "-Wno-macro-redefined", + "-Wno-unused-variable", + "-Wno-unused-local-typedef", + "-Wno-unused-function", + ], + "DEFAULT": [], + }), + public_include_directories = ["caffe2/core/nomnigraph/include"], + public_raw_headers = native.glob([ + "caffe2/core/nomnigraph/include/**/*.h", + ]), + raw_headers = ["caffe2/core/common.h"], + reexport_all_header_dependencies = False, + tests = [ + ":" + paths.basename(filename)[:-len(".cc")] + "_ovrsource" + for filename in TEST_SRCS + ], + deps = [ + ":ovrsource_caffe2_macros.h", + "@fbsource//xplat/caffe2/c10:c10_ovrsource", + ], + ) + + [ + oxx_test( + name = paths.basename(filename)[:-len(".cc")] + "_ovrsource", + srcs = [ + filename, + "caffe2/core/nomnigraph/tests/test_util.cc", + ], + compiler_flags = select({ + "ovr_config//compiler:clang": [ + "-Wno-macro-redefined", + "-Wno-shadow", + "-Wno-undef", + "-Wno-unused-variable", + ], + "DEFAULT": [], + }), + framework = "gtest", + oncall = "frl_gemini", + raw_headers = native.glob([ + "caffe2/core/nomnigraph/tests/*.h", + ]), + deps = [ + ":nomnigraph_ovrsource", + ], + ) + for filename in TEST_SRCS + ] diff --git a/pt_template_srcs.bzl b/pt_template_srcs.bzl new file mode 100644 index 000000000000..b2cbf6cbf5e9 --- /dev/null +++ b/pt_template_srcs.bzl @@ -0,0 +1,239 @@ +# This file keeps a list of PyTorch source files that are used for templated selective build. +# NB: as this is PyTorch Edge selective build, we assume only CPU targets are +# being built + +load("@bazel_skylib//lib:paths.bzl", "paths") +load("@fbsource//tools/build_defs:fbsource_utils.bzl", "is_arvr_mode") +load(":build_variables.bzl", "aten_native_source_list") +load( + ":ufunc_defs.bzl", + "aten_ufunc_generated_cpu_kernel_sources", + "aten_ufunc_generated_cpu_sources", +) + +# Files in this list are supposed to be built separately for each app, +# for different operator allow lists. +TEMPLATE_SOURCE_LIST = [ + "torch/csrc/jit/runtime/register_prim_ops.cpp", + "torch/csrc/jit/runtime/register_special_ops.cpp", +] + aten_native_source_list + +# For selective build, we can lump the CPU and CPU kernel sources altogether +# because there is only ever one vectorization variant that is compiled +def aten_ufunc_generated_all_cpu_sources(gencode_pattern = "{}"): + return ( + aten_ufunc_generated_cpu_sources(gencode_pattern) + + aten_ufunc_generated_cpu_kernel_sources(gencode_pattern) + ) + +TEMPLATE_MASKRCNN_SOURCE_LIST = [ + "register_maskrcnn_ops.cpp", +] + +TEMPLATE_BATCH_BOX_COX_SOURCE_LIST = [ + "register_batch_box_cox_ops.cpp", +] + +METAL_SOURCE_LIST = [ + "aten/src/ATen/native/metal/MetalAten.mm", + "aten/src/ATen/native/metal/MetalGuardImpl.cpp", + "aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp", + "aten/src/ATen/native/metal/MetalCommandBuffer.mm", + "aten/src/ATen/native/metal/MetalContext.mm", + "aten/src/ATen/native/metal/MetalConvParams.mm", + "aten/src/ATen/native/metal/MetalTensorImplStorage.mm", + "aten/src/ATen/native/metal/MetalTensorUtils.mm", + "aten/src/ATen/native/metal/mpscnn/MPSCNNClampOp.mm", + "aten/src/ATen/native/metal/mpscnn/MPSCNNConvOp.mm", + "aten/src/ATen/native/metal/mpscnn/MPSCNNFullyConnectedOp.mm", + "aten/src/ATen/native/metal/mpscnn/MPSCNNNeuronOp.mm", + "aten/src/ATen/native/metal/mpscnn/MPSCNNUtils.mm", + "aten/src/ATen/native/metal/mpscnn/MPSImage+Tensor.mm", + "aten/src/ATen/native/metal/mpscnn/MPSImageUtils.mm", + "aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm", + "aten/src/ATen/native/metal/ops/MetalAddmm.mm", + "aten/src/ATen/native/metal/ops/MetalBinaryElementwise.mm", + "aten/src/ATen/native/metal/ops/MetalChunk.mm", + "aten/src/ATen/native/metal/ops/MetalClamp.mm", + "aten/src/ATen/native/metal/ops/MetalConcat.mm", + "aten/src/ATen/native/metal/ops/MetalConvolution.mm", + "aten/src/ATen/native/metal/ops/MetalCopy.mm", + "aten/src/ATen/native/metal/ops/MetalHardswish.mm", + "aten/src/ATen/native/metal/ops/MetalLeakyReLU.mm", + "aten/src/ATen/native/metal/ops/MetalNeurons.mm", + "aten/src/ATen/native/metal/ops/MetalPadding.mm", + "aten/src/ATen/native/metal/ops/MetalPooling.mm", + "aten/src/ATen/native/metal/ops/MetalReduce.mm", + "aten/src/ATen/native/metal/ops/MetalReshape.mm", + "aten/src/ATen/native/metal/ops/MetalSoftmax.mm", + "aten/src/ATen/native/metal/ops/MetalTranspose.mm", + "aten/src/ATen/native/metal/ops/MetalUpsamplingNearest.mm", +] + +UNET_METAL_PREPACK_SOURCE_LIST = [ + "unet_metal_prepack.cpp", + "unet_metal_prepack.mm", +] + +METAL_MASKRCNN_SOURCE_LIST = [ + "maskrcnn/srcs/GenerateProposals.mm", + "maskrcnn/srcs/RoIAlign.mm", +] + +# The get_template_source_dict() returns a dict containing a path prefix +# and a list of .cpp source files containing operator definitions and +# registrations that should get selected via templated selective build. +# The file selected_mobile_ops.h has the list of selected top level +# operators. +# NB: doesn't include generated files; copy_template_registration_files +# handles those specially +def get_template_source_dict(): + ret = {} + for file_path in TEMPLATE_SOURCE_LIST: + path_prefix = paths.dirname(file_path) + if path_prefix not in ret: + ret[path_prefix] = [] + ret[path_prefix].append(file_path) + return ret + +def get_gen_oplist_outs(): + return { + "SupportedMobileModelsRegistration.cpp": [ + "SupportedMobileModelsRegistration.cpp", + ], + "selected_mobile_ops.h": [ + "selected_mobile_ops.h", + ], + "selected_operators.yaml": [ + "selected_operators.yaml", + ], + } + +def get_generate_code_bin_outs(): + outs = { + "autograd/generated/ADInplaceOrViewTypeEverything.cpp": ["autograd/generated/ADInplaceOrViewTypeEverything.cpp"], + "autograd/generated/ADInplaceOrViewType_0.cpp": ["autograd/generated/ADInplaceOrViewType_0.cpp"], + "autograd/generated/ADInplaceOrViewType_1.cpp": ["autograd/generated/ADInplaceOrViewType_1.cpp"], + "autograd/generated/Functions.cpp": ["autograd/generated/Functions.cpp"], + "autograd/generated/Functions.h": ["autograd/generated/Functions.h"], + "autograd/generated/TraceTypeEverything.cpp": ["autograd/generated/TraceTypeEverything.cpp"], + "autograd/generated/TraceType_0.cpp": ["autograd/generated/TraceType_0.cpp"], + "autograd/generated/TraceType_1.cpp": ["autograd/generated/TraceType_1.cpp"], + "autograd/generated/TraceType_2.cpp": ["autograd/generated/TraceType_2.cpp"], + "autograd/generated/TraceType_3.cpp": ["autograd/generated/TraceType_3.cpp"], + "autograd/generated/TraceType_4.cpp": ["autograd/generated/TraceType_4.cpp"], + "autograd/generated/VariableType.h": ["autograd/generated/VariableType.h"], + "autograd/generated/VariableTypeEverything.cpp": ["autograd/generated/VariableTypeEverything.cpp"], + "autograd/generated/VariableType_0.cpp": ["autograd/generated/VariableType_0.cpp"], + "autograd/generated/VariableType_1.cpp": ["autograd/generated/VariableType_1.cpp"], + "autograd/generated/VariableType_2.cpp": ["autograd/generated/VariableType_2.cpp"], + "autograd/generated/VariableType_3.cpp": ["autograd/generated/VariableType_3.cpp"], + "autograd/generated/VariableType_4.cpp": ["autograd/generated/VariableType_4.cpp"], + "autograd/generated/variable_factories.h": ["autograd/generated/variable_factories.h"], + } + + if is_arvr_mode(): + outs.update({ + "autograd/generated/python_fft_functions.cpp": ["autograd/generated/python_fft_functions.cpp"], + "autograd/generated/python_functions.h": ["autograd/generated/python_functions.h"], + "autograd/generated/python_functions_0.cpp": ["autograd/generated/python_functions_0.cpp"], + "autograd/generated/python_functions_1.cpp": ["autograd/generated/python_functions_1.cpp"], + "autograd/generated/python_functions_2.cpp": ["autograd/generated/python_functions_2.cpp"], + "autograd/generated/python_functions_3.cpp": ["autograd/generated/python_functions_3.cpp"], + "autograd/generated/python_functions_4.cpp": ["autograd/generated/python_functions_4.cpp"], + "autograd/generated/python_linalg_functions.cpp": ["autograd/generated/python_linalg_functions.cpp"], + "autograd/generated/python_nn_functions.cpp": ["autograd/generated/python_nn_functions.cpp"], + "autograd/generated/python_return_types.cpp": ["autograd/generated/python_return_types.cpp"], + "autograd/generated/python_sparse_functions.cpp": ["autograd/generated/python_sparse_functions.cpp"], + "autograd/generated/python_special_functions.cpp": ["autograd/generated/python_special_functions.cpp"], + "autograd/generated/python_torch_functions_0.cpp": ["autograd/generated/python_torch_functions_0.cpp"], + "autograd/generated/python_torch_functions_1.cpp": ["autograd/generated/python_torch_functions_1.cpp"], + "autograd/generated/python_torch_functions_2.cpp": ["autograd/generated/python_torch_functions_2.cpp"], + "autograd/generated/python_variable_methods.cpp": ["autograd/generated/python_variable_methods.cpp"], + }) + return outs + +def get_template_registration_files_outs(): + outs = {} + for file_path in TEMPLATE_MASKRCNN_SOURCE_LIST: + outs[file_path] = [file_path] + + for file_path in TEMPLATE_BATCH_BOX_COX_SOURCE_LIST: + outs[file_path] = [file_path] + + for file_path in TEMPLATE_SOURCE_LIST: + outs[file_path] = [file_path] + + for base_name in aten_ufunc_generated_all_cpu_sources(): + file_path = "aten/src/ATen/{}".format(base_name) + outs[file_path] = [file_path] + + return outs + +def get_template_registration_file_rules(rule_name): + rules = [] + for file_path in TEMPLATE_SOURCE_LIST + TEMPLATE_MASKRCNN_SOURCE_LIST + TEMPLATE_BATCH_BOX_COX_SOURCE_LIST: + rules.append(":{}[{}]".format(rule_name, file_path)) + for file_path in aten_ufunc_generated_all_cpu_sources(): + rules.append(":{}[aten/src/ATen/{}]".format(rule_name, file_path)) + + return rules + +# ---------------------METAL RULES--------------------- +def get_metal_source_dict(): + ret = {} + for file_path in METAL_SOURCE_LIST: + path_prefix = paths.dirname(file_path) + if path_prefix not in ret: + ret[path_prefix] = [] + ret[path_prefix].append(file_path) + return ret + +def get_metal_registration_files_outs(): + outs = {} + for file_path in METAL_SOURCE_LIST: + outs[file_path] = [file_path] + + for file_path in UNET_METAL_PREPACK_SOURCE_LIST: + outs[file_path] = [file_path] + + for file_path in METAL_MASKRCNN_SOURCE_LIST: + outs[file_path] = [file_path] + return outs + +# There is a really weird issue with the arvr windows builds where +# the custom op files are breaking them. See https://fburl.com/za87443c +# The hack is just to not build them for that platform and pray they arent needed. +def get_metal_registration_files_outs_windows(): + outs = {} + for file_path in METAL_SOURCE_LIST: + outs[file_path] = [file_path] + return outs + +def get_metal_registration_files_rules(rule_name): + ret = {} + objc_rules = [] + cxx_rules = [] + + for file_path in METAL_SOURCE_LIST + METAL_MASKRCNN_SOURCE_LIST + UNET_METAL_PREPACK_SOURCE_LIST: + if ".cpp" not in file_path: + objc_rules.append(":{}[{}]".format(rule_name, file_path)) + else: + cxx_rules.append(":{}[{}]".format(rule_name, file_path)) + ret["objc"] = objc_rules + ret["cxx"] = cxx_rules + return ret + +def get_metal_registration_files_rules_windows(rule_name): + ret = {} + objc_rules = [] + cxx_rules = [] + + for file_path in METAL_SOURCE_LIST: + if ".cpp" not in file_path: + objc_rules.append(":{}[{}]".format(rule_name, file_path)) + else: + cxx_rules.append(":{}[{}]".format(rule_name, file_path)) + ret["objc"] = objc_rules + ret["cxx"] = cxx_rules + return ret diff --git a/test/defs.bzl b/test/defs.bzl new file mode 100644 index 000000000000..0e92326402dd --- /dev/null +++ b/test/defs.bzl @@ -0,0 +1,112 @@ +load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest") +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") +load("@fbsource//tools/build_defs/sandcastle:sandcastle_defs.bzl", "is_sandcastle_machine") + +def define_python_unittest(pytest = False, **kwargs): + build_mode = native.read_config("fbcode", "build_mode_test_label") + enable_flatbuffer = bool(native.read_config("fbcode", "caffe2_enable_flatbuffer", None)) + + PYTORCH_TEST_WITH_ASAN = "1" if ("asan" in build_mode or build_mode == "dev") else "0" + + PYTORCH_TEST_WITH_DEV_DBG_ASAN = "1" if (build_mode == "dev" or "dev-asan" in build_mode or "dbg-asan" in build_mode or "dbgo-asan" in build_mode) else "0" + + PYTORCH_TEST_WITH_TSAN = "1" if ("tsan" in build_mode) else "0" + + PYTORCH_TEST_WITH_UBSAN = "1" if ("ubsan" in build_mode or build_mode == "dev") else "0" + + NO_MULTIPROCESSING_SPAWN = "1" if is_sandcastle_machine() else "0" + + ENABLE_FLATBUFFER = "1" if enable_flatbuffer else "0" + + # indicates we are running in test env. + # "deepcopy" the 'env: Dict[str, str]' + kwargs["env"] = dict(kwargs.get("env", {})) + kwargs["env"]["PYTORCH_TEST"] = "1" + kwargs["env"]["PYTORCH_TEST_FBCODE"] = "1" + kwargs["env"]["PYTORCH_TEST_WITH_ASAN"] = PYTORCH_TEST_WITH_ASAN + kwargs["env"]["PYTORCH_TEST_WITH_DEV_DBG_ASAN"] = PYTORCH_TEST_WITH_DEV_DBG_ASAN + kwargs["env"]["PYTORCH_TEST_WITH_TSAN"] = PYTORCH_TEST_WITH_TSAN + kwargs["env"]["PYTORCH_TEST_WITH_UBSAN"] = PYTORCH_TEST_WITH_UBSAN + kwargs["env"]["NO_MULTIPROCESSING_SPAWN"] = NO_MULTIPROCESSING_SPAWN + kwargs["env"]["ENABLE_FLATBUFFER"] = ENABLE_FLATBUFFER + + # To speed up TP tests. + kwargs["env"]["TENSORPIPE_TLS_DATACENTER"] = "test_dc" + + # Run CUDA tests on GPUs + if kwargs.get("name").endswith("cuda"): + # "deepcopy" the 'tags: List[str]' + kwargs["tags"] = list(kwargs.get("tags", [])) + kwargs["tags"].extend([ + "re_opts_capabilities={\"platform\": \"gpu-remote-execution\", \"subplatform\": \"P100\"}", + "supports_remote_execution", + "run_as_bundle", + "tpx:experimental-shard-size-for-bundle=100", + ]) + kwargs["env"]["PYTORCH_TEST_REMOTE_GPU"] = "1" + + if pytest: + python_pytest( + **kwargs + ) + else: + python_unittest( + **kwargs + ) + +def define_mp_tests(tests, additional_deps = None, pytest = False, **kwargs): + # LeakSanitizer doesn't work for python multiprocessing. + # See https://fb.workplace.com/groups/fbcode/posts/2625521060818050/ + # and https://fb.workplace.com/groups/101100140348621/posts/1278688645923092/ + extra_env = { + "ASAN_OPTIONS": "detect_leaks=0", + "CUDA_INJECTION64_PATH": "0", # resolve kineto TSAN flakiness + } + + # Serialize test cases since multiple tests running on same GPUs can + # deadlock or there can be port conflicts. + if "tags" not in kwargs: + kwargs["tags"] = [] + if "serialize_test_cases" not in kwargs["tags"]: + kwargs["tags"].append("serialize_test_cases") + define_tests(tests, additional_deps, pytest, extra_env, **kwargs) + +def define_q_distributed_test(tests, env = None, additional_deps = None, pytest = False, **kwargs): + define_tests(tests, additional_deps, pytest, env, **kwargs) + +def define_tests(tests, additional_deps = None, pytest = False, extra_env = {}, **kwargs): + if additional_deps == None: + additional_deps = {} + + provided_tags = kwargs.pop("tags", []) + + env = { + "DOCS_SRC_DIR": "$(location //caffe2/docs/source:doc_files)", + "MKL_NUM_THREADS": "1", + "OMP_NUM_THREADS": "1", + "SKIP_TEST_BOTTLENECK": "1", + } + env.update(extra_env) + for name, srcs in tests.items(): + tags = list(provided_tags) + + test_deps = ["//caffe2:test-lib"] + additional_deps.get(name, []) + define_python_unittest( + pytest, + name = name, + srcs = srcs, + base_module = "", + compile = "with-source", + env = env, + py_version = ">=3.5", + strip_libpar = True, + tags = tags, + deps = test_deps, + # Depend directly on :libtorch so that tests won't be pruned by the + # rdep distance heuristic. + cpp_deps = ["//caffe2:libtorch"], + runtime_deps = [ + "//caffe2/docs/source:doc_files", + ], + **kwargs + ) diff --git a/test/distributed/defs.bzl b/test/distributed/defs.bzl new file mode 100644 index 000000000000..d3b3040ea4c3 --- /dev/null +++ b/test/distributed/defs.bzl @@ -0,0 +1,39 @@ +load("@fbsource//tools/build_defs:testpilot_defs.bzl", "special_tags") +load( + "//caffe2/test:defs.bzl", + "define_python_unittest", +) + +# These distributed tests need custom environment variables +def define_distributed_test(**kwargs): + # LeakSanitizer doesn't work for python multiprocessing. + # See https://fb.workplace.com/groups/fbcode/posts/2625521060818050/ + # and https://fb.workplace.com/groups/101100140348621/posts/1278688645923092/ + kwargs["env"]["ASAN_OPTIONS"] = "detect_leaks=0" + + # Resolve kineto TSAN flakiness + kwargs["env"]["CUDA_INJECTION64_PATH"] = "0" + define_python_unittest( + base_module = "", + main_module = "fb.test_distributed_trap", + py_version = ">=3.5", + tags = [special_tags.run_as_bundle], + deps = [ + "//caffe2:test-lib", + "//caffe2:torch", + "//caffe2/torch/fb/rendezvous:zeus", + "//pytorch/vision:torchvision", + ], + external_deps = [ + ("numpy", None), + ("scipy", None), + ], + **kwargs + ) + +def define_c10d_distributed_test(srcs, **kwargs): + srcs.extend(["fb/test_distributed_trap.py"]) + define_distributed_test( + srcs = srcs + native.glob(["data/*.py"]), + **kwargs + ) diff --git a/test/distributed/fsdp/defs.bzl b/test/distributed/fsdp/defs.bzl new file mode 100644 index 000000000000..2e496838c807 --- /dev/null +++ b/test/distributed/fsdp/defs.bzl @@ -0,0 +1,22 @@ +load("@bazel_skylib//lib:paths.bzl", "paths") +load( + "//caffe2/test:defs.bzl", + "define_mp_tests", +) + +def define_fsdp_tests(): + test_files = native.glob(["**/test_*.py"]) + + TESTS = {} + + additional_deps = {} + for test_file in test_files: + test_file_name = paths.basename(test_file) + test_name = test_file_name.replace("test_", "").replace(".py", "") + TESTS[test_name] = [test_file] + additional_deps[test_name] = ["//pytorch/vision:torchvision"] + + define_mp_tests( + tests = TESTS, + additional_deps = additional_deps, + ) diff --git a/test/distributed/pipeline/sync/defs.bzl b/test/distributed/pipeline/sync/defs.bzl new file mode 100644 index 000000000000..0de277bddaef --- /dev/null +++ b/test/distributed/pipeline/sync/defs.bzl @@ -0,0 +1,22 @@ +load("@bazel_skylib//lib:paths.bzl", "paths") +load( + "//caffe2/test:defs.bzl", + "define_tests", +) + +def define_pipeline_tests(): + test_files = native.glob(["**/test_*.py"]) + + TESTS = {} + + for test_file in test_files: + test_file_name = paths.basename(test_file) + test_name = test_file_name.replace("test_", "").replace(".py", "") + TESTS[test_name] = [test_file] + + define_tests( + pytest = True, + tests = TESTS, + external_deps = [("pytest", None)], + resources = ["conftest.py"], + ) diff --git a/third_party/tensorflow_cuda_bazel_build/cuda/build_defs.bzl b/third_party/tensorflow_cuda_bazel_build/cuda/build_defs.bzl new file mode 100755 index 000000000000..a394b6ce9204 --- /dev/null +++ b/third_party/tensorflow_cuda_bazel_build/cuda/build_defs.bzl @@ -0,0 +1,31 @@ +# Macros for building CUDA code. +def if_cuda(if_true, if_false = []): + """Shorthand for select()'ing on whether we're building with CUDA. + + Returns a select statement which evaluates to if_true if we're building + with CUDA enabled. Otherwise, the select statement evaluates to if_false. + + """ + return select({ + "@local_config_cuda//cuda:using_clang": if_true, + "@local_config_cuda//cuda:using_nvcc": if_true, + "//conditions:default": if_false, + }) + +def cuda_default_copts(): + """Default options for all CUDA compilations.""" + return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + []) + +def cuda_is_configured(): + """Returns true if CUDA was enabled during the configure process.""" + return True + +def if_cuda_is_configured(x): + """Tests if the CUDA was enabled during the configure process. + + Unlike if_cuda(), this does not require that we are building with + --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries. + """ + if cuda_is_configured(): + return x + return [] diff --git a/tools/cpuinfo_target_definition.bzl b/tools/cpuinfo_target_definition.bzl new file mode 100644 index 000000000000..27b1c7bb272d --- /dev/null +++ b/tools/cpuinfo_target_definition.bzl @@ -0,0 +1,12 @@ +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") +load("//caffe2/tools:sgx_target_definitions.bzl", "is_sgx") + +def add_cpuinfo_lib(): + cpp_library( + name = "cpuinfo", + exported_deps = [ + "fbsource//third-party/cpuinfo_sgx:cpuinfo_coffeelake", + ] if is_sgx else [ + "fbsource//third-party/cpuinfo:cpuinfo", + ], + ) diff --git a/tools/miniz_target_definition.bzl b/tools/miniz_target_definition.bzl new file mode 100644 index 000000000000..7040ff6beaa1 --- /dev/null +++ b/tools/miniz_target_definition.bzl @@ -0,0 +1,25 @@ +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") +load("//caffe2/tools:sgx_target_definitions.bzl", "is_sgx") + +def add_miniz_lib(): + cpp_library( + name = "miniz", + srcs = [ + "third_party/miniz-2.0.8/fb/FollyCrcPlugin.cpp", + "third_party/miniz-2.0.8/fb/miniz-fb.c", + ], + headers = { + "caffe2/third_party/miniz-2.0.8/miniz.c": "third_party/miniz-2.0.8/miniz.c", + "miniz-fb.h": "third_party/miniz-2.0.8/fb/miniz-fb.h", + "miniz.h": "third_party/miniz-2.0.8/miniz.h", + }, + header_namespace = "", + # -fexceptions is required, otherwise, when we use @mode/opt-clang-thinlto, + # c functions become noexcept, and we may not be able to catch exceptions + # during model loading. + compiler_flags = ["-DUSE_EXTERNAL_MZCRC", "-fexceptions"] + (["-DMINIZ_NO_STDIO"] if is_sgx else []), + # folly is only required as a dependency if USE_EXTERNAL_MZCRC + # above is defined, and FollyCrcPlugin.cpp is added. + # Neither are strictly needed, but run significantly faster. + exported_deps = ["//folly/hash:checksum"], + ) diff --git a/tools/perf_kernel_defs.bzl b/tools/perf_kernel_defs.bzl new file mode 100644 index 000000000000..2a699840c8bf --- /dev/null +++ b/tools/perf_kernel_defs.bzl @@ -0,0 +1,54 @@ +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") + +is_dbg_build = native.read_config("fbcode", "build_mode", "").find("dbg") != -1 +is_sanitizer = native.read_config("fbcode", "sanitizer", "") != "" + +def define_perf_kernels(prefix, levels_and_flags, compiler_common_flags, dependencies, external_deps): + vectorize_flags = ([ + # "-Rpass=loop-vectorize", # Add vectorization information to output + "-DENABLE_VECTORIZATION=1", + "-fveclib=SVML", + ] if not is_dbg_build and not is_sanitizer else []) + + compiler_specific_flags = { + "clang": vectorize_flags, + "gcc": [], + } + + compiler_specific_flags["clang"] += ["-Wno-pass-failed"] + + common_srcs = native.glob( + ["**/*.cc"], + exclude = [ + "**/*_avx512.cc", + "**/*_avx2.cc", + "**/*_avx.cc", + ], + ) + + cpp_headers = native.glob( + ["**/*.h"], + ) + + kernel_targets = [] + for level, flags in levels_and_flags: + cpp_library( + name = prefix + "perfkernels_" + level, + srcs = native.glob(["**/*_" + level + ".cc"]), + headers = cpp_headers, + compiler_flags = compiler_common_flags + flags, + compiler_specific_flags = compiler_specific_flags, + exported_deps = dependencies, + exported_external_deps = external_deps, + ) + kernel_targets.append(":" + prefix + "perfkernels_" + level) + + cpp_library( + name = prefix + "perfkernels", + srcs = common_srcs, + headers = cpp_headers, + compiler_flags = compiler_common_flags, + compiler_specific_flags = compiler_specific_flags, + link_whole = True, + exported_deps = kernel_targets + dependencies, + ) diff --git a/tools/rules/METADATA.bzl b/tools/rules/METADATA.bzl new file mode 100644 index 000000000000..a1e9c277630c --- /dev/null +++ b/tools/rules/METADATA.bzl @@ -0,0 +1,9 @@ +# THIS FILE IS AUTOMATICALLY GENERATED FROM INFORMATION STORED IN +# THIRD-PARTY METADATA SERVICE. YOUR MANUAL CHANGES TO THIS FILE WILL +# BE PRESERVED AND WILL SERVE AS THE SOURCE OF TRUTH FOR METADATA OF +# THIS PACKAGE. +# TPMS-GENERATED: b3448f8fd2a893772f944f37627e63917b77dede +METADATA = { + "name": "rules", + "owner": "pytorch_dev_infra", +} diff --git a/tools/sgx_aten_target_definitions.bzl b/tools/sgx_aten_target_definitions.bzl new file mode 100644 index 000000000000..48886ae16fe2 --- /dev/null +++ b/tools/sgx_aten_target_definitions.bzl @@ -0,0 +1,261 @@ +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") +load("@fbcode_macros//build_defs:custom_rule.bzl", "custom_rule") +load("//caffe2:build.bzl", "GENERATED_CPP") +load("//caffe2:build_variables.bzl", "jit_core_headers", "jit_core_sources") +load("//caffe2/tools:sgx_target_definitions.bzl", "is_sgx") + +default_compiler_flags = [ + "-Wno-error=strict-aliasing", + "-Wno-unused-local-typedefs", + "-Wno-shadow-compatible-local", + "-Wno-maybe-uninitialized", # aten is built with gcc as part of HHVM + "-Wno-unknown-pragmas", + "-Wno-strict-overflow", + # See https://fb.facebook.com/groups/fbcode/permalink/1813348245368673/ + # These trigger on platform007 + "-Wno-stringop-overflow", + "-Wno-class-memaccess", + "-DHAVE_MMAP", + "-DUSE_GCC_ATOMICS=1", + "-D_FILE_OFFSET_BITS=64", + "-DHAVE_SHM_OPEN=1", + "-DHAVE_SHM_UNLINK=1", + "-DHAVE_MALLOC_USABLE_SIZE=1", + "-DTH_HAVE_THREAD", + "-DCPU_CAPABILITY_DEFAULT", + "-DTH_INDEX_BASE=0", + "-DMAGMA_V2", + "-DNO_CUDNN_DESTROY_HANDLE", + "-DUSE_QNNPACK", + "-DUSE_PYTORCH_QNNPACK", + # The dynamically loaded NVRTC trick doesn't work in fbcode, + # and it's not necessary anyway, because we have a stub + # nvrtc library which we load canonically anyway + "-DUSE_DIRECT_NVRTC", + "-DUSE_XNNPACK", + "-Wno-error=uninitialized", +] + +compiler_specific_flags = { + "clang": [ + "-Wno-absolute-value", + "-Wno-pass-failed", + "-Wno-braced-scalar-init", + ], + "gcc": [ + "-Wno-error=array-bounds", + ], +} + +def add_sgx_aten_libs(ATEN_HEADERS_CPU_MKL, ATEN_SRCS_CPU_MKL, ATEN_CORE_CPP): + # we do not need to define these targets if we are in not SGX mode + if not is_sgx: + return + + x64_compiler_flags = [ + "-DUSE_SSE2", + "-DUSE_SSE3", + "-DUSE_SSE4_1", + "-DUSE_SSE4_2", + # dont enable AVX2 because we dont have runtime dispatch + "-DCPU_CAPABILITY_DEFAULT", + "-DCPU_CAPABILITY=DEFAULT", + "-DTH_INDEX_BASE=0", + "-DTH_INDEX_BASE=0", + "-msse", + "-msse2", + "-msse3", + "-msse4", + "-msse4.1", + "-msse4.2", + "-mavx", + "-mavx2", + ] + + cpu_preprocessor_flags = [ + "-DATEN_MKLDNN_ENABLED_FBCODE=0", + "-DATEN_NNPACK_ENABLED_FBCODE=0", + "-DATEN_MKL_ENABLED_FBCODE=0", + "-DAT_BUILD_WITH_BLAS_FBCODE=1", + "-DAT_BLAS_USE_CBLAS_DOT_FBCODE=1", + "-DAT_BLAS_F2C_FBCODE=0", + "-DATEN_CUDNN_ENABLED_FBCODE=1", + "-DATEN_ROCM_ENABLED_FBCODE=0", + "-DC10_MOBILE", + "-DAT_PARALLEL_NATIVE_FBCODE=1", + ] + + custom_rule( + name = "generate-sgx-config", + srcs = [ + "src/ATen/Config.h.in", + ], + build_args = " ".join([ + "--input-file", + "src/ATen/Config.h.in", + "--output-file", + "Config.h", + "--replace", + "@AT_MKLDNN_ENABLED@", + "0", + "--replace", + "@AT_MKL_ENABLED@", + "0", + "--replace", + "@AT_MKL_SEQUENTIAL@", + "0", + "--replace", + "@AT_FFTW_ENABLED@", + "0", + "--replace", + "@AT_POCKETFFT_ENABLED@", + "0", + "--replace", + "@AT_NNPACK_ENABLED@", + "ATEN_NNPACK_ENABLED_FBCODE", + "--replace", + "@AT_BUILD_WITH_BLAS@", + "1", + "--replace", + "@AT_BUILD_WITH_LAPACK@", + "0", + "--replace", + "@CAFFE2_STATIC_LINK_CUDA_INT@", + "0", + "--replace", + "@AT_BLAS_F2C@", + "AT_BLAS_F2C_FBCODE", + "--replace", + "@AT_BLAS_USE_CBLAS_DOT@", + "AT_BLAS_USE_CBLAS_DOT_FBCODE", + "--replace", + "@AT_PARALLEL_OPENMP@", + "0", + "--replace", + "@AT_PARALLEL_NATIVE@", + "1", + "--replace", + "@AT_PARALLEL_NATIVE_TBB@", + "0", + ]), + build_script_dep = "//caffe2:substitute", + output_gen_files = ["Config.h"], + ) + + cpp_library( + name = "generated-sgx-config-header", + headers = [":generate-sgx-config=Config.h"], + header_namespace = "ATen", + ) + + ATEN_CORE_H = native.glob([ + "src/ATen/core/*.h", + "src/ATen/core/boxing/*.h", + "src/ATen/core/boxing/impl/*.h", + "src/ATen/core/dispatch/*.h", + "src/ATen/core/op_registration/*.h", + ]) + [ + "src/ATen/CPUGeneratorImpl.h", + "src/ATen/NumericUtils.h", + ] + + cpp_library( + name = "ATen-core-sgx-headers", + headers = ATEN_CORE_H, + propagated_pp_flags = [ + "-Icaffe2/aten/src", + ], + exported_deps = [ + "//caffe2:generated-aten-headers-core", + "//caffe2/c10:c10", + ], + ) + + cpp_library( + name = "ATen-sgx-core", + # Sorry, this is duped with GENERATED_CPP_CORE. I was too lazy to refactor + # the list into a bzl file + srcs = ATEN_CORE_CPP + [ + ":gen_aten=Operators_0.cpp", + ":gen_aten=Operators_1.cpp", + ":gen_aten=Operators_2.cpp", + ":gen_aten=Operators_3.cpp", + ":gen_aten=Operators_4.cpp", + ":gen_aten=core/ATenOpList.cpp", + ":gen_aten=core/TensorMethods.cpp", + ], + headers = native.glob([ + "src/ATen/*.h", + "src/ATen/ops/*.h", + "src/ATen/quantized/*.h", + ]), + compiler_flags = default_compiler_flags, + compiler_specific_flags = compiler_specific_flags, + link_whole = True, + # Tests that fail in CPU static dispatch mode because they require + # the dispatcher in order to work can be gated out with `#ifndef + # ATEN_CPU_STATIC_DISPATCH`. + propagated_pp_flags = [], + # Must be linked with caffe2_core + undefined_symbols = True, + exported_deps = [ + ":ATen-core-sgx-headers", + "//caffe2:jit-core-sgx", + ], + ) + + cpp_library( + name = "ATen-sgx-cpu", + srcs = ATEN_SRCS_CPU_MKL + [":gen_aten=" + x for x in GENERATED_CPP], + headers = ATEN_HEADERS_CPU_MKL, + arch_compiler_flags = {"x86_64": x64_compiler_flags}, + compiler_flags = default_compiler_flags, + compiler_specific_flags = compiler_specific_flags, + include_directories = [ + "src", + "src/TH", + ], + link_whole = True, + propagated_pp_flags = cpu_preprocessor_flags, + exported_deps = [ + "fbsource//third-party/cpuinfo_sgx:cpuinfo_coffeelake", + ":ATen-sgx-core", + ":aten-headers-cpu", + ":generated-aten-headers-cpu", + ":generated-sgx-config-header", + ":generated-sgx-th-general-header", + ":generated-sgx-th-general-header-no-prefix", + "//caffe2/caffe2:caffe2_sgx_core", + "//caffe2/caffe2/perfkernels:sgx_perfkernels", + "//xplat/third-party/XNNPACK:XNNPACK", + ], + exported_external_deps = [ + ("OpenBLAS", None, "OpenBLAS"), + ], + deps = [ + "//caffe2/aten/src/ATen/native/quantized/cpu/qnnpack:pytorch_qnnpack", + ], + ) + +def add_sgx_aten_jit_libs(): + # we do not need to define these targets if we are in not SGX mode + if not is_sgx: + return + + cpp_library( + name = "jit-core-sgx", + # Sorry, this is duped with GENERATED_CPP_CORE. I was too lazy to refactor + # the list into a bzl file + srcs = jit_core_sources, + headers = jit_core_headers, + compiler_flags = default_compiler_flags, + compiler_specific_flags = compiler_specific_flags, + include_directories = [""], + link_whole = True, + # Must be linked with caffe2_core + undefined_symbols = True, + exported_deps = [ + "//caffe2:ATen-core-sgx-headers", + "//caffe2/c10:c10", + ], + ) diff --git a/tools/sgx_caffe2_target_definitions.bzl b/tools/sgx_caffe2_target_definitions.bzl new file mode 100644 index 000000000000..551244fe8c96 --- /dev/null +++ b/tools/sgx_caffe2_target_definitions.bzl @@ -0,0 +1,253 @@ +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") +load("//caffe2/caffe2:defs.bzl", "get_sgx_patterns") +load("//caffe2/tools:perf_kernel_defs.bzl", "define_perf_kernels") +load("//caffe2/tools:sgx_target_definitions.bzl", "is_sgx") + +def add_sgx_caffe_libs(): + # we do not need to define these targets if we are in not SGX mode + if not is_sgx: + return + + core_file_patterns = [ + "core/allocator.cc", + "core/logging.cc", + "core/flags.cc", + "core/common.cc", + "core/context.cc", + "core/event.cc", + "core/context_base.cc", + "core/numa.cc", + "core/blob_serialization.cc", + "core/tensor.cc", + "core/types.cc", + "core/blob_stats.cc", + "opt/converter.cc", + "opt/annotations.cc", + "utils/cpuid.cc", + "utils/threadpool/ThreadPool.cc", + "utils/threadpool/pthreadpool-cpp.cc", + "utils/threadpool/thread_pool_guard.cpp", + "utils/proto_utils.cc", + ] + + core_srcs = native.glob( + core_file_patterns, + ) + + core_external_deps = [ + "protobuf", + "glog", + "sparsehash", + "zstd", + ] + + core_internal_deps = [ + "fbsource//third-party/fmt:fmt", + "//caffe/proto:fb_protobuf", + "//caffe2/caffe2/proto:fb_protobuf", + "//caffe2/c10:c10", + "//common/base:exception", + "//common/logging:logging", + ] + + internal_deps = core_internal_deps + [ + # "//libfb/py/mkl:mkl_dep_handle_lp64", + "//onnx/onnx:onnx_lib", + "//foxi:foxi_loader", + "//caffe2/caffe2/fb/onnxifi:fbonnxifi_loader_stub", + # "//rocksdb:rocksdb", + "//caffe2:cpuinfo", + "//xplat/QNNPACK:QNNPACK", + "//folly/experimental/symbolizer:symbolizer", + "//folly/hash:hash", + "//folly/io:iobuf", + "//folly:conv", + "//folly:dynamic", + "//folly:executor", + "//folly:format", + "//folly:json", + "//folly:map_util", + "//folly:memory", + "//folly:mpmc_queue", + "//folly:optional", + "//folly:random", + "//folly:range", + "//folly/synchronization:rw_spin_lock", + "//folly:singleton", + "//folly:string", + "//folly:synchronized", + "//folly:thread_local", + "//folly:traits", + "//caffe2:ATen-core-headers", + # important dependency to claim space for future refactorings + "//caffe2:ATen-cpu", + "//caffe2/caffe2/perfkernels:perfkernels", + "//xplat/third-party/FP16:FP16", + "fbsource//third-party/neon2sse:neon2sse", + ] + + exclude = [ + # hip files are obtained from defs_hip.bzl + # do not include in the cpu/cuda build + "**/hip/**/*", + "test/caffe2_gtest_main.cc", + "quantization/server/**/*", + "fb/async/comm/**/*", + "fb/monitoring/**/*", + "fb/session/**/*", + # utils/knobs.cc and utils/knob_patcher.cc are only used in the open-source build + # The internal build uses versions from fb/utils/ instead. + "utils/knobs.cc", + "utils/knob_patcher.cc", + ] + + core_file_patterns = [ + "core/allocator.cc", + "core/logging.cc", + "core/flags.cc", + "core/common.cc", + "core/context.cc", + "core/event.cc", + "core/context_base.cc", + "core/numa.cc", + "core/blob_serialization.cc", + "core/tensor.cc", + "core/types.cc", + "core/blob_stats.cc", + "opt/converter.cc", + "opt/annotations.cc", + "utils/cpuid.cc", + "utils/threadpool/ThreadPool.cc", + "utils/threadpool/pthreadpool-cpp.cc", + "utils/threadpool/thread_pool_guard.cpp", + "utils/proto_utils.cc", + ] + + test_file_patterns = get_sgx_patterns([ + "_test.cc", + "_test.cpp", + ]) + + gpu_file_patterns = get_sgx_patterns([ + "_gpu.cc", + "_cudnn.cc", + ]) + + cpu_file_patterns = get_sgx_patterns([ + ".cc", + ".cpp", + ]) + + cpp_srcs = native.glob( + cpu_file_patterns, + exclude = exclude + gpu_file_patterns + test_file_patterns + core_file_patterns, + ) + + pp_flags = [ + "-Icaffe2", + "-Imodules", + "-DEIGEN_NO_DEBUG", + "-DCAFFE2_USE_GOOGLE_GLOG", + "-DCAFFE2_NO_CROSS_ARCH_WARNING", + "-DCAFFE2_USE_EXCEPTION_PTR", + # Work-around for incompatible thread pools in Caffe2 and NNPACK + "-DFBCODE_CAFFE2", + "-DUSE_PTHREADPOOL", + "-DC10_MOBILE", + ] + + compiler_flags = [ + "-Wno-unknown-pragmas", + "-Wno-narrowing", + "-Wno-missing-braces", + "-Wno-strict-overflow", + "-mno-avx", + "-Wno-error=unused-result", + ] + + cpu_header_patterns = [ + "**/*.h", + ] + + cpp_headers = native.glob( + cpu_header_patterns, + exclude = exclude, + ) + + cpp_library( + name = "caffe2_sgx_headers", + headers = cpp_headers, + propagated_pp_flags = pp_flags, + exported_deps = core_internal_deps + [ + "//folly/io/async:async_base", + "//caffe2/aten:ATen-core-sgx-headers", + ], + exported_external_deps = core_external_deps, + ) + + cpp_library( + name = "caffe2_sgx_core", + srcs = core_srcs + [ + "serialize/inline_container.cc", + "serialize/crc.cc", + "serialize/file_adapter.cc", + "serialize/istream_adapter.cc", + "serialize/read_adapter_interface.cc", + ], + compiler_flags = compiler_flags, + link_whole = True, + propagated_pp_flags = pp_flags, + exported_deps = core_internal_deps + [ + "//caffe2/aten:ATen-sgx-core", + "//caffe2/caffe2/core/nomnigraph:nomnigraph", + "//xplat/third-party/pthreadpool:pthreadpool", + "//caffe2:miniz", + ], + exported_external_deps = core_external_deps, + ) + +def add_sgx_perf_kernel_libs(): + # we do not need to define these targets if we are in not SGX mode + if not is_sgx: + return + + dependencies = [ + "//caffe2/caffe2:caffe2_sgx_headers", + "//caffe2/aten:ATen-core-sgx-headers", + ] + + compiler_common_flags = [ + "-DCAFFE2_PERF_WITH_AVX2", + "-DCAFFE2_PERF_WITH_AVX", + ] + + external_deps = [] + + # these are esentially disabled for hte sgx build but we still need them + # to avoid linking issues + levels_and_flags = ([ + ( + "avx2", + [ + "-mavx2", + "-mfma", + "-mavx", + "-mf16c", + ], + ), + ( + "avx", + [ + "-mavx", + "-mf16c", + ], + ), + ]) + + define_perf_kernels( + prefix = "sgx_", + levels_and_flags = levels_and_flags, + compiler_common_flags = compiler_common_flags, + dependencies = dependencies, + external_deps = external_deps, + ) diff --git a/tools/sgx_target_definitions.bzl b/tools/sgx_target_definitions.bzl new file mode 100644 index 000000000000..2cb816e1cc9b --- /dev/null +++ b/tools/sgx_target_definitions.bzl @@ -0,0 +1,96 @@ +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") +load("@fbsource//tools/build_defs:buckconfig.bzl", "read_bool") +load( + "//caffe2:build_variables.bzl", + "core_sources_common", + "core_sources_full_mobile", + "core_trainer_sources", + "libtorch_extra_sources", + "libtorch_generated_sources", +) + +is_sgx = read_bool("fbcode", "sgx_mode", False) + +def libtorch_sgx_sources(gencode_pattern = ":generate-code[{}]"): + libtorch_core_mobile_sources = sorted(core_sources_common + core_sources_full_mobile + core_trainer_sources) + + sgx_sources_to_exclude = [ + "torch/csrc/jit/tensorexpr/llvm_codegen.cpp", + "torch/csrc/jit/tensorexpr/llvm_jit.cpp", + "torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp", + ] + + return libtorch_generated_sources(gencode_pattern) + [i for i in libtorch_core_mobile_sources if i not in sgx_sources_to_exclude] + [i for i in libtorch_extra_sources if i not in sgx_sources_to_exclude] + +def add_sgx_torch_libs(): + # we do not need to define these targets if we are in not SGX mode + if not is_sgx: + return + + compiler_flags_cpu = [ + "-DNO_CUDNN_DESTROY_HANDLE", + "-DPYTORCH_ONNX_CAFFE2_BUNDLE", + "-DTORCH_ENABLE_LLVM", + "-Wno-write-strings", + "-Wno-format", + "-Wno-strict-aliasing", + "-Wno-non-virtual-dtor", + "-Wno-shadow-compatible-local", + "-Wno-empty-body", + "-DUSE_XNNPACK", + ] + + propagated_pp_flags_cpu = [ + "-DSYMBOLICATE_MOBILE_DEBUG_HANDLE", + "-DC10_MOBILE", + ] + + include_directories = [ + "..", + ".", + "torch/csrc/api/include", + "torch/csrc", + "torch/csrc/nn", + "torch/lib", + ] + + common_flags = { + "compiler_specific_flags": { + "clang": [ + "-Wno-absolute-value", + "-Wno-expansion-to-defined", + "-Wno-pessimizing-move", + "-Wno-return-type-c-linkage", + "-Wno-unknown-pragmas", + ], + }, + "headers": native.glob(["torch/csrc/**/*.h", "torch/csrc/generic/*.cpp", "test/cpp/jit/*.h", "test/cpp/tensorexpr/*.h"]), + } + + _libtorch_sgx_sources = list(libtorch_sgx_sources()) + + cpp_library( + name = "libtorch-sgx", + srcs = _libtorch_sgx_sources + [ + "fb/supported_mobile_models/SupportedMobileModels.cpp", + "torch/csrc/jit/mobile/function.cpp", + "torch/csrc/jit/mobile/import.cpp", + "torch/csrc/jit/mobile/interpreter.cpp", + "torch/csrc/jit/mobile/module.cpp", # this is only needed to load the model from caffe2/test/cpp/lite_interpreter_runtime/delegate_test.ptl + ], + link_whole = True, + include_directories = include_directories, + propagated_pp_flags = propagated_pp_flags_cpu, + exported_deps = [ + ":generated-autograd-headers", + ":generated-version-header", + "//caffe2/aten:ATen-sgx-cpu", + "//caffe2/caffe2:caffe2_sgx_core", + "//onnx/onnx:onnx_lib", + ], + exported_external_deps = [ + ("protobuf", None), + ], + compiler_flags = compiler_flags_cpu, + **common_flags + ) diff --git a/tools/target_definitions.bzl b/tools/target_definitions.bzl new file mode 100644 index 000000000000..66b2659050f3 --- /dev/null +++ b/tools/target_definitions.bzl @@ -0,0 +1,568 @@ +# @lint-ignore-every BUCKLINT supress the warning for using native +load("@bazel_skylib//lib:paths.bzl", "paths") +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") +load("@fbcode_macros//build_defs:cpp_python_extension.bzl", "cpp_python_extension") +load("@fbcode_macros//build_defs:custom_rule.bzl", "custom_rule") +load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary") +load("@fbsource//tools/build_defs:glob_defs.bzl", "glob") +load( + "//caffe2:build_variables.bzl", + "glob_libtorch_python_sources", + "libtorch_cuda_sources", + "libtorch_nvfuser_generated_headers", + "libtorch_nvfuser_runtime_sources", + "libtorch_python_cuda_sources", + "libtorch_sources", + "torch_cpp_srcs", +) +load( + "//caffe2:defs_hip.bzl", + "get_hip_flags", + "hip_external_deps", + "hip_pp_flags", +) +load("//caffe2/caffe2/fb:defs_gpu.bzl", "gpu_library_selector", "gpu_library_targets", "is_amd_build") +load("//tools/build/buck:nccl_deps.bzl", "get_nccl_dependency") + +def _path_to_filename(fname): + return paths.split_extension(paths.basename(fname))[0] + +def use_kineto(): + return native.host_info().os.is_linux and native.host_info().arch.is_x86_64 and not is_amd_build() + +def add_torch_libs(): + r = {} + + torch_cpp_headers = glob(["torch/csrc/api/include/**/*.h"]) + ["torch/script.h"] + libtorch_python_sources = glob_libtorch_python_sources() + + use_mpi = native.read_config("fbcode", "caffe2_use_mpi", None) + enable_flatbuffer = bool(native.read_config("fbcode", "caffe2_enable_flatbuffer", None)) + + compiler_flags_cpu = [ + "-DUSE_C10D", + "-DUSE_NUMPY", + "-DUSE_SCALARS", + "-DNO_CUDNN_DESTROY_HANDLE", + "-DBUILD_CAFFE2", + "-DTORCH_ENABLE_LLVM", + "-Wno-write-strings", + "-Wno-format", + "-Wno-strict-aliasing", + "-Wno-non-virtual-dtor", + "-Wno-shadow-compatible-local", + "-Wno-empty-body", + ] + ([] if native.host_info().os.is_windows else [ + # XNNPACK depends on an updated version of pthreadpool interface, whose implementation + # includes - a header not available on Windows. + "-DUSE_XNNPACK", + ]) + + # We should really include preprocessor flags here + # instead of compiler_flags + propagated_pp_flags_cpu = [ + "-DSYMBOLICATE_MOBILE_DEBUG_HANDLE", + "-DUSE_DISTRIBUTED", + "-DUSE_C10D_GLOO", + "-DUSE_RPC", + "-DUSE_TENSORPIPE", + ] + ( + ["-DUSE_C10D_MPI"] if use_mpi else [] + ) + ( + ["-DUSE_KINETO", "-DUSE_KINETO_UPDATED"] if use_kineto() else [] + ) + ( + ["-DENABLE_LIBKINETO_CLIENT"] if native.read_config("kineto", "enable_libkineto_client", "1") == "1" else [] + ) + + compiler_flags_cuda = [ + "-DUSE_CUDNN", + "-DUSE_NCCL", + ] + + compiler_flags_hip = [] + + propagated_pp_flags_cuda = [ + "-DUSE_CUDA", + "-DUSE_C10D_NCCL", + ] + + common_headers = glob([ + "torch/csrc/**/*.h", + # c10d used to be a separate library whose includes ended in .hpp. + "torch/csrc/distributed/c10d/*.hpp", + "torch/csrc/generic/*.cpp", + ]) + [ + "torch/csrc/deploy/Exception.h", + "torch/csrc/deploy/deploy.h", + "torch/csrc/deploy/elf_file.h", + "torch/csrc/deploy/environment.h", + "torch/csrc/deploy/interpreter/builtin_registry.h", + "torch/csrc/deploy/interpreter/interpreter_impl.h", + "torch/csrc/deploy/loader.h", + "torch/csrc/deploy/mem_file.h", + "torch/csrc/deploy/noop_environment.h", + "torch/csrc/deploy/path_environment.h", + "torch/csrc/deploy/unity/tests/test_unity.h", + "torch/csrc/deploy/unity/xar_environment.h", + "torch/csrc/distributed/rpc/metrics/RpcMetricsHandler.h", + "test/cpp/jit/test_custom_class_registrations.h", + "test/cpp/jit/test_utils.h", + "test/cpp/tensorexpr/gtest_assert_float_eq.h", + "test/cpp/tensorexpr/padded_buffer.h", + "test/cpp/tensorexpr/test_base.h", + "test/cpp/tensorexpr/test_utils.h", + ] + common_headers.remove("torch/csrc/jit/serialization/mobile_bytecode_generated.h") + + common_flags = { + "compiler_specific_flags": { + "clang": [ + "-Wno-absolute-value", + "-Wno-expansion-to-defined", + "-Wno-pessimizing-move", + "-Wno-return-type-c-linkage", + "-Wno-unknown-pragmas", + ], + }, + "headers": common_headers, + } + + include_directories = [ + "..", + ".", + "torch/csrc/api/include", + "torch/csrc", + # c10d used to be a separate library and its includes were c10d/Foo.hpp, + # hence we now need this hack to keep supporting them. + "torch/csrc/distributed", + "torch/csrc/nn", + ] + + _libtorch_sources = list(libtorch_sources()) + + # Add the Gloo and TensorPipe backends specific to Facebook networking. + _libtorch_sources.append("torch/csrc/distributed/c10d/fb/GlooDeviceFactory.cpp") + _libtorch_sources.append("torch/csrc/distributed/rpc/fb/tensorpipe_agent.cpp") + + cpp_library( + name = "libtorch", + srcs = _libtorch_sources + ([ + "torch/csrc/jit/serialization/flatbuffer_serializer.cpp", + "torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp", + "torch/csrc/jit/mobile/flatbuffer_loader.cpp", + ] if enable_flatbuffer else []), + link_whole = True, + include_directories = include_directories, + propagated_pp_flags = propagated_pp_flags_cpu + (["-DENABLE_FLATBUFFER"] if enable_flatbuffer else []), + exported_deps = ( + [ + ":ATen-cpu", + ":generated-autograd-headers", + ":generated-lazy-headers", + "//caffe2:version_cpp", + "//caffe2/caffe2:caffe2_cpu", + "//caffe2/caffe2/quantization/server:dnnlowp_ops", + "//caffe2/caffe2/serialize:inline_container", + "//caffe2/torch/lib/libshm:libshm", + "//gloo:gloo", + "//gloo/fb/transport/tls:tls", + "//gloo/transport/tcp:tcp", + "//tensorpipe:tensorpipe_cpu", + ] + (["//kineto/libkineto:kineto"] if use_kineto() else []) + + (["//caffe2:mobile_bytecode"] if enable_flatbuffer else []) + ), + exported_external_deps = [ + ("nanopb", None, "protobuf-nanopb"), + ("protobuf", None), + ("llvm-fb", None, "LLVMAnalysis"), + ("llvm-fb", None, "LLVMBPFAsmParser"), + ("llvm-fb", None, "LLVMBPFCodeGen"), + ("llvm-fb", None, "LLVMCodeGen"), + ("llvm-fb", None, "LLVMCore"), + ("llvm-fb", None, "LLVMExecutionEngine"), + ("llvm-fb", None, "LLVMIRReader"), + ("llvm-fb", None, "LLVMInstCombine"), + ("llvm-fb", None, "LLVMInterpreter"), + ("llvm-fb", None, "LLVMMC"), + ("llvm-fb", None, "LLVMNVPTXCodeGen"), + ("llvm-fb", None, "LLVMOrcJIT"), + ("llvm-fb", None, "LLVMRISCVAsmParser"), + ("llvm-fb", None, "LLVMRISCVCodeGen"), + ("llvm-fb", None, "LLVMScalarOpts"), + ("llvm-fb", None, "LLVMSupport"), + ("llvm-fb", None, "LLVMTarget"), + ("llvm-fb", None, "LLVMTransformUtils"), + ("llvm-fb", None, "LLVMVectorize"), + ("llvm-fb", None, "LLVMWebAssemblyAsmParser"), + ("llvm-fb", None, "LLVMWebAssemblyCodeGen"), + ("llvm-fb", None, "LLVMWebAssemblyInfo"), + ("llvm-fb", None, "LLVMX86AsmParser"), + ("llvm-fb", None, "LLVMX86CodeGen"), + ("llvm-fb", None, "LLVMipo"), + ] + ([("openmpi", None, "openmpi")] if use_mpi else []), + compiler_flags = compiler_flags_cpu, + **common_flags + ) + + # Below rules are used to stringify NVfuser runtime library into a header files + python_binary( + name = "nvfuser-stringify", + srcs = ["torch/csrc/jit/codegen/cuda/tools/stringify_file.py"], + base_module = "", + main_module = "torch.csrc.jit.codegen.cuda.tools.stringify_file", + ) + + # files in libtorch_nvfuser_runtime_sources that are violating package boundaries + # are mapped to their corresponding export_file rules. + violation_paths_to_rule = { + "aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh": ":aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh", + "aten/src/ATen/cuda/detail/UnpackRaw.cuh": ":aten/src/ATen/cuda/detail/UnpackRaw.cuh", + } + + for name in libtorch_nvfuser_runtime_sources: + src_path = violation_paths_to_rule.get(name, name) + filename = _path_to_filename(src_path) + native.genrule( + name = "gen-nvfuser-hdr={}.h".format(filename), + srcs = {name: src_path}, + bash = "$(exe :nvfuser-stringify) -i $SRCDIR/{} -o $OUT".format(name), + out = "{}.h".format(filename), + ) + cpp_library( + name = "generated-nvfuser-headers", + headers = [":gen-nvfuser-hdr=" + x for x in libtorch_nvfuser_generated_headers], + header_namespace = "nvfuser_resources", + ) + + _libtorch_cuda_sources = list(libtorch_cuda_sources) + cpp_library( + name = "libtorch_cuda", + srcs = _libtorch_cuda_sources, + link_whole = True, + include_directories = include_directories, + # TODO: putting USE_CUDA in propagated_pp_flags is error-prone + propagated_pp_flags = propagated_pp_flags_cuda, + exported_deps = [ + ":ATen", + ":generated-aten-headers-cuda", + ":generated-autograd-headers", + ":generated-nvfuser-headers", + ":libtorch", + "//caffe2/caffe2:caffe2_cpu", + "//caffe2/caffe2:caffe2_gpu", + "//caffe2/torch/lib/libshm:libshm", + "//gloo:gloo_gpu_cuda", + "//tensorpipe:tensorpipe_cuda", + ], + exported_external_deps = [ + ("cudnn", None, "cudnn-lazy"), + ("cuda", None, "nvToolsExt-lazy"), + ("cuda", None, "nvrtc-lazy"), + ("cuda", None, "nvrtc-builtins-lazy"), + ] + get_nccl_dependency(), + compiler_flags = compiler_flags_cpu + compiler_flags_cuda, + **common_flags + ) + + # (original_paths, hipified_paths) + libtorch_hip_headers_filter = torch_cpp_headers + [h for h in common_headers if any([h.startswith(d) for d in [ + # headers in the following directories are added to libtorch_hip_headers_filter + # so that they are not hipified. + "torch/csrc/deploy/", + "torch/csrc/distributed/rpc/metrics/", + "torch/csrc/jit/serialization/", + "torch/cpp/jit/", + "torch/cpp/tensorexpr/", + ]])] + libtorch_hip_sources = (libtorch_cuda_sources, [f.replace(".cu", ".hip") for f in libtorch_cuda_sources]) + libtorch_hip_headers = ([f for f in common_headers if f not in libtorch_hip_headers_filter],) * 2 + + custom_rule( + name = "fb_libtorch_hipify_gen", + srcs = libtorch_hip_sources[0] + libtorch_hip_headers[0], + build_args = "--source-dir= --hipify-dir= --copy-dir= --rewrite-cu-ext", + build_script_dep = "//caffe2:fb_caffe2_hipify", + output_gen_files = libtorch_hip_sources[1] + libtorch_hip_headers[1], + ) + + cpp_library( + name = "libtorch_hip_headers", + headers = [":fb_libtorch_hipify_gen={}".format(f) for f in libtorch_hip_headers[1]], + header_namespace = "", + ) + + cpp_library( + name = "libtorch_hip", + srcs = [":fb_libtorch_hipify_gen={}".format(f) for f in libtorch_hip_sources[1]], + headers = [f for f in common_headers if f in libtorch_hip_headers_filter], + link_whole = True, + propagated_pp_flags = hip_pp_flags, + exported_deps = [ + ":generated-aten-headers-hip", + ":generated-autograd-headers", + ":generated-nvfuser-headers", + ":libtorch", + ":libtorch_hip_headers", + "//caffe2:ATen-hip", + "//caffe2/caffe2:caffe2_cpu", + "//caffe2/caffe2:caffe2_gpu_hip", + "//caffe2/torch/lib/libshm:libshm", + "//gloo:gloo_gpu_hip", + "//tensorpipe:tensorpipe_cpu", # TODO: include a HIP version once it's developed + ], + exported_external_deps = hip_external_deps, + compiler_flags = compiler_flags_cpu + compiler_flags_hip + [ + "-Wno-unused-result", + ], + hip_flags = ["-Wno-unused-result"] + get_hip_flags(), + compiler_specific_flags = common_flags["compiler_specific_flags"], + ) + + gpu_library_targets( + name = "libtorch_gpu", + deps_cpu = [ + ":libtorch", + ], + deps_cuda = [ + ":libtorch_cuda", + ], + deps_hip = [ + ":libtorch_hip", + ], + exclude_hip_target = False, + extra_external_deps = [], + ) + + # torch-cpp is still conditionally compiled based on USE_CUDA. Ideally we'd + # separate it out as an additive library instead. + gpu_library_selector( + name = "torch-cpp", + deps_cpu = [":torch-cpp-cpu"], + deps_cuda = [":torch-cpp-cuda"], + deps_hip = [":torch-cpp-hip"], + merge_cpu_deps = False, + exclude_hip_target = False, + ) + + # USE_CUDA flag is propagated through propagated_pp_flags on libtorch + cpp_library( + name = "torch-cpp-cuda", + srcs = torch_cpp_srcs, + headers = torch_cpp_headers, + include_directories = [ + ".", + "torch/csrc/api/include/", + ], + exported_deps = [ + ":libtorch_cuda", + "//caffe2/torch/fb/init:init", + ], + exported_external_deps = [ + ("cuda", None, "cuda-lazy"), + ("cudnn", None, "cudnn-lazy"), + ], + ) + + cpp_library( + name = "torch-cpp-hip", + srcs = torch_cpp_srcs, + headers = torch_cpp_headers, + include_directories = [ + ".", + "torch/csrc/api/include/", + ], + exported_deps = [ + ":libtorch_hip", + "//caffe2/torch/fb/init:init", + ], + exported_external_deps = hip_external_deps, + ) + + cpp_library( + name = "torch-cpp-cpu", + srcs = torch_cpp_srcs, + headers = torch_cpp_headers, + include_directories = [ + ".", + "torch/csrc/api/include/", + ], + exported_deps = [ + ":libtorch", + "//caffe2/torch/fb/init:init", + ], + ) + + # _C_impl is still conditionally compiled based on USE_CUDA. Ideally we'd + # separate it out as an additive library instead. + # TODO: split it into cpp and cuda parts similarly to libtorch + gpu_library_selector( + name = "_C_impl", + deps_cpu = [":_C_impl_cpu"], + deps_cuda = [":_C_impl_cuda"], + deps_hip = [":_C_impl_hip"], + merge_cpu_deps = False, + exclude_hip_target = False, + ) + + cpp_library( + name = "_C_impl_cpu", + srcs = libtorch_python_sources, + link_whole = True, + exported_deps = [ + "fbsource//third-party/fmt:fmt", + ":torch-cpp-cpu", + "//caffe2/torch/fb/init:init", + "//caffe2/torch/lib/libshm:libshm", + ], + exported_external_deps = [ + ("numpy", None, "cpp"), + ("pybind11", None), + ("python", None), + ], + compiler_flags = compiler_flags_cpu, + compiler_specific_flags = common_flags["compiler_specific_flags"], + ) + + # This target is used to help get headers for compile-time deps for torch::deploy + # libinterpreter.so build _without_ getting link-time deps, which are supplied + # separately by the application that dlopens libinterpreter.so. + # + # We make use of the buck auto-generated #headers flavor of a target to accomplish this. + # + # However, since #headers flavor of target with srcs can't be used in all build modes, we + # work around this limitation by using this 'pass-through' target, which has a usable + # #headers flavor in all build modes. + cpp_library( + name = "headers_for_torch_python_deps", + exported_deps = [ + ":_C_impl_cpu", + ], + ) + cpp_library( + name = "headers_for_torch_python_cuda_deps", + exported_deps = [ + ":_C_impl_cuda", + ], + ) + + # This target compiles torch_python bindings, but skips the deps on actual + # torch and python since those will be integrated specially in the wrapper for + # libinterpreter.so used in torch::deploy + cpp_library( + name = "torch_python_without_torch", + srcs = libtorch_python_sources + torch_cpp_srcs, + undefined_symbols = True, + preferred_linkage = "static", + exported_deps = [ + ":headers_for_torch_python_deps#headers", + ], + exported_external_deps = [ + ("pybind11", None), + ("frozenpython", None, "python-headers"), + ], + compiler_flags = compiler_flags_cpu + [ + # some code in the Python bindings compiles differently + # when you are deploy + "-DUSE_DEPLOY", + ], + compiler_specific_flags = common_flags["compiler_specific_flags"], + ) + + cpp_library( + name = "torch_python_cuda_without_torch", + srcs = libtorch_python_sources + torch_cpp_srcs + libtorch_python_cuda_sources, + undefined_symbols = True, + preferred_linkage = "static", + exported_deps = [ + ":headers_for_torch_python_cuda_deps#headers", + ], + exported_external_deps = [ + ("pybind11", None), + ("frozenpython", None, "python-headers"), + ], + compiler_flags = compiler_flags_cpu + [ + "-DUSE_CUDA", + # some code in the Python bindings compiles differently + # when you are deploy + "-DUSE_DEPLOY", + ], + compiler_specific_flags = common_flags["compiler_specific_flags"], + ) + + cpp_library( + name = "_C_impl_cuda", + srcs = libtorch_python_sources + libtorch_python_cuda_sources, + link_whole = True, + exported_deps = [ + "fbsource//third-party/fmt:fmt", + ":torch-cpp-cuda", + "//caffe2/torch/fb/init:init", + "//caffe2/torch/lib/libshm:libshm", + ], + exported_external_deps = [ + ("numpy", None, "cpp"), + ("pybind11", None), + ("python", None), + ], + compiler_flags = compiler_flags_cpu + compiler_flags_cuda, + compiler_specific_flags = common_flags["compiler_specific_flags"], + ) + + # Autogenerated files whose rules contain ":" are not hipified. + libtorch_python_hip_sources = [f for f in (libtorch_python_sources + libtorch_python_cuda_sources) if ":" in f] + libtorch_python_hip_sources_hipified = [f for f in (libtorch_python_sources + libtorch_python_cuda_sources) if not ":" in f] + + custom_rule( + name = "fb_C_impl_hipify_gen", + srcs = libtorch_python_hip_sources_hipified, + build_args = "--source-dir= --hipify-dir= --copy-dir=", + build_script_dep = "//caffe2:fb_caffe2_hipify", + output_gen_files = libtorch_python_hip_sources_hipified, + ) + + cpp_library( + name = "_C_impl_hip", + srcs = [":fb_C_impl_hipify_gen={}".format(f) for f in (libtorch_python_hip_sources_hipified)] + libtorch_python_hip_sources, + link_whole = True, + exported_deps = [ + "fbsource//third-party/fmt:fmt", + ":torch-cpp-hip", + "//caffe2/torch/fb/init:init", + "//caffe2/torch/lib/libshm:libshm", + ], + exported_external_deps = [ + ("numpy", None, "cpp"), + ("pybind11", None), + ("python", None), + ], + compiler_flags = compiler_flags_cpu + compiler_flags_hip + ["-Wno-unused-result"], + compiler_specific_flags = common_flags["compiler_specific_flags"], + ) + + cpp_python_extension( + name = "_C", + srcs = [ + "torch/csrc/stub.c", + ], + base_module = "torch", + deps = [ + ":_C_impl", + "//caffe2:flatbuffer_loader", + ], + ) + + cpp_python_extension( + name = "_C_flatbuffer", + srcs = [ + "torch/csrc/stub_with_flatbuffer.c", + "torch/csrc/init_flatbuffer_module.cpp", + ], + base_module = "torch", + deps = [ + ":_C_impl", + "//caffe2:flatbuffer_loader", + "//caffe2:flatbuffer_serializer", + ], + ) + + return r diff --git a/torch/csrc/deploy/interpreter/defs.bzl b/torch/csrc/deploy/interpreter/defs.bzl new file mode 100644 index 000000000000..719155cf7da0 --- /dev/null +++ b/torch/csrc/deploy/interpreter/defs.bzl @@ -0,0 +1,117 @@ +load("@fbcode_macros//build_defs:cpp_binary.bzl", "cpp_binary") +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") +load("@fbcode_macros//build_defs:native_rules.bzl", "cxx_genrule") + +# @lint-ignore-every BUCKLINT +load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native") + +def embedded_interpreter(name, suffix, legacy = False, exported_deps = [], exported_external_deps = []): + final_name = name + is_all = suffix == "all" + is_cuda = suffix == "cuda" or is_all + platform_static_lib = [] + for platform in ["platform009", "platform010"]: + name = platform + "_" + final_name + so_name = name + ".so" + cpp_binary( + name = so_name, + srcs = [ + "interpreter_impl.cpp", + ] + (["import_find_sharedfuncptr.cpp"] if is_all else []), + headers = [ + "Optional.hpp", + "interpreter_impl.h", + ], + header_namespace = "torch/csrc/deploy", + dlopen_enabled = True, + linker_flags = ([ + # This ensures only the intended interface symbols are public/global + # the rest are hidden, regardless of how they were compiled + # (e.g. fvisibility=hidden is NOT important for the component + # objs in this library, since we override here.) + "--version-script=$(location :hide_symbols.script)", + ] if not is_all else []), + deps = [ + "fbsource//third-party/fmt:fmt", + ] + ([ + ":builtin_registry_cuda", + "//caffe2:torch_python_cuda_without_torch", + "//deeplearning/trt/python:frozen_tensorrt", + ] if is_cuda else [ + ":builtin_registry", + "//caffe2:torch_python_without_torch", + ]), + external_deps = + [ + # needed for interpreter.cpp itself, it uses pybind currently + ("frozenpython", None, "python-frozen"), + ("frozenpython", None, "python"), + ], + fbcode_platform = platform, + ) + + # We build torch::deploy with two embedded binaries- one with only cpu py bindings, + # the other with cpu+cuda py bindings. This unfortunately wastes some binary size, + # but at least at runtime only one of them is loaded. + # + # This is becuase of two reasons + # (1) that applications such as predictor want to depend on torch::deploy in a + # cuda-agnostic way, e.g. they don't choose yet, and a binary/app that depends + # on predictor either chooses to include or not include a dep on cuda. + # + # (2) the way the embedded binary is created and loaded, it only exposes a small + # set of interface symbols globally, for creating a new interpreter, and hides its + # other symbols (esp. python ones) so they don't conflict with other interpreters. + # This prevents dividing the cpu and cuda portions of bindings into _separate_ libs + # and loading the cuda part additively. Hence to achieve requirement (1) we bundle + # two complete interpreter libs, one with and one without cuda. + + cp_cmd = "$(location //caffe2/torch/csrc/deploy:remove_dt_needed)" if suffix == "all" else "cp" + + build_name = "build_" + name + if not legacy: + cxx_genrule( + name = build_name, + out = "embedded_interpreter_" + suffix + ".a", + cmd = """\ + """ + cp_cmd + """ $(location :""" + so_name + """) libtorch_deployinterpreter_internal_""" + suffix + """.so + ld -r -b binary -o ${TMP}/embedded_interpreter_""" + suffix + """.o libtorch_deployinterpreter_internal_""" + suffix + """.so + objcopy --rename-section .data=.torch_deploy_payload.interpreter_""" + suffix + """,readonly,contents -N _binary_libtorch_deployinterpreter_""" + suffix + """_so_start -N _binary_libtorch_deployinterpreter_""" + suffix + """_so_end ${TMP}/embedded_interpreter_""" + suffix + """.o + ar rcs ${OUT} ${TMP}/embedded_interpreter_""" + suffix + """.o + """, + ) + else: + cxx_genrule( + name = build_name, + out = "embedded_interpreter_cuda_legacy.a", + cmd = """\ + cp $(location :""" + so_name + """) libtorch_deployinterpreter_cuda.so + ld -r -b binary -o ${TMP}/embedded_interpreter_cuda.o libtorch_deployinterpreter_cuda.so + ar rcs ${OUT} ${TMP}/embedded_interpreter_cuda.o + """, + ) + platform_static_lib.append(["^" + platform, ":" + build_name]) + + internal_name = final_name + "_internal" + fb_native.prebuilt_cxx_library( + preferred_linkage = "static", + name = internal_name, + visibility = ["PUBLIC"], + link_whole = True, + platform_static_lib = platform_static_lib, + ) + + # a thin wrapper around :embedded_interpreter_internal to add --export-dynamic + # linker flags. The flag will be propagated to cpp_binary. We don't require + # cpp_binary to explicitly enable --export-dynamic any more. New usecases usually + # forgot to do so and caused interpreter not found crash. + cpp_library( + name = final_name, + linker_flags = [ + "--export-dynamic", + ], + exported_deps = [ + ":" + internal_name, + ] + exported_deps, + exported_external_deps = exported_external_deps, + ) diff --git a/torch/csrc/deploy/unity/unity.bzl b/torch/csrc/deploy/unity/unity.bzl new file mode 100644 index 000000000000..8431356a4df9 --- /dev/null +++ b/torch/csrc/deploy/unity/unity.bzl @@ -0,0 +1,46 @@ +load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") +load("@fbcode_macros//build_defs:native_rules.bzl", "cxx_genrule") +load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary") + +# @lint-ignore-every BUCKLINT +load("@fbsource//tools/build_defs:fb_native_wrapper.bzl", "fb_native") + +def build_unity(name, **kwargs): + python_binary(name = name, **kwargs) + + cxx_genrule( + name = "{}_build_python_app_lib".format(name), + out = "python_app.a", + cmd = """\ + cp $(location :""" + name + """) python_app + ld -r -b binary -o ${TMP}/python_app.o python_app + # rename the .data section to .torch_deploy_payload.unity. + # don't set the alloc/load flags for the section so it will not join + # the party of relocation. + # Also strip the _binary_python_app_start/end/size symbols to avoid + # confusion. + objcopy --rename-section .data=.torch_deploy_payload.unity,readonly,contents -N _binary_python_app_start -N _binary_python_app_end -N _binary_python_app_size ${TMP}/python_app.o + ar rcs ${OUT} ${TMP}/python_app.o + """, + ) + + fb_native.prebuilt_cxx_library( + name = "{}_python_app_lib".format(name), + visibility = ["PUBLIC"], + link_whole = True, + preferred_linkage = "static", + static_lib = ":{}_build_python_app_lib".format(name), + ) + + cpp_library( + name = "{}_unity_lib".format(name), + srcs = [ + ], + linker_flags = [ + "--export-dynamic", + ], + exported_deps = [ + "//caffe2/torch/csrc/deploy/unity:unity_core", + ":{}_python_app_lib".format(name), + ], + )