pytorch/tools/target_definitions.bzl

# @lint-ignore-every BUCKLINT supress the warning for using native
load("@bazel_skylib//lib:paths.bzl", "paths")
load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
load("@fbcode_macros//build_defs:cpp_python_extension.bzl", "cpp_python_extension")
load("@fbcode_macros//build_defs:custom_rule.bzl", "custom_rule")
load("@fbcode_macros//build_defs:python_binary.bzl", "python_binary")
load("@fbsource//tools/build_defs:glob_defs.bzl", "glob")
load(
    "//caffe2:build_variables.bzl",
    "glob_libtorch_python_sources",
    "libtorch_cuda_sources",
    "libtorch_nvfuser_generated_headers",
    "libtorch_nvfuser_runtime_sources",
    "libtorch_python_cuda_sources",
    "libtorch_sources",
    "torch_cpp_srcs",
)
load(
    "//caffe2:defs_hip.bzl",
    "get_hip_flags",
    "hip_external_deps",
    "hip_pp_flags",
)
load("//caffe2/caffe2/fb:defs_gpu.bzl", "gpu_library_selector", "gpu_library_targets", "is_amd_build")
load("//tools/build/buck:nccl_deps.bzl", "get_nccl_dependency")

def _path_to_filename(fname):
    return paths.split_extension(paths.basename(fname))[0]

def use_kineto():
    return native.host_info().os.is_linux and native.host_info().arch.is_x86_64 and not is_amd_build()

def add_torch_libs():
    r = {}

    torch_cpp_headers = glob(["torch/csrc/api/include/**/*.h"]) + ["torch/script.h"]
    libtorch_python_sources = glob_libtorch_python_sources()

    use_mpi = native.read_config("fbcode", "caffe2_use_mpi", None)
    enable_flatbuffer = bool(native.read_config("fbcode", "caffe2_enable_flatbuffer", None))

    compiler_flags_cpu = [
        "-DUSE_C10D",
        "-DUSE_NUMPY",
        "-DUSE_SCALARS",
        "-DNO_CUDNN_DESTROY_HANDLE",
        "-DBUILD_CAFFE2",
        "-DTORCH_ENABLE_LLVM",
        "-Wno-write-strings",
        "-Wno-format",
        "-Wno-strict-aliasing",
        "-Wno-non-virtual-dtor",
        "-Wno-shadow-compatible-local",
        "-Wno-empty-body",
    ] + ([] if native.host_info().os.is_windows else [
        # XNNPACK depends on an updated version of pthreadpool interface, whose implementation
        # includes <pthread.h> - a header not available on Windows.
        "-DUSE_XNNPACK",
    ])

    # We should really include preprocessor flags here
    # instead of compiler_flags
    propagated_pp_flags_cpu = [
        "-DSYMBOLICATE_MOBILE_DEBUG_HANDLE",
        "-DUSE_DISTRIBUTED",
        "-DUSE_C10D_GLOO",
        "-DUSE_RPC",
        "-DUSE_TENSORPIPE",
    ] + (
        ["-DUSE_C10D_MPI"] if use_mpi else []
    ) + (
        ["-DUSE_KINETO", "-DUSE_KINETO_UPDATED"] if use_kineto() else []
    ) + (
        ["-DENABLE_LIBKINETO_CLIENT"] if native.read_config("kineto", "enable_libkineto_client", "1") == "1" else []
    )

    compiler_flags_cuda = [
        "-DUSE_CUDNN",
        "-DUSE_NCCL",
    ]

    compiler_flags_hip = []

    propagated_pp_flags_cuda = [
        "-DUSE_CUDA",
        "-DUSE_C10D_NCCL",
    ]

    common_headers = glob([
        "torch/csrc/**/*.h",
        # c10d used to be a separate library whose includes ended in .hpp.
        "torch/csrc/distributed/c10d/*.hpp",
        "torch/csrc/generic/*.cpp",
    ]) + [
        "torch/csrc/deploy/Exception.h",
        "torch/csrc/deploy/deploy.h",
        "torch/csrc/deploy/elf_file.h",
        "torch/csrc/deploy/environment.h",
        "torch/csrc/deploy/interpreter/builtin_registry.h",
        "torch/csrc/deploy/interpreter/interpreter_impl.h",
        "torch/csrc/deploy/loader.h",
        "torch/csrc/deploy/mem_file.h",
        "torch/csrc/deploy/noop_environment.h",
        "torch/csrc/deploy/path_environment.h",
        "torch/csrc/deploy/unity/tests/test_unity.h",
        "torch/csrc/deploy/unity/xar_environment.h",
        "torch/csrc/distributed/rpc/metrics/RpcMetricsHandler.h",
        "test/cpp/jit/test_custom_class_registrations.h",
        "test/cpp/jit/test_utils.h",
        "test/cpp/tensorexpr/gtest_assert_float_eq.h",
        "test/cpp/tensorexpr/padded_buffer.h",
        "test/cpp/tensorexpr/test_base.h",
        "test/cpp/tensorexpr/test_utils.h",
    ]
    common_headers.remove("torch/csrc/jit/serialization/mobile_bytecode_generated.h")

    common_flags = {
        "compiler_specific_flags": {
            "clang": [
                "-Wno-absolute-value",
                "-Wno-expansion-to-defined",
                "-Wno-pessimizing-move",
                "-Wno-return-type-c-linkage",
                "-Wno-unknown-pragmas",
            ],
        },
        "headers": common_headers,
    }

    include_directories = [
        "..",
        ".",
        "torch/csrc/api/include",
        "torch/csrc",
        # c10d used to be a separate library and its includes were c10d/Foo.hpp,
        # hence we now need this hack to keep supporting them.
        "torch/csrc/distributed",
        "torch/csrc/nn",
    ]

    _libtorch_sources = list(libtorch_sources())

    # Add the Gloo and TensorPipe backends specific to Facebook networking.
    _libtorch_sources.append("torch/csrc/distributed/c10d/fb/GlooDeviceFactory.cpp")
    _libtorch_sources.append("torch/csrc/distributed/rpc/fb/tensorpipe_agent.cpp")

    cpp_library(
        name = "libtorch",
        srcs = _libtorch_sources + ([
            "torch/csrc/jit/serialization/flatbuffer_serializer.cpp",
            "torch/csrc/jit/serialization/flatbuffer_serializer_jit.cpp",
            "torch/csrc/jit/mobile/flatbuffer_loader.cpp",
        ] if enable_flatbuffer else []),
        link_whole = True,
        include_directories = include_directories,
        propagated_pp_flags = propagated_pp_flags_cpu + (["-DENABLE_FLATBUFFER"] if enable_flatbuffer else []),
        exported_deps = (
            [
                ":ATen-cpu",
                ":generated-autograd-headers",
                ":generated-lazy-headers",
                "//caffe2:version_cpp",
                "//caffe2/caffe2:caffe2_cpu",
                "//caffe2/caffe2/quantization/server:dnnlowp_ops",
                "//caffe2/caffe2/serialize:inline_container",
                "//caffe2/torch/lib/libshm:libshm",
                "//gloo:gloo",
                "//gloo/fb/transport/tls:tls",
                "//gloo/transport/tcp:tcp",
                "//tensorpipe:tensorpipe_cpu",
            ] + (["//kineto/libkineto:kineto"] if use_kineto() else []) +
            (["//caffe2:mobile_bytecode"] if enable_flatbuffer else [])
        ),
        exported_external_deps = [
            ("nanopb", None, "protobuf-nanopb"),
            ("protobuf", None),
            ("llvm-fb", None, "LLVMAnalysis"),
            ("llvm-fb", None, "LLVMBPFAsmParser"),
            ("llvm-fb", None, "LLVMBPFCodeGen"),
            ("llvm-fb", None, "LLVMCodeGen"),
            ("llvm-fb", None, "LLVMCore"),
            ("llvm-fb", None, "LLVMExecutionEngine"),
            ("llvm-fb", None, "LLVMIRReader"),
            ("llvm-fb", None, "LLVMInstCombine"),
            ("llvm-fb", None, "LLVMInterpreter"),
            ("llvm-fb", None, "LLVMMC"),
            ("llvm-fb", None, "LLVMNVPTXCodeGen"),
            ("llvm-fb", None, "LLVMOrcJIT"),
            ("llvm-fb", None, "LLVMRISCVAsmParser"),
            ("llvm-fb", None, "LLVMRISCVCodeGen"),
            ("llvm-fb", None, "LLVMScalarOpts"),
            ("llvm-fb", None, "LLVMSupport"),
            ("llvm-fb", None, "LLVMTarget"),
            ("llvm-fb", None, "LLVMTransformUtils"),
            ("llvm-fb", None, "LLVMVectorize"),
            ("llvm-fb", None, "LLVMWebAssemblyAsmParser"),
            ("llvm-fb", None, "LLVMWebAssemblyCodeGen"),
            ("llvm-fb", None, "LLVMWebAssemblyInfo"),
            ("llvm-fb", None, "LLVMX86AsmParser"),
            ("llvm-fb", None, "LLVMX86CodeGen"),
            ("llvm-fb", None, "LLVMipo"),
        ] + ([("openmpi", None, "openmpi")] if use_mpi else []),
        compiler_flags = compiler_flags_cpu,
        **common_flags
    )

    # Below rules are used to stringify NVfuser runtime library into a header files
    python_binary(
        name = "nvfuser-stringify",
        srcs = ["torch/csrc/jit/codegen/cuda/tools/stringify_file.py"],
        base_module = "",
        main_module = "torch.csrc.jit.codegen.cuda.tools.stringify_file",
    )

    # files in libtorch_nvfuser_runtime_sources that are violating package boundaries
    # are mapped to their corresponding export_file rules.
    violation_paths_to_rule = {
        "aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh": ":aten/src/ATen/cuda/detail/PhiloxCudaStateRaw.cuh",
        "aten/src/ATen/cuda/detail/UnpackRaw.cuh": ":aten/src/ATen/cuda/detail/UnpackRaw.cuh",
    }

    for name in libtorch_nvfuser_runtime_sources:
        src_path = violation_paths_to_rule.get(name, name)
        filename = _path_to_filename(src_path)
        native.genrule(
            name = "gen-nvfuser-hdr={}.h".format(filename),
            srcs = {name: src_path},
            bash = "$(exe :nvfuser-stringify) -i $SRCDIR/{} -o $OUT".format(name),
            out = "{}.h".format(filename),
        )
    cpp_library(
        name = "generated-nvfuser-headers",
        headers = [":gen-nvfuser-hdr=" + x for x in libtorch_nvfuser_generated_headers],
        header_namespace = "nvfuser_resources",
    )

    _libtorch_cuda_sources = list(libtorch_cuda_sources)
    cpp_library(
        name = "libtorch_cuda",
        srcs = _libtorch_cuda_sources,
        link_whole = True,
        include_directories = include_directories,
        # TODO: putting USE_CUDA in propagated_pp_flags is error-prone
        propagated_pp_flags = propagated_pp_flags_cuda,
        exported_deps = [
            ":ATen",
            ":generated-aten-headers-cuda",
            ":generated-autograd-headers",
            ":generated-nvfuser-headers",
            ":libtorch",
            "//caffe2/caffe2:caffe2_cpu",
            "//caffe2/caffe2:caffe2_gpu",
            "//caffe2/torch/lib/libshm:libshm",
            "//gloo:gloo_gpu_cuda",
            "//tensorpipe:tensorpipe_cuda",
        ],
        exported_external_deps = [
            ("cudnn", None, "cudnn-lazy"),
            ("cuda", None, "nvToolsExt-lazy"),
            ("cuda", None, "nvrtc-lazy"),
            ("cuda", None, "nvrtc-builtins-lazy"),
        ] + get_nccl_dependency(),
        compiler_flags = compiler_flags_cpu + compiler_flags_cuda,
        **common_flags
    )

    # (original_paths, hipified_paths)
    libtorch_hip_headers_filter = torch_cpp_headers + [h for h in common_headers if any([h.startswith(d) for d in [
        # headers in the following directories are added to libtorch_hip_headers_filter
        # so that they are not hipified.
        "torch/csrc/deploy/",
        "torch/csrc/distributed/rpc/metrics/",
        "torch/csrc/jit/serialization/",
        "torch/cpp/jit/",
        "torch/cpp/tensorexpr/",
    ]])]
    libtorch_hip_sources = (libtorch_cuda_sources, [f.replace(".cu", ".hip") for f in libtorch_cuda_sources])
    libtorch_hip_headers = ([f for f in common_headers if f not in libtorch_hip_headers_filter],) * 2

    custom_rule(
        name = "fb_libtorch_hipify_gen",
        srcs = libtorch_hip_sources[0] + libtorch_hip_headers[0],
        build_args = "--source-dir= --hipify-dir= --copy-dir= --rewrite-cu-ext",
        build_script_dep = "//caffe2:fb_caffe2_hipify",
        output_gen_files = libtorch_hip_sources[1] + libtorch_hip_headers[1],
    )

    cpp_library(
        name = "libtorch_hip_headers",
        headers = [":fb_libtorch_hipify_gen={}".format(f) for f in libtorch_hip_headers[1]],
        header_namespace = "",
    )

    cpp_library(
        name = "libtorch_hip",
        srcs = [":fb_libtorch_hipify_gen={}".format(f) for f in libtorch_hip_sources[1]],
        headers = [f for f in common_headers if f in libtorch_hip_headers_filter],
        link_whole = True,
        propagated_pp_flags = hip_pp_flags,
        exported_deps = [
            ":generated-aten-headers-hip",
            ":generated-autograd-headers",
            ":generated-nvfuser-headers",
            ":libtorch",
            ":libtorch_hip_headers",
            "//caffe2:ATen-hip",
            "//caffe2/caffe2:caffe2_cpu",
            "//caffe2/caffe2:caffe2_gpu_hip",
            "//caffe2/torch/lib/libshm:libshm",
            "//gloo:gloo_gpu_hip",
            "//tensorpipe:tensorpipe_cpu",  # TODO: include a HIP version once it's developed
        ],
        exported_external_deps = hip_external_deps,
        compiler_flags = compiler_flags_cpu + compiler_flags_hip + [
            "-Wno-unused-result",
        ],
        hip_flags = ["-Wno-unused-result"] + get_hip_flags(),
        compiler_specific_flags = common_flags["compiler_specific_flags"],
    )

    gpu_library_targets(
        name = "libtorch_gpu",
        deps_cpu = [
            ":libtorch",
        ],
        deps_cuda = [
            ":libtorch_cuda",
        ],
        deps_hip = [
            ":libtorch_hip",
        ],
        exclude_hip_target = False,
        extra_external_deps = [],
    )

    # torch-cpp is still conditionally compiled based on USE_CUDA. Ideally we'd
    # separate it out as an additive library instead.
    gpu_library_selector(
        name = "torch-cpp",
        deps_cpu = [":torch-cpp-cpu"],
        deps_cuda = [":torch-cpp-cuda"],
        deps_hip = [":torch-cpp-hip"],
        merge_cpu_deps = False,
        exclude_hip_target = False,
    )

    # USE_CUDA flag is propagated through propagated_pp_flags on libtorch
    cpp_library(
        name = "torch-cpp-cuda",
        srcs = torch_cpp_srcs,
        headers = torch_cpp_headers,
        include_directories = [
            ".",
            "torch/csrc/api/include/",
        ],
        exported_deps = [
            ":libtorch_cuda",
            "//caffe2/torch/fb/init:init",
        ],
        exported_external_deps = [
            ("cuda", None, "cuda-lazy"),
            ("cudnn", None, "cudnn-lazy"),
        ],
    )

    cpp_library(
        name = "torch-cpp-hip",
        srcs = torch_cpp_srcs,
        headers = torch_cpp_headers,
        include_directories = [
            ".",
            "torch/csrc/api/include/",
        ],
        exported_deps = [
            ":libtorch_hip",
            "//caffe2/torch/fb/init:init",
        ],
        exported_external_deps = hip_external_deps,
    )

    cpp_library(
        name = "torch-cpp-cpu",
        srcs = torch_cpp_srcs,
        headers = torch_cpp_headers,
        include_directories = [
            ".",
            "torch/csrc/api/include/",
        ],
        exported_deps = [
            ":libtorch",
            "//caffe2/torch/fb/init:init",
        ],
    )

    # _C_impl is still conditionally compiled based on USE_CUDA. Ideally we'd
    # separate it out as an additive library instead.
    # TODO: split it into cpp and cuda parts similarly to libtorch
    gpu_library_selector(
        name = "_C_impl",
        deps_cpu = [":_C_impl_cpu"],
        deps_cuda = [":_C_impl_cuda"],
        deps_hip = [":_C_impl_hip"],
        merge_cpu_deps = False,
        exclude_hip_target = False,
    )

    cpp_library(
        name = "_C_impl_cpu",
        srcs = libtorch_python_sources,
        link_whole = True,
        exported_deps = [
            "fbsource//third-party/fmt:fmt",
            ":torch-cpp-cpu",
            "//caffe2/torch/fb/init:init",
            "//caffe2/torch/lib/libshm:libshm",
        ],
        exported_external_deps = [
            ("numpy", None, "cpp"),
            ("pybind11", None),
            ("python", None),
        ],
        compiler_flags = compiler_flags_cpu,
        compiler_specific_flags = common_flags["compiler_specific_flags"],
    )

    # This target is used to help get headers for compile-time deps for torch::deploy
    # libinterpreter.so build _without_ getting link-time deps, which are supplied
    # separately by the application that dlopens libinterpreter.so.
    #
    # We make use of the buck auto-generated #headers flavor of a target to accomplish this.
    #
    # However, since #headers flavor of target with srcs can't be used in all build modes, we
    # work around this limitation by using this 'pass-through' target, which has a usable
    # #headers flavor in all build modes.
    cpp_library(
        name = "headers_for_torch_python_deps",
        exported_deps = [
            ":_C_impl_cpu",
        ],
    )
    cpp_library(
        name = "headers_for_torch_python_cuda_deps",
        exported_deps = [
            ":_C_impl_cuda",
        ],
    )

    # This target compiles torch_python bindings, but skips the deps on actual
    # torch and python since those will be integrated specially in the wrapper for
    # libinterpreter.so used in torch::deploy
    cpp_library(
        name = "torch_python_without_torch",
        srcs = libtorch_python_sources + torch_cpp_srcs,
        undefined_symbols = True,
        preferred_linkage = "static",
        exported_deps = [
            ":headers_for_torch_python_deps#headers",
        ],
        exported_external_deps = [
            ("pybind11", None),
            ("frozenpython", None, "python-headers"),
        ],
        compiler_flags = compiler_flags_cpu + [
            # some code in the Python bindings compiles differently
            # when you are deploy
            "-DUSE_DEPLOY",
        ],
        compiler_specific_flags = common_flags["compiler_specific_flags"],
    )

    cpp_library(
        name = "torch_python_cuda_without_torch",
        srcs = libtorch_python_sources + torch_cpp_srcs + libtorch_python_cuda_sources,
        undefined_symbols = True,
        preferred_linkage = "static",
        exported_deps = [
            ":headers_for_torch_python_cuda_deps#headers",
        ],
        exported_external_deps = [
            ("pybind11", None),
            ("frozenpython", None, "python-headers"),
        ],
        compiler_flags = compiler_flags_cpu + [
            "-DUSE_CUDA",
            # some code in the Python bindings compiles differently
            # when you are deploy
            "-DUSE_DEPLOY",
        ],
        compiler_specific_flags = common_flags["compiler_specific_flags"],
    )

    cpp_library(
        name = "_C_impl_cuda",
        srcs = libtorch_python_sources + libtorch_python_cuda_sources,
        link_whole = True,
        exported_deps = [
            "fbsource//third-party/fmt:fmt",
            ":torch-cpp-cuda",
            "//caffe2/torch/fb/init:init",
            "//caffe2/torch/lib/libshm:libshm",
        ],
        exported_external_deps = [
            ("numpy", None, "cpp"),
            ("pybind11", None),
            ("python", None),
        ],
        compiler_flags = compiler_flags_cpu + compiler_flags_cuda,
        compiler_specific_flags = common_flags["compiler_specific_flags"],
    )

    # Autogenerated files whose rules contain ":" are not hipified.
    libtorch_python_hip_sources = [f for f in (libtorch_python_sources + libtorch_python_cuda_sources) if ":" in f]
    libtorch_python_hip_sources_hipified = [f for f in (libtorch_python_sources + libtorch_python_cuda_sources) if not ":" in f]

    custom_rule(
        name = "fb_C_impl_hipify_gen",
        srcs = libtorch_python_hip_sources_hipified,
        build_args = "--source-dir= --hipify-dir= --copy-dir=",
        build_script_dep = "//caffe2:fb_caffe2_hipify",
        output_gen_files = libtorch_python_hip_sources_hipified,
    )

    cpp_library(
        name = "_C_impl_hip",
        srcs = [":fb_C_impl_hipify_gen={}".format(f) for f in (libtorch_python_hip_sources_hipified)] + libtorch_python_hip_sources,
        link_whole = True,
        exported_deps = [
            "fbsource//third-party/fmt:fmt",
            ":torch-cpp-hip",
            "//caffe2/torch/fb/init:init",
            "//caffe2/torch/lib/libshm:libshm",
        ],
        exported_external_deps = [
            ("numpy", None, "cpp"),
            ("pybind11", None),
            ("python", None),
        ],
        compiler_flags = compiler_flags_cpu + compiler_flags_hip + ["-Wno-unused-result"],
        compiler_specific_flags = common_flags["compiler_specific_flags"],
    )

    cpp_python_extension(
        name = "_C",
        srcs = [
            "torch/csrc/stub.c",
        ],
        base_module = "torch",
        deps = [
            ":_C_impl",
            "//caffe2:flatbuffer_loader",
        ],
    )

    cpp_python_extension(
        name = "_C_flatbuffer",
        srcs = [
            "torch/csrc/stub_with_flatbuffer.c",
            "torch/csrc/init_flatbuffer_module.cpp",
        ],
        base_module = "torch",
        deps = [
            ":_C_impl",
            "//caffe2:flatbuffer_loader",
            "//caffe2:flatbuffer_serializer",
        ],
    )

    return r