Consistently use TORCH_CUDA_API for all files that live in cuda targets. (#29158)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/29158 My plan is to split out libtorch_cuda.so from libtorch.so. To do this, I need accurate _API annotations for files in these directories. I determined the correct set of annotations by looking at tools/build_variables.py and making sure every file that was a member of the libtorch_cuda/ATen-cu targets had these annotations. (torch-cpp-cuda doesn't count since that's going to be where the stuff that has explicit USE_CUDA lives, so it's going to be in a separate dynamic library). As future work, it would be good to setup a lint rule to help people understand what the correct _API annotation to use in a file is; it would also be good to reorganize folder structure so that the library structure is clearer. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Differential Revision: D18309593 Pulled By: ezyang fbshipit-source-id: de710e721b6013a09dad17b35f9a358c95a91030
2025-10-20 21:14:14 +08:00 · 2019-11-06 15:00:18 -08:00
parent a5d356cb39
commit adb7df7117
16 changed files with 56 additions and 47 deletions
--- a/aten/src/ATen/cuda/ATenCUDAGeneral.h
+++ b/aten/src/ATen/cuda/ATenCUDAGeneral.h
@ -6,4 +6,4 @@

 #include <c10/macros/Export.h>

-#define AT_CUDA_API CAFFE2_API
+// Use TORCH_CUDA_API for exports from this folder
--- a/aten/src/ATen/cuda/CUDAContext.h
+++ b/aten/src/ATen/cuda/CUDAContext.h
@ -52,17 +52,17 @@ inline bool is_available() {
    return c10::cuda::device_count() > 0;
 }

-CAFFE2_API cudaDeviceProp* getCurrentDeviceProperties();
+TORCH_CUDA_API cudaDeviceProp* getCurrentDeviceProperties();

-CAFFE2_API int warp_size();
+TORCH_CUDA_API int warp_size();

-CAFFE2_API cudaDeviceProp* getDeviceProperties(int64_t device);
+TORCH_CUDA_API cudaDeviceProp* getDeviceProperties(int64_t device);

-CAFFE2_API Allocator* getCUDADeviceAllocator();
+TORCH_CUDA_API Allocator* getCUDADeviceAllocator();

 /* Handles */
-CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle();
-CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle();
+TORCH_CUDA_API cusparseHandle_t getCurrentCUDASparseHandle();
+TORCH_CUDA_API cublasHandle_t getCurrentCUDABlasHandle();


 } // namespace cuda
--- a/aten/src/ATen/cuda/CUDAEvent.h
+++ b/aten/src/ATen/cuda/CUDAEvent.h
@ -24,7 +24,7 @@ namespace at { namespace cuda {
 * called before the event is ever recorded, it will use the current device.
 * Later streams that record the event must match this device.
 */
-struct AT_CUDA_API CUDAEvent {
+struct TORCH_CUDA_API CUDAEvent {
  // Constructors
  // Default value for `flags` is specified below - it's cudaEventDisableTiming
  CUDAEvent() {}
--- a/aten/src/ATen/cuda/PinnedMemoryAllocator.h
+++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.h
@ -4,6 +4,6 @@

 namespace at { namespace cuda {

-CAFFE2_API at::Allocator* getPinnedMemoryAllocator();
+TORCH_CUDA_API at::Allocator* getPinnedMemoryAllocator();

 }} // namespace at::cuda
--- a/aten/src/ATen/cuda/detail/IndexUtils.cuh
+++ b/aten/src/ATen/cuda/detail/IndexUtils.cuh
@ -8,8 +8,8 @@ namespace at {
 namespace cuda {
 namespace detail {

-CAFFE2_API bool maybeOverlappingIndices(const at::Tensor& t);
-CAFFE2_API bool canUse32BitIndexMath(const at::Tensor &t, int64_t max_elem=std::numeric_limits<int32_t>::max());
+TORCH_CUDA_API bool maybeOverlappingIndices(const at::Tensor& t);
+TORCH_CUDA_API bool canUse32BitIndexMath(const at::Tensor &t, int64_t max_elem=std::numeric_limits<int32_t>::max());

 template <typename scalar, typename IndexType>
 TensorInfo<scalar, IndexType>
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@ -89,6 +89,6 @@ extern "C" typedef struct NVRTC {
 #undef CREATE_MEMBER
 } NVRTC;

-extern "C" AT_CUDA_API NVRTC* load_nvrtc();
+extern "C" TORCH_CUDA_API NVRTC* load_nvrtc();

 }} // at::cuda
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@ -64,7 +64,7 @@ struct DescriptorDeleter {
 // initialized the first time you call set() or any other initializing
 // function.
 template <typename T, cudnnStatus_t (*ctor)(T**), cudnnStatus_t (*dtor)(T*)>
-class AT_CUDA_API Descriptor
+class TORCH_CUDA_API Descriptor
 {
 public:
  // TODO: Figure out why const-correctness doesn't work here
@ -93,7 +93,7 @@ private:
  std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
 };

-class AT_CUDA_API TensorDescriptor
+class TORCH_CUDA_API TensorDescriptor
  : public Descriptor<cudnnTensorStruct,
                      &cudnnCreateTensorDescriptor,
                      &cudnnDestroyTensorDescriptor>
@ -145,7 +145,7 @@ private:
  }
 };

-struct AT_CUDA_API ConvolutionDescriptor
+struct TORCH_CUDA_API ConvolutionDescriptor
  : public Descriptor<cudnnConvolutionStruct,
                      &cudnnCreateConvolutionDescriptor,
                      &cudnnDestroyConvolutionDescriptor>
@ -164,7 +164,7 @@ struct AT_CUDA_API ConvolutionDescriptor
  }
 };

-struct AT_CUDA_API SpatialTransformerDescriptor
+struct TORCH_CUDA_API SpatialTransformerDescriptor
  : public Descriptor<cudnnSpatialTransformerStruct,
                      &cudnnCreateSpatialTransformerDescriptor,
                      &cudnnDestroySpatialTransformerDescriptor>
@ -174,7 +174,7 @@ struct AT_CUDA_API SpatialTransformerDescriptor
  }
 };

-struct AT_CUDA_API DropoutDescriptor
+struct TORCH_CUDA_API DropoutDescriptor
  : public Descriptor<cudnnDropoutStruct,
                      &cudnnCreateDropoutDescriptor,
                      &cudnnDestroyDropoutDescriptor>
@ -216,7 +216,7 @@ struct AT_CUDA_API DropoutDescriptor
  }
 };

-struct AT_CUDA_API RNNDescriptor
+struct TORCH_CUDA_API RNNDescriptor
  : public Descriptor<cudnnRNNStruct,
                      &cudnnCreateRNNDescriptor,
                      &cudnnDestroyRNNDescriptor>
@ -252,7 +252,7 @@ struct AT_CUDA_API RNNDescriptor
  }
 };

-struct AT_CUDA_API CTCLossDescriptor
+struct TORCH_CUDA_API CTCLossDescriptor
  : public Descriptor<cudnnCTCLossStruct,
                      &cudnnCreateCTCLossDescriptor,
                      &cudnnDestroyCTCLossDescriptor>
--- a/aten/src/ATen/cudnn/Handle.h
+++ b/aten/src/ATen/cudnn/Handle.h
@ -5,6 +5,6 @@

 namespace at { namespace native {

-AT_CUDA_API cudnnHandle_t getCudnnHandle();
+TORCH_CUDA_API cudnnHandle_t getCudnnHandle();

 }} // namespace
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cuh
@ -4,21 +4,21 @@

 namespace at { namespace native { namespace sparse { namespace cuda {

-AT_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr);
+TORCH_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr);

 /* Level 3 */
-AT_CUDA_API void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
-AT_CUDA_API void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
+TORCH_CUDA_API void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
+TORCH_CUDA_API void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);

 // overloaded version
 inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) { Scsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); }
 inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc) { Dcsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); }

 /* format conversion */
-AT_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P);
-AT_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes);
-AT_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer);
-AT_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes);
-AT_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer);
+TORCH_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P);
+TORCH_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes);
+TORCH_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer);
+TORCH_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes);
+TORCH_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer);

 }}}} // namespace at::native::sparse::cuda
--- a/aten/src/THC/THCAllocator.h
+++ b/aten/src/THC/THCAllocator.h
@ -5,7 +5,7 @@

 // IPC doesn't support (re)allocation

-class CAFFE2_API THCIpcDeleter {
+class TORCH_CUDA_API THCIpcDeleter {
 public:
  THCIpcDeleter(std::shared_ptr<void> basePtr);
  ~THCIpcDeleter();
--- a/aten/src/THC/THCGeneral.h.in
+++ b/aten/src/THC/THCGeneral.h.in
@ -17,8 +17,9 @@
 # define THC_EXTERNC extern "C"

 // TH & THC are now part of the same library as ATen and Caffe2
-#define THC_API THC_EXTERNC CAFFE2_API
-#define THC_CLASS CAFFE2_API
+// NB: However, we are planning to split it out to a torch_cuda library
+#define THC_API THC_EXTERNC TORCH_CUDA_API
+#define THC_CLASS TORCH_CUDA_API

 #ifndef THAssert
 #define THAssert(exp)                                                   \
--- a/c10/macros/Export.h
+++ b/c10/macros/Export.h
@ -91,11 +91,20 @@
 #define C10_API C10_IMPORT
 #endif

-// This one is being used by libcaffe2.so
+// This one is being used by libtorch.so
+// TODO: rename this to TORCH_API
 #ifdef CAFFE2_BUILD_MAIN_LIB
 #define CAFFE2_API C10_EXPORT
 #else
 #define CAFFE2_API C10_IMPORT
 #endif

+// This one will eventually be used by libtorch_cuda.so, but for
+// now it has the same function as CAFFE2_API
+#ifdef CAFFE2_BUILD_MAIN_LIB
+#define TORCH_CUDA_API C10_EXPORT
+#else
+#define TORCH_CUDA_API C10_IMPORT
+#endif
+
 #endif // C10_MACROS_MACROS_H_
--- a/torch/csrc/autograd/functions/comm.h
+++ b/torch/csrc/autograd/functions/comm.h
@ -14,8 +14,7 @@
 namespace torch {
 namespace autograd {

-//TODO: change it to TORCH_API when we merge the libs
-struct AT_CUDA_API Scatter : public Node {
+struct TORCH_CUDA_API Scatter : public Node {
  explicit Scatter(
      std::vector<at::Device> devices,
      const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
@ -34,7 +33,7 @@ struct AT_CUDA_API Scatter : public Node {
  bool unsqueeze_scalars_;
 };

-struct AT_CUDA_API Gather : public Node {
+struct TORCH_CUDA_API Gather : public Node {
  explicit Gather(const at::Device& destination_device, int64_t dim = 0);
  ~Gather() override;

--- a/torch/csrc/cuda/comm.h
+++ b/torch/csrc/cuda/comm.h
@ -13,11 +13,11 @@ namespace torch { namespace cuda {

 using tensor_list2d = std::vector<std::vector<at::Tensor>>;

-TORCH_API std::vector<at::Tensor> broadcast(const at::Tensor& tensor, at::IntArrayRef devices);
-TORCH_API tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntArrayRef devices,
+TORCH_CUDA_API std::vector<at::Tensor> broadcast(const at::Tensor& tensor, at::IntArrayRef devices);
+TORCH_CUDA_API tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntArrayRef devices,
                                  size_t buffer_size);

-TORCH_API std::vector<at::Tensor> scatter(
+TORCH_CUDA_API std::vector<at::Tensor> scatter(
    const at::Tensor& tensor,
    at::IntArrayRef devices,
    const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
@ -25,7 +25,7 @@ TORCH_API std::vector<at::Tensor> scatter(
    const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>& streams =
        c10::nullopt);

-TORCH_API at::Tensor gather(
+TORCH_CUDA_API at::Tensor gather(
    at::TensorList tensors,
    int64_t dim,
    c10::optional<int32_t> destination_index);
--- a/torch/csrc/cuda/nccl.h
+++ b/torch/csrc/cuda/nccl.h
@ -42,31 +42,31 @@ struct AutoNcclGroup {
  }
 };

-TORCH_API at::ArrayRef<ncclComm_t> get_communicators(at::TensorList inputs);
-TORCH_API void check_inputs(
+TORCH_CUDA_API at::ArrayRef<ncclComm_t> get_communicators(at::TensorList inputs);
+TORCH_CUDA_API void check_inputs(
    at::TensorList inputs,
    at::TensorList outputs,
    int input_multiplier,
    int output_multiplier);
-TORCH_API ncclDataType_t get_data_type(const at::Tensor& t);
+TORCH_CUDA_API ncclDataType_t get_data_type(const at::Tensor& t);

 } // namespace detail

 using comm_list = std::vector<ncclComm_t>;
 using stream_list = std::vector<c10::optional<at::cuda::CUDAStream>>;

-TORCH_API std::uint64_t version();
+TORCH_CUDA_API std::uint64_t version();

 bool is_available(at::TensorList tensors);

-TORCH_API void broadcast(
+TORCH_CUDA_API void broadcast(
    at::TensorList tensors,
    const stream_list& streams = {},
    const comm_list& user_comms = {});

 size_t get_max_count();

-TORCH_API void reduce(
+TORCH_CUDA_API void reduce(
    const std::vector<at::Tensor>& inputs,
    std::vector<at::Tensor>& outputs,
    int32_t root = 0,
@ -74,7 +74,7 @@ TORCH_API void reduce(
    const stream_list& streams = {},
    const comm_list& user_comms = {});

-TORCH_API void reduce(
+TORCH_CUDA_API void reduce(
    std::vector<at::Tensor>& inputs,
    int32_t root = 0,
    int32_t op = ncclSum,
--- a/torch/csrc/jit/fuser/cuda/fused_kernel.h
+++ b/torch/csrc/jit/fuser/cuda/fused_kernel.h
@ -19,7 +19,7 @@ namespace cuda {

 // A class holding metadata for an actual CUDA function.
 // Note: CUDA functions are per device.
-struct TORCH_API FusedKernelCUDA : public ::torch::jit::fuser::FusedKernel {
+struct TORCH_CUDA_API FusedKernelCUDA : public ::torch::jit::fuser::FusedKernel {
  FusedKernelCUDA(
      int16_t device,
      std::string name,