Consistently use TORCH_CUDA_API for all files that live in cuda targets. (#29158)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/29158

My plan is to split out libtorch_cuda.so from libtorch.so.  To do this,
I need accurate _API annotations for files in these directories.

I determined the correct set of annotations by looking at
tools/build_variables.py and making sure every file that was a member
of the libtorch_cuda/ATen-cu targets had these annotations.  (torch-cpp-cuda
doesn't count since that's going to be where the stuff that has explicit
USE_CUDA lives, so it's going to be in a separate dynamic library).

As future work, it would be good to setup a lint rule to help people
understand what the correct _API annotation to use in a file is; it
would also be good to reorganize folder structure so that the library
structure is clearer.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Differential Revision: D18309593

Pulled By: ezyang

fbshipit-source-id: de710e721b6013a09dad17b35f9a358c95a91030
This commit is contained in:
Edward Yang
2019-11-06 15:00:18 -08:00
committed by Facebook Github Bot
parent a5d356cb39
commit adb7df7117
16 changed files with 56 additions and 47 deletions

View File

@ -6,4 +6,4 @@
#include <c10/macros/Export.h>
#define AT_CUDA_API CAFFE2_API
// Use TORCH_CUDA_API for exports from this folder

View File

@ -52,17 +52,17 @@ inline bool is_available() {
return c10::cuda::device_count() > 0;
}
CAFFE2_API cudaDeviceProp* getCurrentDeviceProperties();
TORCH_CUDA_API cudaDeviceProp* getCurrentDeviceProperties();
CAFFE2_API int warp_size();
TORCH_CUDA_API int warp_size();
CAFFE2_API cudaDeviceProp* getDeviceProperties(int64_t device);
TORCH_CUDA_API cudaDeviceProp* getDeviceProperties(int64_t device);
CAFFE2_API Allocator* getCUDADeviceAllocator();
TORCH_CUDA_API Allocator* getCUDADeviceAllocator();
/* Handles */
CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle();
CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle();
TORCH_CUDA_API cusparseHandle_t getCurrentCUDASparseHandle();
TORCH_CUDA_API cublasHandle_t getCurrentCUDABlasHandle();
} // namespace cuda

View File

@ -24,7 +24,7 @@ namespace at { namespace cuda {
* called before the event is ever recorded, it will use the current device.
* Later streams that record the event must match this device.
*/
struct AT_CUDA_API CUDAEvent {
struct TORCH_CUDA_API CUDAEvent {
// Constructors
// Default value for `flags` is specified below - it's cudaEventDisableTiming
CUDAEvent() {}

View File

@ -4,6 +4,6 @@
namespace at { namespace cuda {
CAFFE2_API at::Allocator* getPinnedMemoryAllocator();
TORCH_CUDA_API at::Allocator* getPinnedMemoryAllocator();
}} // namespace at::cuda

View File

@ -8,8 +8,8 @@ namespace at {
namespace cuda {
namespace detail {
CAFFE2_API bool maybeOverlappingIndices(const at::Tensor& t);
CAFFE2_API bool canUse32BitIndexMath(const at::Tensor &t, int64_t max_elem=std::numeric_limits<int32_t>::max());
TORCH_CUDA_API bool maybeOverlappingIndices(const at::Tensor& t);
TORCH_CUDA_API bool canUse32BitIndexMath(const at::Tensor &t, int64_t max_elem=std::numeric_limits<int32_t>::max());
template <typename scalar, typename IndexType>
TensorInfo<scalar, IndexType>

View File

@ -89,6 +89,6 @@ extern "C" typedef struct NVRTC {
#undef CREATE_MEMBER
} NVRTC;
extern "C" AT_CUDA_API NVRTC* load_nvrtc();
extern "C" TORCH_CUDA_API NVRTC* load_nvrtc();
}} // at::cuda

View File

@ -64,7 +64,7 @@ struct DescriptorDeleter {
// initialized the first time you call set() or any other initializing
// function.
template <typename T, cudnnStatus_t (*ctor)(T**), cudnnStatus_t (*dtor)(T*)>
class AT_CUDA_API Descriptor
class TORCH_CUDA_API Descriptor
{
public:
// TODO: Figure out why const-correctness doesn't work here
@ -93,7 +93,7 @@ private:
std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
};
class AT_CUDA_API TensorDescriptor
class TORCH_CUDA_API TensorDescriptor
: public Descriptor<cudnnTensorStruct,
&cudnnCreateTensorDescriptor,
&cudnnDestroyTensorDescriptor>
@ -145,7 +145,7 @@ private:
}
};
struct AT_CUDA_API ConvolutionDescriptor
struct TORCH_CUDA_API ConvolutionDescriptor
: public Descriptor<cudnnConvolutionStruct,
&cudnnCreateConvolutionDescriptor,
&cudnnDestroyConvolutionDescriptor>
@ -164,7 +164,7 @@ struct AT_CUDA_API ConvolutionDescriptor
}
};
struct AT_CUDA_API SpatialTransformerDescriptor
struct TORCH_CUDA_API SpatialTransformerDescriptor
: public Descriptor<cudnnSpatialTransformerStruct,
&cudnnCreateSpatialTransformerDescriptor,
&cudnnDestroySpatialTransformerDescriptor>
@ -174,7 +174,7 @@ struct AT_CUDA_API SpatialTransformerDescriptor
}
};
struct AT_CUDA_API DropoutDescriptor
struct TORCH_CUDA_API DropoutDescriptor
: public Descriptor<cudnnDropoutStruct,
&cudnnCreateDropoutDescriptor,
&cudnnDestroyDropoutDescriptor>
@ -216,7 +216,7 @@ struct AT_CUDA_API DropoutDescriptor
}
};
struct AT_CUDA_API RNNDescriptor
struct TORCH_CUDA_API RNNDescriptor
: public Descriptor<cudnnRNNStruct,
&cudnnCreateRNNDescriptor,
&cudnnDestroyRNNDescriptor>
@ -252,7 +252,7 @@ struct AT_CUDA_API RNNDescriptor
}
};
struct AT_CUDA_API CTCLossDescriptor
struct TORCH_CUDA_API CTCLossDescriptor
: public Descriptor<cudnnCTCLossStruct,
&cudnnCreateCTCLossDescriptor,
&cudnnDestroyCTCLossDescriptor>

View File

@ -5,6 +5,6 @@
namespace at { namespace native {
AT_CUDA_API cudnnHandle_t getCudnnHandle();
TORCH_CUDA_API cudnnHandle_t getCudnnHandle();
}} // namespace

View File

@ -4,21 +4,21 @@
namespace at { namespace native { namespace sparse { namespace cuda {
AT_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr);
TORCH_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr);
/* Level 3 */
AT_CUDA_API void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
AT_CUDA_API void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
TORCH_CUDA_API void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
TORCH_CUDA_API void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
// overloaded version
inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) { Scsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); }
inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc) { Dcsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); }
/* format conversion */
AT_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P);
AT_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes);
AT_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer);
AT_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes);
AT_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer);
TORCH_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P);
TORCH_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes);
TORCH_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer);
TORCH_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes);
TORCH_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer);
}}}} // namespace at::native::sparse::cuda

View File

@ -5,7 +5,7 @@
// IPC doesn't support (re)allocation
class CAFFE2_API THCIpcDeleter {
class TORCH_CUDA_API THCIpcDeleter {
public:
THCIpcDeleter(std::shared_ptr<void> basePtr);
~THCIpcDeleter();

View File

@ -17,8 +17,9 @@
# define THC_EXTERNC extern "C"
// TH & THC are now part of the same library as ATen and Caffe2
#define THC_API THC_EXTERNC CAFFE2_API
#define THC_CLASS CAFFE2_API
// NB: However, we are planning to split it out to a torch_cuda library
#define THC_API THC_EXTERNC TORCH_CUDA_API
#define THC_CLASS TORCH_CUDA_API
#ifndef THAssert
#define THAssert(exp) \

View File

@ -91,11 +91,20 @@
#define C10_API C10_IMPORT
#endif
// This one is being used by libcaffe2.so
// This one is being used by libtorch.so
// TODO: rename this to TORCH_API
#ifdef CAFFE2_BUILD_MAIN_LIB
#define CAFFE2_API C10_EXPORT
#else
#define CAFFE2_API C10_IMPORT
#endif
// This one will eventually be used by libtorch_cuda.so, but for
// now it has the same function as CAFFE2_API
#ifdef CAFFE2_BUILD_MAIN_LIB
#define TORCH_CUDA_API C10_EXPORT
#else
#define TORCH_CUDA_API C10_IMPORT
#endif
#endif // C10_MACROS_MACROS_H_

View File

@ -14,8 +14,7 @@
namespace torch {
namespace autograd {
//TODO: change it to TORCH_API when we merge the libs
struct AT_CUDA_API Scatter : public Node {
struct TORCH_CUDA_API Scatter : public Node {
explicit Scatter(
std::vector<at::Device> devices,
const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
@ -34,7 +33,7 @@ struct AT_CUDA_API Scatter : public Node {
bool unsqueeze_scalars_;
};
struct AT_CUDA_API Gather : public Node {
struct TORCH_CUDA_API Gather : public Node {
explicit Gather(const at::Device& destination_device, int64_t dim = 0);
~Gather() override;

View File

@ -13,11 +13,11 @@ namespace torch { namespace cuda {
using tensor_list2d = std::vector<std::vector<at::Tensor>>;
TORCH_API std::vector<at::Tensor> broadcast(const at::Tensor& tensor, at::IntArrayRef devices);
TORCH_API tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntArrayRef devices,
TORCH_CUDA_API std::vector<at::Tensor> broadcast(const at::Tensor& tensor, at::IntArrayRef devices);
TORCH_CUDA_API tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntArrayRef devices,
size_t buffer_size);
TORCH_API std::vector<at::Tensor> scatter(
TORCH_CUDA_API std::vector<at::Tensor> scatter(
const at::Tensor& tensor,
at::IntArrayRef devices,
const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
@ -25,7 +25,7 @@ TORCH_API std::vector<at::Tensor> scatter(
const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>& streams =
c10::nullopt);
TORCH_API at::Tensor gather(
TORCH_CUDA_API at::Tensor gather(
at::TensorList tensors,
int64_t dim,
c10::optional<int32_t> destination_index);

View File

@ -42,31 +42,31 @@ struct AutoNcclGroup {
}
};
TORCH_API at::ArrayRef<ncclComm_t> get_communicators(at::TensorList inputs);
TORCH_API void check_inputs(
TORCH_CUDA_API at::ArrayRef<ncclComm_t> get_communicators(at::TensorList inputs);
TORCH_CUDA_API void check_inputs(
at::TensorList inputs,
at::TensorList outputs,
int input_multiplier,
int output_multiplier);
TORCH_API ncclDataType_t get_data_type(const at::Tensor& t);
TORCH_CUDA_API ncclDataType_t get_data_type(const at::Tensor& t);
} // namespace detail
using comm_list = std::vector<ncclComm_t>;
using stream_list = std::vector<c10::optional<at::cuda::CUDAStream>>;
TORCH_API std::uint64_t version();
TORCH_CUDA_API std::uint64_t version();
bool is_available(at::TensorList tensors);
TORCH_API void broadcast(
TORCH_CUDA_API void broadcast(
at::TensorList tensors,
const stream_list& streams = {},
const comm_list& user_comms = {});
size_t get_max_count();
TORCH_API void reduce(
TORCH_CUDA_API void reduce(
const std::vector<at::Tensor>& inputs,
std::vector<at::Tensor>& outputs,
int32_t root = 0,
@ -74,7 +74,7 @@ TORCH_API void reduce(
const stream_list& streams = {},
const comm_list& user_comms = {});
TORCH_API void reduce(
TORCH_CUDA_API void reduce(
std::vector<at::Tensor>& inputs,
int32_t root = 0,
int32_t op = ncclSum,

View File

@ -19,7 +19,7 @@ namespace cuda {
// A class holding metadata for an actual CUDA function.
// Note: CUDA functions are per device.
struct TORCH_API FusedKernelCUDA : public ::torch::jit::fuser::FusedKernel {
struct TORCH_CUDA_API FusedKernelCUDA : public ::torch::jit::fuser::FusedKernel {
FusedKernelCUDA(
int16_t device,
std::string name,