mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Consistently use TORCH_CUDA_API for all files that live in cuda targets. (#29158)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/29158 My plan is to split out libtorch_cuda.so from libtorch.so. To do this, I need accurate _API annotations for files in these directories. I determined the correct set of annotations by looking at tools/build_variables.py and making sure every file that was a member of the libtorch_cuda/ATen-cu targets had these annotations. (torch-cpp-cuda doesn't count since that's going to be where the stuff that has explicit USE_CUDA lives, so it's going to be in a separate dynamic library). As future work, it would be good to setup a lint rule to help people understand what the correct _API annotation to use in a file is; it would also be good to reorganize folder structure so that the library structure is clearer. Signed-off-by: Edward Z. Yang <ezyang@fb.com> Test Plan: Imported from OSS Differential Revision: D18309593 Pulled By: ezyang fbshipit-source-id: de710e721b6013a09dad17b35f9a358c95a91030
This commit is contained in:
committed by
Facebook Github Bot
parent
a5d356cb39
commit
adb7df7117
@ -6,4 +6,4 @@
|
||||
|
||||
#include <c10/macros/Export.h>
|
||||
|
||||
#define AT_CUDA_API CAFFE2_API
|
||||
// Use TORCH_CUDA_API for exports from this folder
|
||||
|
@ -52,17 +52,17 @@ inline bool is_available() {
|
||||
return c10::cuda::device_count() > 0;
|
||||
}
|
||||
|
||||
CAFFE2_API cudaDeviceProp* getCurrentDeviceProperties();
|
||||
TORCH_CUDA_API cudaDeviceProp* getCurrentDeviceProperties();
|
||||
|
||||
CAFFE2_API int warp_size();
|
||||
TORCH_CUDA_API int warp_size();
|
||||
|
||||
CAFFE2_API cudaDeviceProp* getDeviceProperties(int64_t device);
|
||||
TORCH_CUDA_API cudaDeviceProp* getDeviceProperties(int64_t device);
|
||||
|
||||
CAFFE2_API Allocator* getCUDADeviceAllocator();
|
||||
TORCH_CUDA_API Allocator* getCUDADeviceAllocator();
|
||||
|
||||
/* Handles */
|
||||
CAFFE2_API cusparseHandle_t getCurrentCUDASparseHandle();
|
||||
CAFFE2_API cublasHandle_t getCurrentCUDABlasHandle();
|
||||
TORCH_CUDA_API cusparseHandle_t getCurrentCUDASparseHandle();
|
||||
TORCH_CUDA_API cublasHandle_t getCurrentCUDABlasHandle();
|
||||
|
||||
|
||||
} // namespace cuda
|
||||
|
@ -24,7 +24,7 @@ namespace at { namespace cuda {
|
||||
* called before the event is ever recorded, it will use the current device.
|
||||
* Later streams that record the event must match this device.
|
||||
*/
|
||||
struct AT_CUDA_API CUDAEvent {
|
||||
struct TORCH_CUDA_API CUDAEvent {
|
||||
// Constructors
|
||||
// Default value for `flags` is specified below - it's cudaEventDisableTiming
|
||||
CUDAEvent() {}
|
||||
|
@ -4,6 +4,6 @@
|
||||
|
||||
namespace at { namespace cuda {
|
||||
|
||||
CAFFE2_API at::Allocator* getPinnedMemoryAllocator();
|
||||
TORCH_CUDA_API at::Allocator* getPinnedMemoryAllocator();
|
||||
|
||||
}} // namespace at::cuda
|
||||
|
@ -8,8 +8,8 @@ namespace at {
|
||||
namespace cuda {
|
||||
namespace detail {
|
||||
|
||||
CAFFE2_API bool maybeOverlappingIndices(const at::Tensor& t);
|
||||
CAFFE2_API bool canUse32BitIndexMath(const at::Tensor &t, int64_t max_elem=std::numeric_limits<int32_t>::max());
|
||||
TORCH_CUDA_API bool maybeOverlappingIndices(const at::Tensor& t);
|
||||
TORCH_CUDA_API bool canUse32BitIndexMath(const at::Tensor &t, int64_t max_elem=std::numeric_limits<int32_t>::max());
|
||||
|
||||
template <typename scalar, typename IndexType>
|
||||
TensorInfo<scalar, IndexType>
|
||||
|
@ -89,6 +89,6 @@ extern "C" typedef struct NVRTC {
|
||||
#undef CREATE_MEMBER
|
||||
} NVRTC;
|
||||
|
||||
extern "C" AT_CUDA_API NVRTC* load_nvrtc();
|
||||
extern "C" TORCH_CUDA_API NVRTC* load_nvrtc();
|
||||
|
||||
}} // at::cuda
|
||||
|
@ -64,7 +64,7 @@ struct DescriptorDeleter {
|
||||
// initialized the first time you call set() or any other initializing
|
||||
// function.
|
||||
template <typename T, cudnnStatus_t (*ctor)(T**), cudnnStatus_t (*dtor)(T*)>
|
||||
class AT_CUDA_API Descriptor
|
||||
class TORCH_CUDA_API Descriptor
|
||||
{
|
||||
public:
|
||||
// TODO: Figure out why const-correctness doesn't work here
|
||||
@ -93,7 +93,7 @@ private:
|
||||
std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
|
||||
};
|
||||
|
||||
class AT_CUDA_API TensorDescriptor
|
||||
class TORCH_CUDA_API TensorDescriptor
|
||||
: public Descriptor<cudnnTensorStruct,
|
||||
&cudnnCreateTensorDescriptor,
|
||||
&cudnnDestroyTensorDescriptor>
|
||||
@ -145,7 +145,7 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
struct AT_CUDA_API ConvolutionDescriptor
|
||||
struct TORCH_CUDA_API ConvolutionDescriptor
|
||||
: public Descriptor<cudnnConvolutionStruct,
|
||||
&cudnnCreateConvolutionDescriptor,
|
||||
&cudnnDestroyConvolutionDescriptor>
|
||||
@ -164,7 +164,7 @@ struct AT_CUDA_API ConvolutionDescriptor
|
||||
}
|
||||
};
|
||||
|
||||
struct AT_CUDA_API SpatialTransformerDescriptor
|
||||
struct TORCH_CUDA_API SpatialTransformerDescriptor
|
||||
: public Descriptor<cudnnSpatialTransformerStruct,
|
||||
&cudnnCreateSpatialTransformerDescriptor,
|
||||
&cudnnDestroySpatialTransformerDescriptor>
|
||||
@ -174,7 +174,7 @@ struct AT_CUDA_API SpatialTransformerDescriptor
|
||||
}
|
||||
};
|
||||
|
||||
struct AT_CUDA_API DropoutDescriptor
|
||||
struct TORCH_CUDA_API DropoutDescriptor
|
||||
: public Descriptor<cudnnDropoutStruct,
|
||||
&cudnnCreateDropoutDescriptor,
|
||||
&cudnnDestroyDropoutDescriptor>
|
||||
@ -216,7 +216,7 @@ struct AT_CUDA_API DropoutDescriptor
|
||||
}
|
||||
};
|
||||
|
||||
struct AT_CUDA_API RNNDescriptor
|
||||
struct TORCH_CUDA_API RNNDescriptor
|
||||
: public Descriptor<cudnnRNNStruct,
|
||||
&cudnnCreateRNNDescriptor,
|
||||
&cudnnDestroyRNNDescriptor>
|
||||
@ -252,7 +252,7 @@ struct AT_CUDA_API RNNDescriptor
|
||||
}
|
||||
};
|
||||
|
||||
struct AT_CUDA_API CTCLossDescriptor
|
||||
struct TORCH_CUDA_API CTCLossDescriptor
|
||||
: public Descriptor<cudnnCTCLossStruct,
|
||||
&cudnnCreateCTCLossDescriptor,
|
||||
&cudnnDestroyCTCLossDescriptor>
|
||||
|
@ -5,6 +5,6 @@
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
AT_CUDA_API cudnnHandle_t getCudnnHandle();
|
||||
TORCH_CUDA_API cudnnHandle_t getCudnnHandle();
|
||||
|
||||
}} // namespace
|
||||
|
@ -4,21 +4,21 @@
|
||||
|
||||
namespace at { namespace native { namespace sparse { namespace cuda {
|
||||
|
||||
AT_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr);
|
||||
TORCH_CUDA_API void Xcoo2csr(const int *coorowind, int64_t nnz, int64_t m, int *csrrowptr);
|
||||
|
||||
/* Level 3 */
|
||||
AT_CUDA_API void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
|
||||
AT_CUDA_API void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
|
||||
TORCH_CUDA_API void Scsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc);
|
||||
TORCH_CUDA_API void Dcsrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc);
|
||||
|
||||
// overloaded version
|
||||
inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, float alpha, float *csrvala, int *csrrowptra, int *csrcolinda, float *b, int64_t ldb, float beta, float *c, int64_t ldc) { Scsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); }
|
||||
inline void csrmm2(char transa, char transb, int64_t m, int64_t n, int64_t k, int64_t nnz, double alpha, double *csrvala, int *csrrowptra, int *csrcolinda, double *b, int64_t ldb, double beta, double *c, int64_t ldc) { Dcsrmm2(transa, transb, m, n, k, nnz, alpha, csrvala, csrrowptra, csrcolinda, b, ldb, beta, c, ldc); }
|
||||
|
||||
/* format conversion */
|
||||
AT_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P);
|
||||
AT_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes);
|
||||
AT_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer);
|
||||
AT_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes);
|
||||
AT_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer);
|
||||
TORCH_CUDA_API void CreateIdentityPermutation(int64_t nnz, int *P);
|
||||
TORCH_CUDA_API void Xcsrsort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, const int *csrColInd, size_t *pBufferSizeInBytes);
|
||||
TORCH_CUDA_API void Xcsrsort(int64_t m, int64_t n, int64_t nnz, const int *csrRowPtr, int *csrColInd, int *P, void *pBuffer);
|
||||
TORCH_CUDA_API void Xcoosort_bufferSizeExt(int64_t m, int64_t n, int64_t nnz, const int *cooRows, const int *cooCols, size_t *pBufferSizeInBytes);
|
||||
TORCH_CUDA_API void XcoosortByRow(int64_t m, int64_t n, int64_t nnz, int *cooRows, int *cooCols, int *P, void *pBuffer);
|
||||
|
||||
}}}} // namespace at::native::sparse::cuda
|
||||
|
@ -5,7 +5,7 @@
|
||||
|
||||
// IPC doesn't support (re)allocation
|
||||
|
||||
class CAFFE2_API THCIpcDeleter {
|
||||
class TORCH_CUDA_API THCIpcDeleter {
|
||||
public:
|
||||
THCIpcDeleter(std::shared_ptr<void> basePtr);
|
||||
~THCIpcDeleter();
|
||||
|
@ -17,8 +17,9 @@
|
||||
# define THC_EXTERNC extern "C"
|
||||
|
||||
// TH & THC are now part of the same library as ATen and Caffe2
|
||||
#define THC_API THC_EXTERNC CAFFE2_API
|
||||
#define THC_CLASS CAFFE2_API
|
||||
// NB: However, we are planning to split it out to a torch_cuda library
|
||||
#define THC_API THC_EXTERNC TORCH_CUDA_API
|
||||
#define THC_CLASS TORCH_CUDA_API
|
||||
|
||||
#ifndef THAssert
|
||||
#define THAssert(exp) \
|
||||
|
@ -91,11 +91,20 @@
|
||||
#define C10_API C10_IMPORT
|
||||
#endif
|
||||
|
||||
// This one is being used by libcaffe2.so
|
||||
// This one is being used by libtorch.so
|
||||
// TODO: rename this to TORCH_API
|
||||
#ifdef CAFFE2_BUILD_MAIN_LIB
|
||||
#define CAFFE2_API C10_EXPORT
|
||||
#else
|
||||
#define CAFFE2_API C10_IMPORT
|
||||
#endif
|
||||
|
||||
// This one will eventually be used by libtorch_cuda.so, but for
|
||||
// now it has the same function as CAFFE2_API
|
||||
#ifdef CAFFE2_BUILD_MAIN_LIB
|
||||
#define TORCH_CUDA_API C10_EXPORT
|
||||
#else
|
||||
#define TORCH_CUDA_API C10_IMPORT
|
||||
#endif
|
||||
|
||||
#endif // C10_MACROS_MACROS_H_
|
||||
|
@ -14,8 +14,7 @@
|
||||
namespace torch {
|
||||
namespace autograd {
|
||||
|
||||
//TODO: change it to TORCH_API when we merge the libs
|
||||
struct AT_CUDA_API Scatter : public Node {
|
||||
struct TORCH_CUDA_API Scatter : public Node {
|
||||
explicit Scatter(
|
||||
std::vector<at::Device> devices,
|
||||
const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
|
||||
@ -34,7 +33,7 @@ struct AT_CUDA_API Scatter : public Node {
|
||||
bool unsqueeze_scalars_;
|
||||
};
|
||||
|
||||
struct AT_CUDA_API Gather : public Node {
|
||||
struct TORCH_CUDA_API Gather : public Node {
|
||||
explicit Gather(const at::Device& destination_device, int64_t dim = 0);
|
||||
~Gather() override;
|
||||
|
||||
|
@ -13,11 +13,11 @@ namespace torch { namespace cuda {
|
||||
|
||||
using tensor_list2d = std::vector<std::vector<at::Tensor>>;
|
||||
|
||||
TORCH_API std::vector<at::Tensor> broadcast(const at::Tensor& tensor, at::IntArrayRef devices);
|
||||
TORCH_API tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntArrayRef devices,
|
||||
TORCH_CUDA_API std::vector<at::Tensor> broadcast(const at::Tensor& tensor, at::IntArrayRef devices);
|
||||
TORCH_CUDA_API tensor_list2d broadcast_coalesced(at::TensorList tensors, at::IntArrayRef devices,
|
||||
size_t buffer_size);
|
||||
|
||||
TORCH_API std::vector<at::Tensor> scatter(
|
||||
TORCH_CUDA_API std::vector<at::Tensor> scatter(
|
||||
const at::Tensor& tensor,
|
||||
at::IntArrayRef devices,
|
||||
const c10::optional<std::vector<int64_t>>& chunk_sizes = c10::nullopt,
|
||||
@ -25,7 +25,7 @@ TORCH_API std::vector<at::Tensor> scatter(
|
||||
const c10::optional<std::vector<c10::optional<at::cuda::CUDAStream>>>& streams =
|
||||
c10::nullopt);
|
||||
|
||||
TORCH_API at::Tensor gather(
|
||||
TORCH_CUDA_API at::Tensor gather(
|
||||
at::TensorList tensors,
|
||||
int64_t dim,
|
||||
c10::optional<int32_t> destination_index);
|
||||
|
@ -42,31 +42,31 @@ struct AutoNcclGroup {
|
||||
}
|
||||
};
|
||||
|
||||
TORCH_API at::ArrayRef<ncclComm_t> get_communicators(at::TensorList inputs);
|
||||
TORCH_API void check_inputs(
|
||||
TORCH_CUDA_API at::ArrayRef<ncclComm_t> get_communicators(at::TensorList inputs);
|
||||
TORCH_CUDA_API void check_inputs(
|
||||
at::TensorList inputs,
|
||||
at::TensorList outputs,
|
||||
int input_multiplier,
|
||||
int output_multiplier);
|
||||
TORCH_API ncclDataType_t get_data_type(const at::Tensor& t);
|
||||
TORCH_CUDA_API ncclDataType_t get_data_type(const at::Tensor& t);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
using comm_list = std::vector<ncclComm_t>;
|
||||
using stream_list = std::vector<c10::optional<at::cuda::CUDAStream>>;
|
||||
|
||||
TORCH_API std::uint64_t version();
|
||||
TORCH_CUDA_API std::uint64_t version();
|
||||
|
||||
bool is_available(at::TensorList tensors);
|
||||
|
||||
TORCH_API void broadcast(
|
||||
TORCH_CUDA_API void broadcast(
|
||||
at::TensorList tensors,
|
||||
const stream_list& streams = {},
|
||||
const comm_list& user_comms = {});
|
||||
|
||||
size_t get_max_count();
|
||||
|
||||
TORCH_API void reduce(
|
||||
TORCH_CUDA_API void reduce(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
std::vector<at::Tensor>& outputs,
|
||||
int32_t root = 0,
|
||||
@ -74,7 +74,7 @@ TORCH_API void reduce(
|
||||
const stream_list& streams = {},
|
||||
const comm_list& user_comms = {});
|
||||
|
||||
TORCH_API void reduce(
|
||||
TORCH_CUDA_API void reduce(
|
||||
std::vector<at::Tensor>& inputs,
|
||||
int32_t root = 0,
|
||||
int32_t op = ncclSum,
|
||||
|
@ -19,7 +19,7 @@ namespace cuda {
|
||||
|
||||
// A class holding metadata for an actual CUDA function.
|
||||
// Note: CUDA functions are per device.
|
||||
struct TORCH_API FusedKernelCUDA : public ::torch::jit::fuser::FusedKernel {
|
||||
struct TORCH_CUDA_API FusedKernelCUDA : public ::torch::jit::fuser::FusedKernel {
|
||||
FusedKernelCUDA(
|
||||
int16_t device,
|
||||
std::string name,
|
||||
|
Reference in New Issue
Block a user