mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
more build updates:
(1) nccl submodule, cnmem submodule (2) mpi ops fallback test (3) a bit more blob interface (4) fixed tests (5) caffe2.python.io -> caffe2.python.dataio to avoid name conflicts (6) In the build system autogen __init__.py instead of having manual rules just to copy over an empty __init__.py.
This commit is contained in:
6
.gitmodules
vendored
6
.gitmodules
vendored
@ -1,3 +1,9 @@
|
||||
[submodule "third_party/pybind11"]
|
||||
path = third_party/pybind11
|
||||
url = https://github.com/pybind/pybind11.git
|
||||
[submodule "third_party/nccl"]
|
||||
path = third_party/nccl
|
||||
url = https://github.com/nvidia/nccl.git
|
||||
[submodule "third_party/cnmem"]
|
||||
path = third_party/cnmem
|
||||
url = https://github.com/nvidia/cnmem.git
|
||||
|
2
Makefile
2
Makefile
@ -16,6 +16,6 @@ lint:
|
||||
@find caffe2 -type f -exec python brewtool/cpplint.py {} \;
|
||||
|
||||
linecount:
|
||||
@cloc --read-lang-def=brewtool/caffe.cloc caffe2 pycaffe2 || \
|
||||
@cloc --read-lang-def=brewtool/caffe.cloc caffe2 || \
|
||||
echo "Cloc is not available on the machine. You can install cloc with " && \
|
||||
echo " sudo apt-get install cloc"
|
||||
|
1
build.py
1
build.py
@ -108,6 +108,7 @@ class Config(object):
|
||||
'arch=compute_30,code=sm_30',
|
||||
'arch=compute_35,code=sm_35',
|
||||
'arch=compute_50,code=sm_50',
|
||||
'arch=compute_61,code=sm_61',
|
||||
]
|
||||
# additional CUDA cflags to pass to nvcc.
|
||||
CUDA_CFLAGS = []
|
||||
|
@ -1,4 +0,0 @@
|
||||
filegroup(
|
||||
name = "caffe_python",
|
||||
srcs = ["__init__.py"],
|
||||
)
|
@ -4,11 +4,3 @@ proto_library(
|
||||
name = 'caffe_proto',
|
||||
srcs = ['caffe.proto'],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "caffe_proto_py",
|
||||
srcs = ["__init__.py"],
|
||||
deps = [
|
||||
"//caffe:caffe_python",
|
||||
]
|
||||
)
|
||||
|
@ -26,7 +26,7 @@ cc_library(
|
||||
deps = [
|
||||
":core",
|
||||
":core_gpu_cu",
|
||||
"//third_party/cnmem:cnmem",
|
||||
"//third_party:cnmem",
|
||||
"//third_party:cuda",
|
||||
],
|
||||
whole_archive = True,
|
||||
@ -48,6 +48,7 @@ cc_test(
|
||||
excludes=["*gpu_test*"]),
|
||||
deps = [
|
||||
":core",
|
||||
"//caffe2/operators:core_ops",
|
||||
"//third_party:gtest",
|
||||
"//caffe2/test:caffe2_gtest_main",
|
||||
],
|
||||
@ -63,11 +64,6 @@ cc_test(
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "caffe2_python",
|
||||
srcs = ["__init__.py"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "all_available_ops",
|
||||
srcs = [],
|
||||
@ -79,6 +75,7 @@ cc_library(
|
||||
optional_deps = [
|
||||
"//caffe2/operators:core_ops_gpu",
|
||||
"//caffe2/operators:core_ops_cudnn",
|
||||
"//caffe2/contrib/nccl:nccl_ops",
|
||||
"//caffe2/cuda_rtc:rtc_ops",
|
||||
"//caffe2/db:db_gpu",
|
||||
"//caffe2/image:image_ops",
|
||||
|
@ -1,5 +0,0 @@
|
||||
"""
|
||||
Caffe2: A General Tool for Neural Networks.
|
||||
"""
|
||||
|
||||
__author__ = 'Yangqing Jia'
|
10
caffe2/contrib/nccl/BREW
Normal file
10
caffe2/contrib/nccl/BREW
Normal file
@ -0,0 +1,10 @@
|
||||
cc_library(
|
||||
name = "nccl_ops",
|
||||
srcs = Glob(["*.cc"]),
|
||||
hdrs = Glob(["*.h"]),
|
||||
deps = [
|
||||
"//caffe2:core_gpu",
|
||||
"//third_party:nccl",
|
||||
],
|
||||
whole_archive = True,
|
||||
)
|
@ -58,6 +58,9 @@ class Blob {
|
||||
return *static_cast<const T*>(pointer_);
|
||||
}
|
||||
|
||||
const void* GetRaw() const { return pointer_; }
|
||||
void* GetRaw() { return pointer_; }
|
||||
|
||||
/**
|
||||
* @brief Gets a mutable pointer to the stored object.
|
||||
*
|
||||
@ -73,6 +76,7 @@ class Blob {
|
||||
return static_cast<T*>(pointer_);
|
||||
} else {
|
||||
if (is_new_object) *is_new_object = true;
|
||||
VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
|
||||
return Reset<T>(new T());
|
||||
}
|
||||
}
|
||||
@ -87,28 +91,53 @@ class Blob {
|
||||
*/
|
||||
template <class T>
|
||||
T* Reset(T* allocated) {
|
||||
if (pointer_) {
|
||||
CHECK_NOTNULL(destroy_)(pointer_);
|
||||
if (pointer_ && destroy_) {
|
||||
destroy_(pointer_);
|
||||
}
|
||||
VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
|
||||
meta_ = TypeMeta::Make<T>();
|
||||
pointer_ = static_cast<void*>(allocated);
|
||||
destroy_ = &Destroy<T>;
|
||||
return allocated;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the underlying object to the allocated one, but does not take over
|
||||
* the ownership of the passed in pointer. If there is already an object in
|
||||
* the Blob, the old object is freed.
|
||||
*
|
||||
* Unlike Reset, this does not take over the ownership of the pointer and the
|
||||
* caller is responsible for making sure that the lifetime of the allocated
|
||||
* blob outlasts the lifetime of any access to this blob, until another Reset
|
||||
* call is made or the blob is destructed.
|
||||
*/
|
||||
template <class T>
|
||||
typename std::remove_const<T>::type* ShareExternal(
|
||||
typename std::remove_const<T>::type* allocated) {
|
||||
return static_cast<T*>(
|
||||
ShareExternal(static_cast<void*>(allocated),
|
||||
TypeMeta::Make<typename std::remove_const<T>::type>()));
|
||||
}
|
||||
|
||||
void* ShareExternal(void* allocated, const TypeMeta& meta) {
|
||||
if (pointer_ && destroy_) {
|
||||
destroy_(pointer_);
|
||||
}
|
||||
meta_ = meta;
|
||||
pointer_ = static_cast<void*>(allocated);
|
||||
destroy_ = nullptr;
|
||||
return allocated;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the Blob to an empty one.
|
||||
*/
|
||||
inline void Reset() {
|
||||
if (pointer_) {
|
||||
CHECK_NOTNULL(destroy_)(pointer_);
|
||||
pointer_ = nullptr;
|
||||
meta_ = TypeMeta();
|
||||
destroy_ = nullptr;
|
||||
if (pointer_ && destroy_) {
|
||||
destroy_(pointer_);
|
||||
}
|
||||
pointer_ = nullptr;
|
||||
meta_ = TypeMeta();
|
||||
destroy_ = nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -69,6 +69,32 @@ TEST(BlobTest, BlobWrongType) {
|
||||
ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
|
||||
}
|
||||
|
||||
TEST(BlobTest, BlobReset) {
|
||||
Blob blob;
|
||||
std::unique_ptr<Foo> foo(new Foo());
|
||||
EXPECT_TRUE(blob.Reset(foo.release()) != nullptr);
|
||||
// Also test that Reset works.
|
||||
blob.Reset();
|
||||
}
|
||||
|
||||
TEST(BlobTest, BlobShareExternalPointer) {
|
||||
Blob blob;
|
||||
std::unique_ptr<Foo> foo(new Foo());
|
||||
EXPECT_EQ(blob.ShareExternal<Foo>(foo.get()), foo.get());
|
||||
EXPECT_TRUE(blob.IsType<Foo>());
|
||||
// Also test that Reset works.
|
||||
blob.Reset();
|
||||
}
|
||||
|
||||
TEST(BlobTest, BlobShareExternalObject) {
|
||||
Blob blob;
|
||||
Foo foo;
|
||||
EXPECT_EQ(blob.ShareExternal<Foo>(&foo), &foo);
|
||||
EXPECT_TRUE(blob.IsType<Foo>());
|
||||
// Also test that Reset works.
|
||||
blob.Reset();
|
||||
}
|
||||
|
||||
TEST(BlobTest, StringSerialization) {
|
||||
const std::string kTestString = "Hello world?";
|
||||
Blob blob;
|
||||
@ -558,6 +584,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
|
||||
"DUMMY_ENGINE");
|
||||
Workspace ws;
|
||||
auto load_op = CreateOperator(op_def, &ws);
|
||||
EXPECT_TRUE(load_op != nullptr);
|
||||
LOG(INFO) << "Running operator";
|
||||
|
||||
load_op->Run();
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include <chrono>
|
||||
#include <future>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
@ -55,6 +56,8 @@ namespace {
|
||||
void TEST_GetStreamAddress(cudaStream_t* ptr) {
|
||||
CUDAContext context(0);
|
||||
*ptr = context.cuda_stream();
|
||||
// Sleep for a while so we have concurrent thread executions
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
}
|
||||
} // namespace
|
||||
|
||||
|
@ -35,9 +35,13 @@ class MPIBroadcastOp final : public Operator<Context> {
|
||||
|
||||
bool RunOnDevice() override {
|
||||
MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
|
||||
CAFFE_ENFORCE(OperatorBase::OutputIsType<Tensor<Context>>(0),
|
||||
"Output is of wrong type.");
|
||||
auto* output = Output(0);
|
||||
// Make sure that output is already allocated.
|
||||
CHECK_GT(output->size(), 0);
|
||||
CAFFE_ENFORCE(output->size() > 0,
|
||||
"Broadcast op uses in-place operation so the output "
|
||||
"should be already allocated.");
|
||||
MPI_CHECK(MPI_Bcast(
|
||||
output->raw_mutable_data(),
|
||||
output->nbytes(),
|
||||
|
@ -54,8 +54,18 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
|
||||
|
||||
bool RunOnDevice() override {
|
||||
for (int i = 0; i < InputSize(); ++i) {
|
||||
if (OperatorBase::InputIsType<TensorCUDA>(i)) {
|
||||
local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
|
||||
Input(i), &context_);
|
||||
} else {
|
||||
VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
|
||||
// Note(jiayq): This removes a const but conceptually
|
||||
// local_input_blobs will only be used as const blob input for the
|
||||
// base op so we are still fine.
|
||||
local_input_blobs_[i]->ShareExternal(
|
||||
const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
|
||||
OperatorBase::Inputs()[i]->meta());
|
||||
}
|
||||
}
|
||||
// Sync to make sure copies are done.
|
||||
context_.FinishDeviceComputation();
|
||||
@ -65,6 +75,9 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < OutputSize(); ++i) {
|
||||
CAFFE_ENFORCE(local_output_blobs_[i]->IsType<TensorCPU>(),
|
||||
"GPU fallback op currently does not support non-TensorCPU "
|
||||
"output type.");
|
||||
Output(i)->CopyFrom(
|
||||
local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
|
||||
}
|
||||
|
@ -4,11 +4,3 @@ proto_library(
|
||||
name = 'caffe2_proto',
|
||||
srcs = Glob(['*.proto']),
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "caffe2_proto_py",
|
||||
srcs = ["__init__.py"],
|
||||
deps = [
|
||||
"//caffe2:caffe2_python",
|
||||
]
|
||||
)
|
||||
|
@ -39,8 +39,8 @@ py_library(
|
||||
srcs=Glob(["*.py"], excludes=["*_test.py"]),
|
||||
deps=[
|
||||
":caffe2_python_cpu",
|
||||
"//caffe/proto:caffe_proto_py",
|
||||
"//caffe2/proto:caffe2_proto_py",
|
||||
"//caffe/proto:caffe_proto",
|
||||
"//caffe2/proto:caffe2_proto",
|
||||
"//caffe2/python/mint:mint",
|
||||
],
|
||||
optional_deps=[
|
||||
|
@ -1,4 +0,0 @@
|
||||
import atexit
|
||||
|
||||
from . import core, utils, workspace
|
||||
from caffe2.proto import caffe2_pb2
|
@ -14,7 +14,7 @@ from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from caffe2.python import core, workspace
|
||||
from caffe2.python.io import Reader, Writer
|
||||
from caffe2.python.dataio import Reader, Writer
|
||||
from caffe2.python.schema import Struct
|
||||
import numpy as np
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
py_library(
|
||||
name = "mint",
|
||||
srcs = [
|
||||
"__init__.py",
|
||||
"app.py",
|
||||
"static/css/simple-sidebar.css",
|
||||
"templates/index.html",
|
||||
|
47
third_party/BREW
vendored
47
third_party/BREW
vendored
@ -92,10 +92,49 @@ cc_thirdparty_target(
|
||||
],
|
||||
)
|
||||
|
||||
cc_thirdparty_target(
|
||||
name="cnmen",
|
||||
deps=["//third_party/cnmem:cnmem"],
|
||||
cc_obj_files = [],
|
||||
shell_script(
|
||||
name = "cnmem_header",
|
||||
srcs = ["cnmem/include/cnmem.h"],
|
||||
commands=[
|
||||
"DST=$CAFFE2_GENDIR/third_party/include/",
|
||||
"mkdir -p $DST",
|
||||
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem/include/cnmem.h $DST/",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "cnmem",
|
||||
srcs = [
|
||||
"cnmem/src/cnmem.cpp",
|
||||
],
|
||||
deps = [
|
||||
":cnmem_header",
|
||||
":cuda",
|
||||
]
|
||||
)
|
||||
|
||||
shell_script(
|
||||
name = "nccl_header",
|
||||
srcs = ["nccl/src/nccl.h"],
|
||||
commands=[
|
||||
"DST=$CAFFE2_GENDIR/third_party/include/",
|
||||
"mkdir -p $DST",
|
||||
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/nccl/src/nccl.h $DST/",
|
||||
],
|
||||
)
|
||||
|
||||
cuda_library(
|
||||
name = "nccl",
|
||||
srcs = Glob(["nccl/src/*.cu"]),
|
||||
deps = [
|
||||
":nccl_header",
|
||||
":cuda",
|
||||
],
|
||||
compiler_flags=[
|
||||
"-Wno-switch", # NCCL does not follow strict switch enum check.
|
||||
"-DNCCL_MAJOR=1 -DNCCL_MINOR=2 -DNCCL_PATCH=3",
|
||||
"-DCUDA_MAJOR=__CUDACC_VER_MAJOR__ -DCUDA_MINOR=__CUDACC_VER_MINOR__",
|
||||
],
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
|
1
third_party/cnmem
vendored
Submodule
1
third_party/cnmem
vendored
Submodule
Submodule third_party/cnmem added at 28a182d495
24
third_party/cnmem/BREW
vendored
24
third_party/cnmem/BREW
vendored
@ -1,24 +0,0 @@
|
||||
# We need to copy over the header to the right folder.
|
||||
shell_script(
|
||||
name = "cnmem_header",
|
||||
srcs = ["cnmem.h"],
|
||||
commands=[
|
||||
"DST=$CAFFE2_GENDIR/third_party/include/",
|
||||
"mkdir -p $DST",
|
||||
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem.h $DST/",
|
||||
],
|
||||
)
|
||||
|
||||
cuda_library(
|
||||
name = "cnmem",
|
||||
srcs = [
|
||||
"cnmem.cpp",
|
||||
],
|
||||
hdrs = [
|
||||
"cnmem.h",
|
||||
],
|
||||
deps = [
|
||||
"cnmem_header",
|
||||
"//third_party:cuda",
|
||||
]
|
||||
)
|
1287
third_party/cnmem/cnmem.cpp
vendored
1287
third_party/cnmem/cnmem.cpp
vendored
File diff suppressed because it is too large
Load Diff
263
third_party/cnmem/cnmem.h
vendored
263
third_party/cnmem/cnmem.h
vendored
@ -1,263 +0,0 @@
|
||||
/* **********************************************************************
|
||||
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* ********************************************************************** */
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include "cstdio"
|
||||
#else
|
||||
#include "stdio.h"
|
||||
#endif
|
||||
#include "cuda_runtime_api.h"
|
||||
|
||||
#if defined(_MSC_VER) || defined(WIN32)
|
||||
#ifdef CNMEM_DLLEXPORT
|
||||
#define CNMEM_API __declspec(dllexport)
|
||||
#else
|
||||
#define CNMEM_API __declspec(dllimport)
|
||||
#endif
|
||||
#else
|
||||
#ifdef CNMEM_DLLEXPORT
|
||||
#define CNMEM_API __attribute__((visibility ("default")))
|
||||
#else
|
||||
#define CNMEM_API
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define CNMEM_VERSION 100 // It corresponds to 1.0.0
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* ********************************************************************************************* */
|
||||
|
||||
typedef enum
|
||||
{
|
||||
CNMEM_STATUS_SUCCESS = 0,
|
||||
CNMEM_STATUS_CUDA_ERROR,
|
||||
CNMEM_STATUS_INVALID_ARGUMENT,
|
||||
CNMEM_STATUS_NOT_INITIALIZED,
|
||||
CNMEM_STATUS_OUT_OF_MEMORY,
|
||||
CNMEM_STATUS_UNKNOWN_ERROR
|
||||
} cnmemStatus_t;
|
||||
|
||||
/* ********************************************************************************************* */
|
||||
|
||||
typedef enum
|
||||
{
|
||||
CNMEM_FLAGS_DEFAULT = 0, /// Default flags.
|
||||
CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption.
|
||||
CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory.
|
||||
} cnmemManagerFlags_t;
|
||||
|
||||
/* ********************************************************************************************* */
|
||||
|
||||
typedef struct cnmemDevice_t_
|
||||
{
|
||||
/** The device number. */
|
||||
int device;
|
||||
/** The size to allocate for that device. If 0, the implementation chooses the size. */
|
||||
size_t size;
|
||||
/** The number of named streams associated with the device. The NULL stream is not counted. */
|
||||
int numStreams;
|
||||
/** The streams associated with the device. It can be NULL. The NULL stream is managed. */
|
||||
cudaStream_t *streams;
|
||||
/** The size reserved for each streams. It can be 0. */
|
||||
size_t *streamSizes;
|
||||
|
||||
} cnmemDevice_t;
|
||||
|
||||
/**
|
||||
* \brief Initialize the library and allocate memory on the listed devices.
|
||||
*
|
||||
* For each device, an internal memory manager is created and the specified amount of memory is
|
||||
* allocated (it is the size defined in device[i].size). For each, named stream an additional
|
||||
* memory manager is created. Currently, it is implemented as a tree of memory managers: A root
|
||||
* manager for the device and a list of children, one for each named stream.
|
||||
*
|
||||
* This function must be called before any other function in the library. It has to be called
|
||||
* by a single thread since it is not thread-safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
|
||||
* CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
|
||||
|
||||
/**
|
||||
* \brief Release all the allocated memory.
|
||||
*
|
||||
* This function must be called by a single thread and after all threads that called
|
||||
* cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemFinalize();
|
||||
|
||||
/**
|
||||
* \brief Increase the internal reference counter of the context object.
|
||||
*
|
||||
* This function increases the internal reference counter of the library. The purpose of that
|
||||
* reference counting mechanism is to give more control to the user over the lifetime of the
|
||||
* library. It is useful with scoped memory allocation which may be destroyed in a final
|
||||
* memory collection after the end of main(). That function is thread-safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemRetain();
|
||||
|
||||
/**
|
||||
* \brief Decrease the internal reference counter of the context object.
|
||||
*
|
||||
* This function decreases the internal reference counter of the library. The purpose of that
|
||||
* reference counting mechanism is to give more control to the user over the lifetime of the
|
||||
* library. It is useful with scoped memory allocation which may be destroyed in a final
|
||||
* memory collection after the end of main(). That function is thread-safe.
|
||||
*
|
||||
* You can use \c cnmemRelease to explicitly finalize the library.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemRelease();
|
||||
|
||||
/**
|
||||
* \brief Add a new stream to the pool of managed streams on a device.
|
||||
*
|
||||
* This function registers a new stream into a device memory manager. It is thread-safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* \brief Allocate memory.
|
||||
*
|
||||
* This function allocates memory and initializes a pointer to device memory. If no memory
|
||||
* is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
|
||||
*
|
||||
* The behavior of that function is the following:
|
||||
*
|
||||
* - If the stream is NULL, the root memory manager is asked to allocate a buffer of device
|
||||
* memory. If there's a buffer of size larger or equal to the requested size in the list of
|
||||
* free blocks, it is returned. If there's no such buffer but the manager is allowed to grow
|
||||
* its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls
|
||||
* cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not
|
||||
* allowed to grow, the manager attempts to steal memory from one of its children (unless
|
||||
* CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns
|
||||
* CNMEM_STATUS_OUT_OF_MEMORY.
|
||||
*
|
||||
* - If the stream is a named stream, the initial request goes to the memory manager associated
|
||||
* with that stream. If a free node is available in the lists of that manager, it is returned.
|
||||
* Otherwise, the request is passed to the root node and works as if the request were made on
|
||||
* the NULL stream.
|
||||
*
|
||||
* The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the
|
||||
* mechanism to steal memory from the children induces GPU synchronizations (the manager has to
|
||||
* make sure no kernel uses a given buffer before stealing it) and it the execution is
|
||||
* sequential (in a multi-threaded context, the code is executed in a critical section inside
|
||||
* the cnmem library - no need for the user to wrap cnmemMalloc with locks).
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
|
||||
* CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* \brief Release memory.
|
||||
*
|
||||
* This function releases memory and recycles a memory block in the manager. This function is
|
||||
* thread safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
|
||||
|
||||
/* ********************************************************************************************* */
|
||||
/* Utility functions. */
|
||||
/* ********************************************************************************************* */
|
||||
|
||||
/**
|
||||
* \brief Returns the amount of memory managed by the memory manager associated with a stream.
|
||||
*
|
||||
* The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
|
||||
* xity linear in the number of allocated blocks so do not call it in performance critical
|
||||
* sections.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* \brief Print a list of nodes to a file.
|
||||
*
|
||||
* This function is intended to be used in case of complex scenarios to help understand the
|
||||
* behaviour of the memory managers/application. It is thread safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0
|
||||
* or free_mem == 0,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* \brief Converts a cnmemStatus_t value to a string.
|
||||
*/
|
||||
const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
|
||||
|
||||
/* ********************************************************************************************* */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
1
third_party/nccl
vendored
Submodule
1
third_party/nccl
vendored
Submodule
Submodule third_party/nccl added at b3a9e1333d
Reference in New Issue
Block a user