more build updates:

(1) nccl submodule, cnmem submodule
(2) mpi ops fallback test
(3) a bit more blob interface
(4) fixed tests
(5) caffe2.python.io -> caffe2.python.dataio to avoid name conflicts
(6) In the build system autogen __init__.py instead of having manual
rules just to copy over an empty __init__.py.
This commit is contained in:
Yangqing Jia
2016-07-22 23:58:24 -07:00
parent b2c2d0b70c
commit 1ede7a7ff0
29 changed files with 156 additions and 1629 deletions

6
.gitmodules vendored
View File

@ -1,3 +1,9 @@
[submodule "third_party/pybind11"]
path = third_party/pybind11
url = https://github.com/pybind/pybind11.git
[submodule "third_party/nccl"]
path = third_party/nccl
url = https://github.com/nvidia/nccl.git
[submodule "third_party/cnmem"]
path = third_party/cnmem
url = https://github.com/nvidia/cnmem.git

View File

@ -16,6 +16,6 @@ lint:
@find caffe2 -type f -exec python brewtool/cpplint.py {} \;
linecount:
@cloc --read-lang-def=brewtool/caffe.cloc caffe2 pycaffe2 || \
@cloc --read-lang-def=brewtool/caffe.cloc caffe2 || \
echo "Cloc is not available on the machine. You can install cloc with " && \
echo " sudo apt-get install cloc"

View File

@ -108,6 +108,7 @@ class Config(object):
'arch=compute_30,code=sm_30',
'arch=compute_35,code=sm_35',
'arch=compute_50,code=sm_50',
'arch=compute_61,code=sm_61',
]
# additional CUDA cflags to pass to nvcc.
CUDA_CFLAGS = []

View File

@ -1,4 +0,0 @@
filegroup(
name = "caffe_python",
srcs = ["__init__.py"],
)

View File

View File

@ -4,11 +4,3 @@ proto_library(
name = 'caffe_proto',
srcs = ['caffe.proto'],
)
filegroup(
name = "caffe_proto_py",
srcs = ["__init__.py"],
deps = [
"//caffe:caffe_python",
]
)

View File

@ -26,7 +26,7 @@ cc_library(
deps = [
":core",
":core_gpu_cu",
"//third_party/cnmem:cnmem",
"//third_party:cnmem",
"//third_party:cuda",
],
whole_archive = True,
@ -48,6 +48,7 @@ cc_test(
excludes=["*gpu_test*"]),
deps = [
":core",
"//caffe2/operators:core_ops",
"//third_party:gtest",
"//caffe2/test:caffe2_gtest_main",
],
@ -63,11 +64,6 @@ cc_test(
],
)
filegroup(
name = "caffe2_python",
srcs = ["__init__.py"],
)
cc_library(
name = "all_available_ops",
srcs = [],
@ -79,6 +75,7 @@ cc_library(
optional_deps = [
"//caffe2/operators:core_ops_gpu",
"//caffe2/operators:core_ops_cudnn",
"//caffe2/contrib/nccl:nccl_ops",
"//caffe2/cuda_rtc:rtc_ops",
"//caffe2/db:db_gpu",
"//caffe2/image:image_ops",

View File

@ -1,5 +0,0 @@
"""
Caffe2: A General Tool for Neural Networks.
"""
__author__ = 'Yangqing Jia'

10
caffe2/contrib/nccl/BREW Normal file
View File

@ -0,0 +1,10 @@
cc_library(
name = "nccl_ops",
srcs = Glob(["*.cc"]),
hdrs = Glob(["*.h"]),
deps = [
"//caffe2:core_gpu",
"//third_party:nccl",
],
whole_archive = True,
)

View File

@ -58,6 +58,9 @@ class Blob {
return *static_cast<const T*>(pointer_);
}
const void* GetRaw() const { return pointer_; }
void* GetRaw() { return pointer_; }
/**
* @brief Gets a mutable pointer to the stored object.
*
@ -73,6 +76,7 @@ class Blob {
return static_cast<T*>(pointer_);
} else {
if (is_new_object) *is_new_object = true;
VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
return Reset<T>(new T());
}
}
@ -87,28 +91,53 @@ class Blob {
*/
template <class T>
T* Reset(T* allocated) {
if (pointer_) {
CHECK_NOTNULL(destroy_)(pointer_);
if (pointer_ && destroy_) {
destroy_(pointer_);
}
VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
meta_ = TypeMeta::Make<T>();
pointer_ = static_cast<void*>(allocated);
destroy_ = &Destroy<T>;
return allocated;
}
/**
* Sets the underlying object to the allocated one, but does not take over
* the ownership of the passed in pointer. If there is already an object in
* the Blob, the old object is freed.
*
* Unlike Reset, this does not take over the ownership of the pointer and the
* caller is responsible for making sure that the lifetime of the allocated
* blob outlasts the lifetime of any access to this blob, until another Reset
* call is made or the blob is destructed.
*/
template <class T>
typename std::remove_const<T>::type* ShareExternal(
typename std::remove_const<T>::type* allocated) {
return static_cast<T*>(
ShareExternal(static_cast<void*>(allocated),
TypeMeta::Make<typename std::remove_const<T>::type>()));
}
void* ShareExternal(void* allocated, const TypeMeta& meta) {
if (pointer_ && destroy_) {
destroy_(pointer_);
}
meta_ = meta;
pointer_ = static_cast<void*>(allocated);
destroy_ = nullptr;
return allocated;
}
/**
* Resets the Blob to an empty one.
*/
inline void Reset() {
if (pointer_) {
CHECK_NOTNULL(destroy_)(pointer_);
pointer_ = nullptr;
meta_ = TypeMeta();
destroy_ = nullptr;
if (pointer_ && destroy_) {
destroy_(pointer_);
}
pointer_ = nullptr;
meta_ = TypeMeta();
destroy_ = nullptr;
}
/**

View File

@ -69,6 +69,32 @@ TEST(BlobTest, BlobWrongType) {
ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
}
TEST(BlobTest, BlobReset) {
Blob blob;
std::unique_ptr<Foo> foo(new Foo());
EXPECT_TRUE(blob.Reset(foo.release()) != nullptr);
// Also test that Reset works.
blob.Reset();
}
TEST(BlobTest, BlobShareExternalPointer) {
Blob blob;
std::unique_ptr<Foo> foo(new Foo());
EXPECT_EQ(blob.ShareExternal<Foo>(foo.get()), foo.get());
EXPECT_TRUE(blob.IsType<Foo>());
// Also test that Reset works.
blob.Reset();
}
TEST(BlobTest, BlobShareExternalObject) {
Blob blob;
Foo foo;
EXPECT_EQ(blob.ShareExternal<Foo>(&foo), &foo);
EXPECT_TRUE(blob.IsType<Foo>());
// Also test that Reset works.
blob.Reset();
}
TEST(BlobTest, StringSerialization) {
const std::string kTestString = "Hello world?";
Blob blob;
@ -558,6 +584,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
"DUMMY_ENGINE");
Workspace ws;
auto load_op = CreateOperator(op_def, &ws);
EXPECT_TRUE(load_op != nullptr);
LOG(INFO) << "Running operator";
load_op->Run();

View File

@ -1,3 +1,4 @@
#include <chrono>
#include <future>
#include <random>
#include <thread>
@ -55,6 +56,8 @@ namespace {
void TEST_GetStreamAddress(cudaStream_t* ptr) {
CUDAContext context(0);
*ptr = context.cuda_stream();
// Sleep for a while so we have concurrent thread executions
std::this_thread::sleep_for(std::chrono::seconds(1));
}
} // namespace

View File

@ -35,9 +35,13 @@ class MPIBroadcastOp final : public Operator<Context> {
bool RunOnDevice() override {
MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
CAFFE_ENFORCE(OperatorBase::OutputIsType<Tensor<Context>>(0),
"Output is of wrong type.");
auto* output = Output(0);
// Make sure that output is already allocated.
CHECK_GT(output->size(), 0);
CAFFE_ENFORCE(output->size() > 0,
"Broadcast op uses in-place operation so the output "
"should be already allocated.");
MPI_CHECK(MPI_Bcast(
output->raw_mutable_data(),
output->nbytes(),

View File

@ -54,8 +54,18 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
bool RunOnDevice() override {
for (int i = 0; i < InputSize(); ++i) {
if (OperatorBase::InputIsType<TensorCUDA>(i)) {
local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
Input(i), &context_);
} else {
VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
// Note(jiayq): This removes a const but conceptually
// local_input_blobs will only be used as const blob input for the
// base op so we are still fine.
local_input_blobs_[i]->ShareExternal(
const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
OperatorBase::Inputs()[i]->meta());
}
}
// Sync to make sure copies are done.
context_.FinishDeviceComputation();
@ -65,6 +75,9 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
return false;
}
for (int i = 0; i < OutputSize(); ++i) {
CAFFE_ENFORCE(local_output_blobs_[i]->IsType<TensorCPU>(),
"GPU fallback op currently does not support non-TensorCPU "
"output type.");
Output(i)->CopyFrom(
local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
}

View File

@ -4,11 +4,3 @@ proto_library(
name = 'caffe2_proto',
srcs = Glob(['*.proto']),
)
filegroup(
name = "caffe2_proto_py",
srcs = ["__init__.py"],
deps = [
"//caffe2:caffe2_python",
]
)

View File

@ -39,8 +39,8 @@ py_library(
srcs=Glob(["*.py"], excludes=["*_test.py"]),
deps=[
":caffe2_python_cpu",
"//caffe/proto:caffe_proto_py",
"//caffe2/proto:caffe2_proto_py",
"//caffe/proto:caffe_proto",
"//caffe2/proto:caffe2_proto",
"//caffe2/python/mint:mint",
],
optional_deps=[

View File

@ -1,4 +0,0 @@
import atexit
from . import core, utils, workspace
from caffe2.proto import caffe2_pb2

View File

@ -14,7 +14,7 @@ from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core, workspace
from caffe2.python.io import Reader, Writer
from caffe2.python.dataio import Reader, Writer
from caffe2.python.schema import Struct
import numpy as np

View File

@ -1,7 +1,6 @@
py_library(
name = "mint",
srcs = [
"__init__.py",
"app.py",
"static/css/simple-sidebar.css",
"templates/index.html",

47
third_party/BREW vendored
View File

@ -92,10 +92,49 @@ cc_thirdparty_target(
],
)
cc_thirdparty_target(
name="cnmen",
deps=["//third_party/cnmem:cnmem"],
cc_obj_files = [],
shell_script(
name = "cnmem_header",
srcs = ["cnmem/include/cnmem.h"],
commands=[
"DST=$CAFFE2_GENDIR/third_party/include/",
"mkdir -p $DST",
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem/include/cnmem.h $DST/",
],
)
cc_library(
name = "cnmem",
srcs = [
"cnmem/src/cnmem.cpp",
],
deps = [
":cnmem_header",
":cuda",
]
)
shell_script(
name = "nccl_header",
srcs = ["nccl/src/nccl.h"],
commands=[
"DST=$CAFFE2_GENDIR/third_party/include/",
"mkdir -p $DST",
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/nccl/src/nccl.h $DST/",
],
)
cuda_library(
name = "nccl",
srcs = Glob(["nccl/src/*.cu"]),
deps = [
":nccl_header",
":cuda",
],
compiler_flags=[
"-Wno-switch", # NCCL does not follow strict switch enum check.
"-DNCCL_MAJOR=1 -DNCCL_MINOR=2 -DNCCL_PATCH=3",
"-DCUDA_MAJOR=__CUDACC_VER_MAJOR__ -DCUDA_MINOR=__CUDACC_VER_MINOR__",
],
)
###############################################################################

1
third_party/cnmem vendored Submodule

Submodule third_party/cnmem added at 28a182d495

View File

@ -1,24 +0,0 @@
# We need to copy over the header to the right folder.
shell_script(
name = "cnmem_header",
srcs = ["cnmem.h"],
commands=[
"DST=$CAFFE2_GENDIR/third_party/include/",
"mkdir -p $DST",
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem.h $DST/",
],
)
cuda_library(
name = "cnmem",
srcs = [
"cnmem.cpp",
],
hdrs = [
"cnmem.h",
],
deps = [
"cnmem_header",
"//third_party:cuda",
]
)

File diff suppressed because it is too large Load Diff

View File

@ -1,263 +0,0 @@
/* **********************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ********************************************************************** */
#pragma once
#ifdef __cplusplus
#include "cstdio"
#else
#include "stdio.h"
#endif
#include "cuda_runtime_api.h"
#if defined(_MSC_VER) || defined(WIN32)
#ifdef CNMEM_DLLEXPORT
#define CNMEM_API __declspec(dllexport)
#else
#define CNMEM_API __declspec(dllimport)
#endif
#else
#ifdef CNMEM_DLLEXPORT
#define CNMEM_API __attribute__((visibility ("default")))
#else
#define CNMEM_API
#endif
#endif
#define CNMEM_VERSION 100 // It corresponds to 1.0.0
#ifdef __cplusplus
extern "C" {
#endif
/* ********************************************************************************************* */
typedef enum
{
CNMEM_STATUS_SUCCESS = 0,
CNMEM_STATUS_CUDA_ERROR,
CNMEM_STATUS_INVALID_ARGUMENT,
CNMEM_STATUS_NOT_INITIALIZED,
CNMEM_STATUS_OUT_OF_MEMORY,
CNMEM_STATUS_UNKNOWN_ERROR
} cnmemStatus_t;
/* ********************************************************************************************* */
typedef enum
{
CNMEM_FLAGS_DEFAULT = 0, /// Default flags.
CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption.
CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory.
} cnmemManagerFlags_t;
/* ********************************************************************************************* */
typedef struct cnmemDevice_t_
{
/** The device number. */
int device;
/** The size to allocate for that device. If 0, the implementation chooses the size. */
size_t size;
/** The number of named streams associated with the device. The NULL stream is not counted. */
int numStreams;
/** The streams associated with the device. It can be NULL. The NULL stream is managed. */
cudaStream_t *streams;
/** The size reserved for each streams. It can be 0. */
size_t *streamSizes;
} cnmemDevice_t;
/**
* \brief Initialize the library and allocate memory on the listed devices.
*
* For each device, an internal memory manager is created and the specified amount of memory is
* allocated (it is the size defined in device[i].size). For each, named stream an additional
* memory manager is created. Currently, it is implemented as a tree of memory managers: A root
* manager for the device and a list of children, one for each named stream.
*
* This function must be called before any other function in the library. It has to be called
* by a single thread since it is not thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function.
*/
cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
/**
* \brief Release all the allocated memory.
*
* This function must be called by a single thread and after all threads that called
* cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemFinalize();
/**
* \brief Increase the internal reference counter of the context object.
*
* This function increases the internal reference counter of the library. The purpose of that
* reference counting mechanism is to give more control to the user over the lifetime of the
* library. It is useful with scoped memory allocation which may be destroyed in a final
* memory collection after the end of main(). That function is thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
*/
cnmemStatus_t CNMEM_API cnmemRetain();
/**
* \brief Decrease the internal reference counter of the context object.
*
* This function decreases the internal reference counter of the library. The purpose of that
* reference counting mechanism is to give more control to the user over the lifetime of the
* library. It is useful with scoped memory allocation which may be destroyed in a final
* memory collection after the end of main(). That function is thread-safe.
*
* You can use \c cnmemRelease to explicitly finalize the library.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
*/
cnmemStatus_t CNMEM_API cnmemRelease();
/**
* \brief Add a new stream to the pool of managed streams on a device.
*
* This function registers a new stream into a device memory manager. It is thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
*/
cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
/**
* \brief Allocate memory.
*
* This function allocates memory and initializes a pointer to device memory. If no memory
* is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
*
* The behavior of that function is the following:
*
* - If the stream is NULL, the root memory manager is asked to allocate a buffer of device
* memory. If there's a buffer of size larger or equal to the requested size in the list of
* free blocks, it is returned. If there's no such buffer but the manager is allowed to grow
* its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls
* cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not
* allowed to grow, the manager attempts to steal memory from one of its children (unless
* CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns
* CNMEM_STATUS_OUT_OF_MEMORY.
*
* - If the stream is a named stream, the initial request goes to the memory manager associated
* with that stream. If a free node is available in the lists of that manager, it is returned.
* Otherwise, the request is passed to the root node and works as if the request were made on
* the NULL stream.
*
* The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the
* mechanism to steal memory from the children induces GPU synchronizations (the manager has to
* make sure no kernel uses a given buffer before stealing it) and it the execution is
* sequential (in a multi-threaded context, the code is executed in a critical section inside
* the cnmem library - no need for the user to wrap cnmemMalloc with locks).
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
/**
* \brief Release memory.
*
* This function releases memory and recycles a memory block in the manager. This function is
* thread safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
/* ********************************************************************************************* */
/* Utility functions. */
/* ********************************************************************************************* */
/**
* \brief Returns the amount of memory managed by the memory manager associated with a stream.
*
* The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
* xity linear in the number of allocated blocks so do not call it in performance critical
* sections.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
/**
* \brief Print a list of nodes to a file.
*
* This function is intended to be used in case of complex scenarios to help understand the
* behaviour of the memory managers/application. It is thread safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0
* or free_mem == 0,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
/**
* \brief Converts a cnmemStatus_t value to a string.
*/
const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
/* ********************************************************************************************* */
#ifdef __cplusplus
} // extern "C"
#endif

1
third_party/nccl vendored Submodule

Submodule third_party/nccl added at b3a9e1333d